擅长:python、mysql、java
<p>使用<code>lxml</code>会更快:</p>
<pre><code>from urllib.request import urlopen
#from bs4 import BeautifulSoup, Comment
from lxml import html
response = urlopen("https://www.baseball-reference.com/leagues/MLB/2018-standard-pitching.shtml")
content = response.read()
tree = html.fromstring( content )
#Now we need to find our target table (comment text)
comment_html = tree.xpath('//comment()[contains(., "players_standard_pitching")]')[0]
#removing HTML comment markup
comment_html = str(comment_html).replace(" >", "")
comment_html = comment_html.replace("<! ", "")
#parsing our target HTML again
tree = html.fromstring( comment_html )
for pitcher_row in tree.xpath('//table[@id="players_standard_pitching"]/tbody/tr[contains(@class, "full_table")]'):
csk = pitcher_row.xpath('./td[@data-stat="player"]/@csk')[0]
print(csk)
</code></pre>