<p>我建议完全不要使用<code>Items</code>类,而使用<code>start_requests</code>方法而不是{<cd3>},因为它们确实令人困惑。请参阅此处的完整工作代码。还要注意<code>match_heading</code>变量。在</p>
<pre><code>class CrawlbotSpider(Spider):
name = 'bigcrawler'
allowed_domains = ['www.matchstat.com']
start_urls = [
'https://matchstat.com/tennis/tournaments/w/Taipei/2015',
'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
]
def start_requests(self):
match_urls = [
'https://matchstat.com/tennis/tournaments/w/Taipei/2015',
'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
]
for url in match_urls:
yield Request(url=url, callback=self.parse_matches)
def parse_matches(self , response):
match_heading = response.xpath('//*[@id="AWS"]/div/h3/text()').extract_first()
for row in response.css('tr.match'):
match = {}
match['heading'] = match_heading
match['round'] = row.css(".round::text").extract_first()
match['event1'] = row.css(".event-name a::text").extract_first()
match['player_1'] = row.css(".player-name:nth-child(2) a::text").extract_first()
match['player_2'] = row.css(".player-name:nth-child(3) a::text").extract_first()
match['player_1_odds'] = row.css(".odds-td.odds-0 [payout]::text").extract_first()
match['player_2_odds'] = row.css(".odds-td.odds-1 [payout]::text").extract_first()
match['h_2_h'] = row.css("a.h2h::text::text").extract_first()
yield match
</code></pre>