刮取多个表并将每个表头作为行存储在cs中

from bigcrawler.items import BigcrawlerItem from scrapy import Spider, Request, Selector from scrapy.selector import Selector from bigcrawler.items import MatchStatItemLoader class CrawlbotSpider(Spider): name = 'bigcrawler' allowed_domains = ['www.matchstat.com'] start_urls = [ 'https://matchstat.com/tennis/tournaments/w/Taipei/2015', 'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017', ] def parse_header(self , response): hxs = Selector(response) for tb in hxs.css('tr.match'): heading = tb.xpath('//*[@id="AWS"]/div/h3/text()').extract()[0] for td in tb.xpath(".//tr[contains(@class, 'match')]/td[contains(@class, 'round')]/text()"): il = BigcrawlerItem(selector=td) il.add_value('event_title' , heading) yield il.load_item() def parse(self , response): for row in response.css('tr.match'): il = MatchStatItemLoader(selector=row) il.add_css('round' , '.round::text') il.add_css('event1' , '.event-name a::text') il.add_css('player_1' , '.player-name:nth-child(2) a::text') il.add_css('player_2' , '.player-name:nth-child(3) a::text') il.add_css('player_1_odds' , '.odds-td.odds-0 [payout]::text') il.add_css('player_2_odds' , '.odds-td.odds-1 [payout]::text') il.add_css('h_2_h' , 'a.h2h::text') yield il.load_item()

2条回答

网友

1楼 · 编辑于 2024-10-02 10:20:28

我建议完全不要使用Items类，而使用start_requests方法而不是{}，因为它们确实令人困惑。请参阅此处的完整工作代码。还要注意match_heading变量。在

class CrawlbotSpider(Spider):
   name = 'bigcrawler'
   allowed_domains = ['www.matchstat.com']
   start_urls =   [ 

      'https://matchstat.com/tennis/tournaments/w/Taipei/2015',        
      'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
                ]

    def start_requests(self):
       match_urls =   [ 

          'https://matchstat.com/tennis/tournaments/w/Taipei/2015',        
          'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
                    ]

        for url in match_urls:

            yield Request(url=url, callback=self.parse_matches)



    def parse_matches(self , response):
        match_heading = response.xpath('//*[@id="AWS"]/div/h3/text()').extract_first()

        for row in response.css('tr.match'):

            match = {}
            match['heading'] = match_heading
            match['round'] = row.css(".round::text").extract_first()
            match['event1'] = row.css(".event-name a::text").extract_first()
            match['player_1'] = row.css(".player-name:nth-child(2) a::text").extract_first()
            match['player_2'] = row.css(".player-name:nth-child(3) a::text").extract_first()
            match['player_1_odds'] = row.css(".odds-td.odds-0 [payout]::text").extract_first()
            match['player_2_odds'] = row.css(".odds-td.odds-1 [payout]::text").extract_first()
            match['h_2_h'] = row.css("a.h2h::text::text").extract_first()

            yield match

网友

2楼 · 编辑于 2024-10-02 10:20:28

如果其中只有一个标题不需要相对于当前节点，请尝试以下操作：

il.add_xpath('event_title', '//*[@id="AWS"]//h3/text()')

但如果需要它相对于当前节点，也可以执行以下操作：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章