Scrapy：由于多个for循环，项目字段重复

import scrapy class Product(scrapy.Item): Date = scrapy.Field() Name = scrapy.Field() Winner_Name= scrapy.Field() Match_Duration= scrapy.Field() Loser_Url= scrapy.Field() Winner_Birthday= scrapy.Field() Loser_Birthday= scrapy.Field() pass

import scrapy from urllib.parse import urljoin from items import Product start = 2018 finish =2019 class QuotesSpider(scrapy.Spider): name = "brief" custom_settings = {'CONCURRENT_REQUESTS':1,'DOWNLOAD_DELAY':1} def start_requests(self): urls = "https://www.atptour.com/en/scores/results-archive?year=2018" for year in range(start, finish): next_page = urljoin(urls,"?year=" + str(year)) yield scrapy.Request(next_page, callback=self.parse, meta={'dont_obey_robotstxt': True},dont_filter=True) def parse(self, response): for tournament in response.css('tr.tourney-result'): item = Product() item['Date']=tournament.css('span.tourney-dates::text').get().replace("\r", '').replace("\n", '').replace("\t", '').strip(), item['Name']= tournament.css('span.tourney-title::text').get().replace("\r", '').replace("\n", '').replace("\t", '').strip(), Tourney_URL= response.urljoin(tournament.css('a.button-border::attr(href)').get()) yield scrapy.Request(Tourney_URL, callback=self.tourney_info, meta={'dont_obey_robotstxt': True, 'item':item},dont_filter=True) def tourney_info(self, response): table = response.css('table.day-table tbody') rows = table.css('tr') for row in rows: item = response.meta['item'] names = row.css('td.day-table-name a::text').getall() item['Winner_Name']= names[0].replace("\r", '').replace("\n", '').replace("\t", '').strip(), Match_URL = response.urljoin(row.css('td.day-table-score a::attr(href)').get()) yield scrapy.Request(Match_URL, callback=self.matchinfo, meta={'dont_obey_robotstxt': True,'item':item},dont_filter=True) def matchinfo(self, response): item = response.meta['item'] table = response.css('table.match-stats-table') mytime = response.css('td.time::text').get().replace("\r", '').replace("\n", '').replace("\t", '').strip() factors = (60, 1, 1 / 60) t1 = sum(i * j for i, j in zip(map(int, mytime.split(':')), factors)) item['Match_Duration'] = t1 item['Loser_Url']=response.urljoin(response.css('div.player-right-image a::attr(href)').get()) winner_url = response.urljoin(response.css('div.player-left-image a::attr(href)').get()) yield scrapy.Request(winner_url, callback=self.winnerinfo, meta={'dont_obey_robotstxt': True,'item':item},dont_filter=True) def winnerinfo(self, response): item = response.meta['item'] item['Winner_Birthday'] = response.css('span.table-birthday::text').get().replace("\r", '').replace("\n", '').replace("\t", '').strip() yield scrapy.Request(item['Loser_Url'], callback=self.loserinfo, meta={'dont_obey_robotstxt': True, 'item': item},dont_filter=True) def loserinfo(self, response): item = response.meta['item'] item['Loser_Birthday'] = response.css('span.table-birthday::text').get().replace("\r", '').replace("\n", '').replace("\t", '').strip() yield item

1条回答

网友

1楼 · 发布于 2024-05-20 18:43:05

您用来存储所刮取数据的项是可变的。你应该照做项目.副本（）然后将其传递给下一个请求，例如在parse上（可能也在tourney\u info上）。由于每个方法都在json文件上生成一个en条目，但是您持有对同一项的多个引用，因此预期会发生这种情况。你知道吗

如果你想的话，也可以选择https://docs.scrapy.org/en/latest/topics/items.html#copying-items。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章