Scrapy：在多个页面上使用项目加载程序填充项目

import scrapy from scrapy.http import Request from scrapy import Spider, Request, Selector from testanother.items import TestItems, TheLoader class tester(scrapy.Spider): name = 'vs' handle_httpstatus_list = [404, 200, 300] #Usually, I only get data from the first start url start_urls = ['https://en.wikipedia.org/wiki/SANZAAR','https://en.wikipedia.org/wiki/2016_Rugby_Championship','https://en.wikipedia.org/wiki/2016_Super_Rugby_season'] def parse(self, response): #item = TestItems() l = TheLoader(item=TestItems(), response=response) #when I use an item loader, the url in the request is completely ignored. without the item loader, it works properly. request = Request("https://en.wikipedia.org/wiki/2016_Rugby_Championship", callback=self.parsePage1, meta={'loadernext':l}, dont_filter=True) yield request request = Request("https://en.wikipedia.org/wiki/SANZAAR", callback=self.parsePage2, meta={'loadernext1': l}, dont_filter=True) yield request yield Request("https://en.wikipedia.org/wiki/2016_Super_Rugby_season", callback=self.parsePage3, meta={'loadernext2': l}, dont_filter=True) def parsePage1(self,response): loadernext = response.meta['loadernext'] loadernext.add_xpath('title1', '//*[@id="firstHeading"]/text()') return loadernext.load_item() #I'm not sure if this return and load_item is the problem, because I've tried yielding/returning to another method that does the item loading instead and the first start url is still the only url scraped. def parsePage2(self,response): loadernext1 = response.meta['loadernext1'] loadernext1.add_xpath('title2', '//*[@id="firstHeading"]/text()') return loadernext1.load_item() def parsePage3(self,response): loadernext2 = response.meta['loadernext2'] loadernext2.add_xpath('title3', '//*[@id="firstHeading"]/text()') return loadernext2.load_item()

{'title2': u'SANZAAR'} 2016-09-24 14:30:43 [scrapy] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/2016_Rugby_Championship> (referer: https://en.wikipedia.org/wiki/SANZAAR) 2016-09-24 14:30:43 [scrapy] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/2016_Rugby_Championship> (referer: https://en.wikipedia.org/wiki/2016_Rugby_Championship) 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/2016_Super_Rugby_season> {'title2': u'SANZAAR', 'title3': u'SANZAAR'} 2016-09-24 14:30:43 [scrapy] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/SANZAAR> (referer: https://en.wikipedia.org/wiki/2016_Rugby_Championship) 2016-09-24 14:30:43 [scrapy] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/2016_Rugby_Championship> (referer: https://en.wikipedia.org/wiki/2016_Super_Rugby_season) 2016-09-24 14:30:43 [scrapy] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/2016_Super_Rugby_season> (referer: https://en.wikipedia.org/wiki/2016_Rugby_Championship) 2016-09-24 14:30:43 [scrapy] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/2016_Super_Rugby_season> (referer: https://en.wikipedia.org/wiki/2016_Super_Rugby_season) 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/2016_Rugby_Championship> {'title1': u'SANZAAR', 'title2': u'SANZAAR', 'title3': u'SANZAAR'} 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/2016_Rugby_Championship> {'title1': u'2016 Rugby Championship'} 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/SANZAAR> {'title1': u'2016 Rugby Championship', 'title2': u'2016 Rugby Championship'} 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/2016_Rugby_Championship> {'title1': u'2016 Super Rugby season'} 2016-09-24 14:30:43 [scrapy] DEBUG: Crawled (200) <GET https://en.wikipedia.org/wiki/SANZAAR> (referer: https://en.wikipedia.org/wiki/2016_Super_Rugby_season) 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/2016_Super_Rugby_season> {'title1': u'2016 Rugby Championship', 'title2': u'2016 Rugby Championship', 'title3': u'2016 Rugby Championship'} 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/2016_Super_Rugby_season> {'title1': u'2016 Super Rugby season', 'title3': u'2016 Super Rugby season'} 2016-09-24 14:30:43 [scrapy] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/SANZAAR> {'title1': u'2016 Super Rugby season', 'title2': u'2016 Super Rugby season', 'title3': u'2016 Super Rugby season'} 2016-09-24 14:30:43 [scrapy] INFO: Clos

1条回答

网友

1楼 · 发布于 2024-10-04 07:31:17

一个问题是您将同一个item loader实例的多个引用传递到多个回调中，例如，parse中有两条yield request指令。在

另外，在后续的回调中，加载程序仍然使用旧的response对象，例如在parsePage1中，项目加载器仍然在parse中的response上操作。在

在大多数情况下，不建议将项加载器传递给另一个回调。或者，您可能会发现直接传递item对象会更好。在

下面是一个简短（不完整）的示例，通过编辑代码：

def parse(self, response):
    l = TheLoader(item=TestItems(), response=response)
    request = Request(
        "https://en.wikipedia.org/wiki/2016_Rugby_Championship",
        callback=self.parsePage1,
        meta={'item': l.load_item()},
        dont_filter=True
    )
    yield request

def parsePage1(self,response):
    loadernext = TheLoader(item=response.meta['item'], response=response)
    loadernext.add_xpath('title1', '//*[@id="firstHeading"]/text()')
    return loadernext.load_item()

相关问题更多 >

编程相关推荐

热门问题

热门文章