使用Scrapy的多个请求

import scrapy import re import requests class ProbRentBaySpider(scrapy.Spider): name = 'prob_rent_bay' start_urls = [ 'https://www.propbay.co.za/for-sale-property/residential/za/western_cape/7', 'https://www.rentbay.co.za/to-rent-property/residential/za/western_cape/7' ] headers = { 'authority': 'www.propbay.co.za', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"', 'accept': '*/*', 'x-requested-with': 'XMLHttpRequest', 'sec-ch-ua-mobile': '?1', 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36', 'content-type': 'application/json', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', 'cookie': '_ga=GA1.3.1758254184.1598490742; ASP.NET_SessionId=v1muaskigrgnn40m42lsqzct; __RequestVerificationToken=AIEv13vh8ksXZeG6Tf_o-vLCscKt7sYKJjwB0kz0CfqmCe8ZpYRQQdGk2BnN095p2A6wlFf7o_lVYyxe1Jro-I5vHE01; _gid=GA1.3.900892753.1605696808', } headers2 = { 'authority': 'www.rentbay.co.za', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"', 'accept': '*/*', 'x-requested-with': 'XMLHttpRequest', 'sec-ch-ua-mobile': '?1', 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36', 'content-type': 'application/json', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', 'cookie': 'ASP.NET_SessionId=rexgmrrgju10aw4rirulzrmk; _ga=GA1.3.225479946.1606814269; __RequestVerificationToken=az6ZATA2H0dJfBQ6KuwDwz39XGSiSuIjc4iZwRT8BGSD2surYfA6iOmkIQk2p835G51hYqJd5FFoiSQYsvx-V3Ndx6s1; _gid=GA1.3.1154633027.1607081144; _gat_gtag_UA_7413963_2=1', } base_url_sale = ['https://www.propbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=', 'https://www.propbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ] base_url_rent = ['https://www.rentbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=', 'https://www.rentbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ] def parse(self,response): for page in range(2, 8):# specify page range you would like to scrape data for for link in self.base_url_sale: next_page = link + str(page) response = requests.get(url=next_page, headers=self.headers) for product in response.json()['Data']['Suburbs']: area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl'] yield scrapy.Request(area_url,callback=self.parse_sale) for page2 in range(2, 8):# specify page range you would like to scrape data for for link2 in self.base_url_rent: next_page2 = link2 + str(page2) response2 = requests.get(url=next_page2, headers=self.headers2) for product2 in response2.json()['Data']['Suburbs']: area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl'] yield scrapy.Request(area_url_2,callback=self.parse_rent) def parse_sale(self, response): # follow links to property pages for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall(): follow_link = 'https://www.propbay.co.za'+ href yield response.follow(follow_link, self.parse_property) # follow pagination links for href in response.xpath('//*[@id="btnNext"]/@href'): yield response.follow(href, self.parse_sale) def parse_rent(self, response): # follow links to property pages for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall(): follow_link = 'https://www.rentbay.co.za'+ href yield response.follow(follow_link, self.parse_property) # follow pagination links for href in response.xpath('//*[@id="btnNext"]/@href').getall(): yield response.follow(href, self.parse_rent) def parse_property(self, response): title = response.css('span.u-text-capitalize::text').get() bedrooms = response.xpath('//span[contains(text(), "Bedrooms")]/following-sibling::span/text()').get() bedrooms = bedrooms.split()[0] if bedrooms is not None else None ...

def parse(self,response): for page in range(2, 8):# specify page range you would like to scrape data for for link in self.base_url_sale: next_page = link + str(page) response = requests.get(url=next_page, headers=self.headers) for product in response.json()['Data']['Suburbs']: area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl'] yield scrapy.Request(area_url,callback=self.parse_rent) def parse_rent(self, response): for page2 in range(2, 8):# specify page range you would like to scrape data for for link2 in self.base_url_rent: next_page2 = link2 + str(page2) response = requests.get(url=next_page2, headers=self.headers2) for product2 in response.json()['Data']['Suburbs']: item = dict() area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl'] yield scrapy.Request(area_url_2,callback=self.parse_all) def parse_all(self, response): # follow links to property pages for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall(): follow_link = 'https://www.propbay.co.za'+ href yield response.follow(follow_link, self.parse_property) # follow pagination links for href in response.xpath('//*[@id="btnNext"]/@href'): yield response.follow(href, self.parse_all)

1条回答

网友

1楼 · 发布于 2024-09-21 03:19:58

首先，您需要覆盖start_请求并为每个url指定不同的回调，然后将parse()逻辑拆分为这两个方法。否则，在通过propbay或rentbay部分循环之前，您至少可以对url进行if检查。目前，两个URL的响应在您的解析中被以相同的方式处理。因此，可能是第一次请求不正确，因为您的响应是针对propbay的，但第二次请求正确时，将由dupefilter进行过滤

对于即时修复，您可以尝试在parse方法中将don't_filter=True添加到您的请求中

相关问题更多 >

编程相关推荐

热门问题

热门文章