我正在尝试删除以下URL,但是,由于URL不同,我需要创建两个不同的请求,一个用于出售物业,另一个用于出租
当我运行我拥有的代码时,我只能解析待售(“propbay”)和不待租(“rentbay”)的属性。我不确定我在第二个请求上做错了什么
有人有什么建议吗?这是我的密码:
import scrapy
import re
import requests
class ProbRentBaySpider(scrapy.Spider):
name = 'prob_rent_bay'
start_urls = [
'https://www.propbay.co.za/for-sale-property/residential/za/western_cape/7',
'https://www.rentbay.co.za/to-rent-property/residential/za/western_cape/7'
]
headers = {
'authority': 'www.propbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': '_ga=GA1.3.1758254184.1598490742; ASP.NET_SessionId=v1muaskigrgnn40m42lsqzct; __RequestVerificationToken=AIEv13vh8ksXZeG6Tf_o-vLCscKt7sYKJjwB0kz0CfqmCe8ZpYRQQdGk2BnN095p2A6wlFf7o_lVYyxe1Jro-I5vHE01; _gid=GA1.3.900892753.1605696808',
}
headers2 = {
'authority': 'www.rentbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': 'ASP.NET_SessionId=rexgmrrgju10aw4rirulzrmk; _ga=GA1.3.225479946.1606814269; __RequestVerificationToken=az6ZATA2H0dJfBQ6KuwDwz39XGSiSuIjc4iZwRT8BGSD2surYfA6iOmkIQk2p835G51hYqJd5FFoiSQYsvx-V3Ndx6s1; _gid=GA1.3.1154633027.1607081144; _gat_gtag_UA_7413963_2=1',
}
base_url_sale = ['https://www.propbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.propbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
base_url_rent = ['https://www.rentbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.rentbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_sale)
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response2 = requests.get(url=next_page2, headers=self.headers2)
for product2 in response2.json()['Data']['Suburbs']:
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_rent)
def parse_sale(self, response):
# follow links to property pages
for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[@id="btnNext"]/@href'):
yield response.follow(href, self.parse_sale)
def parse_rent(self, response):
# follow links to property pages
for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall():
follow_link = 'https://www.rentbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[@id="btnNext"]/@href').getall():
yield response.follow(href, self.parse_rent)
def parse_property(self, response):
title = response.css('span.u-text-capitalize::text').get()
bedrooms = response.xpath('//span[contains(text(), "Bedrooms")]/following-sibling::span/text()').get()
bedrooms = bedrooms.split()[0] if bedrooms is not None else None
...
编辑代码: 我已经尝试了单独的解析功能,但是,我只得到出租的房产,不知道如何也得到出售的房产
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_rent)
def parse_rent(self, response):
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response = requests.get(url=next_page2, headers=self.headers2)
for product2 in response.json()['Data']['Suburbs']:
item = dict()
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_all)
def parse_all(self, response):
# follow links to property pages
for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[@id="btnNext"]/@href'):
yield response.follow(href, self.parse_all)
首先,您需要覆盖start_请求并为每个url指定不同的回调,然后将
parse()
逻辑拆分为这两个方法。否则,在通过propbay或rentbay部分循环之前,您至少可以对url进行if检查。目前,两个URL的响应在您的解析中被以相同的方式处理。因此,可能是第一次请求不正确,因为您的响应是针对propbay的,但第二次请求正确时,将由dupefilter进行过滤对于即时修复,您可以尝试在parse方法中将
don't_filter=True
添加到您的请求中相关问题 更多 >
编程相关推荐