使用Scrapy的多个请求

2024-09-21 03:19:58 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试删除以下URL,但是,由于URL不同,我需要创建两个不同的请求,一个用于出售物业,另一个用于出租

当我运行我拥有的代码时,我只能解析待售(“propbay”)和不待租(“rentbay”)的属性。我不确定我在第二个请求上做错了什么

有人有什么建议吗?这是我的密码:

import scrapy
import re
import requests


class ProbRentBaySpider(scrapy.Spider):
    name = 'prob_rent_bay'
    start_urls = [
    'https://www.propbay.co.za/for-sale-property/residential/za/western_cape/7',
    'https://www.rentbay.co.za/to-rent-property/residential/za/western_cape/7'
    ]

    headers = {
    'authority': 'www.propbay.co.za',
    'pragma': 'no-cache',
    'cache-control': 'no-cache',
    'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
    'accept': '*/*',
    'x-requested-with': 'XMLHttpRequest',
    'sec-ch-ua-mobile': '?1',
    'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
    'content-type': 'application/json',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cookie': '_ga=GA1.3.1758254184.1598490742; ASP.NET_SessionId=v1muaskigrgnn40m42lsqzct; __RequestVerificationToken=AIEv13vh8ksXZeG6Tf_o-vLCscKt7sYKJjwB0kz0CfqmCe8ZpYRQQdGk2BnN095p2A6wlFf7o_lVYyxe1Jro-I5vHE01; _gid=GA1.3.900892753.1605696808',
}
    headers2 = {
    'authority': 'www.rentbay.co.za',
    'pragma': 'no-cache',
    'cache-control': 'no-cache',
    'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
    'accept': '*/*',
    'x-requested-with': 'XMLHttpRequest',
    'sec-ch-ua-mobile': '?1',
    'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
    'content-type': 'application/json',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cookie': 'ASP.NET_SessionId=rexgmrrgju10aw4rirulzrmk; _ga=GA1.3.225479946.1606814269; __RequestVerificationToken=az6ZATA2H0dJfBQ6KuwDwz39XGSiSuIjc4iZwRT8BGSD2surYfA6iOmkIQk2p835G51hYqJd5FFoiSQYsvx-V3Ndx6s1; _gid=GA1.3.1154633027.1607081144; _gat_gtag_UA_7413963_2=1',
}

    base_url_sale = ['https://www.propbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
                    'https://www.propbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
    base_url_rent = ['https://www.rentbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
                    'https://www.rentbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]

    def parse(self,response):     
        for page in range(2, 8):# specify page range you would like to scrape data for
            for link in self.base_url_sale:
                next_page = link + str(page)
                response = requests.get(url=next_page, headers=self.headers)
                for product in response.json()['Data']['Suburbs']:
                    area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
                yield scrapy.Request(area_url,callback=self.parse_sale)

        for page2 in range(2, 8):# specify page range you would like to scrape data for
            for link2 in self.base_url_rent:
                next_page2 = link2 + str(page2)
                response2 = requests.get(url=next_page2, headers=self.headers2)
                for product2 in response2.json()['Data']['Suburbs']:
                    area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
                yield scrapy.Request(area_url_2,callback=self.parse_rent)

    def parse_sale(self, response):
        # follow links to property pages
        for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall():
            follow_link = 'https://www.propbay.co.za'+ href
            yield response.follow(follow_link, self.parse_property)

        # follow pagination links
        for href in response.xpath('//*[@id="btnNext"]/@href'):
            yield response.follow(href, self.parse_sale)

    def parse_rent(self, response):
        # follow links to property pages
        for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall():
            follow_link = 'https://www.rentbay.co.za'+ href
            yield response.follow(follow_link, self.parse_property)

        # follow pagination links
        for href in response.xpath('//*[@id="btnNext"]/@href').getall():
            yield response.follow(href, self.parse_rent)

    def parse_property(self, response):

        title = response.css('span.u-text-capitalize::text').get()
        bedrooms = response.xpath('//span[contains(text(), "Bedrooms")]/following-sibling::span/text()').get()
        bedrooms = bedrooms.split()[0] if bedrooms is not None else None
...

编辑代码: 我已经尝试了单独的解析功能,但是,我只得到出租的房产,不知道如何也得到出售的房产

   def parse(self,response):
        for page in range(2, 8):# specify page range you would like to scrape data for
            for link in self.base_url_sale:
                next_page = link + str(page)
                response = requests.get(url=next_page, headers=self.headers)
                for product in response.json()['Data']['Suburbs']:
                    area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
                    yield scrapy.Request(area_url,callback=self.parse_rent)

    
    def parse_rent(self, response):
        for page2 in range(2, 8):# specify page range you would like to scrape data for
            for link2 in self.base_url_rent:
                next_page2 = link2 + str(page2)
                response = requests.get(url=next_page2, headers=self.headers2)
                for product2 in response.json()['Data']['Suburbs']:
                    item = dict()
                    area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
                    yield scrapy.Request(area_url_2,callback=self.parse_all)

    def parse_all(self, response):
        # follow links to property pages
        for href in response.xpath('//a[@class="u-text-uppercase"]/@href').getall():
            follow_link = 'https://www.propbay.co.za'+ href
            yield response.follow(follow_link, self.parse_property)

        # follow pagination links
        for href in response.xpath('//*[@id="btnNext"]/@href'):
            yield response.follow(href, self.parse_all)

Tags: inhttpsselfurlforparseresponsewww
1条回答
网友
1楼 · 发布于 2024-09-21 03:19:58

首先,您需要覆盖start_请求并为每个url指定不同的回调,然后将parse()逻辑拆分为这两个方法。否则,在通过propbay或rentbay部分循环之前,您至少可以对url进行if检查。目前,两个URL的响应在您的解析中被以相同的方式处理。因此,可能是第一次请求不正确,因为您的响应是针对propbay的,但第二次请求正确时,将由dupefilter进行过滤

对于即时修复,您可以尝试在parse方法中将don't_filter=True添加到您的请求中

相关问题 更多 >

    热门问题