网站被爬网，但不是刮脏

class CapeWaterfrontSpider(scrapy.Spider): name = "cape_waterfront" start_urls = ['https://www.capewaterfrontestates.co.za/template/Properties.vm/listingtype/SALES'] def parse(self, response): for prop in response.css('div.col-sm-6.col-md-12.grid-sizer.grid-item'): link = prop.css('div.property-image a::attr(href)').get() bedrooms = prop.css('div.property-details li.bedrooms::text').getall() bathrooms = prop.css('div.property-details li.bathrooms::text').getall() gar = prop.css('div.property-details li.garages::text').getall() if len(bedrooms) == 0: bedrooms.append(None) else: bedrooms = bedrooms[1].split() if len(bathrooms) == 0: bathrooms.append(None) else: bathrooms = bathrooms[1].split() if len(gar) == 0: gar.append(None) else: gar = gar[1].split() yield scrapy.Request( link, meta={'item': { 'agency': self.name, 'url': link, 'title': ' '.join(prop.css('div.property-details p.intro::text').get().split()), 'price': ''.join(prop.css('div.property-details p.price::text').get().split()), 'bedrooms': str(bedrooms), 'bathroom': str(bathrooms), 'garages': str(gar) }}, callback=self.get_loc, ) next_page = response.css('p.form-control-static.pagination-link a::attr(href)').get() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)

1条回答

网友

1楼 · 发布于 2024-10-06 13:02:18

定义选择器的方式容易出错。此外，很少有错误的选择器根本不起作用。指向下一页的链接不起作用。它只转到第一页然后退出。最后，我不知道css选择器中next_sibling的任何用法，所以我不得不以某种尴尬的方式挖掘出下一个兄弟对象。你知道吗

class CapeWaterfrontSpider(scrapy.Spider):
    name = "cape_waterfront"
    start_urls = ['https://www.capewaterfrontestates.co.za/template/Properties.vm/listingtype/SALES']

    def parse(self, response):

        for prop in response.css('.grid-item'):
            link = prop.css('.property-image a::attr(href)').get()

            bedrooms = [elem.strip() for elem in prop.css(".bedrooms::text").getall()]
            bedrooms = bedrooms[-2] if len(bedrooms)>=1 else None

            bathrooms = [elem.strip() for elem in prop.css(".bathrooms::text").getall()]
            bathrooms = bathrooms[-2] if len(bathrooms)>=1 else None

            gar = [elem.strip() for elem in prop.css(".garages::text").getall()]
            gar = gar[-2] if len(gar)>=1 else None

            yield scrapy.Request(
                link,
                meta={'item': {
                    'agency': self.name,
                    'url': link,
                    'bedrooms': bedrooms,
                    'bathroom':  bathrooms,
                    'garages': gar
                }},
                callback=self.get_loc,
            )

        next_page = response.css('.pagination-link a.next::attr(href)').get()
        if next_page:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

    def get_loc(self,response):
        items = response.meta['item']
        print(items)

如果你想追求一个更干净的方法来获得这三个项目，我认为xpath是你想要坚持的：

for prop in response.css('.grid-item'):
    link = prop.css('.property-image a::attr(href)').get()
    bedrooms = prop.xpath("normalize-space(.//*[contains(@class,'bedrooms')]/label/following::text())").get()
    bathrooms = prop.xpath("normalize-space(.//*[contains(@class,'bathrooms')]/label/following::text())").get()
    gar = prop.xpath("normalize-space(.//*[contains(@class,'garages')]/label/following::text())").get()

为了简洁起见，我已经去掉了两三个字段，我想你可以管理它们。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章