无法用scrapy和python中的javascript检索下一页链接

import scrapy from scrapy.http import Request from gharbheti.items import GharbhetiItem from scrapy.contrib.loader import ItemLoader from scrapy.contrib.loader.processor import TakeFirst, Identity, MapCompose, Join, Compose from urllib.parse import urljoin class ListSpider(scrapy.Spider): name = 'list' allowed_domains = ['gharbheti.com'] start_urls = ['https://www.gharbheti.com/sale','https://www.gharbheti.com/rent'] def parse(self, response): properties=response.xpath('//li[@class="col-md-6 Search_building"]/descendant::a') for property in properties: link=property.xpath('./@href').extract_first() urls=response.urljoin(link) yield Request(urls,callback=self.parse_property, meta={'URL':urls, }) def parse_property(self, response): l = ItemLoader(item=GharbhetiItem(), response=response) URL=response.meta.get('URL') l.add_value('URL', response.url) l.add_xpath('Title','//div[@class="product-page-meta"]/h4/em/text()',MapCompose(str.strip,str.title)) l.add_xpath('Offering','//figcaption[contains(text(), "For Sale")]/text()|//figcaption[contains(text(),"For Rent")]/text()',MapCompose(lambda i:i.replace('For',''),str.strip)) l.add_xpath('Price','//div[@class="deal-pricebox"]/descendant::h3/text()',MapCompose(str.strip)) l.add_xpath('Type','//ul[@class="suitable-for"]/li/text()',MapCompose(str.strip)) bike_parking=response.xpath('//i[@class="fa fa-motorcycle"]/following-sibling::em/text()').extract_first() car_parking=response.xpath('//i[@class="fa fa-car"]/following-sibling::em/text()').extract_first() parking=("Bike Parking: {} Car Parking: {}".format(bike_parking,car_parking)) l.add_value('Parking',parking) l.add_xpath('Description','//div[@class="comment more"]/text()',MapCompose(str.strip)) l.add_xpath('Bedroom','//i[@class="fa fa-bed"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total Bed Room:',''),str.strip,int)) l.add_xpath('Livingroom','//i[@class="fa fa-inbox"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total Living Room:',''),str.strip,int)) l.add_xpath('Kitchen','//i[@class="fa fa-cutlery"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total kitchen Room:',''),str.strip,int)) l.add_xpath('Bathroom','//i[@class="fa fa-puzzle-piece"]/following-sibling::text()',MapCompose(lambda i:i.replace('Total Toilet/Bathroom:',''),str.strip,int)) l.add_xpath('Address','//b[contains(text(), "Map")]/text()',MapCompose(lambda i:i.replace('Map Loaction :-',''),str.strip)) l.add_xpath('Features','//div[@class="list main-list"]/ul/li/text()',MapCompose(str.strip)) images=response.xpath('//div[@class="carousel-inner dtl-carousel-inner text-center"]/descendant::img').extract() images=[s.replace('<img src="', '') for s in images] images=[i.split('?')[0] for i in images] Image=["http://www.gharbheti.com" + im for im in images] l.add_value('Images',Image) return l.load_item()

1条回答

网友

1楼 · 发布于 2024-10-06 10:21:04

因为分页使用javascript，所以页面的源代码中没有链接。你知道吗

看看发生了什么：

打开浏览器的检查器（Chrome中的F12）并转到“网络”选项卡
单击网页UI上的“加载更多”按钮

检查器将向您显示站点正在向https://www.gharbheti.com/RoomRentHome/GetPropertiesForRent发送一个异步POST表单请求，表单数据有两个值：

RentTypeId:0{不知道这是什么，但如果你需要知道的话，我相信你能弄清楚}
page:1{随着每次点击“加载更多”而递增}

您必须使用scrapy的Form Request采用编程方法。看起来每个页面都会产生10个以上的属性，因此如果您想在初始页面加载后获得下一个1000个属性，可以编写

for i in range(1,101):
    <send a form request with i as the page value>

我假设发帖返回的数据格式与网站主页不同，因此您可能需要定义另一个回调函数来解析该数据。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章