如果没有href，如何跟踪Scrapy中的链接？

# -*- coding: utf-8 -*- import scrapy import json class LoclocatorTestSpider(scrapy.Spider): name = "loclocator_test" start_urls = [] with open("test_one_url.json", encoding="utf-8") as json_file: data = json.load(json_file) for store in data: storeName = store["storeName"] storeLinkUrl = store["storeLinkMaker"] start_urls.append(storeLinkUrl) def parse(self, response): selector = "//div[@class='mainContentWrapInner cf']" store_name_selector = ".//h1[@class='title']/text()" store_branches_selector = ".//li/a[@class='xiti']/@href" for basic_info in response.xpath(selector): store_branches = {} store_branches["storeName"] = basic_info.xpath(store_name_selector).extract_first() # This specific XPath extracts 1st part of link needed to crawl all of store branches store_branches["storeBranchesLink"] = basic_info.xpath(store_branches_selector).extract_first() + "?" store_branches_url = basic_info.xpath(store_branches_selector).extract_first() yield response.follow(store_branches_url, self.parse_pagination, meta={"store_branches": store_branches}) def parse_branches(self, response): store_branches_name_selector = "//li[@class='xiti']" store_branches = response.meta["store_branches"] for store_branch in response.xpath(store_branches_name_selector): store_branches["storeBranchName"] = store_branch.xpath(".//span[@class='title']/text()").extract_first() yield store_branches # This specific XPath extracts 2nd part of link needed to crawl all of store branches # URL should look like: https://popusti.njuskalo.hr/trgovina/Interspar?page=n where n>0 links = response.selector.xpath("//li[@class='next']/button[@class='nBtn link xiti']/@data-param").extract() for link in links: absolute_url = #LIST FROM FIRST PARSE (ie. store_branches["storeBranchesLink"]) + link yield scrapy.Request(absolute_url, callback=self.parse_branches)

1条回答

网友

1楼 · 发布于 2024-09-28 22:01:33

我自己设法找到了一个解决方案，而且我离这个解决方案比较近

在该部分下：

    # This specific XPath extracts 2nd part of link needed to crawl all of store branches
    # URL should look like: https://popusti.njuskalo.hr/trgovina/Interspar?page=n where n>0
    links = response.selector.xpath("//@data-param").extract()
    store_branches = response.meta["store_branches"]
    for link in links:
        absolute_url = store_branches["storeBranchesLink"]) + link
        yield scrapy.Request(absolute_url, callback=self.parse_branches)

我认为解决方案是添加store_分支的响应，因为它能够找到所有可能的页面（？page=n，其中n>；0）。如果有人知道更多的技术信息，因为我对代码的理解还比较初级，请务必回答

相关问题更多 >

编程相关推荐

热门问题

热门文章