Scrapy返回重复的项目

2024-05-03 08:30:06 发布

您现在位置:Python中文网/ 问答频道 /正文

运行Srapy时返回重复的项目。我相信这是因为我没有在第二个方法def parse_branches中设置新字典,但如果我在该方法下创建新字典,Scrapy将返回没有第一个方法parse的数据。返回的数据示例:

[
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Westgate, Zapre\u0161i\u0107ka 2, 10290 Zapre\u0161i\u0107", "storeBranchLongLat": ["45.871201", "15.827459"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Velika Gorica , Slavka Kolara 6, 10410 Velika Gorica", "storeBranchLongLat": ["45.714564", "16.070628"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Supernova Karlovac, Prilaz Ve\u0107eslava Holjevca 12, 47000 Karlovac", "storeBranchLongLat": ["45.505645", "15.544222"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Kri\u017eevci , Ul. Petra Zrinskog 8, 48260 Kri\u017eevci", "storeBranchLongLat": ["46.022143", "16.548462"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Karlovac , Trg Milana \u0160uflaja 1, 47000 Karlovac", "storeBranchLongLat": ["45.493079", "15.5495"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Samobor , Ulica Kralja Petra Kre\u0161imira IV 4, 10430 Samobor", "storeBranchLongLat": ["45.801553", "15.726537"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Samobor , Ul.Bleibur\u0161kih \u017ertava 1945, 10430 Samobor", "storeBranchLongLat": ["45.810583", "15.712344"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Arena Park Zagreb, Jaru\u0161\u010dica 4, 10020 Zagreb", "storeBranchLongLat": ["45.76936", "15.942252"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport City Center one East Zagreb, Slavonska avenija 11d, 10000 Zagreb", "storeBranchLongLat": ["45.801812", "16.050655"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Arena Centar Zagreb, Ulica Vice Vukova 6, 10020 Zagreb", "storeBranchLongLat": ["45.770106", "15.937922"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Supernova Garden Mall Zagreb , Rudolfa Kolaka 14, 10 040 Zagreb", "storeBranchLongLat": ["45.836791", "16.046129"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Meridijan 16 Zagreb, Gra\u010danska cesta 208, 10000 Zagreb", "storeBranchLongLat": ["45.861938", "15.983691"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Zagreb , Zagreba\u010dka avenija 94, 10000 Zagreb", "storeBranchLongLat": ["45.796563", "15.909759"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Zagreb Petrinjska , Petrinjska 3 , 10000 Zagreb", "storeBranchLongLat": ["45.811949", "15.979349"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Metkovi\u0107", "storeBranchLink": "/poslovnica/intersport-metkovic-splitska-1-20350-metkovic", "storeBranchAddress": "Splitska 1", "storeBranchAddressCounty": "20350 Metkovi\u0107", "storeBranchNameFull": "Intersport Po\u017eega , Osje\u010dka 10, 34 000 Po\u017eega", "storeBranchLongLat": ["45.339676", "17.682629"]}
]

下面是剪贴代码:

# -*- coding: utf-8 -*-
import scrapy
import json


class LoclocatorTestSpider(scrapy.Spider):
    name = "loclocator_test"
    start_urls = []

    with open("test_one_url.json", encoding="utf-8") as json_file:
        data = json.load(json_file)
        for store in data:
            storeName = store["storeName"]
            storeLinkUrl = store["storeLinkMaker"]
            start_urls.append(storeLinkUrl)

    def parse(self, response):
        selector = "//div[@class='mainContentWrapInner cf']"

        store_branches_selector = ".//li/a[@class='xiti']/@href"

        for basic_info in response.xpath(selector):
            store_branches = {}

            store_branches["storeBranchesLink"] = basic_info.xpath(store_branches_selector).extract_first()

            store_branches_url = basic_info.xpath(store_branches_selector).extract_first()
            yield response.follow(store_branches_url, self.parse_branches, meta={"store_branches": store_branches})

    def parse_branches(self, response):
        store_branches_name_selector = response.xpath("//li[@class='xiti']")
        store_branches = response.meta["store_branches"]

        for store_branch in store_branches_name_selector:
            store_branch["storeBranchName"] = store_branch.xpath(".//span[@class='title']/text()").extract_first()
            store_branch["storeBranchLink"] = store_branch.xpath("./a/@href").extract_first()
            store_branch["storeBranchAddress"] = store_branch.xpath(".//address/p[1]/text()").extract_first(default="")
            store_branch["storeBranchAddressCounty"] = store_branch.xpath(".//address/p[2]/text()").extract_first(default="")

            store_single_branch_url = store_branch.xpath("./a/@href").extract_first()
            yield response.follow(store_single_branch_url, self.parse_single_branch, meta={"store_branches": store_branch})

        links = response.selector.xpath("//@data-param").extract()
        for link in links:
            absolute_url = store_branches["storeBranchesLink"] + "?" + link
            yield scrapy.Request(absolute_url, callback=self.parse_branches, meta={"store_branches": store_branches})

    def parse_single_branch(self, response):
        single_branch_selector = "//header[@class='mainContentHeader']"
        f_store_branches = response.meta["store_branches"]

        for single_branch in response.xpath(single_branch_selector):
            f_store_branches["storeBranchNameFull"] = single_branch.xpath("//h1[@class='title']/text()").extract_first(default="")
            f_store_branches["storeBranchLongLat"] = single_branch.xpath(".//ul[@class='simpleListBox']/li[1]/a[@class='xiti']/@href").extract_first(default="")[29:].split(",")

            yield f_store_branches

Tags: storebranchulbranchesstorebranchnamestorebranchaddressstorebranchlinkstorebrancheslink