运行Srapy时返回重复的项目。我相信这是因为我没有在第二个方法def parse_branches
中设置新字典,但如果我在该方法下创建新字典,Scrapy将返回没有第一个方法parse
的数据。返回的数据示例:
[
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Westgate, Zapre\u0161i\u0107ka 2, 10290 Zapre\u0161i\u0107", "storeBranchLongLat": ["45.871201", "15.827459"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Velika Gorica , Slavka Kolara 6, 10410 Velika Gorica", "storeBranchLongLat": ["45.714564", "16.070628"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Supernova Karlovac, Prilaz Ve\u0107eslava Holjevca 12, 47000 Karlovac", "storeBranchLongLat": ["45.505645", "15.544222"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Kri\u017eevci , Ul. Petra Zrinskog 8, 48260 Kri\u017eevci", "storeBranchLongLat": ["46.022143", "16.548462"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Karlovac , Trg Milana \u0160uflaja 1, 47000 Karlovac", "storeBranchLongLat": ["45.493079", "15.5495"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Samobor , Ulica Kralja Petra Kre\u0161imira IV 4, 10430 Samobor", "storeBranchLongLat": ["45.801553", "15.726537"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Samobor , Ul.Bleibur\u0161kih \u017ertava 1945, 10430 Samobor", "storeBranchLongLat": ["45.810583", "15.712344"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Arena Park Zagreb, Jaru\u0161\u010dica 4, 10020 Zagreb", "storeBranchLongLat": ["45.76936", "15.942252"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport City Center one East Zagreb, Slavonska avenija 11d, 10000 Zagreb", "storeBranchLongLat": ["45.801812", "16.050655"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Arena Centar Zagreb, Ulica Vice Vukova 6, 10020 Zagreb", "storeBranchLongLat": ["45.770106", "15.937922"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Supernova Garden Mall Zagreb , Rudolfa Kolaka 14, 10 040 Zagreb", "storeBranchLongLat": ["45.836791", "16.046129"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Meridijan 16 Zagreb, Gra\u010danska cesta 208, 10000 Zagreb", "storeBranchLongLat": ["45.861938", "15.983691"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Zagreb , Zagreba\u010dka avenija 94, 10000 Zagreb", "storeBranchLongLat": ["45.796563", "15.909759"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Zadar", "storeBranchLink": "/poslovnica/intersport-zadar-ulbleiburskih-zrtava-17-23000-zadar", "storeBranchAddress": "Ul.Bleibur\u0161kih \u017ertava 17", "storeBranchAddressCounty": "23000 Zadar", "storeBranchNameFull": "Intersport Zagreb Petrinjska , Petrinjska 3 , 10000 Zagreb", "storeBranchLongLat": ["45.811949", "15.979349"]},
{"storeBranchesLink": "https://popusti.njuskalo.hr/f/intersport-poslovnice", "storeBranchName": "Intersport Metkovi\u0107", "storeBranchLink": "/poslovnica/intersport-metkovic-splitska-1-20350-metkovic", "storeBranchAddress": "Splitska 1", "storeBranchAddressCounty": "20350 Metkovi\u0107", "storeBranchNameFull": "Intersport Po\u017eega , Osje\u010dka 10, 34 000 Po\u017eega", "storeBranchLongLat": ["45.339676", "17.682629"]}
]
下面是剪贴代码:
# -*- coding: utf-8 -*-
import scrapy
import json
class LoclocatorTestSpider(scrapy.Spider):
name = "loclocator_test"
start_urls = []
with open("test_one_url.json", encoding="utf-8") as json_file:
data = json.load(json_file)
for store in data:
storeName = store["storeName"]
storeLinkUrl = store["storeLinkMaker"]
start_urls.append(storeLinkUrl)
def parse(self, response):
selector = "//div[@class='mainContentWrapInner cf']"
store_branches_selector = ".//li/a[@class='xiti']/@href"
for basic_info in response.xpath(selector):
store_branches = {}
store_branches["storeBranchesLink"] = basic_info.xpath(store_branches_selector).extract_first()
store_branches_url = basic_info.xpath(store_branches_selector).extract_first()
yield response.follow(store_branches_url, self.parse_branches, meta={"store_branches": store_branches})
def parse_branches(self, response):
store_branches_name_selector = response.xpath("//li[@class='xiti']")
store_branches = response.meta["store_branches"]
for store_branch in store_branches_name_selector:
store_branch["storeBranchName"] = store_branch.xpath(".//span[@class='title']/text()").extract_first()
store_branch["storeBranchLink"] = store_branch.xpath("./a/@href").extract_first()
store_branch["storeBranchAddress"] = store_branch.xpath(".//address/p[1]/text()").extract_first(default="")
store_branch["storeBranchAddressCounty"] = store_branch.xpath(".//address/p[2]/text()").extract_first(default="")
store_single_branch_url = store_branch.xpath("./a/@href").extract_first()
yield response.follow(store_single_branch_url, self.parse_single_branch, meta={"store_branches": store_branch})
links = response.selector.xpath("//@data-param").extract()
for link in links:
absolute_url = store_branches["storeBranchesLink"] + "?" + link
yield scrapy.Request(absolute_url, callback=self.parse_branches, meta={"store_branches": store_branches})
def parse_single_branch(self, response):
single_branch_selector = "//header[@class='mainContentHeader']"
f_store_branches = response.meta["store_branches"]
for single_branch in response.xpath(single_branch_selector):
f_store_branches["storeBranchNameFull"] = single_branch.xpath("//h1[@class='title']/text()").extract_first(default="")
f_store_branches["storeBranchLongLat"] = single_branch.xpath(".//ul[@class='simpleListBox']/li[1]/a[@class='xiti']/@href").extract_first(default="")[29:].split(",")
yield f_store_branches
目前没有回答
相关问题 更多 >
编程相关推荐