class KextraSpider(CrawlSpider):
name = "kextra"
allowed_domains = ["k-extra.fi"]
start_urls = ["http://www.k-extra.fi/Tarjoukset/"]
rules = (
Rule(SgmlLinkExtractor(allow = ('\?id=5&epslanguage=fi&sivu=\d')) , callback="parse_items" , follow=True),
)
def parse_items(self, response):
sel = Selector(response)
kxitems = []
sites = sel.xpath('//div[@class="offerListItem"]')
for site in sites:
item = KextraItem()
item["image"] = site.xpath('div[@class="offerListLeftColumn"]/img/@ src').extract()
item["product_primary"] = site.xpath('div[@class="offerListRightColumn"]/h4/text()').extract()
item["product_secondary"]= site.xpath('div[@class="offerListRightColumn"]/h3/text()').extract()
item["discount"] = site.xpath('div[@class="offerListRightColumn"]/div[@class="plussaDiscount"]/div[@class="plussaAmount"]/text()').extract()
item["priceEuros"] = site.xpath('div[@class="offerListPriceContainer"]/div[@class="price"]/p[@class="euros"]/text()').extract()
item["priceCents"] = site.xpath('div[@class="offerListPriceContainer"]/div[@class="price"]/p[@class="euros"]/span[@class="cents"]/text()').extract()
kxitems.append(item)
return kxitems;
问题是没有遵循指定的allow。如果我把allow留空,那么整个链接都会被跟踪。allow中的正则表达式可能有什么问题?你知道吗
尝试使用此链接提取器:
scrapy shell会话示例:
相关问题 更多 >
编程相关推荐