处理u时出错

import scrapy from scrapy.exceptions import CloseSpider from scrapy_crawls.items import Vino class BodebocaSpider(scrapy.Spider): name = "Bodeboca" allowed_domains = ["bodeboca.com"] start_urls = ( 'http://www.bodeboca.com/vino/espana', ) counter = 1 next_url = "" vino = None def __init__(self): self.next_url = self.start_urls[0] def parse(self, response): for sel in response.xpath( '//div[@id="venta-main-wrapper"]/div[@id="venta-main"]/div/div/div/div/div/div/span'): #print sel # HREF a_href = sel.xpath('.//a/@href').extract() the_href = a_href[0] print the_href yield scrapy.Request(the_href, callback=self.parse_item, headers={'Referer': response.url.encode('utf-8'), 'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3'}) # SIGUIENTE URL results = response.xpath( '//div[@id="wrapper"]/article/div[@id="article-inner"]/div[@id="default-filter-form-wrapper"]/div[@id="venta-main-wrapper"]/div[@class="bb-product-info-sort bb-sort-behavior-attached"]/div[@clsas="bb-product-info"]/span[@class="bb-product-info-count"]').extract() if not results: raise CloseSpider else: #self.next_url = self.next_url.replace(str(self.counter), str(self.counter + 1)) #self.counter += 1 self.next_url = response.xpath('//div[@id="venta-main-wrapper"]/div[@class="item-list"]/ul[@class="pager"]/li[@class="pager-next"]/a/@href').extract()[0] yield scrapy.Request(self.next_url, callback=self.parse, headers={'Referer': self.allowed_domains[0], 'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3'})

1条回答

网友

1楼 · 发布于 2024-09-28 21:29:12

简单回答：你可以从页面的相对url中提取。 /vino/terra-cuques-2014

为了发出一个糟糕的请求，url必须是完整的： http://www.bodeboca.com/vino/terra-cuques-2014。您可以使用Scrapyresponse.urljoin()创建完整的url 方法例如： full_url = response.urljoin(url)。在

尽量不要使用xpath表达式，比如：/div[@id="venta-main"]/div/div/div/div/div/div/span-它很难阅读，而且很容易被页面中的细微变化破坏。相反，您可以简单地使用基于类的xpath://a[@class="verficha"]。在

你可以像这样重写蜘蛛的一部分：

def parse(self, response):
    links = response.xpath('//a[@class="verficha"]')
    for link in links:
        url = link.xpath('@href').extract_first()
        full_url = response.urljoin(url)
        yield scrapy.Request(full_url , callback= your callback)

若要将url提取到下一页，可以使用xpathnext_page = response.xpath('//li[@class="pager-next"]/a/@href').extract_first()，再次调用response.urljoin(next_page)等

相关问题更多 >

编程相关推荐

热门问题

热门文章