我的爬虫不返回来自亚马逊网站

2024-06-28 19:34:45 发布

您现在位置:Python中文网/ 问答频道 /正文

伙计们,我已经编写了几个星期的基于垃圾的网络爬虫。他们似乎在如期工作。我成了一个讨厌的粉丝。但最近几天,我最新的爬行器拒绝爬行亚马逊网站。我没有得到任何结果。我也没有收到任何错误代码。我甚至试过这个破壳。它只是不返回任何结果。我怀疑问题出在xpath或css表达式中,但我无法解决。如有任何帮助,我们将不胜感激。在

这是我的蜘蛛长什么样的 我的代码打印了xxxxx,之后什么都没有

import scrapy

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from amazon.items import LowesItem
from amazon.items import SwatchcolorItem


class SattySpider(scrapy.Spider):
    name = "faucets"
    allowed_domains = ["amazon.com"]
    start_urls = [
        "https://www.amazon.com/s?ie=UTF8&page=1&rh=n%3A228013%2Ck%3Abathroom%20faucets"
    ]

    rules = (
        Rule(LinkExtractor(allow='amazon\.com/[A-Z][a-zA-Z_/]+$'),
            'parse_category', follow=True,
        ),
    )


    def parse(self, response):
        print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
        #####
        # I even tried xpath
        #for sel in response.xpath('.//li[@class="s-result-item  celwidget s-hidden-sponsored-item"]'):
        #    prodDesc= sel.xpath('.//div[@class="s-item-container"]//div[@class="a-row a-spacing-none"]//a[@title]').extract()
        #####
        for sel in response.css("li.s-result-item.celwidget.s-hidden-sponsored-item > div.s-item-container > div > div > a::attr('href')"):
        #for sel in response.xpath('.//li[@class="s-result-item  celwidget s-hidden-sponsored-item"]'):

            prodDesc= sel.xpath('.//div[@class="s-item-container"]//div[@class="a-row a-spacing-none"]//a[@title]').extract()
            print prodDesc
            produrls = sel.xpath('.//@data-producturl').extract()
            urls = sel.xpath('.//@data-productimg').extract()

            #prod_url_det = response.urljoin(produrl.extract())

            lowi= LowesItem()
            lowi['swatcharray'] = {}


            for idx,swatch in enumerate(sel.xpath('.//div[@class="product-container js-product-container"]//a//div[@class="pvs pvs-options-height v-spacing-small"]//ul/li')):
                swatchcolor = swatch.xpath('.//img//@alt').extract()

        lowi['swatcharray'][idx] =swatchcolor

                #yield lowi

            #url_prod_det = response.urljoin(produrl)
            for idx1,url in enumerate(urls):
                url_prod_det = response.urljoin(produrls[idx1])
                yield scrapy.Request(url_prod_det,
                     meta={'lowes': LowesItem(prod=prod[idx1], swatcharray=lowi['swatcharray'], file_urls=['http:' + url])},
                                 callback=self.parse_productdetail)


        for next in response.css("div.grid-parent.v-spacing-extra-large > nav > ul > li.page-next > a::attr('href')"):
            url_next = response.urljoin(next.extract())
            print " url_next : " + url_next
            yield scrapy.Request(url_next, callback=self.parse)

    def parse_productdetail(self, response):

    print 'Testing....'
#        for model in response.xpath('//div[@class="pd-numbers grid-50 tablet-grid-100"]//p[@class="secondary-text small-type"]').re('<strong> Model # </strong>'):
        for model in response.xpath('//div[@class="pd-numbers grid-50 tablet-grid-100"]//p[@class="secondary-text small-type"]'):
                #print model.extract()
                modelname = model.xpath('./text()').extract()
                #print modelname
                #yield lowesItem  


    lowesItem = response.meta['lowes']
    lowesItem['model']=modelname[1]
    lowesItem['category']='default'
    lowesItem['subcategory']='default'
    lowesItem['vendor']='Lowes'

        for namevals in response.xpath('//div[@id="collapseSpecs"]//div[@class="panel-body"]//div[@class="grid-100 grid-parent"]//div[@class="grid-50"]//table[@class="table full-width no-borders"]//tbody//tr'):
    #print namevals  
        name = namevals.xpath('.//th/text()').extract()
        val = namevals.xpath('.//td//span/text()').extract()

        if 'Faucet Type' in name:
            lowesItem['faucettype']=val[0]
        elif 'Number of Faucet Handles' in name:   
            lowesItem['numofhandles']=val[0]
        elif 'ADA Compliant' in name:
            lowesItem['ada']=val[0]
        elif 'Built-In Water Filter' in name:   
            lowesItem['builtinwaterfilter']=val[0]
        elif 'Mounting Location' in name:
            lowesItem['mountingloc']=val[0]
        elif 'Color/Finish Family' in name:   
                lowesItem['color']=val[0]
        elif 'Manufacturer Color/Finish' in name:   
            lowesItem['manufacturercolor']=val[0]
        elif 'Collection Name' in name:
            lowesItem['collection']=val[0]
        elif 'Soap or Lotion Dispenser' in name:   
            lowesItem['soapdispenser']=val[0]
        elif 'Spout Height (Inches)' in name:
            lowesItem['spoutheight']=val[0]
        elif 'Max Flow Rate' in name:   
                lowesItem['maxflowrate']=val[0]
        yield lowesItem          

Tags: nameindivurlforresponseextractval