断断续续运行的Python

2024-09-30 02:23:19 发布

您现在位置:Python中文网/ 问答频道 /正文

我的管道process_item写入两个不同的csv文件,同时发出外部soap请求。你知道吗

有时会写入这些文件,但大多数情况下不会。当我运行crawl命令时,我运行scrapy crawl partsscrapy crawl parts -o results.json。当我输出到results.json时,总会有结果。你知道吗

似乎很随意。有时管道会创建文件,有时只是附加到已经存在的文件中。你知道吗

这是我的管道:

有些信息被省略了

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv
import zeep
from XXX import settings

class XXXPipeline(object):
    def process_item(self, item, spider):
        data = make_request(item)
        if (data):
            mainCsv = csv.writer(open(settings.csv_file_path, 'a+'), delimiter=',')
            imgPath = '/' + item['images'][0]['path']

            mainCsv.writerow([
                item['partNumber'], # sku,
                'simple', # _type
                'base', # _product_websites
                'Default', # _attribute_set
                4, # visiblity
                1, # status
                1, # is_in_stock
                10, # qty
                2, # tax_class_id
                1, # weight
                item['partNumber'] + data.PartDescription, # name
                9999, # price
                item['partNumber'] + ' ' + data.PartDescription, # description
                item['partNumber'] + ' ' + data.PartDescription, # short_description
                item['make'], # manufacturer_code
                imgPath, # image
                imgPath, # small_image
                imgPath, # thumbnail
                '"2,3"', # category_ids // Change based on site's categories
            ])

            imgCsv = csv.writer(open(settings.img_csv_file_path, 'a+'), delimiter=',')

            iterimg = iter(item['images'])
            next(iterimg)
            for img in iterimg:
                imgCsv.writerow([
                    item['partNumber'],
                    '/' + img['path']
                ])

        return item

def make_request(item):
    wsdl = 'XXX'
    client = zeep.Client(wsdl=wsdl)
    try:
        data = client.service.ExactPartLookup(
            userName='XXX',
            password='XXX',
            make=item['make'],
            partNumber=item['partNumber']
        )

        return data.PartInformation_v2[0]
    except:
        pass

爬虫:

# -*- coding: utf-8 -*-
import scrapy
import html
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse


class PartsSpiderPySpider(scrapy.Spider):
    name = "parts"
    allowed_domains = ["XXX"]
    start_urls = ['https://XXX/']

    def parse(self, response):
        data = {'UserName': 'XXX', 'Password': 'XXX'}
        return scrapy.http.FormRequest(
            url='https://XXX/UserLogin/DoLogin',
            callback=self.after_login,
            formdata=data
        )

    def after_login(self, response):
        for i in range(1, 34):
            request = scrapy.Request(
                url='https://XXX/Landing/AppendMoreParts?type=1&page=' + str(i),
                callback=self.get_parts,
            )
            yield request

    def get_parts(self, response):
        res = json.loads(response.body_as_unicode())
        soup = BeautifulSoup(html.unescape(res['HTMLForGeneral']), 'html.parser')

        for part in soup.findAll('li'):
            item = {
                'partNumber': part.h5.a.string,
                'make': part.findAll('span')[0].string
            }
            yield scrapy.Request(
                url='https://XXX/Product/ProductImageListPartial?part=' + str(item['partNumber']) + '&make=' + str(item['make']),
                callback=self.get_img_urls,
                meta={'item': item},
            )

    def get_cross_info(self, response):
        item = response.meta['item']
        item['crossReference'] = response.css('span span::text').extract()

        yield scrapy.Request(
            url='https://XXX/Product/GetPartModelPaging?make=' + str(item['make']) + '&partNumber=' + str(item['partNumber']),
            callback=self.get_related_models,
            meta={'item': item},
        )

    def get_related_models(self, response):
        item = response.meta['item']
        res = json.loads(response.body_as_unicode())
        if res['Result']:
            soup = BeautifulSoup(html.unescape(res['Message']), 'html.parser')
            models = []
            for model in soup.findAll('a'):
                models.append(model.string)

            item['models'] = models
            return item

    def get_img_urls(self, response):
        item = response.meta['item']
        soup = BeautifulSoup(response.body, 'html.parser')
        imgs = []
        for div in soup.findAll('div', {'class': 'tumbimagepart'}):
            url = div.img['src']
            o = urlparse(url)
            imgs.append(o.scheme + "://" + o.netloc + o.path + '?width=750&mode=crop')

        item['image_urls'] = imgs

        yield scrapy.Request(
            url='https://XXX/Product/CrossReferencePartInfo?make=' + str(item['make']) + '&partNumber=' + str(item['partNumber']),
            callback=self.get_cross_info,
            meta={'item': item},
        )

更新管道: #--编码:utf-8--

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv
import zeep
from XXX import settings

class XXXPipeline(object):
    # def from_crawler(cls, crawler):
    #     self.settings = crawler.settings
    #     return cls()

    def spider_open(self, spider):
        self.data_file = open(settings.csv_file_path, 'a+')
        self.data_writer = csv.writer(self.data_file, delimiter=',')
        self.img_file = open(settings.img_csv_file_path, 'a+')
        self.img_writer = csv.writer(self.img_file, delimiter=',')

    def process_item(self, item, spider):
        data = make_request(item)
        if (data):
            mainCsv = self.data_writer
            imgPath = '/' + item['images'][0]['path']

            mainCsv.writerow([
                item['partNumber'], # sku,
                'simple', # _type
                'base', # _product_websites
                'Default', # _attribute_set
                4, # visiblity
                1, # status
                1, # is_in_stock
                10, # qty
                2, # tax_class_id
                1, # weight
                item['partNumber'] + data.PartDescription, # name
                9999, # price
                item['partNumber'] + ' ' + data.PartDescription, # description
                item['partNumber'] + ' ' + data.PartDescription, # short_description
                item['make'], # manufacturer_code
                imgPath, # image
                imgPath, # small_image
                imgPath, # thumbnail
                '"2,3"', # category_ids // Change based on site's categories
            ])

            imgCsv = self.img_writer

            iterimg = iter(item['images'])
            next(iterimg)
            for img in iterimg:
                imgCsv.writerow([
                    item['partNumber'],
                    '/' + img['path']
                ])

        return item

    def spider_close(self, spider):
        self.data_file.close()
        self.image_file.close()

def make_request(item):
    wsdl = 'https://XXX/b2b/parts_v2.asmx?WSDL'
    client = zeep.Client(wsdl=wsdl)
    # try:
    data = client.service.ExactPartLookup(
        userName='XXX',
        password='XXX',
        make=str(item['make']),
        partNumber=str(item['partNumber'])
    )

    return data.PartInformation_v2[0]
    # except:
    #     raise Exception('Couldn\'t get part information!')

编辑:我发现我的问题实际上是由于这个错误:maximum recursion depth exceeded while getting the str of an object

似乎在调用函数make_request时,我得到了这个错误。不完全确定是什么导致了这一切。你知道吗


Tags: csvpathimportselfimgdatamakeresponse
1条回答
网友
1楼 · 发布于 2024-09-30 02:23:19

应该在管道初始化期间打开文件。Scrapy管道有一个方便的方法spider_open(),它的作用类似于init,但只有在spider启动时才起作用——如果在某些情况下它会这样做,那么它什么都不会做。你知道吗

所以你想:

  1. 当spider打开时-打开文件并创建csv writer对象
  2. 当spider运行时-将所有项写入行
  3. 当蜘蛛关闭-关闭你的文件

另一个需要注意的重要事项是,您应该从crawler中检索settings对象,因为scrapy可以在不同的地方设置设置,比如通过shell(比如scrapy crawl myspider -s CSV_FILE_LOCATION=/home/me/something.csv)。你知道吗

所有这些放在一起的结果应该是这样的:

class MyPipeline:

    @classmethod
    def from_crawler(cls, crawler)
        # you want to get settings from crawler because settings.py is 
        # not the only place that can have some settings 
        self.settings = crawler.settings
        return cls()

    def open_spider(self, spider):
        # initiate file and writer objects once the spider opens
        self.data_file = open(self.settings.get('CSV_DATA'), 'a+')
        self.data_writer = csv.writer(self.data_file)
        self.image_file = open(settings.get('CSV_IMAGE'), 'a+')
        self.image_writer = csv.writer(self.image_file)

    def process_item(self, item, spider):
        # write some rows!
        if 'image' in item:
            self.image_writer.write_row(item.values()
        else:
            self.data_writer.write_row(item.values())
        return item

    def close_spider(self, spider):
        # close the file objects
        self.data_file.close()
        self.image_file.close()

相关问题 更多 >

    热门问题