使用scrapy StatsC将刮削统计信息写入数据库

2024-10-03 00:29:52 发布

您现在位置:Python中文网/ 问答频道 /正文

我对Stackoverflow和编码的世界是完全陌生的,所以如果这个问题格式不好,请提前道歉!你知道吗

我已经构建了爬行器,可以在网站上搜索关键信息,然后将输出写入数据库。现在,我想将每个scrape的一些关键统计信息写入数据库中的一个单独的表中。你知道吗

我尝试过使用scrapyStats Collector,但是到目前为止还没有任何东西被写入数据库,我正在努力寻找这方面的例子。你知道吗

有人能帮我指出正确的方向吗?下面是一个尝试只写一列的示例(爬行器在刮取过程中找到的全部条目)。爬虫程序成功了,但我的StatCollector什么也没写。我特别想知道:

1)我是否以正确的方式处理这个问题,特别是当我为项目添加了更多的spider和更复杂的内容时

2)如何在每次爬网时只向数据库写入一个统计数据(例如找到的产品总数)。目前,我相信它每次调用parse\u main\u项时都会尝试写入数据库,这对于几十个子类别来说是几十次。你知道吗

提前感谢您的帮助!你知道吗

# One of my spiders
class Retailer1(scrapy.Spider):
    name = 'R1_monitor'
    allowed_domains = ['example.com']
    start_urls = ['https://www.example.com']

    def __init__(self):
        self.declare_xpath()
    def declare_xpath(self):
        #site specific xpaths

    def parse(self, response):
# returns all the category links in the home page menu
            yield scrapy.Request(url=url, callback=self.parse_category, )

    def parse_category(self, response):
# returns all the sub category links sitting beneath each category
            yield scrapy.Request(url, callback=self.parse_page_url, )

    def parse_page_url(self, response):
# uses a product count in the html to return a url that will display all products
            yield scrapy.Request(paginated_url, callback=self.parse_main_item, dont_filter=True,)

    def parse_main_item(self, response):
        items_by_id = {}

        for product in zip(
                #zips things like product name, price etc together ):

            item = PriceMonitorItem()
            item['product_id'] = product[0]
            item['product_name'] = product[1]
            item['product_url'] = product[2]
            item['product_image'] = product[3]
            item['retailer_site'] = host
            items_by_id[product[0]] = item

            #yield all of the items from the main parse function
            for items in items_by_id.values():
                yield items

#Here I'd like to count the total number of items that are successfully parsed and sent to the pipeline
        self.total_items = 0
        self.total_items += len(items_by_id)

        total_entries = self.crawler.stats.set_value("Total items scraped", self.total_items, spider=DdcMonitorSpider)
        yield total_entries
In pipelines.py

class PriceCrawlerStatsPipeline(object):
    def __init__(self):
        #db session is created

    def process_item(self, total_entries, spider):
        session = self.Session()
        **self.scrape_stats = ScrapeStats()
        self.scrape_stats.total_entries = total_entries**
        columns_to_dict = lambda obj: {c.name: getattr(obj, c.name) for c in obj.__table__.columns}

        try:
            insert_stmt = insert(ScrapeStats).values(
                **columns_to_dict(self.scrape_stats))

            on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
                {'date_scraped': date.today(), 'time_scraped': pytz.timezone('GMT')} )

            session.execute(on_duplicate_key_stmt)
            session.commit()
        return total_entries

最后,这是我在日志中得到的回报

Traceback (most recent call last):
  File "C:\Users\sibla\Documents\CodingNomads\price_crawler\price_monitor\price_monitor\pipelines.py", line 112, in process_item
    session.execute(on_duplicate_key_stmt)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\orm\session.py", line 1269, in execute
    clause, params or {}
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 982, in execute
    return meth(self, multiparams, params)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 287, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1092, in _execute_clauseelement
    else None,
  File "<string>", line 1, in <lambda>
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 462, in compile
    return self._compiler(dialect, bind=bind, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 468, in _compiler
    return dialect.statement_compiler(dialect, self, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 571, in __init__
    Compiled.__init__(self, dialect, statement, **kwargs)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 319, in __init__
    self.string = self.process(self.statement, **compile_kwargs)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 350, in process
    return obj._compiler_dispatch(self, **kwargs)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\visitors.py", line 92, in _compiler_dispatch
    return meth(self, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 2405, in visit_insert
    self, insert_stmt, crud.ISINSERT, **kw
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 62, in _setup_crud_params
    return _get_crud_params(compiler, stmt, **kw)
  File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 177, in _get_crud_params
    % (", ".join("%s" % c for c in check))
**sqlalchemy.exc.CompileError: Unconsumed column names: id, total_entries**

Tags: inpyselfvenvsqlalchemyliblinesite