我对Stackoverflow和编码的世界是完全陌生的,所以如果这个问题格式不好,请提前道歉!你知道吗
我已经构建了爬行器,可以在网站上搜索关键信息,然后将输出写入数据库。现在,我想将每个scrape的一些关键统计信息写入数据库中的一个单独的表中。你知道吗
我尝试过使用scrapyStats Collector,但是到目前为止还没有任何东西被写入数据库,我正在努力寻找这方面的例子。你知道吗
有人能帮我指出正确的方向吗?下面是一个尝试只写一列的示例(爬行器在刮取过程中找到的全部条目)。爬虫程序成功了,但我的StatCollector什么也没写。我特别想知道:
1)我是否以正确的方式处理这个问题,特别是当我为项目添加了更多的spider和更复杂的内容时
2)如何在每次爬网时只向数据库写入一个统计数据(例如找到的产品总数)。目前,我相信它每次调用parse\u main\u项时都会尝试写入数据库,这对于几十个子类别来说是几十次。你知道吗
提前感谢您的帮助!你知道吗
# One of my spiders
class Retailer1(scrapy.Spider):
name = 'R1_monitor'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com']
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
#site specific xpaths
def parse(self, response):
# returns all the category links in the home page menu
yield scrapy.Request(url=url, callback=self.parse_category, )
def parse_category(self, response):
# returns all the sub category links sitting beneath each category
yield scrapy.Request(url, callback=self.parse_page_url, )
def parse_page_url(self, response):
# uses a product count in the html to return a url that will display all products
yield scrapy.Request(paginated_url, callback=self.parse_main_item, dont_filter=True,)
def parse_main_item(self, response):
items_by_id = {}
for product in zip(
#zips things like product name, price etc together ):
item = PriceMonitorItem()
item['product_id'] = product[0]
item['product_name'] = product[1]
item['product_url'] = product[2]
item['product_image'] = product[3]
item['retailer_site'] = host
items_by_id[product[0]] = item
#yield all of the items from the main parse function
for items in items_by_id.values():
yield items
#Here I'd like to count the total number of items that are successfully parsed and sent to the pipeline
self.total_items = 0
self.total_items += len(items_by_id)
total_entries = self.crawler.stats.set_value("Total items scraped", self.total_items, spider=DdcMonitorSpider)
yield total_entries
In pipelines.py
class PriceCrawlerStatsPipeline(object):
def __init__(self):
#db session is created
def process_item(self, total_entries, spider):
session = self.Session()
**self.scrape_stats = ScrapeStats()
self.scrape_stats.total_entries = total_entries**
columns_to_dict = lambda obj: {c.name: getattr(obj, c.name) for c in obj.__table__.columns}
try:
insert_stmt = insert(ScrapeStats).values(
**columns_to_dict(self.scrape_stats))
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
{'date_scraped': date.today(), 'time_scraped': pytz.timezone('GMT')} )
session.execute(on_duplicate_key_stmt)
session.commit()
return total_entries
最后,这是我在日志中得到的回报
Traceback (most recent call last):
File "C:\Users\sibla\Documents\CodingNomads\price_crawler\price_monitor\price_monitor\pipelines.py", line 112, in process_item
session.execute(on_duplicate_key_stmt)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\orm\session.py", line 1269, in execute
clause, params or {}
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 982, in execute
return meth(self, multiparams, params)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 287, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1092, in _execute_clauseelement
else None,
File "<string>", line 1, in <lambda>
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 462, in compile
return self._compiler(dialect, bind=bind, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\elements.py", line 468, in _compiler
return dialect.statement_compiler(dialect, self, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 571, in __init__
Compiled.__init__(self, dialect, statement, **kwargs)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 319, in __init__
self.string = self.process(self.statement, **compile_kwargs)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 350, in process
return obj._compiler_dispatch(self, **kwargs)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\visitors.py", line 92, in _compiler_dispatch
return meth(self, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\compiler.py", line 2405, in visit_insert
self, insert_stmt, crud.ISINSERT, **kw
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 62, in _setup_crud_params
return _get_crud_params(compiler, stmt, **kw)
File "c:\users\sibla\documents\codingnomads\price_crawler\venv\lib\site-packages\sqlalchemy\sql\crud.py", line 177, in _get_crud_params
% (", ".join("%s" % c for c in check))
**sqlalchemy.exc.CompileError: Unconsumed column names: id, total_entries**
目前没有回答
相关问题 更多 >
编程相关推荐