我需要爬网一些网站,我只想抓取每个网站一定数量的网页。那么如何实现这一点呢?在
我的想法是使用dict,它的键是域名,值是存储在mongodb中的页数。因此,当一个页面被爬网并成功存储在数据库中时,这个域的页面数将增加一个。如果数量大于最大数量,则蜘蛛应停止从该网站crwling。在
下面是我的代码,但它不起作用。当spider.crawledPagesPerSite[domain_name]
大于spider.maximumPagesPerSite:
时,蜘蛛仍在爬行。在
class AnExampleSpider(CrawlSpider):
name="anexample"
rules=(
Rule(LinkExtractor(allow=r"/*.html"),
callback="parse_url",follow=True),
)
def __init__(self, url_file ): #, N=10,*a, **kw
data = open(url_file, 'r').readlines() #[:N]
self.allowed_domains = [ i.strip() for i in data ]
self.start_urls = ['http://' + domain for domain in self.allowed_domains]
super(AnExampleSpider, self).__init__()#*a, **kw
self.maximumPagesPerSite=100 #maximum pages each site
self.crawledPagesPerSite={}
def parse_url(self, response):
url=response.url
item=AnExampleItem()
html_text=response.body
extracted_text=parse_page.parse_page(html_text)
item["url"]=url
item["extracted_text"]=extracted_text
return item
class MongoDBPipeline(object):
def __init__(self):
self.connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] )
def process_item(self, item, spider):
domain_name=tldextract.extract(item['url']).domain
db = self.connection[domain_name] #use domain name as database name
self.collection = db[settings['MONGODB_COLLECTION']]
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Item added to MongoDB database!",level=log.DEBUG, spider=spider)
if domain_name in spider.crawledPagesPerSite:
spider.crawledPagesPerSite[domain_name]+=1
else:
spider.crawledPagesPerSite[domain_name]=1
if spider.crawledPagesPerSite[domain_name]>spider.maximumPagesPerSite:
suffix=tldextract.extract(item['url']).suffix
domain_and_suffix=domain_name+"."+suffix
if domain_and_suffix in spider.allowed_domains:
spider.allowed_domains.remove(domain_and_suffix)
spider.rules[0].link_extractor.allow_domains.remove(domain_and_suffix)
return None
return item
这个呢:
相关问题 更多 >
编程相关推荐