如何使用scrapy从一个站点抓取有限数量的页面?

2024-10-06 10:24:14 发布

您现在位置:Python中文网/ 问答频道 /正文

我需要爬网一些网站,我只想抓取每个网站一定数量的网页。那么如何实现这一点呢?在

我的想法是使用dict,它的键是域名,值是存储在mongodb中的页数。因此,当一个页面被爬网并成功存储在数据库中时,这个域的页面数将增加一个。如果数量大于最大数量,则蜘蛛应停止从该网站crwling。在

下面是我的代码,但它不起作用。当spider.crawledPagesPerSite[domain_name]大于spider.maximumPagesPerSite:时,蜘蛛仍在爬行。在

class AnExampleSpider(CrawlSpider):
name="anexample"
rules=(
    Rule(LinkExtractor(allow=r"/*.html"),
    callback="parse_url",follow=True),
)   
def __init__(self, url_file ): #, N=10,*a, **kw
    data = open(url_file, 'r').readlines() #[:N]
    self.allowed_domains = [ i.strip() for i in data ] 
    self.start_urls = ['http://' + domain for domain in self.allowed_domains]
    super(AnExampleSpider, self).__init__()#*a, **kw

    self.maximumPagesPerSite=100 #maximum pages each site
    self.crawledPagesPerSite={}
def parse_url(self, response):
    url=response.url
    item=AnExampleItem()     
    html_text=response.body
    extracted_text=parse_page.parse_page(html_text)
    item["url"]=url
    item["extracted_text"]=extracted_text
    return item

class MongoDBPipeline(object):
    def __init__(self):
        self.connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] )

    def process_item(self, item, spider):
        domain_name=tldextract.extract(item['url']).domain
        db = self.connection[domain_name] #use domain name as database name
        self.collection = db[settings['MONGODB_COLLECTION']]
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
            if valid:
                self.collection.insert(dict(item))
                log.msg("Item added to MongoDB database!",level=log.DEBUG, spider=spider)
                if domain_name in spider.crawledPagesPerSite:
                    spider.crawledPagesPerSite[domain_name]+=1
                else:
                    spider.crawledPagesPerSite[domain_name]=1
                if spider.crawledPagesPerSite[domain_name]>spider.maximumPagesPerSite:
                    suffix=tldextract.extract(item['url']).suffix
                    domain_and_suffix=domain_name+"."+suffix

                    if domain_and_suffix in spider.allowed_domains:
                        spider.allowed_domains.remove(domain_and_suffix)
                        spider.rules[0].link_extractor.allow_domains.remove(domain_and_suffix)
                        return None
                return item

Tags: textnameinselfurldataifparse
1条回答
网友
1楼 · 发布于 2024-10-06 10:24:14

这个呢:

def parse_url(self, response):
    url = response.url
    domain_name = tldextract.extract(url).domain
    if domain_name in self.crawledPagesPerSite:
        # If enough page visited in this domain, return
        if self.crawledPagesPerSite[domain_name] > self.maximumPagesPerSite:
            return 
        self.crawledPagesPerSite[domain_name]+=1

    else:
        self.crawledPagesPerSite[domain_name]=1
    print self.crawledPagesPerSite[domain_name]

相关问题 更多 >