如何在python中杀死单个函数实例

2024-06-25 05:18:51 发布

您现在位置:Python中文网/ 问答频道 /正文

我在用刮刀刮。代码如下:

import scrapy
from scrapy.exceptions import CloseSpider

class IrnaSpider(scrapy.Spider):
    name = 'irna'
    base_url = 'http://www.irna.ir/en/services/161'
    next_page = 162


    def start_requests(self):
        yield scrapy.Request(self.base_url, meta={'page_number': 1})

    def parse(self, response):

        for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
            yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)

        page_number = response.meta['page_number'] + 1
        if response.css('#MoreButton'):
            yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
                callback=self.parse, meta={'page_number': page_number})

        for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
            yield response.follow(next_article, callback=self.parse)


    def parse_article(self, response):
        with open("irnadate.txt", "rt") as in_file:
            irnadate = in_file.read()

        articleday = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
        articlemonth = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
        articleyear = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
        articletime = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
        articlestamp = articleyear + articlemonth + articleday + articletime

        articlestampint = int(articlestamp)
        irnadateint = int(irnadate)

        if articlestampint <= irnadateint:
            raise CloseSpider('duplicate article')

        yield {
            'date': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
            'time': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
            'title': ''.join(response.xpath('//*[@id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
            'text': ''.join(response.xpath('//p[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
            'tags': [tag.strip() for tag in response.xpath('//div[@class="Tags"]/p/a/text()').extract() if tag.strip()]

        }

我希望它只抓取自上次运行以来建立的链接,因此每次它阅读一篇文章时,都会将其发布日期与程序上次运行的日期进行比较,如果文章较旧,则不会抓取它并终止程序。你知道吗

这里的问题是,有多个类别都是在同一时间刮与此代码,这是有可能的,我得到了一个类别中的一个旧的文章之前,我通过另一个类别中的所有新文章。你知道吗

有没有可能为了只杀死一个函数的一个实例而引发一些东西,以便scraper能够继续查看其他类别?你知道吗

编辑:

import scrapy
from scrapy.exceptions import CloseSpider

class IrnaSpider(scrapy.Spider):
    name = 'irna'
    base_urls = [
    'http://www.irna.ir/en/services/161',
    'http://www.irna.ir/en/services/162',
    'http://www.irna.ir/en/services/163',
    'http://www.irna.ir/en/services/164',
    ]

    def start_requests(self):
        for base_url in self.base_urls:
            yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})


    def parse(self, response):
        with open("irnadate.txt", "rt") as in_file:
            irnadate = in_file.read()

        for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
            articleday = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
            articlemonth = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
            articleyear = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
            articletime = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
            articlestamp = articleyear + articlemonth + articleday + articletime

            articlestampint = int(articlestamp)
            irnadateint = int(irnadate)

            if articlestampint <= irnadateint:
                break

            yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)


        page_number = response.meta['page_number'] + 1
        base_url = response.meta['base_url']

        if response.css('#MoreButton'):
            yield scrapy.Request('{}/page{}'.format(base_url, page_number),
                callback=self.parse, meta={'page_number': page_number})



    def parse_article(self, response):

        yield {
            'date': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
            'time': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
            'title': ''.join(response.xpath('//*[@id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
            'text': ''.join(response.xpath('//p[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
            'tags': [tag.strip() for tag in response.xpath('//div[@class="Tags"]/p/a/text()').extract() if tag.strip()]

        }

这个问题是,它看起来像我无法加载一篇文章之前,刮它来确定它的日期。你知道吗


Tags: textselfreidurlnumberbaseresponse
1条回答
网友
1楼 · 发布于 2024-06-25 05:18:51

你的蜘蛛需要重组。一是你不应该使用

    for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
        yield response.follow(next_article, callback=self.parse)

因为每次你得到一个结果页时,你都会一次又一次地运行相同的url。所以在下一个请求之后,它们将被过滤。所以你应该在基本URL中使用这个

base_urls = [
    'http://www.irna.ir/en/services/161',
    'http://www.irna.ir/en/services/162',
    'http://www.irna.ir/en/services/163',
    'http://www.irna.ir/en/services/164',
    ]

def start_requests(self):
    for base_url in self.base_urls:
        yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})

接下来在文章中,您应该从结果中获取日期

def parse(self, response):

    for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
        # get the date for this article
        # if the date is already extracted 

        date_already_processed = <-Get the date from result page->

        if date_already_processed:
           break 
        yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)

    page_number = response.meta['page_number'] + 1
    base_url = response.meta['base_url']

    if response.css('#MoreButton'):
        yield scrapy.Request('{}/page{}'.format(base_url, page_number),
                             callback=self.parse, meta={'page_number': page_number})

相关问题 更多 >