用scrapy递归地刮

import scrapy from medium.items import MediumItem from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class DataSpider(CrawlSpider): name = 'data' allowed_domains = ['medium.com'] start_urls = ['https://medium.com/tag/python/archive/'] rules = ( Rule( LinkExtractor( allow=(), restrict_xpaths=('//div[@class="timebucket u-inlineBlock u-width50"]/a')), follow=True, callback='years'), Rule( LinkExtractor( allow=(), restrict_xpaths=('//div[@class="timebucket u-inlineBlock u-width80"]/a')), follow=True, callback='months' ), Rule( LinkExtractor( allow=(),restrict_xpaths=('//div[@class="timebucket u-inlineBlock u-width35"]/a')),callback='days' )) def years(self,response): for year in response.xpath('//div[@class="timebucket u-inlineBlock u-width50"]/a/@href').extract(): yield response.follow(year, callback=self.months) def months(self,response): for month in response.xpath('//div[@class="timebucket u-inlineBlock u-width50"]/a/@href').extract(): yield response.follow(month,callback=self.days) def days(self,response): for day in response.xpath('//div[@class="timebucket u-inlineBlock u-width35"]/a/@href').extract(): yield response.follow(day,callback=self.parse_item) def parse_item(self,response): dic = {} dic['title1'] = 'h3[@class="graf graf--h3 graf-after--figure graf--title"]/text()' dic['title2'] = 'h3[@class="graf graf--h3 graf-after--figure graf--trailing graf--title"]/text()' dic['title3'] = 'h3[@class="graf graf--h3 graf-after--h4 graf--trailing graf--title"]/text()' dic['title4'] = 'h3[@class="graf graf--h3 graf--leading graf--title"]/text()' dic['title5'] = 'h3[@class="graf graf--h3 graf-after--figure graf--title"]/span[@class="markup--anchor markup--h3-anchor"]/text()' dic['title6'] = 'h3[@class="graf graf--h3 graf-after--h3 graf--trailing"]/text()' articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]') def title(dict, articles): for article in articles: for num in dic.keys(): if article.xpath('.//' + dic.get(f'{num}')).extract_first() == None: continue else: title = article.xpath('.//' + dic.get(f'{num}')).extract_first() return title for article in articles: item = MediumItem() item['date'] = article.css('time::text').extract_first() + ' 2019' item['read'] = article.css("span::attr(title)").extract_first() item['publication'] = article.xpath('.//a[@class="ds-link ds-link--styleSubtle link--darken link--accent u-accentColor--textNormal"]/text()').extract_first() item['name'] = article.xpath('.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()').extract_first() item['claps'] = article.xpath('.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()').extract_first() item['responses'] = article.xpath('.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()').extract_first() item['text'] = title(dict,articles) yield item link = article.xpath('.//a[@class="link link--darken"]/@href').extract_first() yield response.follow(link, callback=self.get_link, meta={'item':item}) def get_tag(self,response): item = response.meta['item'] item['tag'] = response.css("ul > li > a::text").getall() yield item

2020-02-06 16:56:38 [scrapy_user_agents.middlewares] DEBUG: Assigned User-Agent Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36 2020-02-06 16:56:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://medium.com/tag/python/archive/2019/03/04> (referer: https://medium.com/tag/python/archive/2019/03) ['cached'] 2020-02-06 16:56:38 [scrapy_user_agents.middlewares] DEBUG: Assigned User-Agent Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36 2020-02-06 16:56:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://medium.com/tag/python/archive/2019/03/03> (referer: https://medium.com/tag/python/archive/2019/03) ['cached'] 2020-02-06 16:56:38 [scrapy_user_agents.middlewares] DEBUG: Assigned User-Agent Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36 2020-02-06 16:56:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://medium.com/tag/python/archive/2019/03/02> (referer: https://medium.com/tag/python/archive/2019/03) ['cached'] 2020-02-06 16:56:38 [scrapy.core.engine] INFO: Closing spider (finished) 2020-02-06 16:56:38 [scrapy.statscollectors] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 1236771, 'downloader/request_count': 1984, 'downloader/request_method_count/GET': 1984, 'downloader/response_bytes': 137377143, 'downloader/response_count': 1984, 'downloader/response_status_count/200': 1984, 'dupefilter/filtered': 52095, 'elapsed_time_seconds': 76.366135, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2020, 2, 6, 16, 56, 38, 573687), 'httpcache/hit': 1984, 'log_count/DEBUG': 3970, 'log_count/INFO': 11, 'log_count/WARNING': 45, 'request_depth_max': 4, 'response_received_count': 1984, 'robotstxt/request_count': 1, 'robotstxt/response_count': 1, 'robotstxt/response_status_count/200': 1, 'scheduler/dequeued': 1983, 'scheduler/dequeued/memory': 1983, 'scheduler/enqueued': 1983, 'scheduler/enqueued/memory': 1983, 'start_time': datetime.datetime(2020, 2, 6, 16, 55, 22, 207552)} 2020-02-06 16:56:38 [scrapy.core.engine] INFO: Spider closed (finished)

1条回答

网友

1楼 · 发布于 2024-09-27 18:06:35

删除年、月、日这三个函数

rules = (
         Rule(
            LinkExtractor(
                restrict_xpaths=('//div[@class="timebucket u-inlineBlock u-width50"]/a')),),
        Rule(
            LinkExtractor(

                restrict_xpaths=('//div[@class="timebucket u-inlineBlock u-width80"]/a')), ),
        Rule(
            LinkExtractor(follow=True,
                restrict_xpaths=('//div[@class="timebucket u-inlineBlock u-width35"]/a')),callback='parse_item'
                ))

def parse_item(self,response):
    #your code,

相关问题更多 >

编程相关推荐

热门问题

热门文章