删除JSON中提供的无限结果（“查看更多”）

import json import scrapy class SpidyQuotesSpider(scrapy.Spider): name = 'spidyquotes' quotes_base_url = 'http://spidyquotes.herokuapp.com/api/quotes?page=%s' start_urls = [quotes_base_url % 1] download_delay = 1.5 def parse(self, response): data = json.loads(response.body) for item in data.get('quotes', []): yield { 'text': item.get('text'), 'author': item.get('author', {}).get('name'), 'tags': item.get('tags'), } if data['has_next']: next_page = data['page'] + 1 yield scrapy.Request(self.quotes_base_url % next_page)

2条回答

网友

1楼 · 编辑于 2024-09-23 04:26:51

这段代码可能有效，也可能无效，但鉴于您面临的问题，我将采用这种方法。您可以在起始url中插入{}以使用该格式。此外，当您循环数据['quotes']时，您现在处理的是一个JSON对象，而不是一个粗糙的选择器。因此，不需要调用.get（）

import json
import scrapy


class SpidyQuotesSpider(scrapy.Spider):
    name = 'spidyquotes'
    start_urls = ['https://www.1177.se/api/hjv/search?batchsize=10&caretype=&componentname&cs=false&location=&p={}&q=&s=name&sortorder=name&st=4af2ed43-1154-4363-ae6b-718f9b84d23a']

    def start_requests(self):
        # You may also need to replicate the headers used in the requests made to this URL.
        yield scrapy.Request(self.start_urls[0].format('1'))

    def parse(self, response):
        data = json.loads(response.body)
        for item in data['quotes']:
            # remember you're no longer dealing with a scrapy selector but now a json object
            yield {
                'text': item['text'],
                'author': item['name'],
                'tags': item['tags'],
            }
        if data['has_next']:
            # convert to integer to do addition
            next_page = int(data['page']) + 1
            yield scrapy.Request(self.start_urls[0].format(next_page), callback=self.parse)

网友

2楼 · 编辑于 2024-09-23 04:26:51

这应该可以做到：

Headerz = {
    'accept': 'text/html, */*; q=0.01',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'no-cache',
    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'pragma': 'no-cache',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}

class SpidyQuotesSpider(scrapy.Spider):
    name = 'spidyquotes'
    start_urls = ['https://www.1177.se/api/hjv/search?batchsize=10&caretype=&componentname&cs=false&location=&p={}&q=&s=name&sortorder=name&st=4af2ed43-1154-4363-ae6b-718f9b84d23a']

    def start_requests(self):
        # You may also need to replicate the headers used in the requests made to this URL.
        yield scrapy.Request(self.start_urls[0].format('1'), headers=Headerz)

    def parse(self, response):
        data = json.loads(response.body)
        # you have json data in data variable, do what you intent to do so
        try:
            # paginate
            if not data['NextPage'] is None:
                nextpage_number = data['NextPage']
                nexturl = self.start_urls[0].format( str(nextpage_number) )
                yield scrapy.Request(nexturl, headers=Headerz)
        except:
            pass

这里的诀窍是使用正确的标题

相关问题更多 >

编程相关推荐

热门问题

热门文章