如何在scrapy中加载更多ajax请求

2024-09-27 04:28:53 发布

您现在位置:Python中文网/ 问答频道 /正文

我使用的是windows10和python3。我从来没有得到第二页的数据。请检查一下。你知道吗

提前谢谢!你知道吗

scrapy shell "https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html"

我的终端

url = 'https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html'


form = {

'lang': 'en',
'beta': 'false',
'action': 'RESULTPAGE_AJAX#getOverview',
'content': 'resultpage',
'subContent': 'result',
'company_id': '0',
'override_id': '0',
'domain_id': '0',
'user_id': '0',
'keyword_id': '19931',
'JSONStr': '{"key":"company","length":9,"keyword_id":null,"index":6,"filter":{},"override":{"key":"company"},"query":"Hydraulic Cylinder"}'}


headers = {

'Content-Type': 'json/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',

}

req = scrapy.FormRequest(url, method='POST', formdata=form, headers=headers)

fetch(req)
view(response)

我们希望爬网加载更多的页面和数据!你知道吗


Tags: 数据httpscomidhtmlwwwresultproduct
1条回答
网友
1楼 · 发布于 2024-09-27 04:28:53

我试图找到一种不渲染页面的方法:

from scrapy import Spider
import scrapy
import json
import logging


class IndustrystockSpider(Spider):
    name = "industry_stock"
    allowed_domains = ['industrystock.com']
    start_urls = ["https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html"]

    custom_settings = {'ROBOTSTXT_OBEY': False}

    ajax_url = 'https://www.industrystock.com/ajax/ajax_live.php'
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer': 'https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html',
        'Origin': 'https://www.industrystock.com',
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    }

    data = {
        'lang': 'en',
        'beta': 'false',
        'action': 'RESULTPAGE_AJAX#getOverview',
        'content': 'resultpage',
        'subContent': 'result',
        'company_id': '0',
        'override_id': '0',
        'domain_id': '0',
        'user_id': '0',
        'keyword_id': '19931',
    }

    @staticmethod
    def construct_json_str(index):
        return '{"key":"company","length":9,"keyword_id":null,"index":' + \
               str(index) + \
               ',"filter":{},"override":{"key":"company"},"query":"Hydraulic Cylinder"}'

    def parse(self, response):
        index = 0
        data = self.data
        data['JSONStr'] = self.construct_json_str(index)
        logging.info(f"data is {data}")
        yield scrapy.FormRequest(self.ajax_url,
                                 callback=self.parse_detail,
                                 method='POST',
                                 formdata=data,
                                 headers=self.headers,
                                 meta={'index': index})

    def parse_detail(self, response):
        company_data = json.loads(response.body)
        overview = company_data['result']['overview']
        if overview:
            for company in overview:
                company_id = company['company_id']
                logging.info(f"company_id {company_id}")

            previous_index = response.meta['index']
            index = previous_index + 1
            data = self.data
            data['JSONStr'] = self.construct_json_str(index)
            yield scrapy.FormRequest(self.ajax_url,
                                     callback=self.parse_detail,
                                     method='POST',
                                     formdata=data,
                                     headers=self.headers,
                                     dont_filter=True,
                                     meta={'index': index})

相关问题 更多 >

    热门问题