Scrapy FormRequest参数不起作用,但显示所有结果

2024-10-04 09:21:50 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在用Scrapy FormRequest刮这个网页https://researchgrant.gov.sg/eservices/advanced-search/?keyword=&source=sharepoint&type=project&status=open&_pp_projectstatus=&_pp_hiname=&_pp_piname=&_pp_source=sharepoint&_pp_details=#project。我的代码如下。带有ab的参数_pp_hiname和带有pua的参数_pp_piname应该只在response.text中返回1个结果,而是以HTML代码返回所有结果。参数显然不起作用,但我看不出有什么问题。你知道吗

def start_requests(self):
    params = {
        'keyword': '',
        'source': 'sharepoint',
        'type': 'project',
        'status': 'open',
        'page': '1',
        '_pp_projectstatus': '',
        '_pp_hiname': 'ab',
        '_pp_piname': 'pua',
        '_pp_source': '',
        '_pp_details': '',
        'name':'advancesearchawardedprojectsp'
    }
    yield scrapy.FormRequest('https://researchgrant.gov.sg/eservices/mvcgrid',callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})

def parse_item(self,response):
    print(response.text)

应该只有一个条目: actual result

但显然它显示了所有的入口: shows all result

最新更新:

class ToScrapeCSSSpiderSG(scrapy.Spider):
name = "toscrapesg-css"
# start_urls = [
    # 'https://researchgrant.gov.sg/eservices/mvcgrid',
# ] 
params = {
    'name':'advancesearchawardedprojectsp'
    }  
args = {
        'keyword': '',
        'source': 'sharepoint',
        'type': 'project',
        'status': 'open',
        'page': 1,
        '_pp_projectstatus': '',
        '_pp_hiname': 'ab',
        '_pp_piname': '',
        '_pp_source': '',
        '_pp_details': '',
        'name':'advancesearchawardedprojectsp'
    }
def start_requests(self):
    args = urllib.parse.urlencode(self.args)
    url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
    yield scrapy.FormRequest(url,callback=self.parse_item,method='POST',formdata=self.params,headers = {'X-Requested-With':'XMLHttpRequest'})


def parse_item(self,response):
    args = urllib.parse.urlencode(self.args)
    url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
    for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
        for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
            link=row.xpath('td[1]/a/@href').extract_first()
            yield scrapy.FormRequest(link,callback = self.parse_product,method='GET')


    onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
    if onclick:
        self.args['page'] += 1
        args = urllib.parse.urlencode(self.args)
        url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
        yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers = {'X-Requested-With': 'XMLHttpRequest'})

def parse_product(self,response):
        text = response.xpath('//span[contains(@id,"ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle")]/text()').extract()
        # text = info.xpath('./text()').extract()
        print(text)

控制台消息:enter image description here


Tags: texthttpsselfsourceparseresponsedefservice
1条回答
网友
1楼 · 发布于 2024-10-04 09:21:50

它只在POST体中发送Name=advancesearchawardedprojectsp。其他参数应在url中作为查询。你知道吗

所以url应该是

https://researchgrant.gov.sg/eservices/mvcgrid?keyword=&source=sharepoint&type=project&status=open&page=1&_pp_projectstatus=&_pp_hiname=ab&_pp_piname=pua&_pp_source=&_pp_details

你可以用urllib.parse.urlencode(args)来做这个。你知道吗

它给了我一个结果。你知道吗

import urllib.parse

def start_requests(self):

    params = {
        'name':'advancesearchawardedprojectsp'
    }

    args = {
        'keyword': '',
        'source': 'sharepoint',
        'type': 'project',
        'status': 'open',
        'page': '1',
        '_pp_projectstatus': '',
        '_pp_hiname': 'ab',
        '_pp_piname': 'pua',
        '_pp_source': '',
        '_pp_details': '',
    }

    args = urllib.parse.urlencode(args)

    url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args

    yield scrapy.FormRequest(url, callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})

编辑:加载下一页并选中按钮Next Page停止的示例。你知道吗

编辑:现在它可以保存在csv文件中。你知道吗

import scrapy
import urllib.parse

class MySpider(scrapy.Spider):

    name = 'myspider'
    #allowed_domains = []

    params = {
        'name': 'advancesearchawardedprojectsp'
    }

    args = {
        'keyword': '',
        'source': 'sharepoint',
        'type': 'project',
        'status': 'open',
        'page': 1,
        '_pp_projectstatus': '',

        #'_pp_hiname': 'tan',
        #'_pp_piname': '',
        '_pp_hiname': 'ab',
        '_pp_piname': '', #'pua',

        '_pp_source': '',
        '_pp_details': '',
    }

    def start_requests(self):

        # create request for first page
        args = urllib.parse.urlencode(self.args)

        url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args

        yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})


    def parse_item(self,response):
        #print('parse_item] url:', response.url)
        #print('parse_item] text:', response.text)

        #for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
        #    for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
        #        link = row.xpath('td[1]/a/@href').extract_first()
        #        yield scrapy.Request(link, callback=self.parse_product)

        for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
            link = row.xpath('.//a/@href').get()
            #title = row.xpath('.//a/text()').get()
            yield scrapy.Request(link, callback=self.parse_product)

        # create request for next page
        onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()

        if onclick:
            # next page 
            self.args['page'] += 1
            args = urllib.parse.urlencode(self.args)
            url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
            yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})

    def parse_product(self, response):
        #print('parse_product] url:', response.url)

        # .extract_first() or .get() instead of .extract()
        project_id = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjIdExt"]/text()').get()
        title = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle"]/text()').get()
        pi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblLeadPIName"]/text()').get()
        hi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblHostInstName"]/text()').get()
        date = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_dtPickerStartDate"]/text()').get()
        # etc.

        item = {
            'id': project_id,
            'title': title,
            'pi': pi,
            'hi': hi,
            'date': date,
        }

        yield item

#  - run without project and save in `output.csv`  -

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()

相关问题 更多 >