scrapy formrequest产生不匹配和缺失的结果

2024-10-02 18:27:41 发布

您现在位置:Python中文网/ 问答频道 /正文

我在scrapy中运行一系列formrequests,从一页跳到下一页,同时在一页中从一行跳到下一行,并从每一行中删除文档编号和名称

但是,当输出到csv时,所刮取的数据似乎不匹配,有时甚至丢失。你知道为什么会这样吗

    formdata_pre={
            'ScriptManager1' : "SearchFormEx1$UpdatePanel|SearchFormEx1$btnSearch",
            'ScriptManager1_HiddenField': '', 
            'Navigator1$SearchOptions1$SavePrintCriteriaCheck' : 'on',
            'Navigator1$SearchOptions1$SaveOrderCriteriaCheck' : 'on',
            'SearchCriteriaOffice1$DDL_OfficeName' : 'Recorded Land',
            'SearchCriteriaName1$DDL_SearchName' : 'Recorded Land Name Search',
            'SearchFormEx1$ACSTextBox_LastName1' : 'mortgage electronic',
            'SearchFormEx1$ACSTextBox_FirstName1' : '',
            'SearchFormEx1$ACSRadioButtonList_PartyType1' : '',
            'SearchFormEx1$ACSTextBox_LastName2' : '',
            'SearchFormEx1$ACSTextBox_FirstName2' : '',
            'SearchFormEx1$ACSRadioButtonList_PartyType2' : '',
            'SearchFormEx1$ACSRadioButtonList_Search' : '3',
            'SearchFormEx1$ACSDropDownList_DocumentType' : '29',
            'SearchFormEx1$ACSDropDownList_Towns' : '-2',
            'SearchFormEx1$ACSTextBox_DateFrom' : '1/1/1753',
            'SearchFormEx1$ACSTextBox_DateTo' : '10/14/2000',
            'ImageViewer1$ScrollPos' : '',
            'ImageViewer1$ScrollPosChange' : '',
            'ImageViewer1$_imgContainerWidth' : '0',
            'ImageViewer1$_imgContainerHeight' : '0',
            'ImageViewer1$isImageViewerVisible' : 'true',
            'ImageViewer1$hdnWidgetSize' : '',
            'ImageViewer1$DragResizeExtender_ClientState' : '',
            'CertificateViewer1$ScrollPos' : '',
            'CertificateViewer1$ScrollPosChange' : '',
            'CertificateViewer1$_imgContainerWidth' : '0',
            'CertificateViewer1$_imgContainerHeight' : '0',
            'CertificateViewer1$isImageViewerVisible' : 'true',
            'CertificateViewer1$hdnWidgetSize' : '',
            'CertificateViewer1$DragResizeExtender_ClientState' : '',
            'PTAXViewer1$ScrollPos' : '',
            'PTAXViewer1$ScrollPosChange' : '',
            'PTAXViewer1$_imgContainerWidth' : '0',
            'PTAXViewer1$_imgContainerHeight' : '0',
            'PTAXViewer1$isImageViewerVisible' : 'true',
            'PTAXViewer1$hdnWidgetSize' : '',
            'PTAXViewer1$DragResizeExtender_ClientState' : '',
            'DocList1$ctl12' : '',
            'DocList1$ctl14' : '',
            'RefinementCtrl1$ctl01' : '',
            'RefinementCtrl1$ctl03' : '',
            'NameList1$ScrollPos' : '',
            'NameList1$ScrollPosChange' : '',
            'NameList1$_SortExpression' : '',
            'NameList1$ctl03' : '',
            'NameList1$ctl05' : '',
            'DocDetails1$PageSize' : '',
            'DocDetails1$PageIndex' : '',
            'DocDetails1$SortExpression' : '',
            'BasketCtrl1$ctl01' : '',
            'BasketCtrl1$ctl03' : '',
            'OrderList1$ctl01' : '',
            'OrderList1$ctl03' : '',
            '__EVENTTARGET' : '',
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '',
            '__VIEWSTATE' : '',
            '__ASYNCPOST' : 'true',
            'SearchFormEx1$btnSearch' : 'Search'
            }

上面是下面formdata的formdata\u pre——堆栈溢出在这个dict中很奇怪

import scrapy
from scrapy import FormRequest
from scrapy.shell import inspect_response
import re

class FormySpider(scrapy.Spider):

    name = 'formy'
    allowed_domains = ['i2a.uslandrecords.com']
    start_urls = ['https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx']
    def parse(self, response):
        URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}        
        yield FormRequest(url=URL, 
            method='POST',
            headers = headers,
            formdata={
            'ScriptManager1' : 'Navigator1$SearchOptions1$UpdatePanel|Navigator1$SearchOptions1$DocImagesCheck',
            '__EVENTTARGET' : 'Navigator1$SearchOptions1$DocImagesCheck',
            '__ASYNCPOST' : ' true'},
            dont_filter=True,
            callback=self.after_login_1
            )

    def after_login_1(self, response):        
        URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}        
        formdata=formdata_pre,
        yield FormRequest(url=URL,
            method='POST',
            headers = headers,
            formdata=formdata,
            dont_filter=True,
            callback=self.after_login_2
            )

    def after_login_2(self, response):
        URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}        
        if response.xpath("(//a[contains(@id,'ButtonRow_Doc')])[1]").get():            print(response.xpath("//a[contains(@id,'LinkButton')]/text()").getall())            print(response.xpath("//a[contains(@id,'ButtonRow_Doc')]/text()").getall())
            for row in response.xpath("//a[contains(@id,'ButtonRow_Doc')]/@href").getall():
                event =re.findall(r'doPostBack\(\'([^()].*)\'\,\'\'\)',row)[0]
                scriptmanager = 'DocList1$UpdatePanel|'+event
                ndoc=re.findall(r'\#\_(.*)\'\,',row)[0]
                nbook = response.xpath( "(//a[contains(@href,\"_{}',\")])[3]/text()".format(ndoc)).get()                
                npage = response.xpath( "(//a[contains(@href,\"_{}',\")])[4]/text()".format(ndoc)).get()
                yield FormRequest(url=URL, 
                    method='POST',
                    headers = headers,
                    formdata={
                    'ScriptManager1' : scriptmanager,
                    '__EVENTTARGET' : event,
                    '__ASYNCPOST' : ' true'},
                    dont_filter=True,
                    meta = {
                    'nbook' : nbook,
                    'npage' : npage
                    },
                    callback=self.after_login_3
                    )                
                if response.xpath("//a[@id='DocList1_LinkButtonNext']").get():            

                    yield FormRequest(url=URL, 
                        method='POST',
                        headers = headers,
                        formdata={
                        'ScriptManager1' : 'DocList1$UpdatePanel|DocList1$LinkButtonNext',
                        '__EVENTTARGET' : 'DocList1$LinkButtonNext',
                        '__ASYNCPOST' : ' true'},
                        dont_filter=True,
                        callback=self.after_login_2
                        )
                else:            
                    print("empty week")

    def after_login_3(self, response):
        URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
        names  = response.xpath("//a[contains(@id,'GrantorGrantee')]/text()").getall()
        nbook = response.request.meta['nbook']
        npage = response.request.meta['npage']
        yield {        
        'nbook' : nbook,
        'npage' : npage,
        'names' : names
        }

就输出而言,以下是一个不匹配的示例:

15643 | | | 20 | |{FRANTZ RICHARD C,抵押电子登记系统公司,舰队抵押公司}

在上面的例子中,书名与页码和书号不匹配,“休斯·安德鲁”这个名字完全不存在。这只是一个例子

我认为scrapy请求的速度太快,响应无法生成,可能这个延迟造成了断开连接和缺少对象


Tags: selftrueurlresponseloginxpathheaderscontains
1条回答
网友
1楼 · 发布于 2024-10-02 18:27:41

您没有提供代码或输出,因此很难帮助您听起来像页面根据您的会话显示结果,因为Scrapy请求是并发的,会话可能会重叠

您可以使用参数meta={'cookiejar': different_values_here}设置每个会话的请求,您可以阅读更多here

相关问题 更多 >