我在scrapy中运行一系列formrequests,从一页跳到下一页,同时在一页中从一行跳到下一行,并从每一行中删除文档编号和名称
但是,当输出到csv时,所刮取的数据似乎不匹配,有时甚至丢失。你知道为什么会这样吗
formdata_pre={
'ScriptManager1' : "SearchFormEx1$UpdatePanel|SearchFormEx1$btnSearch",
'ScriptManager1_HiddenField': '',
'Navigator1$SearchOptions1$SavePrintCriteriaCheck' : 'on',
'Navigator1$SearchOptions1$SaveOrderCriteriaCheck' : 'on',
'SearchCriteriaOffice1$DDL_OfficeName' : 'Recorded Land',
'SearchCriteriaName1$DDL_SearchName' : 'Recorded Land Name Search',
'SearchFormEx1$ACSTextBox_LastName1' : 'mortgage electronic',
'SearchFormEx1$ACSTextBox_FirstName1' : '',
'SearchFormEx1$ACSRadioButtonList_PartyType1' : '',
'SearchFormEx1$ACSTextBox_LastName2' : '',
'SearchFormEx1$ACSTextBox_FirstName2' : '',
'SearchFormEx1$ACSRadioButtonList_PartyType2' : '',
'SearchFormEx1$ACSRadioButtonList_Search' : '3',
'SearchFormEx1$ACSDropDownList_DocumentType' : '29',
'SearchFormEx1$ACSDropDownList_Towns' : '-2',
'SearchFormEx1$ACSTextBox_DateFrom' : '1/1/1753',
'SearchFormEx1$ACSTextBox_DateTo' : '10/14/2000',
'ImageViewer1$ScrollPos' : '',
'ImageViewer1$ScrollPosChange' : '',
'ImageViewer1$_imgContainerWidth' : '0',
'ImageViewer1$_imgContainerHeight' : '0',
'ImageViewer1$isImageViewerVisible' : 'true',
'ImageViewer1$hdnWidgetSize' : '',
'ImageViewer1$DragResizeExtender_ClientState' : '',
'CertificateViewer1$ScrollPos' : '',
'CertificateViewer1$ScrollPosChange' : '',
'CertificateViewer1$_imgContainerWidth' : '0',
'CertificateViewer1$_imgContainerHeight' : '0',
'CertificateViewer1$isImageViewerVisible' : 'true',
'CertificateViewer1$hdnWidgetSize' : '',
'CertificateViewer1$DragResizeExtender_ClientState' : '',
'PTAXViewer1$ScrollPos' : '',
'PTAXViewer1$ScrollPosChange' : '',
'PTAXViewer1$_imgContainerWidth' : '0',
'PTAXViewer1$_imgContainerHeight' : '0',
'PTAXViewer1$isImageViewerVisible' : 'true',
'PTAXViewer1$hdnWidgetSize' : '',
'PTAXViewer1$DragResizeExtender_ClientState' : '',
'DocList1$ctl12' : '',
'DocList1$ctl14' : '',
'RefinementCtrl1$ctl01' : '',
'RefinementCtrl1$ctl03' : '',
'NameList1$ScrollPos' : '',
'NameList1$ScrollPosChange' : '',
'NameList1$_SortExpression' : '',
'NameList1$ctl03' : '',
'NameList1$ctl05' : '',
'DocDetails1$PageSize' : '',
'DocDetails1$PageIndex' : '',
'DocDetails1$SortExpression' : '',
'BasketCtrl1$ctl01' : '',
'BasketCtrl1$ctl03' : '',
'OrderList1$ctl01' : '',
'OrderList1$ctl03' : '',
'__EVENTTARGET' : '',
'__EVENTARGUMENT' : '',
'__LASTFOCUS' : '',
'__VIEWSTATE' : '',
'__ASYNCPOST' : 'true',
'SearchFormEx1$btnSearch' : 'Search'
}
上面是下面formdata的formdata\u pre——堆栈溢出在这个dict中很奇怪
import scrapy
from scrapy import FormRequest
from scrapy.shell import inspect_response
import re
class FormySpider(scrapy.Spider):
name = 'formy'
allowed_domains = ['i2a.uslandrecords.com']
start_urls = ['https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx']
def parse(self, response):
URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
yield FormRequest(url=URL,
method='POST',
headers = headers,
formdata={
'ScriptManager1' : 'Navigator1$SearchOptions1$UpdatePanel|Navigator1$SearchOptions1$DocImagesCheck',
'__EVENTTARGET' : 'Navigator1$SearchOptions1$DocImagesCheck',
'__ASYNCPOST' : ' true'},
dont_filter=True,
callback=self.after_login_1
)
def after_login_1(self, response):
URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
formdata=formdata_pre,
yield FormRequest(url=URL,
method='POST',
headers = headers,
formdata=formdata,
dont_filter=True,
callback=self.after_login_2
)
def after_login_2(self, response):
URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
if response.xpath("(//a[contains(@id,'ButtonRow_Doc')])[1]").get(): print(response.xpath("//a[contains(@id,'LinkButton')]/text()").getall()) print(response.xpath("//a[contains(@id,'ButtonRow_Doc')]/text()").getall())
for row in response.xpath("//a[contains(@id,'ButtonRow_Doc')]/@href").getall():
event =re.findall(r'doPostBack\(\'([^()].*)\'\,\'\'\)',row)[0]
scriptmanager = 'DocList1$UpdatePanel|'+event
ndoc=re.findall(r'\#\_(.*)\'\,',row)[0]
nbook = response.xpath( "(//a[contains(@href,\"_{}',\")])[3]/text()".format(ndoc)).get()
npage = response.xpath( "(//a[contains(@href,\"_{}',\")])[4]/text()".format(ndoc)).get()
yield FormRequest(url=URL,
method='POST',
headers = headers,
formdata={
'ScriptManager1' : scriptmanager,
'__EVENTTARGET' : event,
'__ASYNCPOST' : ' true'},
dont_filter=True,
meta = {
'nbook' : nbook,
'npage' : npage
},
callback=self.after_login_3
)
if response.xpath("//a[@id='DocList1_LinkButtonNext']").get():
yield FormRequest(url=URL,
method='POST',
headers = headers,
formdata={
'ScriptManager1' : 'DocList1$UpdatePanel|DocList1$LinkButtonNext',
'__EVENTTARGET' : 'DocList1$LinkButtonNext',
'__ASYNCPOST' : ' true'},
dont_filter=True,
callback=self.after_login_2
)
else:
print("empty week")
def after_login_3(self, response):
URL = 'https://i2a.uslandrecords.com/ME/Cumberland/D/Default.aspx'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
names = response.xpath("//a[contains(@id,'GrantorGrantee')]/text()").getall()
nbook = response.request.meta['nbook']
npage = response.request.meta['npage']
yield {
'nbook' : nbook,
'npage' : npage,
'names' : names
}
就输出而言,以下是一个不匹配的示例:
15643 | | | 20 | |{FRANTZ RICHARD C,抵押电子登记系统公司,舰队抵押公司}
在上面的例子中,书名与页码和书号不匹配,“休斯·安德鲁”这个名字完全不存在。这只是一个例子
我认为scrapy请求的速度太快,响应无法生成,可能这个延迟造成了断开连接和缺少对象
您没有提供代码或输出,因此很难帮助您听起来像页面根据您的会话显示结果,因为Scrapy请求是并发的,会话可能会重叠
您可以使用参数
meta={'cookiejar': different_values_here}
设置每个会话的请求,您可以阅读更多here相关问题 更多 >
编程相关推荐