我有最后一个问题:last question
现在我已经尽我最大的努力去思考和改进我的蜘蛛的结构。 然而,由于某些原因,我的蜘蛛还是没有开始爬行。在
我还检查了xpath,它们工作正常(在chrome控制台)。在
我用href加入了url,因为href始终只返回参数。我在最后一个问题上附上了一个链接格式示例。(我希望不要让这篇文章变得冗长)
我的蜘蛛:
class kmssSpider(scrapy.Spider):
name='kmss'
start_url = 'https://kmssqkr.hksarg/LotusQuickr/dept/Main.nsf/h_RoomHome/ade682e34fc59d274825770b0037d278/?OpenDocument#{unid=ADE682E34FC59D274825770B0037D278}'
login_page = 'https://kmssqkr.hksarg/LotusQuickr/dept/Main.nsf?OpenDatabase&Login'
allowed_domain = ["kmssqkr.hksarg"]
def start_requests(self):
yield Request(url=self.login_page, callback=self.login ,dont_filter = True
)
def login(self,response):
return FormRequest.from_response(response,formdata={'user':'usename','password':'pw'},
callback = self.check_login_response)
def check_login_response(self,response):
if 'Welcome' in response.body:
self.log("\n\n\n\n Successfuly Logged in \n\n\n ")
yield Request(url=self.start_url,
cookies={'LtpaToken2':'jHxHvqs+NeT...'}
)
else:
self.log("\n\n You are not logged in \n\n " )
def parse(self,response):
listattheleft = response.xpath("*//*[@class='qlist']/li[not(contains(@role,'menuitem'))]")
anyfolder = response.xpath("*//*[@class='q-folderItem']/h4")
anyfile = response.xpath("*//*[@class='q-otherItem']/h4")
for each_tab in listattheleft:
item = CrawlkmssItem()
item['url'] = each_tab.xpath('a/@href').extract()
item['title'] = each_tab.xpath('a/text()').extract()
yield item
if 'unid' not in each_tab.xpath('./a').extract():
parameter = each_tab.xpath('a/@href').extract()
locatetheroom = parameter.find('PageLibrary')
item['room'] = parameter[locatetheroom:]
locatethestart = response.url.find('#',0)
full_url = response.url[:locatethestart] + parameter
yield Request(url=full_url,
cookies={'LtpaToken2':'jHxHvqs+NeT...'}
)
for folder in anyfolder:
folderparameter = folder.xpath('a/@href').extract()
locatethestart = response.url.find('#',0)
folder_url = response.url[:locatethestart]+ folderparameter
yield Request(url=folder_url, callback='parse_folder',
cookies={'LtpaToken2':'jHxHvqs+NeT...'}
)
for File in anyfile:
fileparameter = File.xpath('a/@href').extract()
locatethestart = response.url.find('#',0)
file_url = response.url[:locatethestart] + fileparameter
yield Request(url=file_url, callback='parse_file',
cookies={'LtpaToken2':'jHxHvqs+NeT...'}
)
def parse_folder(self,response):
findfolder = response.xpath("//div[@class='lotusHeader']")
folderitem= CrawlkmssFolder()
folderitem['foldername'] = findfolder.xpath('h1/span/span/text()').extract()
folderitem['url']= response.url[response.url.find("unid=")+5:]
yield folderitem
def parse_file(self,response):
findfile = response.xpath("//div[@class='lotusContent']")
fileitem = CrawlkmssFile()
fileitem['filename']=findfile.xpath('a/text()').extract()
fileitem['title']=findfile.xpath(".//div[@class='qkrTitle']/span/@title").extract()
fileitem['author']=findfile.xpath(".//div[@class='lotusMeta']/span[3]/span/text()").extract()
yield fileitem
我想搜集的信息:
左侧栏:
文件夹:
日志:
^{pr2}$谢谢你的帮助!在
您的日志中有一个警告,并且您的回溯表明在打开
httpConnection
时发生错误。在我觉得你太复杂了,为什么你要做继承自类
scrapy.Spider
的繁重工作,而你有{a1}?Spider
通常用于抓取页面列表,而Crawler
用于抓取网站。在相关问题 更多 >
编程相关推荐