Scrapy：检查页面是否包含HTML表单元素

from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class MySpider(CrawlSpider): name = 'mps' allowed_domains = ['some.url.com'] start_urls = ['https://some.url.com/'] rules = ( Rule(LinkExtractor(), callback='parse_item', follow=True), ) def parse_item(self, response): hasForm = response.xpath("//form[@id = 'aspnetForm']/form").extract_first(default='not-found') if hasForm == 'not-found': pass else: filename = response.url.split("/")[-2] + '.html' with open(filename, 'wb') as f: f.write(response.body) pass

1条回答

网友
1楼 · 发布于 2024-09-30 01:21:47

示例
from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class MySpider(CrawlSpider): name = 'mps' allowed_domains = ['some.url.com'] start_urls = ['https://some.url.com/'] rules = ( Rule(LinkExtractor(), callback='parse_item', follow=True), ) def parse_item(self, response): hasForm = response.xpath("//form").extract_first(default='not-found') if hasForm != 'not-found': page = response.url.split("/")[-2] filename = 'test-%s.html' % page with open(filename, 'wb') as f: f.write(response.body)

相关问题更多 >

编程相关推荐

热门问题

热门文章