如何使用scrapy获取表数据

from scrapy import Spider from scrapy.http import Request, FormRequest from scrapy.utils.response import open_in_browser class LarmSpider(Spider): name = 'larm' allowed_domains = ['larmtjanst.se'] start_urls = ['https://www.larmtjanst.se/Efterlysta-objekt/Personbil/?s=True'] def parse(self, response): yield FormRequest('https://www.larmtjanst.se/StolenItemsHelper/SearchAjax?category=Personbil', formdata={'category': 'Personbil'}, callback=self.parse_form) def parse_form(self, response): open_in_browser(response) table = response.xpath('//*[contains(@class, "searchResultTable")]')[1] trs = table.xpath('.//tr') for tr in trs: reg_num = tr.xpath('.//td/a/text()').extract_first() yield { 'Register Number': reg_num }

1条回答

网友

1楼 · 发布于 2024-10-03 09:11:03

您不仅应该提供表单数据，还应该设置一些标题以获得所需的输出。可以随意使用https://michael-shub.github.io/curl2scrapy/在scrapy shell中调试请求。这个对我很有用：

from scrapy import Request

url = 'https://www.larmtjanst.se/StolenItemsHelper/SearchAjax?category=Personbil'

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:82.0) Gecko/20100101 Firefox/82.0",
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "en-US,en;q=0.5",
    "__RequestVerificationToken": "__5gL8glld19CRsUT8xTXmXEr-YzXF6_0VeJd_aLgoWHOqq2QU-D-UQSbB9Jjn60_N15L742nDFo_2W_DW-wOdoCJK9Zambqf7Qr0OwNgM81:6CZOtCzlWWD2SGiKILKI0rCfY5yHMPcVCCw6NBTpKtmp8HTIJpIkoPPa8NI2-7tPE-hMJT0MYDsmyh3aLDGpyf8IY_kPPoHT6CIQxu6Yfrs1",
    "IsJson": "true",
    "X-Requested-With": "XMLHttpRequest",
    "Origin": "https://www.larmtjanst.se",
    "Connection": "keep-alive",
    "Referer": "https://www.larmtjanst.se/Efterlysta-objekt/Personbil/?s=True",
    "Cache-Control": "max-age=0",
    "TE": "Trailers"
}

request = Request(
    url=url,
    method='POST',
    dont_filter=True,
    headers=headers,
)

fetch(request)

正如您所看到的，您应该设置__RequestVerificationToken，它可以在页面正文中找到。其他一些标题也是必需的

相关问题更多 >

编程相关推荐

热门问题

热门文章