XPath语句未按预期进行分析

import scrapy class IMDBcrawler(scrapy.Spider): name = 'imdb' def start_requests(self): pages = [] count = 1 limit = 10 while (count <= limit): str_number = '%07d' % count pages.append('https://www.imdb.com/title/tt' + str_number) count += 1 for url in pages: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): yield { 'nom': response.xpath('//div[@class="title_wrapper"]/h1/text()').extract_first(), 'ano': response.xpath('//span[@id="titleYear"]/a/text()').extract_first(), }

[ { "nom": "Chinese Opium Den\u00a0", "ano": "<a href=\"\/year\/1894\/?ref_=tt_ov_inf\">1894<\/a>" }, { "nom": "Pauvre Pierrot\u00a0", "ano": "<a href=\"\/year\/1892\/?ref_=tt_ov_inf\">1892<\/a>" }, { "nom": "Carmencita\u00a0", "ano": "<a href=\"\/year\/1894\/?ref_=tt_ov_inf\">1894<\/a>" }, { "nom": "Un bon bock\u00a0", "ano": "<a href=\"\/year\/1892\/?ref_=tt_ov_inf\">1892<\/a>" }, { "nom": "Blacksmith Scene\u00a0", "ano": "<a href=\"\/year\/1893\/?ref_=tt_ov_inf\">1893<\/a>" }, { "nom": "Corbett and Courtney Before the Kinetograph\u00a0", "ano": "<a href=\"\/year\/1894\/?ref_=tt_ov_inf\">1894<\/a>" }, { "nom": "Employees Leaving the Lumi\u00e8re Factory\u00a0", "ano": "<a href=\"\/year\/1895\/?ref_=tt_ov_inf\">1895<\/a>" }, { "nom": "Miss Jerry\u00a0", "ano": "<a href=\"\/year\/1894\/?ref_=tt_ov_inf\">1894<\/a>" }, { "nom": "Le clown et ses chiens\u00a0", "ano": "<a href=\"\/year\/1892\/?ref_=tt_ov_inf\">1892<\/a>" }, { "nom": "Edison Kinetoscopic Record of a Sneeze\u00a0", "ano": "<a href=\"\/year\/1894\/?ref_=tt_ov_inf\">1894<\/a>" } ]

1条回答

网友

1楼 · 发布于 2024-09-29 00:18:50

不确定使用Scrapy的问题是什么，但是在请求的帮助下直接使用lxml，使用findtext的更简单的xpath效果很好：

import requests

from lxml import html

pages = []

for count in range(1, 10):
    str_num = '%07d' % count
    res = html.fromstring(requests.get('https://www.imdb.com/title/tt' + str_num).text)
    pages.append({'nom': res.findtext('.//div[@class="title_wrapper"]/h1'), 'ano': res.findtext('.//span[@id="titleYear"]/a')})

结果：

In [40]: pages
Out[40]:
[{'ano': '1894', 'nom': 'Carmencita\xa0'},
 {'ano': '1892', 'nom': 'Le clown et ses chiens\xa0'},
 {'ano': '1892', 'nom': 'Pauvre Pierrot\xa0'},
 {'ano': '1892', 'nom': 'Un bon bock\xa0'},
 {'ano': '1893', 'nom': 'Blacksmith Scene\xa0'},
 {'ano': '1894', 'nom': 'Chinese Opium Den\xa0'},
 {'ano': '1894', 'nom': 'Corbett and Courtney Before the Kinetograph\xa0'},
 {'ano': '1894', 'nom': 'Edison Kinetoscopic Record of a Sneeze\xa0'},
 {'ano': '1894', 'nom': 'Miss Jerry\xa0'}]

相关问题更多 >

编程相关推荐

热门问题

热门文章