获取循环中的数据时出现问题（碎片）

def parse(self, response): Caballo = response.url jockey_url = response.xpath( './/*[@id="site-content"]/div/main/div/div[1]/div[1]/div/div/div/div[1]/div[2]/div[2]/div/div[1]/ul/li[4]/a/@href').get() loader = ItemLoader(item=DailyItem(), response=response) loader.add_value('Caballo', Caballo) loader.add_xpath('jockey', './/*[@id="site-content"]/div/main/div/div[1]/div[1]/div/div/div/div[1]/div[2]/div[2]/div/div[1]/ul/li[4]/a/text()') new_items = loader.load_item() yield response.follow(jockey_url, self.parse_jockey, meta={'item': new_items}) def parse_jockey(self, response): new_items = response.meta['item'] table = response.xpath('//*[@id="tab-form-alltime"]/div/table/tbody/tr') for t in table: loader = ItemLoader(item=new_items, selector=t) loader.add_xpath('Type', './/td[1]/text()') loader.add_xpath('Rate', './/td[6]/text()') yield loader.load_item()

{"Caballo": "https://www.attheraces.com/form/horse/Alexanderthegreat/FR/3022995?raceid=1149928", "jockey": "Jason Hart", "Type": "Flat Turf", "Rate": "11.57%"}, {"Caballo": "https://www.attheraces.com/form/horse/Alexanderthegreat/FR/3022995?raceid=1149928", "jockey": "Jason Hart", "Type": "Flat Turf", "Rate": "11.57%"}, {"Caballo": "https://www.attheraces.com/form/horse/Alexanderthegreat/FR/3022995?raceid=1149928", "jockey": "Jason Hart", "Type": "Flat Turf", "Rate": "11.57%"},

1条回答

网友

1楼 · 发布于 2024-06-26 01:31:57

所以我认为问题的症结在于你的幻觉

代码示例test.py

import scrapy
from ..items import DailyItem
from scrapy.loader import ItemLoader

class TestSpider(scrapy.Spider):
        name = 'test'
        allowed_domains = ['www.attheraces.com']
        start_urls = ['https://www.attheraces.com/form/jockey/Jason-Hart/1354728?raceid=1149928']

    def parse(self, response):
        Caballo = response.url
        
        jockey_url = 'https://www.attheraces.com/form/horse/Strongbowe/FR/3091730?raceid=1150331'
        loader = ItemLoader(item=DailyItem(), response=response)
        loader.add_value('Caballo', Caballo)
        loader.add_xpath('Jockey', '//h1[@class="h3"]/text()')
        new_items = loader.load_item()
    
        yield response.follow(jockey_url, self.parse_jockey, meta={'item':new_items})

    def parse_jockey(self, response):
        
    
         
        table = response.xpath('//div[@id="tab-form-flat-form"]/div[2]/table/tbody/tr')
        new_items = response.meta['item']
        for t in table:
            loader = ItemLoader(item=new_items, selector=t)
            if t.xpath('.//td[1]/div/span[2]/text()'):
                loader.add_xpath('Type', './/td[1]/div/span[2]/text()')
                loader.add_xpath('Rate', './/td[6]/text()')
                yield loader.load_item()
            else: 
                continue

代码示例items.py

import scrapy
from scrapy.loader.processors import TakeFirst

class DailyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    Caballo = scrapy.Field(output_processor=TakeFirst())
    Jockey = scrapy.Field(output_processor=TakeFirst())
    Type = scrapy.Field(output_processor=TakeFirst())
    Rate = scrapy.Field(output_processor=TakeFirst())

输出

{"Caballo": "https://www.attheraces.com/form/jockey/Jason-Hart/1354728?raceid=1149928", "Jockey": "Jason Hart", "Type": "Turf", "Rate": "50.0%"}

提示

1.如果试图提取的HTML标记有class或id属性，请尝试使用该属性来获取信息，而不是XPATH中的一长串div。

2.对于长属性名，请使用XPATH中的contains函数，该函数将获取包含您指定的内容

比如说

   '//div[contains(@class,"jock")]'

将获取class属性中包含jock的任何div

在编写代码之前，使用ScrapyShell验证XPATH选择器。节省了大量时间来确定您是否获得了所需的数据

代码示例test.py

代码示例items.py

输出

提示

相关问题更多 >

编程相关推荐

热门问题

热门文章