在Scrapy上的多个链接中爬行

div = response.xpath('//*[@id="sidebar"]/div[1]/nav/ul/li[5]/div') hrefs = div.xpath('*//a').extract() splits = {} for href in hrefs: split = href.split('"') link = split[1] date = split[2] clean_date = "".join(re.findall("[^><a/]",date)) clean_link = "http://www.ylioppilastutkinto.fi" + str(link) splits[clean_date] = clean_link

table = resp.xpath('//*[@id="content"]/table/tbody') rows = table.xpath('//tr') data_dict = {"Category": [w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]] } for row in rows[1:]: data = row.xpath('td') title = w3lib.html.remove_tags(data[0].get()) nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]] data_dict[title] = nums

import scrapy import regex as re from scrapy.http import HtmlResponse import w3lib.html class MainSpider(scrapy.Spider): name = 'links' allowed_domains = ['www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat'] start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/'] def parse(self, response): div = response.xpath('//*[@id="sidebar"]/div[1]/nav/ul/li[5]/div') hrefs = div.xpath('*//a').extract() splits = {} for href in hrefs: split = href.split('"') link = split[1] date = split[2] clean_date = "".join(re.findall("[^><a/]",date)) clean_link = "http://www.ylioppilastutkinto.fi" + str(link) splits[clean_date] = clean_link for date,url in splits.items(): resp = HtmlResponse(url) table = resp.xpath('//*[@id="content"]/table/tbody') rows = table.xpath('//tr') data_dict = {"Category":[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]} for row in rows[1:]: data = row.xpath('td') title = w3lib.html.remove_tags(data[0].get()) nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]] data_dict[title] = nums yield { 'Date': date, 'Scores': data_dict}

1条回答

网友

1楼 · 发布于 2024-10-02 08:29:26

初始化HtmlResponse(url)不会完成任何事情，因为类本身不会发出请求

要向scrapy的调度程序添加请求，您需要生成一个请求，例如：yield scrapy.Request(url, callback=self.parse)

也就是说，您可以对spider进行许多改进

使用scrapy的内置^{}而不是字符串拆分
使用css选择器而不是硬编码的XPath
使用selector.root.text而不是w3lib.remove_tags（完全删除依赖项）

以下是一个工作示例：

import scrapy
from scrapy.linkextractors import LinkExtractor


class MainSpider(scrapy.Spider):
    name = 'links'
    allowed_domains = ['www.ylioppilastutkinto.fi']
    start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']

    def parse(self, response):
        le = LinkExtractor(
            allow_domains=self.allowed_domains,
            restrict_xpaths='//*[@id="sidebar"]/div[1]/nav/ul/li[5]/div',
        )
        for link in le.extract_links(response):
            yield scrapy.Request(
                url=link.url,
                callback=self.parse_table,
                cb_kwargs={ 'date': link.text },
            )

    def parse_table(self, response, date):
        rows = response.css('#content table tbody tr')
        if not rows:
            print(f'No table found for url: {response.url}')
            return

        category = [char.root.text for char in rows[0].css('td strong')[1:]]
        if not category:
            category = [char.root.text for char in rows[0].css('td')[1:]]

        for row in rows[1:]:
            cols = row.css('td')
            title = cols[0].root.text
            nums = [col.root.text for col in cols[1:]]
            yield {
                'Date': date,
                'Category': category,
                title: nums
            }

注意您的类别解析似乎不起作用。我不确定你想提取什么，所以我把这个留给你

相关问题更多 >

编程相关推荐

热门问题

热门文章