爬行外流.cc使用刮痧

from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor from extra.items import * class extraSpider(CrawlSpider): name = 'extraSpider' allowed_domains = ['extratorrent.cc'] start_urls = ['http://www.extratorrent.cc/torrent'] rules = [Rule(LinkExtractor(allow=['/\d+/\S+']), 'parse_torrent')] def parse_torrent(self, response): torrent = TorrentItem() torrent['url'] = response.url torrent['name'] = response.xpath("/html/body/table/tbody/tr[3]/td/table/tbody/tr/td[2]/table[2]/tbody/tr/td[2]/h1").extract() torrent['description'] = response.xpath("/html/body/table/tbody/tr[3]/td/table/tbody/tr/td[2]/div[4]").extract() torrent['size'] = response.xpath("/html/body/table/tbody/tr[3]/td/table/tbody/tr/td[2]/table[3]/tbody/tr/td[1]/table/tbody/tr[10]/td[2]").extract() return torrent

1条回答

网友

1楼 · 发布于 2024-10-02 18:15:54

我对代码做了一些修改，这可能会有帮助

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from extra.items import *


class extraSpider(CrawlSpider):

    name = 'extraSpider'
    allowed_domains = ['extratorrent.cc']
    start_urls = ['http://www.extratorrent.cc/torrent']
    rules = [Rule(LinkExtractor(allow=['/\d+/\S+']), 'parse_torrent')]

    def parse_torrent(self, response):
        url = response.url
        name = response.xpath(
            "//h1/b/text()").extract()
        name = name[0].strip() if name else 'N/A'
        description = response.xpath(
            '//div[@class="borderdark"]//text()').extract()
        description = ' '.join(
            ' '.join(description).split()) if description else 'N/A'
        size = response.xpath(
            '//td[@class="tabledata1" and contains(text(), "Total Size:")]/following-sibling::td[@class="tabledata0" and position()=1]/text()').extract()
        size = size[0].strip().replace(u'\xa0', ' ') if size else 'N/A'
        torrent = TorrentItem(
            url=url,
            name=name,
            description=description,
            size=size)
        yield torrent

我在这里附加了一些示例输出

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章