Python/Scrapy:crawspider在获取起始url后停止

# -*- coding: utf-8 -*- import scrapy # from scrapy.shell import inspect_response from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.selector import Selector from scrapy.http import HtmlResponse, FormRequest, Request from KiPieSpider.items import * from KiPieSpider.settings import * class KiSpider(CrawlSpider): name = "KiSpider" allowed_domains = ['www.kiweb.de', 'kiweb.de'] start_urls = ( # ST Regra start page: 'https://www.kiweb.de/default.aspx?pageid=206', # follow ST Regra links in the form of: # https://www.kiweb.de/default.aspx?pageid=206&page=\d+ # https://www.kiweb.de/default.aspx?pageid=299&docid=\d{6} # ST Thermo start page: 'https://www.kiweb.de/default.aspx?pageid=202&page=1', # follow ST Thermo links in the form of: # https://www.kiweb.de/default.aspx?pageid=202&page=\d+ # https://www.kiweb.de/default.aspx?pageid=299&docid=\d{6} ) rules = ( # First rule that matches a given link is followed / parsed. # Follow category pagination without further parsing: Rule( LinkExtractor( # Extract links in the form: allow=r'Default\.aspx?pageid=(202|206])&page=\d+', # but only within the pagination table cell: restrict_xpaths=('//td[@id="ctl04_teaser_next"]'), ), follow=True, ), # Follow links to category (202|206) articles and parse them: Rule( LinkExtractor( # Extract links in the form: allow=r'Default\.aspx?pageid=299&docid=\d+', # but only within article preview cells: restrict_xpaths=("//td[@class='TOC-zelle TOC-text']"), ), # and parse the resulting pages for article content: callback='parse_init', follow=False, ), ) # Once an article page is reached, check whether a login is necessary: def parse_init(self, response): self.log('Parsing article: %s' % response.url) if not response.xpath('input[@value="Logout"]'): # Note: response.xpath() is a shortcut of response.selector.xpath() self.log('Not logged in. Logging in...\n') return self.login(response) else: self.log('Already logged in. Continue crawling...\n') return self.parse_item(response) def login(self, response): self.log("Trying to log in...\n") self.username = self.settings['KI_USERNAME'] self.password = self.settings['KI_PASSWORD'] return FormRequest.from_response( response, formname='Form1', formdata={ # needs name, not id attributes! 'ctl04$Header$ctl01$textbox_username': self.username, 'ctl04$Header$ctl01$textbox_password': self.password, 'ctl04$Header$ctl01$textbox_logindaten_typ': 'Username_Passwort', 'ctl04$Header$ctl01$checkbox_permanent': 'True', }, callback = self.parse_item, ) def parse_item(self, response): articles = response.xpath('//div[@id="artikel"]') items = [] for article in articles: item = KiSpiderItem() item['link'] = response.url item['title'] = articles.xpath("div[@class='ct1']/text()").extract() item['subtitle'] = articles.xpath("div[@class='ct2']/text()").extract() item['article'] = articles.extract() item['published'] = articles.xpath("div[@class='biblio']/text()").re(r"(\d{2}.\d{2}.\d{4}) PIE") item['artid'] = articles.xpath("div[@class='biblio']/text()").re(r"PIE \[(d+)-\d+\]") item['lang'] = 'de-DE' items.append(item) # return(items) yield items # what is the difference between return and yield?? found both on web.

1条回答

网友

1楼 · 发布于 2024-09-19 23:45:10

rules = (
        # First rule that matches a given link is followed / parsed.
        # Follow category pagination without further parsing:
        Rule(
            LinkExtractor(
                # Extract links in the form:
                # allow=r'Default\.aspx?pageid=(202|206])&page=\d+',

                # but only within the pagination table cell:
                restrict_xpaths=('//td[@id="ctl04_teaser_next"]'),
            ),
            follow=True,
        ),
        # Follow links to category (202|206) articles and parse them:
        Rule(
            LinkExtractor(
                # Extract links in the form:
                # allow=r'Default\.aspx?pageid=299&docid=\d+',
                # but only within article preview cells:
                restrict_xpaths=("//td[@class='TOC-zelle TOC-text']"),
            ),
            # and parse the resulting pages for article content:
            callback='parse_init',
            follow=False,
        ),
    )

您不需要allow参数，因为XPath选择的标记中只有一个链接。在

我不理解allow参数中的regex，但至少您应该转义?。

相关问题更多 >

编程相关推荐

热门问题

热门文章