我想返回URL的'a'，而不是整个URL

from scrapy.spider import BaseSpider from project.items import QualificationItem from scrapy.selector import HtmlXPathSelector from scrapy.http.request import Request from urlparse import urljoin USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0' class recursiveSpider(BaseSpider): name = 'usw1' allowed_domains = ['international.southwales.ac.uk''eu.southwales.ac.uk/'] start_urls = ['http://international.southwales.ac.uk/countries'] def parse(self, response): hxs = HtmlXPathSelector(response) xpath = '/html/body/div[1]/div[4]/div[2]/ul/li/a/@href' for link in hxs.select(xpath).extract(): yield Request(urljoin(response.url, link), headers={'User-Agent': USER_AGENT}, callback=self.parse_linkpage, dont_filter=True) def parse_linkpage(self, response): hxs = HtmlXPathSelector(response) item = QualificationItem() xpath = """ //h4[normalize-space(.)="Entry Requirements - Undergraduate"] /following-sibling::ul/li """ item['Qualification'] = hxs.select(xpath).extract()[0:1] item['Url'] = response.url return item

1条回答

网友

1楼 · 发布于 2024-09-30 01:19:31

这可以通过使用请求调用的meta属性来实现。文档here谈到了它。你知道吗

将parse方法更改为：

def parse(self, response):
    hxs = HtmlXPathSelector(response)
    xpath = '/html/body/div[1]/div[4]/div[2]/ul/li/a/@href'
    a_of_the_link = '/html/body/div[1]/div[4]/div[2]/ul/li/a/text()'
    for text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):
        yield Request(urljoin(response.url, link),  meta={'a_of_the_link': text},
                          headers={'User-Agent': USER_AGENT},
                          callback=self.parse_linkpage,
                          dont_filter=True)

您可以在parse_item中访问：

item['Url'] = response.meta['a_of_the_link']

希望这有帮助

相关问题更多 >

编程相关推荐

热门问题

热门文章