如何让当前代码返回链接的“a”而不是整个链接。下面是我如何返回整个链接,但我只想要'a'。你知道吗
item ['Url'] = response.url
例如http://international.southwales.ac.uk/country/iran/en/就是“伊朗”。你知道吗
from scrapy.spider import BaseSpider
from project.items import QualificationItem
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from urlparse import urljoin
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
class recursiveSpider(BaseSpider):
name = 'usw1'
allowed_domains = ['international.southwales.ac.uk''eu.southwales.ac.uk/']
start_urls = ['http://international.southwales.ac.uk/countries']
def parse(self, response):
hxs = HtmlXPathSelector(response)
xpath = '/html/body/div[1]/div[4]/div[2]/ul/li/a/@href'
for link in hxs.select(xpath).extract():
yield Request(urljoin(response.url, link),
headers={'User-Agent': USER_AGENT},
callback=self.parse_linkpage,
dont_filter=True)
def parse_linkpage(self, response):
hxs = HtmlXPathSelector(response)
item = QualificationItem()
xpath = """
//h4[normalize-space(.)="Entry Requirements - Undergraduate"]
/following-sibling::ul/li
"""
item['Qualification'] = hxs.select(xpath).extract()[0:1]
item['Url'] = response.url
return item
这可以通过使用请求调用的
meta
属性来实现。文档here谈到了它。你知道吗将
parse
方法更改为:您可以在
parse_item
中访问:希望这有帮助
相关问题 更多 >
编程相关推荐