废料输出问题

from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.http import request from scrapy.selector import HtmlXPathSelector from texashealth.items import TexashealthItem class texashealthspider(CrawlSpider): name="texashealth" allowed_domains=['jobs.texashealth.org'] start_urls=['http://jobs.texashealth.org/search/?&q=&title=Filter%3A%20title&facility=Filter%3A%20facility&location=Filter%3A%20city&date=Filter%3A%20date'] rules=( Rule(SgmlLinkExtractor(allow=("search/",)), callback="parse_health", follow=True), #Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True), ) def parse_health(self, response): hxs=HtmlXPathSelector(response) titles=hxs.select('//tbody/tr/td') items = [] for titles in titles: item=TexashealthItem() item['title']=titles.select('span[@class="jobTitle"]/a/text()').extract() item['link']=titles.select('span[@class="jobTitle"]/a/@href').extract() item['shifttype']=titles.select('span[@class="jobShiftType"]/text()').extract() item['location']=titles.select('span[@class="jobLocation"]/text()').extract() items.append(item) print items return items

1条回答

网友

1楼 · 发布于 2024-09-29 01:37:44

应该循环表行tr元素，而不是表单元格td元素。在

我建议您使用hxs.select('//table[@id="searchresults"]/tbody/tr')，然后在每个循环迭代中使用.//span...

titles=hxs.select('//table[@id="searchresults"]/tbody/tr')
items = []
for titles in titles:
    item['title']=titles.select('.//span[@class="jobTitle"]/a/text()').extract()
    item['link']=titles.select('.//span[@class="jobTitle"]/a/@href').extract()
    item['shifttype']=titles.select('.//span[@class="jobShiftType"]/text()').extract()
    item['location']=titles.select('.//span[@class="jobLocation"]/text()').extract()
    items.append(item)
return items

相关问题更多 >

编程相关推荐

热门问题

热门文章