所以我想知道我的getURLs
函数可能有什么问题。我试图从包含主体的字符串中获取所有URL。在
我的爬虫程序没有抓取任何内容,因为我的输入URL无效。在
# Get all URLs contained within the body string
def getURLs(body):
urls = []
tempArr = body.split("a href=")
index = 1
for part in tempArr:
if part[0] == '"':
while (part[index] != '"' and index < len(part)):
index += 1
if index < len(part):
urls.append(part[1:index-1])
index = 1
return urls
# Open file which contains input urls
with open("test_urls.txt","rU") as infile:
urls = [row.strip("\n") for row in infile]
class BackpageSpider(CrawlSpider):
name = 'backpage'
allowed_domains = ['backpage.com']
start_urls = urls
def parse(self,response):
#print response.url
if response.status < 600:
# all_links = response.xpath("//div[contains(@class,'cat')]/a/@href").extract()
#all the links FOR THE ESCORTS on whatever page we're on
todays_links = []
#all the links for today's date
backpage_date = backpage_date_today()
yesterday_date = backpage_date_yesterday()
if backpage_date in response.body:
todays_section = response.body.split(backpage_date)[1].split(yesterday_date)[0].decode('utf-8')
# todays_links = todays_section.xpath("//div[contains(@class,'cat')]/a/@href").extract
todays_links = getURLs(todays_section)
# for url in todays_links:
# todays_links.append(url)
# for url in all_links:
# if url in todays_section:
# todays_links.append(url)
for url in todays_links:
yield scrapy.Request(url,callback=self.parse_ad_into_content)####HERE
for url in set(response.xpath('//a[@class="pagination next"]/@href').extract()):
yield scrapy.Request(url,callback=self.parse)
else:
time.sleep(600)
yield scrapy.Request(response.url,callback=self.parse)
def parse_ad_into_content(self,response):
#ipdb.set_trace()
item = items.BackpageScrapeItem(
url=response.url,
backpage_id=response.url.split('.')[0].split('/')[2].encode('utf-8'),
text = response.body,
posting_body= response.xpath("//div[@class='postingBody']").extract()[0].encode('utf-8'),
date = datetime.utcnow()-timedelta(hours=5),
posted_date = response.xpath("//div[@class='adInfo']/text()").extract()[0].encode('utf-8'),
posted_age = response.xpath("//p[@class='metaInfoDisplay']/text()").extract()[0].encode('utf-8'),
posted_title = response.xpath("//div[@id='postingTitle']//h1/text()").extract()[0].encode('utf-8')
)
return item
网页是:http://grandisland.backpage.com/FemaleEscorts/?layout=date
目前没有回答
相关问题 更多 >
编程相关推荐