从正文字符串获取URL的正则表达式

2024-09-28 01:26:43 发布

您现在位置:Python中文网/ 问答频道 /正文

所以我想知道我的getURLs函数可能有什么问题。我试图从包含主体的字符串中获取所有URL。在

我的爬虫程序没有抓取任何内容,因为我的输入URL无效。在

# Get all URLs contained within the body string
def getURLs(body):
    urls = []
    tempArr = body.split("a href=")
    index = 1
    for part in tempArr:
        if part[0] == '"':
            while (part[index] != '"' and index < len(part)):
                index += 1
            if index < len(part):
                urls.append(part[1:index-1])

            index = 1
    return urls


# Open file which contains input urls
with open("test_urls.txt","rU") as infile:
    urls = [row.strip("\n") for row in infile]

class BackpageSpider(CrawlSpider):
    name = 'backpage'
    allowed_domains = ['backpage.com']
    start_urls = urls

    def parse(self,response):
        #print response.url

        if response.status < 600:

            # all_links = response.xpath("//div[contains(@class,'cat')]/a/@href").extract()
            #all the links FOR THE ESCORTS on whatever page we're on
            todays_links = []
            #all  the links for today's date

            backpage_date = backpage_date_today()
            yesterday_date = backpage_date_yesterday()

            if backpage_date in response.body:
                todays_section = response.body.split(backpage_date)[1].split(yesterday_date)[0].decode('utf-8') 
                # todays_links = todays_section.xpath("//div[contains(@class,'cat')]/a/@href").extract
                todays_links = getURLs(todays_section)

                # for url in todays_links: 
                #   todays_links.append(url)
                # for url in all_links: 
                #   if url in todays_section:
                #       todays_links.append(url)

            for url in todays_links: 
                yield scrapy.Request(url,callback=self.parse_ad_into_content)####HERE

            for url in set(response.xpath('//a[@class="pagination next"]/@href').extract()):
                yield scrapy.Request(url,callback=self.parse)

        else:
            time.sleep(600)
            yield scrapy.Request(response.url,callback=self.parse)


    def parse_ad_into_content(self,response):
        #ipdb.set_trace()
        item = items.BackpageScrapeItem(
            url=response.url,
            backpage_id=response.url.split('.')[0].split('/')[2].encode('utf-8'),
            text = response.body,
            posting_body= response.xpath("//div[@class='postingBody']").extract()[0].encode('utf-8'),
            date = datetime.utcnow()-timedelta(hours=5),
            posted_date = response.xpath("//div[@class='adInfo']/text()").extract()[0].encode('utf-8'),
            posted_age = response.xpath("//p[@class='metaInfoDisplay']/text()").extract()[0].encode('utf-8'),
            posted_title = response.xpath("//div[@id='postingTitle']//h1/text()").extract()[0].encode('utf-8')
            )
        return item

网页是:http://grandisland.backpage.com/FemaleEscorts/?layout=date


Tags: inurlfordateindexresponseextractbody

热门问题