从python请求的txt文件中的链接获取所有分页URL的列表

2024-09-28 01:32:06 发布

您现在位置:Python中文网/ 问答频道 /正文

嗨,伙计们定义了一个函数,从python的txt文件中的链接中获取底部所有分页的url的列表。在

这是我需要做的一个例子。在

输入链接

http://www.apartmentguide.com/apartments/Alabama/Hartselle/

期望输出

^{pr2}$

以限制每个输入Url的限制。在

这是我目前为止编写的函数,但它不起作用我也不擅长Python。在

import requests
#from bs4 import BeautifulSoup
from scrapy import Selector as Se
import urllib2


lists = open("C:\Users\Administrator\Desktop\\3.txt","r")
read_list = lists.read()
line = read_list.split("\n")


def get_links(line):
    for each in line:
        r = requests.get(each)
        sel = Se(text=r.text, type="html")
        next_ = sel.xpath('//a[@class="next sprite"]//@href').extract()
        for next_1 in next_:
            next_2 = "http://www.apartmentguide.com"+next_1
            print next_2
        get_links(next_1)

get_links(line)

Tags: 函数fromimporttxtcomhttpreadget
2条回答

下面是两种方法。在

import mechanize

import requests
from bs4 import BeautifulSoup, SoupStrainer
import urlparse

import pprint

#  Mechanize  
br = mechanize.Browser()

def get_links_mechanize(root):
    links = []
    br.open(root)

    for link in br.links():
        try:
            if dict(link.attrs)['class'] == 'page':
                links.append(link.absolute_url)
        except:
            pass
    return links


#  Requests / BeautifulSoup / urlparse  
def get_links_bs(root):
    links = []
    r = requests.get(root)

    for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link.has_attr('class') and 'page' in link.get('class'):
            links.append(urlparse.urljoin(root, link.get('href')))

    return links


#with open("C:\Users\Administrator\Desktop\\3.txt","r") as f:
#    for root in f:
#        links = get_links(root) 
#        # <Do something with links>
root = 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/'

print "Mech:"
pprint.pprint( get_links_mechanize(root) )
print "Requests/BS4/urlparse:"
pprint.pprint( get_links_bs(root) )

一种使用mechanize使用url会更聪明一些,但速度要慢得多,而且可能会过度依赖于你正在做的其他事情。在

另一个使用requests来获取页面(urllib2就足够了),BeautifulSoup来解析标记,而{}则从你列出的页面中的相对url中形成绝对url。在

请注意,这两个函数都返回以下列表:

^{pr2}$

有重复的。你可以通过改变

return links

return list(set(links))

不管你选择什么方法。在

编辑:

我注意到上面的函数只返回到第2-5页的链接,而您必须浏览这些页面才能看到实际上有10个页面。在

一种完全不同的方法是从“根”页面中获取结果的数量,然后预测将产生多少个页面,然后从中构建链接。在

由于每页有20个结果,计算出多少页是很简单的,考虑一下:

import requests, re, math, pprint

def scrape_results(root):
    links = []
    r = requests.get(root)

    mat = re.search(r'We have (\d+) apartments for rent', r.text)
    num_results = int(mat.group(1))                     # 182 at the moment
    num_pages = int(math.ceil(num_results/20.0))        # ceil(182/20) => 10

    # Construct links for pages 1-10
    for i in range(num_pages):
        links.append("%s?page=%d" % (root, (i+1)))

    return links

pprint.pprint(scrape_results(root))

这将是3种方法中最快的,但可能更容易出错。在

编辑2

可能是这样的:

import re, math, pprint
import requests, urlparse
from bs4 import BeautifulSoup, SoupStrainer

def get_pages(root):
    links = []
    r = requests.get(root)

    mat = re.search(r'We have (\d+) apartments for rent', r.text)
    num_results = int(mat.group(1))                     # 182 at the moment
    num_pages = int(math.ceil(num_results/20.0))        # ceil(182/20) => 10

    # Construct links for pages 1-10
    for i in range(num_pages):
        links.append("%s?page=%d" % (root, (i+1)))

    return links

def get_listings(page):
    links = []
    r = requests.get(page)

    for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link.has_attr('data-listingid') and 'name' in link.get('class'):
            links.append(urlparse.urljoin(root, link.get('href')))

    return links

root='http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
listings = []
for page in get_pages(root):
    listings += get_listings(page)

pprint.pprint(listings)
print(len(listings))

对于Re我不确定,所以尝试了xpath。在

links = open("C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\2.txt","r")
read_list = links.read()
line = read_list.split("\n")

for each in line:
    lines = []
    r = requests.get(each)
    sel = Selector(text=r.text,type="html")
    mat = sel.xpath('//h1//strong/text()').extract()
    mat = str(mat)
    mat1 = mat.replace(" apartments for rent']","")
    mat2 = mat1.replace("[u'","")
    mat3 = int(mat2)
    num_pages = int(math.ceil(mat3/20.0))
    for i in range(num_pages):
        lines.append("%s/Page%d" % (each, (i+1)))
    with open('C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\test.csv', 'ab') as f:
        writer = csv.writer(f)
        for val in lines:
            writer.writerow([val])

相关问题 更多 >

    热门问题