我使用的是beauthulsoup,如何在重定向后获得链接?

2024-09-28 01:25:40 发布

您现在位置:Python中文网/ 问答频道 /正文

我想在文章页面的下载链接重定向后获得链接。在

例如: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

在上述文章页面中,有以下下载链接: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=

直接打开这个链接,它不会重定向到真正的下载链接,你需要在文章页面打开它。在

# coding=utf-8

import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen


def urlopen(url):
    '''
    using requests to replace urllib.requests.urlopen
    return an html
    '''
    headers = {"User-Agent":"Mozilla/5.0"}
    r = requests.get(url, headers=headers)
    return r.text

def generate_pages(subTitle,fromPage,toPage):
    '''
    return  page sites' url list
    '''
    pages = []
    if(fromPage > 0 and fromPage<toPage):
        for i in range(fromPage,toPage+1):
            pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
    return pages



def get_book_sites_of_one_page(page):
    '''
    get book site's url in one page
    input: page site url
    output: book site urls list
    return book sites in one page
    '''
    html = urlopen(page)
    soup = BeautifulSoup(html,'html.parser')
    linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
    bookSites= []
    for link in linkList[::2]:
        if 'href' in link.attrs:
            #print(link)
            bookSites.append(link.attrs['href'])
    return bookSites


def get_book_urls(bookSite):
    '''
    input a book site
    find book downloading urls in this book site
    then
    return them as a list
    '''
    bookURLs=[]
    html = urlopen(bookSite)
    soup = BeautifulSoup(html,'lxml')
    linkList = soup.findAll("a",{"target":"_blank"})
    for link in linkList[::2]:
        # print(link)
        if 'href' in link.attrs:
            bookURLs.append(link.attrs['href'])
    return bookURLs


def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
    bookSites = []
    bookURLs = []
    pages = generate_pages(subTitle,fromPage, toPage)

    for page in pages:
        bookSiteOfOnePage=get_book_sites_of_one_page(page)
        bookSites.extend(bookSiteOfOnePage)

    for bookSite in bookSites:
        book_urls=get_book_urls(bookSite)
        bookURLs += book_urls

    for bookURL in bookURLs:
        print(bookURL)

    #with open(filename, 'w') as f:
    #    f.write(bookURLs)


def main():
    if(len(sys.argv) == 4):
        '''
        python getUrl.py 1, 100, programming
        from page 1 to page in subject programming
        '''
        subTitle = str(sys.argv[3])
        fromPage = int(sys.argv[1])
        toPage = int(sys.argv[2])
        get_all_book_urls(fromPage, toPage, subTitle)

    if(len(sys.argv) == 3):
        '''
        python getUrl.py 1 100
        from page 1 to page 100
        '''
        subTitle = ''
        fromPage = int(sys.argv[1])
        toPage = int(sys.argv[2])
        #filename = subTitle="-"+str(pageNum)+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)

    elif(len(sys.argv) == 2):
        '''
        python getUrl.py 10
        from page 10 to page 10
        only download books on page 10
        '''
        fromPage = int(sys.argv[1])
        toPage = fromPage + 1
        subTitle = ''
        #filename = "All-"+str(pageNum)+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)

    elif(len(sys.argv)== 1):
        fromPage = 1
        # custom page range
        toPage = 2
        subTitle = ''

        #filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)
    else:
        print("Error, too many arguments")



if __name__ == '__main__':

    #filename = ''
    main()

谢谢你的帮助!在


Tags: inimportgetreturnsyspagelinkpages
1条回答
网友
1楼 · 发布于 2024-09-28 01:25:40

此网站检查重定向时是否设置了referer。您只需在标题中提供原始url作为referer,并轻松绕过它。您还可以看到referer被用作最终下载链接中的url参数。在

import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url) 

输出

^{pr2}$

相关问题 更多 >

    热门问题