无法使用此python脚本下载多个文件

2024-09-30 16:33:13 发布

您现在位置:Python中文网/ 问答频道 /正文

这个脚本使用BeautifulSoup解析网站特定页面上的所有pdf文档。脚本成功下载了一个文件,但不会下载返回的所有文件。我需要帮助使这个下载所有的pdf文件,我已经解析

我做过研究,但没有找到答案

import requests
from bs4 import BeautifulSoup 
import html5lib
import lxml

#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx?    portalId=895956&pageId=1606144')
RFP_Import =     ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')

def get_pdf_links():
    r = requests.get(RFP_Import)
    soup= BeautifulSoup(r.content, 'html5lib')
    links = soup.find_all('a')
    pdf_links = [place_hoder + link['href'] for link in links if     link['href'].endswith('pdf')]
    return pdf_links



def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
        print ('all RFPs downloaded!')
        return 

if __name__ == "__main__":
        pdf_links = get_pdf_links()
        download_pdf_links(pdf_links)

成功下载第一个pdf文档,然后停止

import requests
from bs4 import BeautifulSoup 
import html5lib
import lxml

#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx?       portalId=895956&pageId=1606144')
RFP_Import =     ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')

def get_pdf_links():
    r = requests.get(RFP_Import)
    soup= BeautifulSoup(r.content, 'html5lib')
    links = soup.find_all('a')
    pdf_links = [place_hoder + link['href'] for link in links if     link['href'].endswith('pdf')]
return pdf_links



def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
        print ('all RFPs downloaded!')
        return 

if __name__ == "__main__":
        pdf_links = get_pdf_links()
        download_pdf_links(pdf_links)

Tags: nameorgimporthttpgetpdfwwwlink
2条回答

download_pdf_links()内部,return未对齐。它应该与for对齐。否则,它是for循环的一部分,函数在第一次迭代后终止

这对于print ('all RFPs downloaded!')可能也是如此。我猜您希望在for循环结束时,在您浏览了所有链接之后,打印出来

download_pdf_link中,您正在循环内使用return,循环第一次迭代后返回,停止下载文件。您需要在循环结束后返回,方法是将其放在与循环开始相同的缩进上,如下所示:

def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
    # Un-indented so it happens after the loop finishes.
    print ('all RFPs downloaded!')
    return 

相关问题 更多 >