这个脚本使用BeautifulSoup解析网站特定页面上的所有pdf文档。脚本成功下载了一个文件,但不会下载返回的所有文件。我需要帮助使这个下载所有的pdf文件,我已经解析
我做过研究,但没有找到答案
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
成功下载第一个pdf文档,然后停止
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
在
download_pdf_links()
内部,return
未对齐。它应该与for
对齐。否则,它是for
循环的一部分,函数在第一次迭代后终止这对于
print ('all RFPs downloaded!')
可能也是如此。我猜您希望在for循环结束时,在您浏览了所有链接之后,打印出来在
download_pdf_link
中,您正在循环内使用return
,循环第一次迭代后返回,停止下载文件。您需要在循环结束后返回,方法是将其放在与循环开始相同的缩进上,如下所示:相关问题 更多 >
编程相关推荐