在Python3中使用Selenium和Requests模块从网页获取文件

#! python3 import os, requests, bs4 os.chdir('C:\\Standards') standardURL = 'http://www.nerc.net/standardsreports/standardssummary.aspx' res = requests.get(standardURL) res.raise_for_status() soup = bs4.BeautifulSoup(res.text, 'html.parser') # this is the url pattern when inspecting the elements on the page linkElems = soup.select('.style97 a') # I wanted to save the hyperlinks into a list splitStandards = [] for link in range(len(linkElems)): splitStandards.append(linkElems[link].get('href')) # Next, I wanted to create the pdf's and copy them locally print(' STARTING STANDARDS DOWNLOAD '.center(80, '=') + '\n') for item in splitStandards: j = os.path.basename(item) # BAL-001-2.pdf, etc... f = open(j, 'wb') ires = requests.get(item) # http://www.nerc.com/pa/Stand/Reliability%20Standards/BAL-001-2.pdf ires.raise_for_status() for chunk in ires.iter_content(1000000): # 1MB chunks f.write(chunk) print('Completing download for: ' + str(j) + '.') f.close() print() print(' STANDARDS DOWNLOAD COMPLETE '.center(80, '='))

#! python3 # _nercTest.py - Opens the nerc.net website and pulls down all # pdf's for the present, future, and inactive standards. import os, requests, bs4, time, datetime from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.select import Select os.chdir('C:\\Standards') def nercStandards(standardURL): logFile = open('_logFile.txt', 'w') logFile.write('Standard\t\tHyperlinks or Errors\t\t' + str(datetime.datetime.now().strftime("%m-%d-%Y %H:%M:%S")) + '\n\n') logFile.close() fp = webdriver.FirefoxProfile('C:\\pathto\\Firefox\\Profiles\\seleniumDefault') browser = webdriver.Firefox(fp) wait = WebDriverWait(browser, 10) currentOption = 'Mandatory Standards Subject to Enforcement' futureOption = 'Standards Subject to Future Enforcement' inactiveOption = 'Inactive Reliability Standards' dropdownList = [currentOption, futureOption, inactiveOption] print() print(' STARTING STANDARDS DOWNLOAD '.center(80, '=') + '\n') for option in dropdownList: standardName = [] # Capture all the standard names accurately standardLink = [] # Capture all the href links for each standard standardDict = {} # combine the standardName and standardLink into a dictionary browser.get(standardURL) dropdown = Select(browser.find_element_by_id("ReportDropDown")) dropdown.select_by_visible_text(option) wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'div > span[class="style12"]'), option)) time.sleep(3) # Needed for the 'inactive' page to completely load consistently page_source = browser.page_source soup = bs4.BeautifulSoup(page_source, 'html.parser') soupElems = soup.select('.style97 a') # standardLink list generated here for link in range(len(soupElems)): standardLink.append(soupElems[link].get('href')) # http://www.nerc.com/pa/Stand/Reliability%20Standards/BAL-001-2.pdf # standardName list generated here if option == currentOption: print(' Mandatory Standards Subject to Enforcement '.center(80, '.') + '\n') currentElems = soup.select('.style99 span[class="style30"]') for currentStandard in range(len(currentElems)): standardName.append(currentElems[currentStandard].getText()) # BAL-001-2 elif option == futureOption: print() print(' Standards Subject to Future Enforcement '.center(80, '.') + '\n') futureElems = soup.select('.style99 span[class="style30"]') for futureStandard in range(len(futureElems)): standardName.append(futureElems[futureStandard].getText()) # COM-001-3 elif option == inactiveOption: print() print(' Inactive Reliability Standards '.center(80, '.') + '\n') inactiveElems = soup.select('.style104 font[face="Verdana"]') for inactiveStandard in range(len(inactiveElems)): standardName.append(inactiveElems[inactiveStandard].getText()) # BAL-001-0 # if nunber of names and links match, then create key:value pairs in standardDict if len(standardName) == len(standardLink): for x in range(len(standardName)): standardDict[standardName[x]] = standardLink[x] else: print('Error: items in standardName and standardLink are not equal!') logFile = open('_logFile.txt', 'a') logFile.write('\nError: items in standardName and standardLink are not equal!\n') logFile.close() # URL correction for PRC-005-1b # if 'PRC-005-1b' in standardDict: # standardDict['PRC-005-1b'] = 'http://www.nerc.com/files/PRC-005-1.1b.pdf' for k, v in standardDict.items(): logFile = open('_logFile.txt', 'a') f = open(k + '.pdf', 'wb') ires = requests.get(v) try: ires.raise_for_status() logFile.write(k + '\t\t' + v + '\n') except Exception as exc: print('\nThere was a problem on %s: \n%s' % (k, exc)) logFile.write('There was a problem on %s: \n%s\n' % (k, exc)) for chunk in ires.iter_content(1000000): f.write(chunk) f.close() logFile.close() print(k + ': \n\t' + v) print() print(' STANDARDS DOWNLOAD COMPLETE '.center(80, '=')) nercStandards('http://www.nerc.net/standardsreports/standardssummary.aspx')

2条回答

网友

1楼 · 编辑于 2024-09-24 16:34:18

@HenryM是正确的，除了在您读取.page_source并将其传递给BeautifulSoup进行进一步解析之前，您需要确保您所需的数据已加载到那里。为此，请使用^{} class。在

例如，在您选择了“Standards Filted and Pending Regulatory Approval”选项后，您需要等待报告标题更新—这将指示您已加载新结果。大致如下：

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select

# ...

wait = WebDriverWait(browser, 10)

option_text = "Standards Filed and Pending Regulatory Approval" 

# select the dropdown value
dropdown = Select(browser.find_element_by_id("ReportDropDown"))
dropdown.select_by_visible_text(option_text)

# wait for results to be loaded
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#panel5 > div > span"), option_text)

soup = BeautifulSoup(browser.page_source,'html.parser')
# TODO: parse the results

还要注意使用^{} class来操作下拉列表。在

网友

2楼 · 编辑于 2024-09-24 16:34:18

使用Selenium点击按钮等完成工作后，您需要告诉BeautifulSoup使用它：

    page_source = browser.page_source
    link_soup = bs4.BeautifulSoup(page_source,'html.parser')

相关问题更多 >

编程相关推荐

热门问题

热门文章