在Selenium中webscraping表时循环URL？

import time import pandas as pd from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager browser = webdriver.Chrome(ChromeDriverManager().install()) browser.get('https://hslf.org/scorecards/2007-senate-midterm') time.sleep(10) html = browser.page_source humane_sc_tables = pd.read_html(html) humane_sc_data = humane_sc_tables[0]

import time import pandas as pd from selenium import webdriver from selenium.common import exceptions from webdriver_manager.chrome import ChromeDriverManager # browser = webdriver.Chrome(ChromeDriverManager().install()) URL_list = ["https://hslf.org/scorecards/2007-senate-midterm", "https://hslf.org/scorecards/2008-senate-final", "https://hslf.org/scorecards/2008-house-final", "https://hslf.org/scorecards/2009-senate-midterm", "https://hslf.org/scorecards/2009-house-midterm", "https://hslf.org/scorecards/2010-house-final", "https://hslf.org/scorecards/2010-senate-final", "https://hslf.org/scorecards/2011-house-midterm", "https://hslf.org/scorecards/2011-senate-midterm", "https://hslf.org/scorecards/2012-house-final", "https://hslf.org/scorecards/2012-senate-final", "https://hslf.org/scorecards/2013-house-midterm", "https://hslf.org/scorecards/2013-senate-midterm", "https://hslf.org/scorecards/2014-house-final", "https://hslf.org/scorecards/2014-senate-final", "https://hslf.org/scorecards/2015-house-midterm", "https://hslf.org/scorecards/2015-senate-midterm", "https://hslf.org/scorecards/2016-house-final", "https://hslf.org/scorecards/2016-senate-final", "https://hslf.org/scorecards/2017-house-midterm", "https://hslf.org/scorecards/2017-senate-midterm", "https://hslf.org/scorecards/2018-house-final", "https://hslf.org/scorecards/2018-senate-final"] for url in URL_list: browser = webdriver.Chrome(ChromeDriverManager().install()) time.sleep(5) print("Current session is {}".format(browser.session_id)) browser.quit() try: browser.get(url) except exceptions.InvalidSessionIdException as e: print(e.message) html = browser.page_source humane_sc_tables = pd.read_html(html) humane_sc_data = humane_sc_tables[0] humane_sc_data = humane_sc_data.drop(humane_sc_data.columns[[0,5,7]], axis = 1) browser.close() humane_sc_data.to_csv(f'humane_scores{url}.csv')

2条回答

网友

1楼 · 编辑于 2024-07-07 07:09:00

让它发挥作用。请参阅下面的代码：

import time
import pprint
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

'''

Note:
The link https://hslf.org/scorecards/2007-house-midterm doesn't work 
and is therefore excluded

'''
URL_list = ["https://hslf.org/scorecards/2007-senate-midterm",
            "https://hslf.org/scorecards/2008-senate-final",
            "https://hslf.org/scorecards/2008-house-final",
            "https://hslf.org/scorecards/2009-senate-midterm",
            "https://hslf.org/scorecards/2009-house-midterm",
            "https://hslf.org/scorecards/2010-house-final",
            "https://hslf.org/scorecards/2010-senate-final",
            "https://hslf.org/scorecards/2011-house-midterm",
            "https://hslf.org/scorecards/2011-senate-midterm",
            "https://hslf.org/scorecards/2012-house-final",
            "https://hslf.org/scorecards/2012-senate-final",
            "https://hslf.org/scorecards/2013-house-midterm",
            "https://hslf.org/scorecards/2013-senate-midterm",
            "https://hslf.org/scorecards/2014-house-final",
            "https://hslf.org/scorecards/2014-senate-final",
            "https://hslf.org/scorecards/2015-house-midterm",
            "https://hslf.org/scorecards/2015-senate-midterm",
            "https://hslf.org/scorecards/2016-house-final",
            "https://hslf.org/scorecards/2016-senate-final",
            "https://hslf.org/scorecards/2017-house-midterm",
            "https://hslf.org/scorecards/2017-senate-midterm",
            "https://hslf.org/scorecards/2018-house-final",
            "https://hslf.org/scorecards/2018-senate-final"]

for url in URL_list:
    browser = webdriver.Chrome(ChromeDriverManager().install())
    browser.get(url)
    time.sleep(10)
    
    html = browser.page_source
    tables = pd.read_html(html)
    tables = pd.concat(tables)
    
    data = tables.iloc[:, [0,2]]
    
    browser.close()
    browser.quit()
    
    filename = url[28:].replace("/","_")
    data.to_csv(filename+'.csv', index=False)

网友

2楼 · 编辑于 2024-07-07 07:09:00

您对以下browser.quit()的呼叫

print("Current session is {}".format(browser.session_id))
browser.quit()
try:
    browser.get(url)
except exceptions.InvalidSessionIdException as e:
    print(e.message)

quit() is a webdriver command which calls the driver.dispose method, which in turn closes all the browser windows and terminates the WebDriver session.

因此，在请求.get()之前，您似乎正在关闭浏览器实例，该请求反过来检索所需的内容。尝试将该行添加到循环的末尾，以便在下一次迭代中创建一个新会话

相关问题更多 >

编程相关推荐

热门问题

热门文章