试图构建美丽集团，以灵活处理公司年度报告

from selenium import webdriver from bs4 import BeautifulSoup chrome_path = r"C:\webdrivers\chromedriver.exe" browser = webdriver.Chrome(chrome_path) #Microsoft browser.get("https://www.sec.gov/Archives/edgar/data/789019/000156459019027952/msft-10k_20190630.htm") msft = browser.page_source page_msft = BeautifulSoup(msft, 'html.parser') tables_msft = page_msft.find_all("table") #Walmart browser.get("https://www.sec.gov/Archives/edgar/data/104169/000010416919000016/wmtform10-kx1312019.htm") wmt = browser.page_source page_wmt = BeautifulSoup(wmt, 'html.parser') tables_wmt = page_wmt.find_all("table") print("MSFT Result Table Count: " + str(len(tables_msft))) print("Walmart Result Table Count: " + str(len(tables_wmt)))

1条回答

网友

1楼 · 发布于 2024-10-02 06:32:14

首先，您不需要Selenium，请求库将更快并避免开销。因此，我能够部分地找到提取所需数据的方法。但是由于列数不同，它们不能组合在一起（对于微软和沃尔玛）。下面的代码生成两个必需的数据帧，一个用于Microsoft，一个用于Walmart。您仍然需要操纵列名。这样做的目的是获取td值为“Age”的表，因为它是唯一的表数据。如果您需要澄清，请告诉我：-

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np


#Microsoft
page = requests.get("https://www.sec.gov/Archives/edgar/data/789019/000156459019027952/msft-10k_20190630.htm")
soup = BeautifulSoup(page.text, 'html')
resmsft = []
tables_msft = soup.find(text="Age").find_parent("table")
for row in tables_msft.find_all("tr")[1:]:
#    print([cell.get_text(strip=True) for cell in row.find_all("td")])
    if row:
        resmsft.append([cell.get_text(strip=True) for cell in row.find_all("td")])

non_empty = [sublist for sublist in resmsft if any(sublist)]
df_msft = pd.DataFrame.from_records(non_empty)
df_msft[df_msft==''] = np.nan 
df_msft=df_msft.dropna(axis=1,how='all')


#Walmart
page = requests.get("https://www.sec.gov/Archives/edgar/data/104169/000010416919000016/wmtform10-kx1312019.htm")
soup = BeautifulSoup(page.text, 'html')
#page_wmt = BeautifulSoup(soup, 'html.parser')
tables_wmt = soup.find(text="Age").find_parent("table")
reswmt = []
for row in tables_wmt.find_all("tr")[1:]:
#    print([cell.get_text(strip=True) for cell in row.find_all("td")])
    if row:
        reswmt.append([cell.get_text(strip=True) for cell in row.find_all("td")])
non_empty_wmt = [sublist for sublist in reswmt if any(sublist)]
df_wmt = pd.DataFrame.from_records(non_empty_wmt)
df_wmt[df_wmt==''] = np.nan 
df_wmt=df_wmt.dropna(axis=1,how='all')

相关问题更多 >

编程相关推荐

热门问题

热门文章