如何在Python BeautifulSoup上高效地解析大型htmldivclass和span数据？

3条回答

网友

1楼 · 编辑于 2024-09-25 00:30:46

更新

Jack Fleeting's answer is likely the best approach for solving the OP's question.
My answer works, but it pales in comparison to Jack's approach. I'm leaving my answer here because, I want it to serve as a placeholder to help other that need to tackle a similar problem in the future.

原始回复

这是另一个答案，可能会由比我更擅长做汤的人来完善。在

我把收集到的数据放入两个字典中，分别命名为资产负债表_dict和金融学。我还提取了与列关联的日期，因为我将在其他函数中使用它们。我还将这些日期从%m/%d/%Y重新格式化为%m%d%Y

我还使用soups find_all_next（tag_name，limit=int）只收集所需的子标记。您可以调整此限制以从表中收集所需的项。在

总的来说，这是一个有趣的问题，需要一些额外的思考。谢谢你发帖提问。在

import requests
from datetime import datetime
from bs4 import BeautifulSoup
import re as regex

operating_income_or_loss_keys = []
operating_income_or_loss_values = []

def get_operating_income_or_loss(soup):
  for rows in soup.find_all('div', {'class': 'D(tbr)'}):
    for date_row in rows.find_all('div', {'class': 'D(ib)'}):
      chart_dates = date_row.find_all_next('span', limit=8)
      for dates in chart_dates[1:]:
       if dates.text == 'ttm':
         operating_income_or_loss_keys.append(dates.text)
       else:
         date_format = regex.match(r'(\d{1,2}/\d{2}/\d{4})', dates.text)
         if date_format:
          reformatted_date = datetime.strptime(dates.text, '%m/%d/%Y').strftime('%m%d%Y')                            
          operating_income_or_loss_keys.append(reformatted_date)

    for sub_row in rows.find_all('div', {'class': 'D(tbc)'}):
      for row_item in sub_row.find_all('span', {'class': 'Va(m)'}):
        if row_item.text == 'Operating Income or Loss':
         operating_income_or_loss = row_item.find_all_next('span', limit=len(operating_income_or_loss_keys))
         for item in operating_income_or_loss[1:]:
           if len(item) == 0 or item.text == '-':
             operating_income_or_loss_values.append('no value provided')
           else:
             operating_income_or_loss_values.append(item.text)
 return


total_assets_values = []
total_current_liabilities = []
balance_sheet_keys = []

def get_total_assets(soup):
  for rows in soup.find_all('div', {'class': 'D(tbr)'}):
    for date_row in rows.find_all('div', {'class': 'D(ib)'}):
        if date_row.text == 'Breakdown':
            chart_dates = date_row.find_all_next('span', limit=8)
            for dates in chart_dates[1:]:
                date_format = regex.match(r'(\d{1,2}/\d{2}/\d{4})', dates.text)
                if date_format:
                    reformatted_date = datetime.strptime(dates.text, '%m/%d/%Y').strftime('%m%d%Y')
                    balance_sheet_keys.append(reformatted_date)

  for rows in soup.find_all('div', {'class': 'D(tbr)'}):
    for sub_row in rows.find_all('div', {'class': 'D(tbc)'}):
        for row_item in sub_row.find_all('span', {'class': 'Va(m)'}):
            if row_item.text == 'Total Assets':
                    total_assets = row_item.find_all_next('span', limit=len(balance_sheet_keys))
                    for item in total_assets:
                        if len(item) == 0 or item.text == '-':
                            total_assets_values.append('no value provided')
                        else:
                            total_assets_values.append(item.text)
  return

def get_total_current_liabilities(soup):
  for rows in soup.find_all('div', {'class': 'D(tbr)'}):
    for sub_row in rows.find_all('div', {'class': 'D(tbc)'}):
        for row_item in sub_row.find_all('span', {'class': 'Va(m)'}):
            if row_item.text == 'Total Current Liabilities':
                current_liabilities = row_item.find_all_next('span', limit=len(balance_sheet_keys))
                for item in current_liabilities:
                    if len(item) == 0 or item.text == '-':
                        total_current_liabilities.append('no value provided')
                    else:
                        total_current_liabilities.append(item.text)
  return


urls = ['https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL',
        'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL']

for url in urls:

  stock_symbol = url.rpartition('?p=')[-1]

  if 'balance-sheet' in url:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    get_total_assets(soup)
    get_total_current_liabilities(soup)
    balance_sheet_dict = {k: v for k, v in zip(balance_sheet_keys, zip(total_assets_values,total_current_liabilities))}
    print('*' * 10, f'Balance sheet results for {stock_symbol}', '*' * 10)
    for key, values in balance_sheet_dict.items():
        total_asset = values[0]
        current_liabilities = values[1]
        print (f'Year: {key}, Total Asset: {total_asset}')
        print (f'Year: {key}, Current liabilities: {current_liabilities}')
        # output
        ********** Balance sheet results for AAPL **********
        Year: 09292018, Total Asset: 365,725,000
        Year: 09292018, Current liabilities: 116,866,000
        Year: 09292017, Total Asset: 375,319,000
        Year: 09292017, Current liabilities: 100,814,000
        Year: 09292016, Total Asset: 321,686,000
        Year: 09292016, Current liabilities: 79,006,000

elif 'financials' in url:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    get_operating_income_or_loss(soup)
    financials_dict = {k: v for k, v in zip(operating_income_or_loss_keys, operating_income_or_loss_values)}
    print('*' * 10, f'Financials results for {stock_symbol}', '*' * 10)
    for key, value in financials_dict.items():
      print (f'Year: {key}, Operating income or loss: {value}')
      # output 
      ********** Financials results for AAPL **********
      Year: ttm, Operating income or loss: 64,423,000
      Year: 09292018, Operating income or loss: 70,898,000
      Year: 09292017, Operating income or loss: 61,344,000
      Year: 09292016, Operating income or loss: 60,024,000

请参考您对查询2016年名为资产负债表_dict的字典的评论：

^{pr2}$

网友
2楼 · 编辑于 2024-09-25 00:30:46

一些关于解析html的建议使用'beauthoulsoup'，这对我很有帮助也许对你也有帮助。在
use 'id' to location the element, instead of using 'class' because the 'class' change more frequently than id.
use structure info to location the element instead of using 'class', the structure info change less frequently.
use headers with user-agent info to get response is always better than no headers. In this case, if do not specify headers info, you can not find id 'Col1-1-Financials-Proxy', but you can find 'Col1-3-Financials-Proxy', which is not same with result in Chrome inspector.
下面是针对您的需求的可运行代码使用结构信息定位元素。你绝对可以使用“类”信息来制作它。只要记住，当你的代码不能正常工作时，请检查网站的源代码。在
# import libraries import requests from bs4 import BeautifulSoup # set the URL you want to webscrape from first_page_url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL' second_page_url = 'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } ################# # first page ################# print('*' * 10, ' FIRST PAGE RESULT ', '*' * 10) total_assets = {} total_current_liabilities = {} operating_income_or_loss = {} page1_table_keys = [] page2_table_keys = [] # connect to the first page URL response = requests.get(first_page_url, headers=headers) # parse HTML and save to BeautifulSoup object¶ soup = BeautifulSoup(response.text, "html.parser") # the nearest id to get the result sheet = soup.find(id='Col1-1-Financials-Proxy') sheet_section_divs = sheet.section.find_all('div', recursive=False) # last child sheet_data_div = sheet_section_divs[-1] div_ele_table = sheet_data_div.find('div').find('div').find_all('div', recursive=False) # table header div_ele_header = div_ele_table[0].find('div').find_all('div', recursive=False) # first element is label, the remaining element containing data, so use range(1, len()) for i in range(1, len(div_ele_header)): page1_table_keys.append(div_ele_header[i].find('span').text) # table body div_ele = div_ele_table[-1] div_eles = div_ele.find_all('div', recursive=False) tgt_div_ele1 = div_eles[0].find_all('div', recursive=False)[-1] tgt_div_ele1_row = tgt_div_ele1.find_all('div', recursive=False)[-1] tgt_div_ele1_row_eles = tgt_div_ele1_row.find('div').find_all('div', recursive=False) # first element is label, the remaining element containing data, so use range(1, len()) for i in range(1, len(tgt_div_ele1_row_eles)): total_assets[page1_table_keys[i - 1]] = tgt_div_ele1_row_eles[i].find('span').text tgt_div_ele2 = div_eles[1].find_all('div', recursive=False)[-1] tgt_div_ele2 = tgt_div_ele2.find('div').find_all('div', recursive=False)[-1] tgt_div_ele2 = tgt_div_ele2.find('div').find_all('div', recursive=False)[-1] tgt_div_ele2_row = tgt_div_ele2.find_all('div', recursive=False)[-1] tgt_div_ele2_row_eles = tgt_div_ele2_row.find('div').find_all('div', recursive=False) # first element is label, the remaining element containing data, so use range(1, len()) for i in range(1, len(tgt_div_ele2_row_eles)): total_current_liabilities[page1_table_keys[i - 1]] = tgt_div_ele2_row_eles[i].find('span').text print('Total Assets', total_assets) print('Total Current Liabilities', total_current_liabilities) ################# # second page, same logic as the first page ################# print('*' * 10, ' SECOND PAGE RESULT ', '*' * 10) # Connect to the second page URL response = requests.get(second_page_url, headers=headers) # Parse HTML and save to BeautifulSoup object¶ soup = BeautifulSoup(response.text, "html.parser") # the nearest id to get the result sheet = soup.find(id='Col1-1-Financials-Proxy') sheet_section_divs = sheet.section.find_all('div', recursive=False) # last child sheet_data_div = sheet_section_divs[-1] div_ele_table = sheet_data_div.find('div').find('div').find_all('div', recursive=False) # table header div_ele_header = div_ele_table[0].find('div').find_all('div', recursive=False) # first element is label, the remaining element containing data, so use range(1, len()) for i in range(1, len(div_ele_header)): page2_table_keys.append(div_ele_header[i].find('span').text) # table body div_ele = div_ele_table[-1] div_eles = div_ele.find_all('div', recursive=False) tgt_div_ele_row = div_eles[4] tgt_div_ele_row_eles = tgt_div_ele_row.find('div').find_all('div', recursive=False) for i in range(1, len(tgt_div_ele_row_eles)): operating_income_or_loss[page2_table_keys[i - 1]] = tgt_div_ele_row_eles[i].find('span').text print('Operating Income or Loss', operating_income_or_loss)
带标题信息的输出：
^{pr2}$

网友
3楼 · 编辑于 2024-09-25 00:30:46

编辑-应@Life的要求是复杂的，编辑后添加日期标题。在

使用lxml试试这个：

import requests
from lxml import html

url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL'
url2 = 'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL'
page = requests.get(url)
page2 = requests.get(url2)


tree = html.fromstring(page.content)
tree2 = html.fromstring(page2.content)

total_assets = []
Total_Current_Liabilities = []
Operating_Income_or_Loss = []
heads = []


path = '//div[@class="rw-expnded"][@data-test="fin-row"][@data-reactid]'
data_path = '../../div/span/text()'
heads_path = '//div[contains(@class,"D(ib) Fw(b) Ta(end)")]/span/text()'

dats = [tree.xpath(path),tree2.xpath(path)]

for entry in dats:
    heads.append(entry[0].xpath(heads_path))
    for d in entry[0]:
        for s in d.xpath('//div[@title]'):
            if s.attrib['title'] == 'Total Assets':
                total_assets.append(s.xpath(data_path))
            if s.attrib['title'] == 'Total Current Liabilities':
                Total_Current_Liabilities.append(s.xpath(data_path))
            if s.attrib['title'] == 'Operating Income or Loss':
                Operating_Income_or_Loss.append(s.xpath(data_path))

del total_assets[0]
del Total_Current_Liabilities[0]
del Operating_Income_or_Loss[0]

print('Date   Total Assets Total_Current_Liabilities:')
for date,asset,current in zip(heads[0],total_assets[0],Total_Current_Liabilities[0]):    
         print(date, asset, current)
print('Operating Income or Loss:')
for head,income in zip(heads[1],Operating_Income_or_Loss[0]):
         print(head,income)

输出：

^{pr2}$
当然，如果需要，可以很容易地将其合并到pandas数据帧中。在

相关问题更多 >

编程相关推荐

热门问题

热门文章