<p><strong>更新</strong></p>
<blockquote>
<p>Jack Fleeting's answer is likely the best approach for solving the OP's question. </p>
<p>My answer works, but it pales in comparison to Jack's approach.
I'm leaving my answer here because, I want it to serve as a placeholder to help other that need to tackle a similar problem in the future. </p>
</blockquote>
<hr/>
<p><strong>原始回复</strong></p>
<p>这是另一个答案,可能会由比我更擅长做汤的人来完善。在</p>
<p>我把收集到的数据放入两个字典中,分别命名为<em>资产负债表_dict</em>和<em>金融学。我还提取了与列关联的日期,因为我将在其他函数中使用它们。我还将这些日期从%m/%d/%Y重新格式化为%m%d%Y</p>
<p>我还使用soups find_all_next(tag_name,limit=int)只收集所需的子标记。您可以调整此限制以从表中收集所需的项。在</p>
<p>总的来说,这是一个有趣的问题,需要一些额外的思考。谢谢你发帖提问。在</p>
<pre><code>import requests
from datetime import datetime
from bs4 import BeautifulSoup
import re as regex
operating_income_or_loss_keys = []
operating_income_or_loss_values = []
def get_operating_income_or_loss(soup):
for rows in soup.find_all('div', {'class': 'D(tbr)'}):
for date_row in rows.find_all('div', {'class': 'D(ib)'}):
chart_dates = date_row.find_all_next('span', limit=8)
for dates in chart_dates[1:]:
if dates.text == 'ttm':
operating_income_or_loss_keys.append(dates.text)
else:
date_format = regex.match(r'(\d{1,2}/\d{2}/\d{4})', dates.text)
if date_format:
reformatted_date = datetime.strptime(dates.text, '%m/%d/%Y').strftime('%m%d%Y')
operating_income_or_loss_keys.append(reformatted_date)
for sub_row in rows.find_all('div', {'class': 'D(tbc)'}):
for row_item in sub_row.find_all('span', {'class': 'Va(m)'}):
if row_item.text == 'Operating Income or Loss':
operating_income_or_loss = row_item.find_all_next('span', limit=len(operating_income_or_loss_keys))
for item in operating_income_or_loss[1:]:
if len(item) == 0 or item.text == '-':
operating_income_or_loss_values.append('no value provided')
else:
operating_income_or_loss_values.append(item.text)
return
total_assets_values = []
total_current_liabilities = []
balance_sheet_keys = []
def get_total_assets(soup):
for rows in soup.find_all('div', {'class': 'D(tbr)'}):
for date_row in rows.find_all('div', {'class': 'D(ib)'}):
if date_row.text == 'Breakdown':
chart_dates = date_row.find_all_next('span', limit=8)
for dates in chart_dates[1:]:
date_format = regex.match(r'(\d{1,2}/\d{2}/\d{4})', dates.text)
if date_format:
reformatted_date = datetime.strptime(dates.text, '%m/%d/%Y').strftime('%m%d%Y')
balance_sheet_keys.append(reformatted_date)
for rows in soup.find_all('div', {'class': 'D(tbr)'}):
for sub_row in rows.find_all('div', {'class': 'D(tbc)'}):
for row_item in sub_row.find_all('span', {'class': 'Va(m)'}):
if row_item.text == 'Total Assets':
total_assets = row_item.find_all_next('span', limit=len(balance_sheet_keys))
for item in total_assets:
if len(item) == 0 or item.text == '-':
total_assets_values.append('no value provided')
else:
total_assets_values.append(item.text)
return
def get_total_current_liabilities(soup):
for rows in soup.find_all('div', {'class': 'D(tbr)'}):
for sub_row in rows.find_all('div', {'class': 'D(tbc)'}):
for row_item in sub_row.find_all('span', {'class': 'Va(m)'}):
if row_item.text == 'Total Current Liabilities':
current_liabilities = row_item.find_all_next('span', limit=len(balance_sheet_keys))
for item in current_liabilities:
if len(item) == 0 or item.text == '-':
total_current_liabilities.append('no value provided')
else:
total_current_liabilities.append(item.text)
return
urls = ['https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL',
'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL']
for url in urls:
stock_symbol = url.rpartition('?p=')[-1]
if 'balance-sheet' in url:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
get_total_assets(soup)
get_total_current_liabilities(soup)
balance_sheet_dict = {k: v for k, v in zip(balance_sheet_keys, zip(total_assets_values,total_current_liabilities))}
print('*' * 10, f'Balance sheet results for {stock_symbol}', '*' * 10)
for key, values in balance_sheet_dict.items():
total_asset = values[0]
current_liabilities = values[1]
print (f'Year: {key}, Total Asset: {total_asset}')
print (f'Year: {key}, Current liabilities: {current_liabilities}')
# output
********** Balance sheet results for AAPL **********
Year: 09292018, Total Asset: 365,725,000
Year: 09292018, Current liabilities: 116,866,000
Year: 09292017, Total Asset: 375,319,000
Year: 09292017, Current liabilities: 100,814,000
Year: 09292016, Total Asset: 321,686,000
Year: 09292016, Current liabilities: 79,006,000
elif 'financials' in url:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
get_operating_income_or_loss(soup)
financials_dict = {k: v for k, v in zip(operating_income_or_loss_keys, operating_income_or_loss_values)}
print('*' * 10, f'Financials results for {stock_symbol}', '*' * 10)
for key, value in financials_dict.items():
print (f'Year: {key}, Operating income or loss: {value}')
# output
********** Financials results for AAPL **********
Year: ttm, Operating income or loss: 64,423,000
Year: 09292018, Operating income or loss: 70,898,000
Year: 09292017, Operating income or loss: 61,344,000
Year: 09292016, Operating income or loss: 60,024,000
</code></pre>
<p>请参考您对查询2016年名为<em>资产负债表_dict</em>的字典的评论:</p>
^{pr2}$