Pandas从已发布的GSheet（数据库）中读取嵌套的HTML多个表

url = r'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml' import pandas as pd import requests rtext = requests.get(url).text

<table class="waffle" cellspacing="0" cellpadding="0"><thead> <tr><th class="row-header freezebar-vertical-handle header-shim row-header-shim"></th><th id="0C0" style="width:54px" class="header-shim"></th><th id="0C1" style="width:60px" class="header-shim"></th><th id="0C2" style="width:72px" class="header-shim"></th><th id="0C4" style="width:50px" class="header-shim"></th><th id="0C5" style="width:48px" class="header-shim"></th><th id="0C6" style="width:111px" class="header-shim"></th><th id="0C7" style="width:130px" class="header-shim"></th><th id="0C8" style="width:201px" class="header-shim"></th><th id="0C9" style="width:69px" class="header-shim"></th><th id="0C10" style="width:124px" class="header-shim"></th><th id="0C11" style="width:190px" class="header-shim"></th><th id="0C12" style="width:96px" class="header-shim"></th><th id="0C13" style="width:99px" class="header-shim"></th><th id="0C14" style="width:99px" class="header-shim"></th><th id="0C15" style="width:99px" class="header-shim"></th><th id="0C16" style="width:258px" class="header-shim"></th><th id="0C17" style="width:96px" class="header-shim"></th><th id="0C18" style="width:96px" class="header-shim"></th><th id="0C19" style="width:100px" class="header-shim"></th><th id="0C20" style="width:100px" class="header-shim"></th><th id="0C21" style="width:100px" class="header-shim"></th><th id="0C22" style="width:100px" class="header-shim"></th><th id="0C23" style="width:100px" class="header-shim"></th><th id="0C24" style="width:100px" class="header-shim"></th><th id="0C25" style="width:100px" class="header-shim"></th><th id="0C26" style="width:100px" class="header-shim"></th></tr></thead><tbody> <tr style="height:46px;"><th id="0R0" style="height: 46px;" class="row-headers-background row-header-shim"><div class="row-header-wrapper" style="line-height: 46px;">1</div></th> <td class="s0" dir="ltr">Patient Number</td><td class="s1" dir="ltr">State Patient Number</td><td class="s2">Date Announced</td><td class="s2">Age Bracket</td><td class="s0">Gender</td><td class="s0">Detected City</td><td class="s0" dir="ltr">Detected District</td><td class="s2" dir="ltr">Detected State</td><td class="s0" dir="ltr">State code</td><td class="s0" dir="ltr">Current Status</td><td class="s3" dir="ltr">Notes</td><td class="s1" dir="ltr">Contracted from which Patient (Suspected)</td><td class="s2" dir="ltr">Nationality</td><td class="s2" dir="ltr">Type of transmission</td><td class="s2">Status Change Date</td><td class="s3" dir="ltr">Source_1</td><td class="s3" dir="ltr">Source_2</td><td class="s3" dir="ltr">Source_3</td><td class="s1" dir="ltr">Backup Notes</td><td class="s4"></td><td class="s4"></td><td class="s4"></td><td class="s4"></td><td class="s4"></td><td class="s4"></td><td class="s4"></td></tr> <tr><th style="height:3px" class="freezebar-cell freezebar-horizontal-handle row-header-shim"></th><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td><td class="freezebar-cell"></td></tr> <tr style="height:20px;"><th id="0R1" style="height: 20px;" class="row-headers-background row-header-shim"><div class="row-header-wrapper" style="line-height: 20px;">2</div></th><td class="s5" dir="ltr">1</td><td class="s6" dir="ltr">KL-TS-P1</td><td class="s5" dir="ltr">30/01/2020</td><td class="s5" dir="ltr">20</td><td class="s6" dir="ltr">F</td><td class="s6" dir="ltr">Thrissur</td><td class="s6" dir="ltr">Thrissur</td><td class="s6" dir="ltr">Kerala</td><td class="s6">KL</td><td class="s6" dir="ltr">Recovered</td><td class="s7" dir="ltr">Travelled from Wuhan</td><td class="s8"></td><td class="s5" dir="ltr">India</td><td class="s5" dir="ltr">Imported</td><td class="s9" dir="ltr">14/02/2020</td><td class="s10" dir="ltr"><a target="_blank" rel="noreferrer" href="https://www.google.com/url?q=https://twitter.com/vijayanpinarayi/status/1222819465143832577&sa=D&ust=1586180154769000&usg=AFQjCNEwtlOETa2v9D30Pjoe-fJxrVA9PA">https://twitter.com/vijayanpinarayi/status/1222819465143832577</a></td><td class="s11" dir="ltr" colspan="2"><a target="_blank" rel="noreferrer" href="https://www.google.com/url?q=https://weather.com/en-IN/india/news/news/2020-02-14-kerala-defeats-coronavirus-indias-three-covid-19-patients-successfully&sa=D&ust=1586180154769000&usg=AFQjCNGWVTymYTvejeSjCqq583NMJ3jbTA">https://weather.com/en-IN/india/news/news/2020-02-14-kerala-defeats-coronavirus-indias-three-covid-19-patients-successfully</a></td><td class="s12 softmerge" dir="ltr"><div class="softmerge-inner" style="width: 198px; left: -1px;">Student from Wuhan</div></td><td class="s13"></td><td class="s13"></td><td></td><td></td><td></td><td></td><td></td></tr>

1条回答

网友

1楼 · 发布于 2024-10-02 08:27:14

我在下面列出了我自己的答案

其他资源：Parsing HTML Tables in Python

import pandas as pd
import requests
from datetime import datetime
import bs4

print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# get url (published from google sheets, cannot download, only html)
url = r'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml'

# reques read the url and put as text
rtext = requests.get(url).text
print(len(rtext)) #13.9 Million Char

# beautiful soup to identify the number tables
# this is to speed up read_html, otherwise read_html will take 15 mins for 13.9 Million characters to parse
# now it takes < 10 seconds

bs = bs4.BeautifulSoup(rtext)
tables = bs.findAll("table") 

# first table is what we want, help to find the table.

t0 = tables[0]

# read_html returns a list of dataframes even for 1 table.
list_df = pd.read_html(str(t0))
list_df

# get dataframe and process
dfindia = list_df[0]
dfindia.columns = dfindia.iloc[0]
dfindia = dfindia.drop([0,1])
dfindia = dfindia.drop(1, axis=1)

dfindia = dfindia[~dfindia['Patient Number'].isna()]
dfindia = dfindia[~dfindia['Date Announced'].isna()]

print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


dfindia

结果从下载到在Google Colab中打印只需14秒

2020-04-07 15:16:53
13989044
2020-04-07 15:17:07

相关问题更多 >

编程相关推荐

热门问题

热门文章