抓取Excel文件并动态读取

from bs4 import BeautifulSoup import requests import os import pandas as pd from io import BytesIO, StringIO url = "https://rigcount.bakerhughes.com/na-rig-count" r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') link_temp = soup.find_all('div', attrs={'class': 'file-link'}) for i in link_temp: if 'Rig Count Pivot Table' in i.find().text: link = i break href = link.a.get('href') response = requests.get(href) #Store file in memory f = BytesIO() for line in response.iter_content(): f.write(line) f.seek(0) pd.read_excel(f, engine='pyxlsb', sheet_name = 1, skiprows=0)

1条回答

网友

1楼 · 发布于 2024-10-02 02:33:45

import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
from functools import partial


async def main(url):
    async with httpx.AsyncClient(timeout=None) as client:
        r = await client.get(url)
        soup = BeautifulSoup(r.text, 'lxml')
        tfile = soup.select_one('.file-link:-soup-contains(Table)').a['href']
        async with client.stream('GET', tfile) as r:
            fname = r.headers.get('content-disposition').split('=')[-1]
            async with await trio.open_file(fname, 'wb') as f:
                async for chunk in r.aiter_bytes():
                    await f.write(chunk)

        df = await trio.to_thread.run_sync(partial(pd.read_excel, fname, sheet_name=3, engine="pyxlsb"))
        print(df)

if __name__ == "__main__":
    trio.run(main, 'https://rigcount.bakerhughes.com/na-rig-count')

输出：

              Country      County        Basin DrillFor  ... Week RigCount State/Province  PublishDate
0       UNITED STATES      SABINE  Haynesville      Gas  ...   13        1      LOUISIANA        40634    
1       UNITED STATES  TERREBONNE        Other      Oil  ...   13        1      LOUISIANA        40634    
2       UNITED STATES   VERMILION        Other      Gas  ...   13        1      LOUISIANA        40634    
3       UNITED STATES   VERMILION        Other      Gas  ...   13        1      LOUISIANA        40634    
4       UNITED STATES        EDDY      Permian      Oil  ...   13        1     NEW MEXICO        40634    
...               ...         ...          ...      ...  ...  ...      ...            ...          ...    
769390  UNITED STATES        KERN        Other      Oil  ...   29        1     CALIFORNIA        44393    
769391  UNITED STATES        KERN        Other      Oil  ...   29        1     CALIFORNIA        44393    
769392  UNITED STATES        KERN        Other      Oil  ...   29        1     CALIFORNIA        44393    
769393  UNITED STATES        KERN        Other      Oil  ...   29        1     CALIFORNIA        44393    
769394  UNITED STATES        KERN        Other      Oil  ...   29        1     CALIFORNIA        44393    

[769395 rows x 13 columns]

~~&燃气轮机；注意：似乎您在'pyxlsb'阅读器中遇到了一个bug。使用索引读取工作表是原因，但使用'sheet\u name='Master Data'也可以。~~

更新：

the problem is that the excel file has 2 hidden sheets, and the 2nd sheets really has 1457 rows, the Master Data is actually the 4th sheet, so sheet_name=3 will work

上次更新：

为了跟随Python DRY Principle。我注意到，我们不需要将文件保存在本地，甚至不需要将文件可视化并存储到内存中，然后将其加载到pandas

实际上response内容本身存储在内存中，因此我们可以通过将r.content直接传递给pandas来一次加载所有内容

使用以下代码：

import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
from functools import partial


async def main(url):
    async with httpx.AsyncClient(timeout=None) as client:
        r = await client.get(url)
        soup = BeautifulSoup(r.text, 'lxml')
        tfile = soup.select_one('.file-link:-soup-contains(Table)').a['href']
        r = await client.get(tfile)
        df = await trio.to_thread.run_sync(partial(pd.read_excel, r.content, sheet_name=3, engine="pyxlsb"))
        print(df)

if __name__ == "__main__":
    trio.run(main, 'https://rigcount.bakerhughes.com/na-rig-count')

相关问题更多 >

编程相关推荐

热门问题

热门文章