BeautifulSoup在多个页面上抓取html表

## importing bs4, requests, fake_useragent and csv modules from bs4 import BeautifulSoup import requests from fake_useragent import UserAgent import csv ## create an array with URLs urls = [ 'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10', 'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10', 'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10' ] ## initializing the UserAgent object user_agent = UserAgent() ## starting the loop for url in urls: ## getting the reponse from the page using get method of requests module page = requests.get(url, headers={"user-agent": user_agent.chrome}) ## storing the content of the page in a variable html = page.content ## creating BeautifulSoup object soup = BeautifulSoup(html, "html.parser") table = soup.findAll("table", {"class":"table"})[0] rows = table.findAll("tr") with open("test.csv", "wt+", newline="") as f: writer = csv.writer(f) for row in rows: csv_row = [] for cell in row.findAll(["td", "th"]): csv_row.append(cell.get_text()) writer.writerow(csv_row)

2条回答

网友

1楼 · 编辑于 2024-10-01 04:44:23

在您的代码中，您不会将rows变量存储到任何位置，因此您只将上一个URL中的值写入CSV文件。此示例将从所有三个URL写入值：

import csv
import requests
from bs4 import BeautifulSoup


urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}

all_data = []
for url in urls:
    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.findAll("table", {"class":"table"})[0]

    # here I store all rows to list `all_data`
    for row in table.findAll('tr'):
        tds = [cell.get_text(strip=True, separator=' ') for cell in row.findAll(["td", "th"])]
        all_data.append(tds)
        print(*tds)

# write list `all_data` to CSV
with open("test.csv", "wt+", newline="") as f:
    writer = csv.writer(f)
    for row in all_data:
        writer.writerow(row)

从所有三个URL写入test.csv（来自LibreOffice的屏幕截图）：

网友

2楼 · 编辑于 2024-10-01 04:44:23

为了简化行的读取过程，您还可以使用pandas进行快照：

import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd


urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}

all_data = []
for url in urls:
    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.findAll("table", {"class":"table"})[0]
    
    df_table = pd.read_html(str(table))[0]
    
    #add a column with additional info
    df_table['hit'] = soup.find("span", {"class":"c"}).text.strip() 
    
    #store the table in a list of tables
    all_data.append(df_table)

#concat the tables and export them to csv
pd.concat(all_data).to_csv('test.csv',index=False)

相关问题更多 >

编程相关推荐

热门问题

热门文章