PythonBS4仅使用表头+另存为字典从wikipedia表中提取子表

from bs4 import BeautifulSoup import requests r=requests.get("https://de.wikipedia.org/wiki/Stuttgart") soup=BeautifulSoup(r.text,"html.parser") soup.find('th', text=re.compile('Basisdaten')).find_parent('table')

data = [] def extractDict(y): results = y.find("th", {"colspan" : "2"}).find_parent('table').select('td')[3:35] for row in results: data.append(row.text.strip().replace('\xa0', '').replace(':', '').replace('[1]', '')) return dict(zip(data[::2], data[1::2])) basisdaten=extractDict(soup) basisdaten

{'Adresse derStadtverwaltung': 'Marktplatz 170173 Stuttgart', 'Bevölkerungsdichte': '3029Einwohner je km2', 'Bundesland': 'Baden-Württemberg', 'Einwohner': '628.032 (31.Dez.2016)', 'Fläche': '207,35km2', 'Gemeindeschlüssel': '08111000', 'Höhe': '247m ü.NHN', 'Kfz-Kennzeichen': 'S', 'LOCODE': 'DE STR', 'NUTS': 'DE111', 'Oberbürgermeister': 'Fritz Kuhn (Bündnis 90/Die Grünen)', 'Postleitzahlen': '70173–70619', 'Regierungsbezirk': 'Stuttgart', 'Stadtgliederung': '23 Stadtbezirkemit 152 Stadtteilen', 'Vorwahl': '0711', 'Webpräsenz': 'www.stuttgart.de'}

1条回答

网友

1楼 · 发布于 2024-09-30 07:32:28

这应该行

from bs4 import BeautifulSoup
import requests

data = requests.get("https://de.wikipedia.org/wiki/Stuttgart").text
soup = BeautifulSoup(data, "lxml")
trs = soup.select('table[id*="Infobox"] tr')
is_in_basisdaten = False
data = {}
clean_data = lambda x: x.get_text().strip().replace('\xa0', '').replace(':', '')
for tr in trs:
    if tr.th:
        if "Basisdaten" in tr.th.string:
                is_in_basisdaten = True
        if is_in_basisdaten and "Basisdaten" not in tr.th.string:
            break
    elif is_in_basisdaten:
        key, val = tr.select('td')
        data[clean_data(key)] = clean_data(val)

print(data)

相关问题更多 >

编程相关推荐

热门问题

热门文章