from bs4 import BeautifulSoup
import urllib.request
import re
with urllib.request.urlopen('https://uk-air.defra.gov.uk/latest/currentlevels?view=region') as response:
htmlData = response.read()
soup = BeautifulSoup(htmlData, 'html5lib')
tables = soup.find_all('table', attrs={'class':'current_levels_table'})
#what you want to check:
Iwant = ['nitrogen', 'update']
about = 'Edinburgh'
for table in tables:
#get header to have the data (we're looking for) column number and table real names
table_head = table.find('thead')
headrows = table_head.find_all('tr')
measures = headrows[1].find_all('th')
for colnum, measure in enumerate(measures):
index.update({colnum: measure.text.strip() for wanted in Iwant if re.search(wanted+'(?iu)', measure.text)})
#get table content and look for Edinburgh
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cels = row.find_all('td')
rowContent = [cel.text.strip().replace(u'\xa0', u' ').replace(u'\n Timeseries Graph', u'') for cel in cels if cel]
if re.search(about+'(?iu)', rowContent[0]):
for indexwanted, measurewanted in index.items():
print(measurewanted, ':', rowContent[indexwanted])
import requests
from bs4 import BeautifulSoup
# Request the page, set headers to prevent 403 Forbidden
page = requests.get(
url='https://uk-air.defra.gov.uk/latest/currentlevels',
headers={'User-Agent': 'Not blank'})
# Get html from page
html = page.text
# BeautifulSoup object
soup = BeautifulSoup(html, 'html5lib')
for table in soup.find_all('table'):
# Print all tables on the page
print(table)
废弃表列表中的所有html表。 表索引可能会更改,因此不应依赖行/列索引。 下面脚本的一部分查找搜索到的数据的索引。此外,它打印头名称:所以您知道want是您获得的数据。在
利用d2718nis的建议,您可以这样做。当然,很多其他的方法也可以。在
首先,找到包含“爱丁堡圣伦纳德”文本的链接。然后找到那个link元素的grand父元素,它是一个
tr
元素。现在确定tr
中的td
元素。当您检查表时,您会看到您想要的列是第4列和第7列。从所有td
元素中获取这些元素作为(0-相对)第3和第6个。最后,展示这些元素的原始文本。在您需要做一些聪明的事情来从这些结果中提取正确可读的字符串。在
你可以从这个开始:
相关问题 更多 >
编程相关推荐