从html页面的列表元素中筛选数据

from BeautifulSoup import BeautifulSoup import requests import re person_dict = {} ..... <snip> <snip> ..... soup = BeautifulSoup(response.text) div = soup.find('div', {'id': 'object-a'}) ul = div.find('ul', {'id': 'object-a-1'}) li_a = ul.findAll('a', {'class': 'title'}) li_p = ul.findAll('p', {'class': 'url word'}) li_po = ul.findAll('p') for a in li_a: nametemp = a.text name = (nametemp.split(' - ')[0]) person_dict.update({'Name': name}) #I attempted updating for lip in li_p: person_dict['url'] = lip.text #I attempted adding directly for email in li_po: reg_emails = re.compile('[a-zA-Z0-9.]*' + '@') person_dict['email'] = reg_emails.findall(email.text) print person_dict # results in 1 entry being returned

2条回答

网友

1楼 · 编辑于 2024-05-17 05:06:21

是否需要使用字典取决于您自己，但是如果您选择使用字典，则最好为每个列表项单独使用一个字典，而不是为所有条目使用一个字典。在

我建议你把所有的条目都存储在一个列表中。下面的代码显示了两个建议，要么使用tuple来存储每个项目的不同信息位，要么使用字典。在

如果您只想显示信息或将其写入文件，tuple解决方案会更快。在

# Two possible ways of storing your data: a list of tuples, or a list of dictionaries
entries_tuples = []             
entries_dictionary = []

soup = BeautifulSoup(text)

div = soup.find('div', {'id': 'object-a'})
ul = div.find('ul', {'id': 'object-a-1'})

for li in ul.findAll('li'):
    title = li.find('a', {'class': 'title'})
    url_href = title.get('href')
    person = title.text
    url_word = li.find('p', {'class': 'url word'}).text
    emails = re.findall(r'\s+(\S+@\S+)(?:\s+|\Z)', li.findAll('p')[1].text, re.M)       # allow for multiple emails

    entries_tuples.append((url_href, person, url_word, emails))
    entries_dictionary.append({'url_href' : url_href, 'person' : person, 'url_word' : url_word, 'emails' : emails})

for url_href, person, url_word, emails in entries_tuples:
    print '{:25} {:10} {:25} {}'.format(url_href, person, url_word, emails)

print

for entry in entries_dictionary:
    print '{:25} {:10} {:25} {}'.format(entry['url_href'], entry['person'], entry['url_word'], entry['emails'])

对于示例HTML，将显示以下内容：

^{pr2}$

注意，从文本中提取电子邮件地址本身就是一个完整的问题。上面的解决方案可以很容易地匹配那些实际上不是格式良好的电子邮件地址的条目，但在这里就足够了。在

网友

2楼 · 编辑于 2024-05-17 05:06:21

你可能走错了路。试试这样的方法：

from BeautifulSoup import BeautifulSoup
import re

text = open('soup.html') # You are opening the file differently
soup = BeautifulSoup(text)
list_items = soup.findAll('li')

people = []

for item in list_items:
    name = item.find('a', {'class': 'title'}).text
    url = item.find('p', {'class': 'url word'}).text
    email_text = item.findAll('p')[1].text
    match = re.search(r'[\w\.-]+@[\w\.-]+', email_text)
    email = match.group(0)

    person = {'name': name, 'url': url, 'email': email}
    people.append(person)

print people

相关问题更多 >

编程相关推荐

热门问题

热门文章