使用BeautifulSoup从一个博客归档页面中提取多个帖子，无需脚本

import urllib2 from bs4 import BeautifulSoup import re file_list = open ("hafiles.txt", "r") posts_file = open ("haposts.txt","w") for indurl in file_list: indurl = indurl.rstrip("\n") with open(indurl,"r") as ha_file: soup_ha = BeautifulSoup(ha_file) #works the second find gets rid of the sociable crap # this is the way it looks on the page <div class='post-body'> posts = soup_ha.find("div", class_="post-body").find_all("p") #tried a trick i saw on http://stackoverflow.com/questions/24458353/cleaning-text-string-after-getting-body-text-using-beautifulsoup #no joy #posts = soup_ha.find("div", class_="post-body") #text = [''.join(s.findAll(text=True))for s in posts.findAll('p')] text = str(posts) + "\n" + "\n" posts_file.write (text) print ("All done!") file_list.close() posts_file.close()

1条回答

网友

1楼 · 发布于 2024-10-03 00:23:41

下面是一个包含多个帖子的单页示例：

from bs4 import BeautifulSoup


soup = BeautifulSoup(open('test.html'))
posts = []
for post in soup.find_all('div', class_='post'):
    title = post.find('h3', class_='post-title').text.strip()
    author = post.find('span', class_='post-author').text.replace('Posted by', '').strip()
    content = post.find('div', class_='post-body').p.text.strip()
    date = post.find_previous_sibling('h2', class_='date-header').text.strip()

    posts.append({'title': title,
                  'author': author,
                  'content': content,
                  'date': date})
print posts

对于您发布的html，它将打印：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章