beauthulsoup递归解析数据并在输出时维护结构

#!/usr/bin/env python import sys import urllib2 from pprint import pprint from bs4 import BeautifulSoup def dataList(element): categoryList = [] try: for ul in categorySoup('ul', recursive=True): for li in ul('li', recursive=True): categoryList.append(li.a.contents) categoryList.append("new ccategory"); return categoryList except: return ['broken!'] categories = ['20081', '550', '2984', '267', '12576', '625', '15032', '11450', '11116', '1', '58058', '293', '14339', '237', '11232', '45100', '99', '172008', '26395', '11700', '281', '11233', '619', '1281', '870', '10542', '316', '888', '64482', '260', '1305', '220', '3252', '1249'] print "\nSetting user agent...", user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3' print "DONE" print "Setting headers...", headers = { 'User-Agent' : user_agent } print "DONE" data = {} print "Iterating through dictionary of categories\n" for rootID in categories: print "Requesting source code...", url = 'http://www.isoldwhat.com/getcats/fullcategorytree.php?RootID=%s' % rootID req = urllib2.Request(url, None, headers) response = urllib2.urlopen(req) print "DONE" print "Turning HTML into soup..." text = response.read() soup = BeautifulSoup(text, 'html.parser') categorySoup = soup.find('div', id='catnumbers') print "DONE" print "Parsing data...", pprint(dataList(categorySoup)) print "DONE\n" response.close() # its always safe to close an open connection sys.exit() print "Turning data into JSON...", #data = find_li(soup) data = json.dumps(data, ensure_ascii=False) print "DONE\n" print "Finished doing. Enjoy!"

1条回答

网友

1楼 · 发布于 2024-09-30 22:18:42

例如，可以使用soup = BeautifulSoup("<b></b>")创建一个新的根。并且只递归地追加类别和标记，同时保持相同的结构。关于以下内容：

def getCategory(root):
    children = root.contents
    if len(children) == 0:
        //returns an empty element of the same class
        //you can save other info here if you want, like the category
        return root.new_tag(root.name) 
    else:
        return root.append(getCategory(e) for e in children))

希望这能解决问题：）

相关问题更多 >

编程相关推荐

热门问题

热门文章