用python解析无序HTML页面的最佳方法是什么？

import requests from bs4 import BeautifulSoup PTiD = 7680560 url = "http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=/netahtml/PTO/srchnum.htm&r=1&f=G&l=50&s1=" + str(PTiD) + ".PN.&OS=PN/" + str(PTiD) + "&RS=PN/" + str(PTiD) res = requests.get(url, prefetch = True) raw_html = res.content print "Parser Started.. " bs_html = BeautifulSoup(raw_html, "lxml") #Initialize all the Search Lists fonts = bs_html.find_all('font') para = bs_html.find_all('p') bs_text = bs_html.find_all(text=True) onlytext = [x for x in bs_text if x != '\n' and x != ' '] #Initialize the Indexes AppNumIndex = onlytext.index('Appl. No.:\n') FiledIndex = onlytext.index('Filed:\n ') InventorsIndex = onlytext.index('Inventors: ') AssigneeIndex = onlytext.index('Assignee:') ClaimsIndex = onlytext.index('Claims') DescriptionIndex = onlytext.index(' Description') CurrentUSClassIndex = onlytext.index('Current U.S. Class:') CurrentIntClassIndex = onlytext.index('Current International Class: ') PrimaryExaminerIndex = onlytext.index('Primary Examiner:') AttorneyOrAgentIndex = onlytext.index('Attorney, Agent or Firm:') RefByIndex = onlytext.index('[Referenced By]') #~~Title~~ for a in fonts: if a.has_key('size') and a['size'] == '+1': d_title = a.string print "title: " + d_title #~~Abstract~~~ d_abstract = para[0].string print "abstract: " + d_abstract #~~Assignee Name~~ d_assigneeName = onlytext[AssigneeIndex +1] print "as name: " + d_assigneeName #~~Application number~~ d_appNum = onlytext[AppNumIndex + 1] print "ap num: " + d_appNum #~~Application date~~ d_appDate = onlytext[FiledIndex + 1] print "ap date: " + d_appDate #~~ Patent Number~~ d_PatNum = onlytext[0].split(':')[1].strip() print "patnum: " + d_PatNum #~~Issue Date~~ d_IssueDate = onlytext[10].strip('\n') print "issue date: " + d_IssueDate #~~Inventors Name~~ d_InventorsName = '' for x in range(InventorsIndex+1, AssigneeIndex, 2): d_InventorsName += onlytext[x] print "inv name: " + d_InventorsName #~~Inventors City~~ d_InventorsCity = '' for x in range(InventorsIndex+2, AssigneeIndex, 2): d_InventorsCity += onlytext[x].split(',')[0].strip().strip('(') d_InventorsCity = d_InventorsCity.strip(',').strip().strip(')') print "inv city: " + d_InventorsCity #~~Inventors State~~ d_InventorsState = '' for x in range(InventorsIndex+2, AssigneeIndex, 2): d_InventorsState += onlytext[x].split(',')[1].strip(')').strip() + ',' d_InventorsState = d_InventorsState.strip(',').strip() print "inv state: " + d_InventorsState #~~ Asignee City ~~ d_AssigneeCity = onlytext[AssigneeIndex + 2].split(',')[1].strip().strip('\n').strip(')') print "asign city: " + d_AssigneeCity #~~ Assignee State~~ d_AssigneeState = onlytext[AssigneeIndex + 2].split(',')[0].strip('\n').strip().strip('(') print "asign state: " + d_AssigneeState #~~Current US Class~~ d_CurrentUSClass = '' for x in range (CuurentUSClassIndex + 1, CurrentIntClassIndex): d_CurrentUSClass += onlytext[x] print "cur us class: " + d_CurrentUSClass #~~ Current Int Class~~ d_CurrentIntlClass = onlytext[CurrentIntClassIndex +1] print "cur intl class: " + d_CurrentIntlClass #~~~Primary Examiner~~~ d_PrimaryExaminer = onlytext[PrimaryExaminerIndex +1] print "prim ex: " + d_PrimaryExaminer #~~d_AttorneyOrAgent~~ d_AttorneyOrAgent = onlytext[AttorneyOrAgentIndex +1] print "agent: " + d_AttorneyOrAgent #~~ Referenced by ~~ for x in range(RefByIndex + 2, RefByIndex + 400): if (('Foreign' in onlytext[x]) or ('Primary' in onlytext[x])): break else: d_ReferencedBy += onlytext[x] print "ref by: " + d_ReferencedBy #~~Claims~~ d_Claims = '' for x in range(ClaimsIndex , DescriptionIndex): d_Claims += onlytext[x] print "claims: " + d_Claims

2条回答

网友

1楼 · 编辑于 2024-10-01 13:33:18

我认为最简单的解决方案是使用pyquery库 http://packages.python.org/pyquery/api.html

您可以使用jquery选择器选择页面的元素。在

网友

2楼 · 编辑于 2024-10-01 13:33:18

如果您使用beauthoulsoup，并且有dom 123和{}，那么您将拥有{}

但是，如果您有dom 123，它与前面的语义相同，但是beautifulsoup将给您['12','3']

也许您可以准确地找到阻碍您完成['123']的标记，然后先忽略/消除该标记。在

一些关于如何消除<；b>；标记的伪代码

import re
html='<p>12<b>3</b></p>'
reExp='<[\/\!]?b[^<>]*?>'
print re.sub(reExp,'',html)

对于图案，可以使用以下选项：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章