Python和Beautifulsoup问题删除soup对象中的空标记

# Load the cursor/recordset myrecordset = mycursor.fetchall() # Outer loop for y in myrecordset: myfilepath = "myexample.html" % y[2] soup = BeautifulSoup(open(myfilepath),"html.parser") PageName = soup.find("h1",{"class":"topictitle1"}) # print ("PageName: " + PageName.text) FieldName = soup.find_all("dt", {"class":"dlterm"}) FieldDataType = soup.find_all("samp", {"class":"codeph"}) FieldDesc = soup.find_all("dd", {"class":"ddexpand"}) # outercounter = -1 # # #Fix the empty value issue early that is offsetting everything # for z in FieldName: # outercounter+=1 # # FieldName[7].decompose() # if z.text == '': # '<dt class="dlterm"></dt>': # z.decompose() # # # FieldName[outercounter-1].pop() # How to get get the description cleaned up # FieldDesc[2].text.replace('\n','').replace(' ', ' ') # print(FieldName.text) # print(FieldDataType.text) # print(FieldDesc.text) # inner loop innercounter1 = 0 # zip allows me to iterate through multiple lists at the same time for (fn, fdt, fd) in zip(FieldName, FieldDataType, FieldDesc): fntemp= '' fdttemp= '' fdtemp= '' fntemp = fn.text fdttemp = fdt.text # clean the string if fd.text.__contains__('One of:'): # hold onto the double return while I replace the others. fdtemp = fd.text.replace('\n\n', '<<nn>>') fdtemp = fdtemp.replace('\n',', ') fdtemp = fdtemp.replace('<<nn>>', '\n') else: fdtemp = fd.text.replace('\n', ' ') fdtemp = fdtemp.strip() # remove all redundant spaces from the string fdtemp = " ".join(fdtemp.split()) # have to escape single quotes in text so it will insert correctly fdtemp = fdtemp.replace("'", "''") #Insert into SQL # ... code continued

<div class="section"> <h2 class="sectiontitle">Title</h2> <dl> <dt class="dlterm">Term1</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah blah about term1</dd> <dt class="dlterm">Term2</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah blah about term2</dd> <dt class="dlterm"></dt><dt class="dlterm">Term3</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah about term3</dd> </dl></div>

1条回答

网友
1楼 · 发布于 2024-10-04 11:33:06

decompose()足以解决您的问题。你知道吗
from bs4 import BeautifulSoup html=""" <div class="section"> <h2 class="sectiontitle">Title</h2> <dl> <dt class="dlterm">Term1</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah blah about term1</dd> <dt class="dlterm">Term2</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah blah about term2</dd> <dt class="dlterm"></dt><dt class="dlterm">Term3</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah about term3</dd> </dl></div> """ soup=BeautifulSoup(html,'html.parser') for tag in soup.find_all('dt',attrs={"class":"dlterm"}): #all dl tags with class dlterm if not tag.text: #if tag is empty tag.decompose() print(soup)
输出
<div class="section"> <h2 class="sectiontitle">Title</h2> <dl> <dt class="dlterm">Term1</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah blah about term1</dd> <dt class="dlterm">Term2</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah blah about term2</dd> <dt class="dlterm">Term3</dt><dd><samp class="codeph">nonNegativeInteger</samp></dd><dd class="ddexpand">Blah blah about term3</dd> </dl></div>

相关问题更多 >

编程相关推荐

热门问题

热门文章