如何使用beautiful soup刮取标记属性列表？

from bs4 import BeautifulSoup with open("new2.xml", "r") as file: url = file.read() soup = BeautifulSoup(url, "html.parser") required0 = soup.find_all("sentences") text1 = [] for i in required0: sent_id = [] category = [] for sentences in soup.find_all('sentence'): if sentences.has_attr('id'): sent_id.append(sentences['id']) text.append(sentences.get_text()) for opinions in soup.find_all('opinion'): cat1 = [] pola1 = [] targ = [] if opinions.has_attr('category'): cat1.append(opinions['category']) if opinions.has_attr('polarity'): pola1.append(opinions['polarity']) if opinions.has_attr('target'): targ.append(opinions['target']) cat_list = list(zip(cat1,pola1,targ)) category.append(cat_list) catlist= list(zip(sent_id, category, category)) text1.append(catlist)

1条回答

网友

1楼 · 发布于 2024-09-30 07:33:01

这里是我用来获取所有链接的解决方案。这是您的解决方案的修改版本

from bs4 import BeautifulSoup

data = """<sentences>    
<sentence id="1126814:0">
<Opinions>
    <Opinion target="Leon" category="RESTAURANT#GENERAL" polarity="positive" from="0" to="4"/>
    <Opinion target="Leon" category="AMBIENCE#GENERAL" polarity="positive" from="0" to="4"/>
    <Opinion target="specials" category="FOOD#QUALITY" polarity="positive" from="95" to="103"/>
    <Opinion target="atmosphere" category="AMBIENCE#GENERAL" polarity="positive" from="123" to="133"/>
    <Opinion target="French bistro fare" category="FOOD#QUALITY" polarity="positive" from="70" to="88"/>
    </Opinions>
</sentence>"""

soup = BeautifulSoup(data, "html.parser")
sentences = soup.find_all("sentence")
result = []
for sent in sentences:
    element = []
    if sent.has_attr("id"):
        element.append(sent["id"])
    opinions = sent.find_all("opinion")
    for op in opinions:
        el = []
        if op.has_attr("category"):
            el.append(op["category"])
        if op.has_attr("polarity"):
            el.append(op["polarity"])
        if op.has_attr("target"):
            el.append(op["target"])
        element.append(el)
    result.append(tuple(element))
print(result)

结果

[('1126814:0', ['RESTAURANT#GENERAL', 'positive', 'Leon'], ['AMBIENCE#GENERAL', 'positive', 'Leon'], ['FOOD#QUALITY', 'positive', 'specials'], ['AMBIENCE#GENERAL', 'positive', 'atmosphere'], ['FOOD#QUALITY', 'positive', 'French bistro fare'])]

请让我知道这是否是你想要答案的格式

相关问题更多 >

编程相关推荐

热门问题

热门文章