删除美化组字符串中的重音符号

def strip_accents_1(text): text = unicodedata.normalize('NFD', text)\ .encode('ascii', 'ignore')\ .decode("utf-8") return str(text) def strip_accents_2(text): return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') # This works correctly print(strip_accents_1('Jesús')) def html_imp(h): soup = BeautifulSoup(open(h), features = "lxml") tdTags = [] values =[] sort = [] for i in range(4,40): for i in soup.find_all('td'): tdTags.append(i.text) for i in [7,25,9,15,5,11]: values.append(tdTags[i]) # Original name with accent sort.append(values[3]) # Strip accents sort.append(strip_accents_1(values[3])) sort.append(strip_accents_2(values[3])) print(sort) return sort

2条回答

网友

1楼 · 编辑于 2024-05-19 20:12:18

我知道你可能不会再寻找另一个软件包来安装

但我发现Gensim has a great accent remover效果非常好：

from gensim.utils import deaccent

deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
>>> u'Sef chomutovskych komunistu dostal postou bily prasek'

事实证明，源代码非常简单，只有几行，并且只使用unicode_data。也许你可以复制一下，check it out here

网友

2楼 · 编辑于 2024-05-19 20:12:18

看起来问题的关键在于您默认使用Python使用的编码，而不是所讨论的文件的编码

为了调试这个问题，我简化了您的代码，希望它能说明核心问题：

import unicodedata
from bs4 import BeautifulSoup

def strip_accents(text):
    # Just a prefrence change
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    return text.decode("utf-8")
    
# A simplified version of your code
def html_imp_old(h):
    soup = BeautifulSoup(open(h), features = "lxml")
    tags = []
    for tag in soup.find_all('td'):
        tags.append(strip_accents(tag.text)) 
    print(tags)

# Same as _old, just specifying an encoding when reading the file
def html_imp_new(h):
    soup = BeautifulSoup(open(h, encoding="utf-8"), features = "lxml")
    tags = []
    for tag in soup.find_all('td'):
        tags.append(strip_accents(tag.text)) 
    print(tags)

# Make a self-contained snippet, so write out the HTML to disk
with open("temp.html", "wt", encoding="utf-8") as f:
    f.write("<TD WIDTH=\"80\" ALIGN=\"center\">Jes\u00fas</TD>\n")
# This works correctly, outputs "Jesus"
print(strip_accents('Jes\u00fas'))
# This doesn't work, outputs "JesAs" for me, though I assume this will be OS dependent behavior
html_imp_old("temp.html")
# This works correctly, outputs "Jesus"
html_imp_new("temp.html")

相关问题更多 >

编程相关推荐

热门问题

热门文章