如何在NLTK（python）中处理中文？

# -*- coding: utf-8 -*- import nltk tparse = nltk.tree.Tree.fromstring import sys reload(sys) sys.setdefaultencoding('utf8') class cal_prob: def __init__(self): pass def input_dataset(self, path="CTB-auto-pos/"): trainfile = open(path+"train.txt", "r+") datas = trainfile.read().split("\n") for data in datas: data = unicode(data) # change them to unicode print data tree = tparse(data) print tree print unicode(str(tree)).decode("utf8") print unicode(str(tree)).encode("utf8") break # a = u"(IP \n (NP (NP (NR \u4e0a\u6d77) (NR \u6d66\u4e1c)) (NP (NN \u5f00\u53d1) (NP (CC \u4e0e) (NP (NN \u6cd5\u5236) (NN \u5efa\u8bbe))))) (VP (VV \u540c\u6b65)))" print a print a.decode("utf8") trainfile.close() a = cal_prob() a.input_dataset()

1条回答

网友

1楼 · 发布于 2024-09-26 21:57:16

下面是一个正确打开编码文件的示例。不需要reload(sys)技巧（参见https://anonbadger.wordpress.com/2015/06/16/why-sys-setdefaultencoding-will-break-code/）或其他编码/解码。在

tree.pformat()按您的意愿显示树：

import nltk
import io

with io.open('train.txt', encoding='utf8') as trainfile:
    for line in trainfile:
        print tree
        print
        print tree.pformat()

输出：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章