脚本在cx\u freez之后无法检测到文件中的汉字

def initialize(self): #several imports here #several filename operations here writefileF = codecs.open(writefile, "w", "utf-8") # copy the original to another with utf-8 encoding (to be safe) with io.open(self.orig_filename, ) as source: with io.open(readFileN, mode='w', encoding='utf-8') as target: try: shutil.copyfileobj(source, target) except: print 'trying single copy file with no metadata.. ' shutil.copyfile(self.filename, readFileN) readFile = codecs.open(readFileN, "r", "utf-8") # generAtor func call creategen = self.readfilebylines(readFile) for iterator in creategen: endd = myconcat.join(iterator[0]) writefileF.writelines(myconcat.join(endd)) def readfilebylines(self, myfileobj): linenum = 0 for lines in myfileobj.readlines(): mygen = lines mymatch = self.regularexpmatch(lines) if mymatch: print 'chinese word detected' #do translation else: pass yield mygen, linenum def regularexpmatch(self, mytext): chinese_compile = re.compile(ur'[\u4e00-\u9fff]+') matched = chinese_compile.search(mytext) return matched

1条回答

网友

1楼 · 发布于 2024-10-01 04:53:26

我挣扎了几个小时终于找到了解决办法。问题是，如果不指定原始文件编码，脚本就无法更改文件编码。你知道吗

所以就我而言：

def copyfile_inUTF8(self,orig_file,copy_file):
    import chardet
    raw_data=open(orig_file,'r').read()
    target_en='utf-8'
    #detect sourcefile encoding
    orig_en=chardet.detect(raw_data)['encoding']
    print 'original file encoding:',orig_en
    target = open(copy_file, "w")
    target.write(unicode(raw_data, orig_en).encode(target_en))
    print 'copy file encoding:',chardet.detect(open(copy_file,'r').read())['encoding']

使用这个函数，我可以看到原始编码，并将其更改为目标编码，即utf-8。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章