解码使用CP437制表符编码的文件

def convertBinToHex(binary): binToHex = hex(int(binary, 2)) temp = list(binToHex) temp = temp[2:] binToHex = "".join(temp).upper() return binToHex def convertUnicodeToUTF(unicodeBin, symbolDecimal, returnBin): # https://stackoverflow.com/questions/6240055/manually-converting-unicode-codepoints-into-utf-8-and-utf-16 bytesCount = 0 if int("0000", 16) <= symbolDecimal <= int("007F", 16): if returnBin: return unicodeBin return convertBinToHex(unicodeBin) elif int("0080", 16) <= symbolDecimal <= int("07FF", 16): bytesCount = 2 elif int("0800", 16) <= symbolDecimal <= int("FFFF", 16): bytesCount = 3 elif int("10000", 16) <= symbolDecimal <= int("10FFFF", 16): bytesCount = 4 else: return if bytesCount == 2: template = ['1', '1', '0', 'x', 'x', 'x', 'x', 'x', '1', '0', 'x', 'x', 'x', 'x', 'x', 'x'] elif bytesCount == 3: template = ['1', '1', '1', '0', 'x', 'x', 'x', 'x', '1', '0', 'x', 'x', 'x', 'x', 'x', 'x', '1', '0', 'x', 'x', 'x', 'x', 'x', 'x'] elif bytesCount == 4: template = ['1', '1', '1', '1', '0', 'x', 'x', 'x', '1', '0', 'x', 'x', 'x', 'x', 'x', 'x', '1', '0', 'x', 'x', 'x', 'x', 'x', 'x', '1', '0', 'x', 'x', 'x', 'x', 'x', 'x'] else: return results = [] unicodeList = list(unicodeBin) counter = len(unicodeList) - 1 for el in reversed(template): if el == 'x': if counter >= 0: results.append(unicodeList[counter]) counter -= 1 else: results.append('0') elif el == '0': results.append('0') else: results.append('1') results.reverse() results = "".join(results) if returnBin: return results else: return convertBinToHex(results) codePage = {} with open("CP437.txt") as f: for line in f: (key, val) = line.split() codePage[key] = val text = [] with open("386intel.txt", 'rb') as f: while True: c = f.read(1) if c: # Converts bytes to bits (string) text.append("{:08b}".format(int(c.hex(), 16))) if not c: print("End of file") break bytesString = 0 bytesStringInt = 0 resultFile = open("rez.txt", "wb") for item in text: decimalValue = int(item, 2) newUnicode = codePage[str(decimalValue)] unicodeToBin = "{0:08b}".format(int(newUnicode, 16)) bytesString = convertUnicodeToUTF(unicodeToBin, decimalValue, True) if len(bytesString) > 8: bytesStringSplit = [bytesString[i:i + 8] for i in range(0, len(bytesString), 8)] for x in bytesStringSplit: bytesStringInt = int(x, 2) resultFile.write(bytes([bytesStringInt])) # print(bytes([bytesStringInt])) else: bytesStringInt = int(bytesString, 2) resultFile.write(bytes([bytesStringInt])) # print(bytes([bytesStringInt]))

1条回答

网友

1楼 · 发布于 2024-10-01 09:28:04

未测试，因为您忽略了提供输入文件：

#!/usr/bin/env perl
use strict;
use warnings;
use autodie;

my @cp;
{
    open my $fh, '<', 'CP437.txt';
    while (my $line = readline $fh) {
        chomp $line;
        my ($k, $v) = split ' ', $line;
        $cp[$k] = chr hex $v;
    }
}
{
    open my $in, '<:raw', '386intel.txt';
    open my $out, '>:encoding(UTF-8)', '386intel.txt.utf8';
    while (my $line = readline $in) {
        $out->print(
            join '',            # 5. join characters into string
            map {               # 2. loop over octets
                $cp[            # 4. look up character corresponding to
                                    # octet numeric value
                    ord         # 3. numeric value of octet
                ]
            }
            split '', $line     # 1. split line into octets
        );
    }
}

这个程序非常容易理解，只有10行重要的代码（如果需要，还可以很容易地移植到Python）。你知道吗

如果文件CP437.txt遵循标准，那么它将变为：

› piconv -f CP437 -t UTF-8 < 386intel.txt > 386intel.txt.utf8

如果赋值确实涉及到对UTF-8的手动编码而不是使用库，那么在代码中chr函数所在的位置进行替换。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章