Python删除uuencoding行

GRAPHIC 18 g438975g32h99a01.jpg begin 644 g438975g32h99a01.jpg M_]C_X``02D9)1@`!`@$`8`!@``#_[0G64&AO;=&]S:&]P(#,N,``X0DE-`^T` M`````!``8`````$``0!@`````0`!.$))300-```````$````'CA"24T$&0`` M````!````!XX0DE-`_,```````D```````````$`.$))300*```````!```X M0DE-)Q````````H``0`````````".$))30/U``````!(`"]F9@`!`&QF9;@`& M```````!`"]F9@`!`*&9F@`&```````!`#(````!`%H````&```````!`#4` M```!`"T````&```````!.$))30/X``````!P``#_____________________ M________`^@`````_____________________________P/H`````/______ M______________________\#Z`````#_____________________________ M`^@``#A"24T$"```````$`````$```)````"0``````X0DE-!!X```````0` M````.$))300:``````!M````!@``````````````)P```+`````&`&<`,P`R M`&@`.0`Y`````0`````````````````````````!``````````````"P```` M)P`````````````````````````````````````````````X0DE-!!$````` M``$!`#A"24T$%```````!`````(X0DE-!`P`````!SH````!````<````!D` M``%0```@T```!QX`&``!_]C_X``02D9)1@`!`@$`2`!(``#_[@`.061O8F4` M9(`````!_]L`A``,"`@("0@,"0D,$0L*"Q$5#PP,#Q48$Q,5$Q,8$0P,#`P, M#!$,#`P,#`P,#`P,#`P,#`P,#`P,#`P,#`P,#`P,`0T+"PT.#1`.#A`4#@X. M%!0.#@X.%!$,#`P,#!$1#`P,#`P,$0P,#`P,#`P,#`P,#`P,#`P,#`P,#`P, M#`P,#`S_P``1"``9`'`#`2(``A$!`Q$!_]T`!``'_\0!/P```04!`0$!`0$` M`````````P`!`@0%!@<("0H+`0`!!0$!`0$!`0`````````!``(#!`4&!P@) M"@L0``$$`0,"!`(%!P8(!0,,,P$``A$#!"$2,05!46$3(G&!,@84D:&Q0B;,D M%5+!8C,T<H+10P)E\K.$P]-U MX_-&)Y2DA;25Q-3D]*6UQ=7E]59F=H:6IK;&UN;V-T=79W>'EZ>WQ]?G]Q$` M`@(!`@0$`P0%!@<'!@4U`0`"$0,A,1($05%A<2(3!3*!D12AL4(CP5+1\#,D M8N%R@I)#4Q5C<S3Q)086HK*#!R8UPM)$DU2C%V1%539T9>+RLX3#TW7C\T:4 MI(6TE<34Y/2EM<75Y?569G:&EJ;:VQM;F]B

with open('fileWithBegin644.txt') as inf: ignoreLines = False for line in inf: if start_marker in line: print line, ignoreLines = True if not ignoreLines: with open("strip_" + inf, "w") as f: f.write(line.get_text().encode('utf-8'))

1条回答

网友

1楼 · 发布于 2024-07-07 09:06:08

我编了一个简单的发电机。因为规范有点乏味（为什么在不同的行上有两个独立的结束标记？）它相当笨重，但这里有。它应该同时作为uuencode的验证器工作，但我只在非常有限的设置中测试过它。在

import re

def unuuencode (iterator, collector=None, ignore_length_errors=False):
    """
    Yield lines from iterator except when they are in an uuencode blob.

    If collector is not None, append to it the uuencoded blobs as a list
    of a list of lines, one for each uuencoded blob.
    """
    state = None  # one of { None, 'in_blob', 'closing', 'closed' }
    collectitem = None
    regex = re.compile(r'^begin\s+[0-7]{3,6}\s+.*?(?:\r?\n)?$')

    for line in iterator:
        if state == None:
            if regex.match(line):
                if collector != None:
                    collectitem = [line]
                state = 'in_blob'
                continue
            else:
                yield line

        else:
            stripped = line.rstrip('\r\n')

            if state == 'in_blob' and line.startswith('`'):
                state = 'closing'

            if state == 'closing':
                if stripped != '`':
                    raise ValueError('Expected "`" but got "%s"' % line)
                state = 'closed'
            elif state == 'closed':
                if stripped != 'end':
                    raise ValueError('Expected "end" but got "%s"' % line)
                state = None
            else:
                expect = ord(line[0:1])-32
                actual = len(stripped)
                seen = (len(stripped)-1)*6/8
                if seen != expect:
                    if not ignore_length_errors:
                        raise ValueError('Wrong prefix on line: %s '
                            '(indicated %i, 6/8 %i, actual length %i)' % (
                                line, expect, seen, actual))
                if line[0:1] != 'M':
                    state = 'closing'

            if collectitem:
                collectitem.append(line)

            if state is None:
                if collectitem:
                    collector.append(collectitem)
                    collectitem = None

            continue

这样使用：

^{pr2}$

或者像这样：

with open(file, 'r') as f:
    blobs = []
    lines = [x for x in unuuencode(f, collector=blobs)]

或者像这样：

with open(file, 'r') as f:
    lines = f.read().split('\n')
# ... or whichever way you obtained your content as an array of lines
lines = [x for x in unuuencode(lines)]

或者在code you seem to be using的情况下：

for fi in sys.argv[1:]:
    with open(fi) as markup:
        soup = BeautifulSoup(''.join(unuuencode(markup, ignore_length_errors=True)))
    with open("strip_" + fi, "w") as f:
        f.write(soup.get_text().encode('utf-8'))

您链接到的sample在第二个uuencoded blob中有一个无效的长度指示符，所以我添加了一个选项来忽略它。在

相关问题更多 >

编程相关推荐

热门问题

热门文章