解析空值xml d时出错

<title> <name>Test Name</name> <alt name /> <file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file> <file local="" type="excel">http://filestore/file2.xls</file> <file local="C:\file\file3.xls" type="excel" /> <file local="" type="ppt" /> </title>

dict = {'title': [{'name': 'Test Name', 'alt name': '', 'file': [{'local': 'C:\file\file1.doc', 'type': 'word', 'url': 'http://filestore/file1.doc'}, {'local': '', 'type': 'excel', 'url': 'http://filestore/file2.xls'}, {'local': 'C:\file\file3.xls', 'type': 'excel', 'url': ''}, {'local': '', 'type': 'ppt', 'url': ''}] }]}

1条回答

网友

1楼 · 发布于 2024-06-14 06:57:11

所以我最终创建了一个客户解析器，虽然不是很理想，但它是有效的。有人向我建议lxml和html.parser语法分析器可能会更好地解析格式错误的xml，但我只是同意这个。你知道吗

我仍然对任何反馈非常感兴趣，无论是关于这个还是使用任何其他方法。你知道吗

import re

def merge_dicts(*dict_args):
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

def make_dict(str_arg, op):
    result = {}
    result = dict(s.split(op) for s in str_arg.split(","))
    return result

'''
Samples
lst = r'  <name>Test Name</name>'
lst = r'  <alt name />'
lst = r'  <file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
lst = r'  <file local="" type="excel">http://filestore/file2.xls</file>'
lst = r'  <file local="C:\file\file3.xls" type="excel" />'
lst = r'  <file local="" type="ppt" />'
'''
def match_pattern(file_str):
    #<description>desc blah</description>'
    pattern1 = r'''(?x)
    ^
    \s*                                             # cut leading whitespace
    (?P<whole_thing>
        < (?P<tag_open> (\w+?|\w*\s\w+?)+) \b       # word boundary, so we can
        >                                           # skip attributes
        (?P<tag_body> .+? )                         # insides
        </ (?P<tag_close> (\w+?|\w*\s\w+?)+) >      # closing tag, nothing interesting
    )
    $'''

    #<alt name />
    pattern2 = r'''(?x)
    ^
    \s*
    (?P<whole_thing>
        < (?P<tag_open> (\w+?|\w*\s\w+?)+) \b
        \s/>
    )
    $'''

    #<file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
    pattern3 = r'''(?x)
        ^
        \s*
        (?P<whole_thing>
            < (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
            \s
            (?P<tag_attrib1> (\w*\=.*?))            # 1st attribute
            \s
            (?P<tag_attrib2> (\w*\=.*))             # 2nd attribute
            .*? >
            (?P<tag_body> .+? )
            </ (?P<tag_close> (\w+?|\w*\s\w+?)+) >
        )
        $'''

    #<file local="" type="ppt" />
    pattern4 = r'''(?x)
        ^
        \s*
        (?P<whole_thing>
            < (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
            \s
            (?P<tag_attrib1> (\w*\=.*?))            # 1st attribute
            \s
            (?P<tag_attrib2> (\w*\=.*))             # 2nd attribute
            \s/>
        )
        $'''

    pat_str = 'pattern'
    pat_val = 1
    return_dict = {}
    while (pat_val <= 4):
        pattern = pat_str+str(pat_val)
        matchObj = re.match(eval(pattern), file_str, re.L|re.M)

        if matchObj:
            #for k, v in matchObj.groupdict().items():
            #    print('matchObj.group({!r}) == {!r}'.format(k, v))
            if pat_val == 1:
                body = matchObj.group('tag_body')
                return_dict = {matchObj.group('tag_open'): body}
            elif pat_val == 2:
                return_dict = {matchObj.group('tag_open'): ''}
            elif pat_val == 3:
                attr1 = make_dict(matchObj.group('tag_attrib1'), '=')
                attr2 = make_dict(matchObj.group('tag_attrib2'), '=')
                body = {'url': matchObj.group('tag_body')}
                attrib = merge_dicts(attr1, attr2, body)
                return_dict = {matchObj.group('tag_open'): attrib}
            elif pat_val == 4:
                attr1 = make_dict(matchObj.group('tag_attrib1'), '=')
                attr2 = make_dict(matchObj.group('tag_attrib2'), '=')
                body = {'url': ''}
                attrib = merge_dicts(attr1, attr2, body)
                return_dict = {matchObj.group('tag_open'): attrib}
            return return_dict
        else:
            pat_val = pat_val + 1
            if pat_val > 4:
                print("No match!!")  

#print(match_pattern(lst))

def in_file(file):
    result = {}
    with open(file, "r") as file:
        data = (file.read().splitlines())
        for d in data:
            if data.index(d) == 0 or data.index(d) == len(data)-1:
                if data.index(d) == 0:
                    print(re.sub('<|/|>', '', d))
            elif d:
                lst = []
                dct = {}
                if 'file' in match_pattern(d).keys():
                    for i in match_pattern(d).items():
                        if 'file' in result.keys():
                            lst = result['file']
                            lst.append(i[1])
                            dct = {i[0]: lst}
                            result = merge_dicts(result, dct)
                            #print(result['file'])
                        else:
                            dct = {i[0]: [i[1]]}
                            result = merge_dicts(result, dct)
                else:
                    result = merge_dicts(result, match_pattern(d))
                    print('else', match_pattern(d))
    return result

print(in_file('C:\\test.nfo'))

注意：我把最上面的字典从原来的帖子里删掉了

相关问题更多 >

编程相关推荐

热门问题

热门文章