通过SAX输出带有子元素的大型元素

2024-05-17 10:18:00 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在使用SAX将圣经文本处理成XML(不要和我争论SAXV)。LXML之类的,拜托,SAX在其他方面对我来说非常好,我更喜欢使用stdlib)

除了我还没有弄清楚如何在文件头中生成一个大的样板文本之外,所有的工作都很好。目前我的脚本中有以下内容,但我确实不喜欢它,原因很明显:

def startDocument(self):
        self.ds.startElement('osis', attrs={
            'xmlns': 'http://www.bibletechnologies.net/2003/OSIS/namespace',
            'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance'
        })
        self.ds.startElement('osisText',
                             {'osisIDWork': 'CzeCSP',
                              'osisRefWork': 'bible',
                              'xml:lang': 'cs',
                              'canonical': 'true'})
        self.ds.startElement('header', attrs={})
        self.ds.startElement('revisionDesc', {'resp': 'Matěj Cepl'})
        self.elem('date', '2010.09.07')
        self.elem('p', 'This is just an information about the book.')
        self.ds.endElement('revisionDesc')
        self.ds.startElement('work', {'osisWork': "CzeCSP"})
        self.elem('title', 'Český studijní překlad Bible')
        self.elem('creator', 'Nadační fond překladu Bible')
        self.elem('date', '2012-03-06',
                  {'event': 'eversion', 'type': 'Gregorian'})
        self.elem('publisher', 'Nadační fond překladu Bible')
        self.elem('type', 'Bible', {'type': 'OSIS'})
        self.elem('identifier', 'Bible.cs.CSP', {'type': 'OSIS'})
        self.elem('source', 'http://www.biblecsp.cz/')
        self.elem('language', 'CES', {'type': 'SIL'})
        self.elem('coverage', 'Czech 2010')
        self.elem('rights', 'Copyright 2009 Nakladatelství KMS',
                  {'type': 'x-copyright'})
        self.elem('rights', 'CC BY-NC-ND 3.0 CZ',
                  {'type': 'x-license'})
        self.elem('rights',
                  'http://creativecommons.org/licenses/by-nc-nd/3.0/cz/',
                  {'type': 'x-license-url'})
        self.elem('rights',
                  'Email comments to office in domain biblescp.cz',
                  {'type': 'x-comments-to'})
        self.elem('refSystem', 'Bible.MT')
        self.ds.endElement('work')
        self.ds.startElement('work', {'osisWork': 'strong'})
        self.elem('refSystem', 'Dict.Strongs')
        self.ds.endElement('work')
        self.ds.startElement('work', {'osisWork': 'robinson'})
        self.elem('refSystem', 'Dict.Robinson')
        self.ds.endElement('work')
        self.ds.endElement('header')

我想要一些更优雅的方式,例如

self.reprocessintoevents('''
<osis xmlns='...' ...>
<osisText>...
''')

所以我可以用普通的XML作为这个函数的参数 编写为XML

有人创造了这样的东西吗


Tags: selfhttpwwwtypedsxmlbiblework
1条回答
网友
1楼 · 发布于 2024-05-17 10:18:00

最后,我想到了创建自己的xml.sax.ContentHandler

HEADER_DATA = '''<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace"
      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <osisText osisIDWork="CzeCSP" osisRefWork="bible" xml:lang="cs"
      canonical="true">
      <!  remainder of the XML element to be insterted  >
    </osis>'''

class CopyHandler(xml.sax.handler.ContentHandler):
    """
    Simple SAX handler just for copying string to SAX events.
    """
    def __init__(self, downstream):
        super().__init__()
        self.ds = downstream

    def startElement(self, name, attrs):
        self.ds.startElement(name, attrs)

    def endElement(self, name):
        # Notice that HEADER_DATA **must** contain balanced
        # well-formed XML element, so any weirdness must be
        # corrected in this ContentHandler. For example:
        if name not in ('osisText', 'osis'):
            self.ds.endElement(name)

    def characters(self, content):
        self.ds.characters(content)

# Inside of some other sax.XMLFilterBase or sax.ContentHandler, in
# place you want. In my case it is in startDocument handler.

class CEPProcessor(BaseProcessor):

    # ...

    def startDocument(self):
        xml.sax.parseString(HEADER_DATA.encode('utf8'),
                            CopyHandler(self.ds))

相关问题 更多 >