<blockquote>
<p><strong>Question</strong>: programmatically combine the content of certain HTML tags </p>
</blockquote>
<p>本例使用<code>lxml</code>解析XHTML文件并构建新的XHTML树。你知道吗</p>
<pre><code>import io, os
from lxml import etree
XHTML = b"""<?xml version='1.0' encoding='Latin1'?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
<p class="calibre1">In the California registry, there was</p>
<p class="calibre1">a calm breeze blowing through the room. A woman</p>
<p class="calibre1">who must have just walked in quietly beckoned for the</p>
<p class="calibre1">counterman to approach to store her slip.</p>
<p class="calibre1">642</p>
</body></html>"""
class Calibre2EPUB(etree.iterparse):
def __init__(self, fh):
"""
Initialize 'iterparse' to only generate 'start' and 'end' events
:param fh: File Handle from the XHTML File to parse
"""
super().__init__(fh, events=('start', 'end'))
self.parse()
def element(self, elem, parent=None):
"""
Copy 'elem' with attributes and text to new Element
:param elem: Source Element
:param parent: Parent of the new Element
:return: New Element
"""
if parent is None:
e = etree.Element(elem.tag, nsmap={None: etree.QName(elem).namespace})
else:
e = etree.SubElement(parent, elem.tag)
[e.set(key, elem.attrib[key]) for key in elem.attrib]
if elem.text:
e.text = elem.text
return e
def parse(self):
"""
Parse all Elements, copy Elements 1:1 except <p class:'calibre1' Element
Aggregate all <p class:'calibre1' text to one Element
:return: None
"""
self.calibre1 = None
for event, elem in self:
if event == 'start':
if elem.tag.endswith('html'):
self._xhtml = self.element(elem)
elif elem.tag.endswith('body'):
self.body = self.element(elem, parent=self._xhtml)
if event == 'end':
if elem.tag.endswith('p'):
_class = elem.attrib['class']
if not _class == 'calibre1':
p = self.element(elem, parent=self.body)
else:
if self.calibre1 is None:
self.calibre1 = self.element(elem, parent=self.body)
else:
self.calibre1.text += ' ' + elem.text
@property
def xhtml(self):
"""
:return: The new Element Tree XHTML
"""
return etree.tostring(self._xhtml, xml_declaration=True, encoding='Latin1', pretty_print=True)
</code></pre>
<blockquote>
<p><strong>Usage</strong>_</p>
</blockquote>
<pre><code>if __name__ == "__main__":
# with open(os.path.join(pathname, file_name), 'rb', encoding="Latin1") as in_file:
with io.BytesIO(XHTML) as in_file:
print(Calibre2EPUB(in_file).xhtml.decode())
#with open(os.path.join(pathname, '_modified_' + file_name), 'wb') as out_file:
# out_file.write(Calibre2EPUB(xml_file).xhtml)
</code></pre>
<blockquote>
<p><strong>Output</strong>:</p>
<pre><code><?xml version='1.0' encoding='Latin1'?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
<p class="calibre1">In the California registry, ... (omitted for brevity)to store her slip. 642</p>
</body></html>
</code></pre>
</blockquote>
<p>用Python:3.5测试</p>