<p>感谢@scanny的反馈。因为在<code>python-docx</code>中没有办法做到这一点,而且我还是要把文档转换成PDF格式,所以我决定在Word文档转换成PDF之后使用<code>pdfminer</code>来获取页码。这段代码可能很长,但它可以完成任务</p>
<pre><code>import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def xmlToLines(xml):
text = ''.join(xml)
return text.split('\n')
#Convert a PDF found at the 'path' and turns it into XML lines
#path is the full path directory to the PDF file you're reading from
def convert_pdf_to_xml(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
print 'Converting following file from PDF to XML: \n - ' + str(path)
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
lines = xmlToLines(text)
#Close out pdf and I/O's
fp.close()
device.close()
retstr.close()
return lines
#returns a list of every page number where the field marker is found in the PDF
def getPagesWithField(wordPdfPath, field):
lines = convert_pdf_to_xml(wordPdfPath)
page_regex = r'page id="[0-9]*"'
t_regex = r'<text font='
pagesFound = []
text = ''
field = field.replace('<','&').replace('>','&')
for i in range(len(lines)):
#If it's a new page line, increment to the new page
if len(re.findall(page_regex, lines[i])) > 0:
page = int(re.findall(r'[0-9]{1,}', lines[i])[0])
#print 'page: ' + str(page)
#If it's the end of a line
elif lines[i] == '<text>':
#print "Text: " + text
#check if the collected text is the field you're looking for
if field in text:
pagesFound.append(page)
text = ''
#If it's a line with a text character, add it to text
elif len(re.findall(t_regex, lines[i])) > 0:
text = str(text + re.findall(r'>[^\r\n]*</text>',lines[i])[0][1])
pagesFound = list(set(pagesFound))
pagesFound.sort()
return pagesFound
</code></pre>
<p>在此之后,<code>PyPDF2</code>可用于简单的PDF页面插入/合并</p>