我有一个6303页的大PDF文件。我编写了一个python程序,它在每个页面中查找特定的ID,用该ID重命名文件,并将其保存在另一个目录中。可能有多个具有相同ID的页面(后续页面),这些页面需要保存为单个文件。该程序运行良好,但需要几个小时才能完成。有什么方法可以优化代码以加快进程吗?我是Python新手,非常感谢您在这方面的帮助
我已经使用了pdfplumber和PyPDF2库来完成任务。我使用PDFPF2的原因是因为PyPDF2无法正确读取我的PDF文档
提前谢谢
import pdfplumber
import PyPDF2
import sys
searchTxt = 'personalnummer'
Path = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\Inputs and Reference\Consolidated payslips.pdf'
outputPath = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\PDF Folder'
pages = []
strtPg = 0
c = 0
PayslipPdf = pdfplumber.open(Path)
if len(PayslipPdf.pages) < 2:
firstPg = PayslipPdf.pages[0].extract_text().splitlines()
for line in firstPg:
if searchTxt in line.lower(): #search for personalnummer
EEID = (line.split()[-1]) #get Employee ID
newFileName = 'Payslip-' + EEID + '.pdf'
pyPayslipPdf = open(Path,'rb')
reader = PyPDF2.PdfFileReader(pyPayslipPdf)
writer = PyPDF2.PdfFileWriter()
curPage = reader.getPage(0)
writer.addPage(curPage)
outputFile = open(outputPath + '\\' + newFileName, 'wb')
writer.write(outputFile)
outputFile.close()
sys.exit()
firstPg = PayslipPdf.pages[0].extract_text().splitlines()
for line in firstPg:
if searchTxt in line.lower(): #search for personalnummer
EEID = (line.split()[-1]) #get Employee ID
break
for page in range(len(PayslipPdf.pages)): #loop through pages
c = c + 1
pdfPage = PayslipPdf.pages[page].extract_text().splitlines() #Extract text
for line in pdfPage:
n = 0
if searchTxt in line.lower(): #search for personalnummer
CrID = (line.split()[-1]) #get Employee ID
n = 1
break
if n == 1:
if page != len(PayslipPdf.pages)-1:
if EEID != CrID:
newFileName = 'Payslip-' + EEID + '.pdf'
for i in range(strtPg,page):
pages.append(i)
pyPayslipPdf = open(Path,'rb')
reader = PyPDF2.PdfFileReader(pyPayslipPdf)
writer = PyPDF2.PdfFileWriter()
for pg in pages:
curPage = reader.getPage(pg)
writer.addPage(curPage)
outputFile = open(outputPath + '\\' + newFileName, 'wb')
writer.write(outputFile)
outputFile.close()
pages.clear()
EEID = CrID
strtPg = page
elif page == len(PayslipPdf.pages)-1:
print('entered the last page')
if EEID != CrID:
newFileName = 'Payslip-' + EEID + '.pdf'
for i in range(strtPg,page):
pages.append(i)
pyPayslipPdf = open(Path,'rb')
reader = PyPDF2.PdfFileReader(pyPayslipPdf)
writer = PyPDF2.PdfFileWriter()
for pg in pages:
curPage = reader.getPage(pg)
writer.addPage(curPage)
outputFile = open(outputPath + '\\' + newFileName, 'wb')
writer.write(outputFile)
outputFile.close()
pages.clear()
newFileName = 'Payslip-' + CrID + '.pdf'
pages.append(page)
pyPayslipPdf = open(Path,'rb')
reader = PyPDF2.PdfFileReader(pyPayslipPdf)
writer = PyPDF2.PdfFileWriter()
for pg in pages:
curPage = reader.getPage(pg)
writer.addPage(curPage)
outputFile = open(outputPath + '\\' + newFileName, 'wb')
writer.write(outputFile)
outputFile.close()
elif EEID == CrID:
newFileName = 'Payslip-' + EEID + '.pdf'
for i in range(strtPg,page+1):
pages.append(i)
pyPayslipPdf = open(Path,'rb')
reader = PyPDF2.PdfFileReader(pyPayslipPdf)
writer = PyPDF2.PdfFileWriter()
for pg in pages:
curPage = reader.getPage(pg)
writer.addPage(curPage)
outputFile = open(outputPath + '\\' + newFileName, 'wb')
writer.write(outputFile)
outputFile.close()
print(str(c) + ' pages were processed')
我刚刚想出了一个相对更快的解决方案。显然,问题似乎在于pdf(6303)中的总页数。当我试图从一个大约有100页的pdf中读取和提取页面时,python在不到几秒钟的时间内就完成了。因此,我编写了一个函数,将pdf页面按一定数量(本例中为500页)拆分,并根据ID从包含500页的分块版本中提取页面。这在大约15到20分钟内完成了任务
相关问题 更多 >
编程相关推荐