使用Python在PDF中读写6000页需要几个小时

2024-10-06 06:47:07 发布

您现在位置:Python中文网/ 问答频道 /正文

我有一个6303页的大PDF文件。我编写了一个python程序,它在每个页面中查找特定的ID,用该ID重命名文件,并将其保存在另一个目录中。可能有多个具有相同ID的页面(后续页面),这些页面需要保存为单个文件。该程序运行良好,但需要几个小时才能完成。有什么方法可以优化代码以加快进程吗?我是Python新手,非常感谢您在这方面的帮助

我已经使用了pdfplumber和PyPDF2库来完成任务。我使用PDFPF2的原因是因为PyPDF2无法正确读取我的PDF文档

提前谢谢

import pdfplumber
import PyPDF2
import sys

searchTxt = 'personalnummer' 
Path = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\Inputs and Reference\Consolidated payslips.pdf'
outputPath = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\PDF Folder'
pages = []
strtPg = 0
c = 0

PayslipPdf = pdfplumber.open(Path)
    
if len(PayslipPdf.pages) < 2:
    firstPg = PayslipPdf.pages[0].extract_text().splitlines()
    for line in firstPg:
        if searchTxt in line.lower(): #search for personalnummer
           EEID = (line.split()[-1]) #get Employee ID
           newFileName = 'Payslip-' + EEID + '.pdf'
           pyPayslipPdf = open(Path,'rb')
           reader = PyPDF2.PdfFileReader(pyPayslipPdf)
           writer = PyPDF2.PdfFileWriter()
           curPage = reader.getPage(0)
           writer.addPage(curPage)
           outputFile = open(outputPath + '\\' + newFileName, 'wb')
           writer.write(outputFile)
           outputFile.close()
           sys.exit()

firstPg = PayslipPdf.pages[0].extract_text().splitlines()
for line in firstPg:
    if searchTxt in line.lower(): #search for personalnummer
       EEID = (line.split()[-1]) #get Employee ID
       break
for page in range(len(PayslipPdf.pages)): #loop through pages
    c = c + 1
    pdfPage = PayslipPdf.pages[page].extract_text().splitlines() #Extract text
    for line in pdfPage:
        n = 0
        if searchTxt in line.lower(): #search for personalnummer
           CrID = (line.split()[-1]) #get Employee ID
           n = 1
           break
    if n == 1: 
       if page != len(PayslipPdf.pages)-1:     
          if EEID != CrID:
             newFileName = 'Payslip-' + EEID + '.pdf' 
             for i in range(strtPg,page):
                 pages.append(i)
             pyPayslipPdf = open(Path,'rb')
             reader = PyPDF2.PdfFileReader(pyPayslipPdf)
             writer = PyPDF2.PdfFileWriter()
             for pg in pages: 
                 curPage = reader.getPage(pg)
                 writer.addPage(curPage)   
             outputFile = open(outputPath + '\\' + newFileName, 'wb')
             writer.write(outputFile)
             outputFile.close()
             pages.clear()
             EEID = CrID
             strtPg = page
          
       elif page == len(PayslipPdf.pages)-1:
            print('entered the last page')
            if EEID != CrID:
               newFileName = 'Payslip-' + EEID + '.pdf' 
               for i in range(strtPg,page):
                   pages.append(i)
               pyPayslipPdf = open(Path,'rb')
               reader = PyPDF2.PdfFileReader(pyPayslipPdf)
               writer = PyPDF2.PdfFileWriter()
               for pg in pages: 
                   curPage = reader.getPage(pg)
                   writer.addPage(curPage)   
               outputFile = open(outputPath + '\\' + newFileName, 'wb')
               writer.write(outputFile)
               outputFile.close()
               pages.clear()

               newFileName = 'Payslip-' + CrID + '.pdf' 
               pages.append(page)
               pyPayslipPdf = open(Path,'rb')
               reader = PyPDF2.PdfFileReader(pyPayslipPdf)
               writer = PyPDF2.PdfFileWriter()
               for pg in pages: 
                   curPage = reader.getPage(pg)
                   writer.addPage(curPage)   
               outputFile = open(outputPath + '\\' + newFileName, 'wb')
               writer.write(outputFile)
               outputFile.close()
               
            elif EEID == CrID:
                 newFileName = 'Payslip-' + EEID + '.pdf' 
                 for i in range(strtPg,page+1):
                     pages.append(i)  
                 pyPayslipPdf = open(Path,'rb')
                 reader = PyPDF2.PdfFileReader(pyPayslipPdf)
                 writer = PyPDF2.PdfFileWriter()
                 for pg in pages: 
                     curPage = reader.getPage(pg)
                     writer.addPage(curPage)   
                 outputFile = open(outputPath + '\\' + newFileName, 'wb')
                 writer.write(outputFile)
                 outputFile.close()
          
print(str(c) + ' pages were processed')

Tags: inforlinepagepagesopenreaderwriter
1条回答
网友
1楼 · 发布于 2024-10-06 06:47:07

我刚刚想出了一个相对更快的解决方案。显然,问题似乎在于pdf(6303)中的总页数。当我试图从一个大约有100页的pdf中读取和提取页面时,python在不到几秒钟的时间内就完成了。因此,我编写了一个函数,将pdf页面按一定数量(本例中为500页)拆分,并根据ID从包含500页的分块版本中提取页面。这在大约15到20分钟内完成了任务

import pdfplumber
import PyPDF2
import sys
import os

searchTxt = 'personalnummer'

SplitInpPath = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\Inputs and Reference\Consolidated payslips.pdf'
SplitOutPath = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\Split Pages'

def PdfSplitter(filePath, pgs, SrTxt, OutputPath):
    pdfFile = open(filePath,'rb')
    strtpg = 0   
    numpgs = pgs
    n = 0

    PayslipPdf = pdfplumber.open(filePath)
    reader = PyPDF2.PdfFileReader(pdfFile)
    totalPages = reader.numPages
    
    while numpgs <= totalPages:
        writer = PyPDF2.PdfFileWriter()
        for i in range(strtpg,numpgs):
            curPage = reader.getPage(i)
            writer.addPage(curPage)
            if i+1 != totalPages:
               if i+1 == numpgs:
                  Lastpg = PayslipPdf.pages[i].extract_text().splitlines()
                  for line in Lastpg:
                      if SrTxt in line.lower(): #search for personalnummer
                         EmpID = (line.split()[-1]) #get Employee ID
                         break
                  CurID = EmpID
                  while CurID == EmpID:
                        i = i + 1
                        CurPg = PayslipPdf.pages[i].extract_text().splitlines()
                        for line in CurPg:
                            if SrTxt in line.lower(): #search for personalnummer
                               CurID = (line.split()[-1]) #get Employee ID
                               break
                        if CurID == EmpID:   
                           curPage = reader.getPage(i)
                           writer.addPage(curPage)
                     
        outputFile = open(OutputPath + '\\Output_' + str(int(i+1)) + '.pdf', 'wb')
        writer.write(outputFile)
        outputFile.close()

        if numpgs == totalPages:
           break
        else:   
            strtpg = i
            numpgs = numpgs + pgs
        if numpgs > totalPages:
           numpgs = numpgs - (numpgs-totalPages)

PdfSplitter(SplitInpPath, 500, searchTxt, SplitOutPath)           

Path = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\Split Pages'
outputPath = r'C:\Users\102398\OneDrive - Neeyamo Enterprise Solutions Pvt. Ltd\Work in progress_Allan\PDF Automation for SGRE\PDF Folder'
pages = []
strtPg = 0
c = 0



fileName = os.listdir(Path)

for file in fileName:
    PayslipPdf = pdfplumber.open(Path + '\\' + file)
    print(Path + '\\' + file)
    
    if len(PayslipPdf.pages) < 2:
       firstPg = PayslipPdf.pages[0].extract_text().splitlines()
       for line in firstPg:
           if searchTxt in line.lower(): #search for personalnummer
              EEID = (line.split()[-1]) #get Employee ID
              newFileName = 'Payslip-' + EEID + '.pdf'
              pyPayslipPdf = open(Path + '\\' + file, 'rb')
              reader = PyPDF2.PdfFileReader(pyPayslipPdf)
              writer = PyPDF2.PdfFileWriter()
              curPage = reader.getPage(0)
              writer.addPage(curPage)
              outputFile = open(outputPath + '\\' + newFileName, 'wb')
              writer.write(outputFile)
              outputFile.close()
              sys.exit()

    firstPg = PayslipPdf.pages[0].extract_text().splitlines()
    for line in firstPg:
        if searchTxt in line.lower(): #search for personalnummer
           EEID = (line.split()[-1]) #get Employee ID
           break
    for page in range(len(PayslipPdf.pages)): #loop through pages
        #c = c + 1
        length = len(PayslipPdf.pages)
        pdfPage = PayslipPdf.pages[page].extract_text().splitlines() #Extract text
        for line in pdfPage:
            n = 0
            if searchTxt in line.lower(): #search for personalnummer
               CrID = (line.split()[-1]) #get Employee ID
               n = 1
               break
        if n == 1: 
           if page != len(PayslipPdf.pages)-1:     
              if EEID != CrID:
                 newFileName = 'Payslip-' + EEID + '.pdf' 
                 for i in range(strtPg,page):
                     pages.append(i)
                 pyPayslipPdf = open(Path + '\\' + file, 'rb')
                 reader = PyPDF2.PdfFileReader(pyPayslipPdf)
                 writer = PyPDF2.PdfFileWriter()
                 for pg in pages: 
                     curPage = reader.getPage(pg)
                     writer.addPage(curPage)   
                 outputFile = open(outputPath + '\\' + newFileName, 'wb')
                 writer.write(outputFile)
                 outputFile.close()
                 pages.clear()
                 EEID = CrID
                 strtPg = page
          
           elif page == len(PayslipPdf.pages)-1:
                print('entered the last page')
                if EEID != CrID:
                   newFileName = 'Payslip-' + EEID + '.pdf' 
                   for i in range(strtPg,page):
                       pages.append(i)
                   pyPayslipPdf = open(Path + '\\' + file, 'rb')
                   reader = PyPDF2.PdfFileReader(pyPayslipPdf)
                   writer = PyPDF2.PdfFileWriter()
                   for pg in pages: 
                       curPage = reader.getPage(pg)
                       writer.addPage(curPage)   
                   outputFile = open(outputPath + '\\' + newFileName, 'wb')
                   writer.write(outputFile)
                   outputFile.close()
                   pages.clear()

                   newFileName = 'Payslip-' + CrID + '.pdf' 
                   pages.append(page)
                   pyPayslipPdf = open(Path + '\\' + file, 'rb')
                   reader = PyPDF2.PdfFileReader(pyPayslipPdf)
                   writer = PyPDF2.PdfFileWriter()
                   for pg in pages: 
                       curPage = reader.getPage(pg)
                       writer.addPage(curPage)   
                   outputFile = open(outputPath + '\\' + newFileName, 'wb')
                   writer.write(outputFile)
                   outputFile.close()
                   strtPg = 0
                   pages.clear()
               
                elif EEID == CrID:
                     newFileName = 'Payslip-' + EEID + '.pdf' 
                     for i in range(strtPg,page+1):
                         pages.append(i)  
                     pyPayslipPdf = open(Path + '\\' + file, 'rb')
                     reader = PyPDF2.PdfFileReader(pyPayslipPdf)
                     writer = PyPDF2.PdfFileWriter()
                     for pg in pages: 
                         curPage = reader.getPage(pg)
                         writer.addPage(curPage)   
                     outputFile = open(outputPath + '\\' + newFileName, 'wb')
                     writer.write(outputFile)
                     outputFile.close()
                     strtPg = 0
                     pages.clear()

相关问题 更多 >