如何修复“无法序列化”错误_io.BufferedReader文件在python中运行多处理时的“object”

2024-05-04 17:47:21 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图使用python中的多处理将多页pdf拆分为多个单页pdf,但出现了错误。我在StackoverFlow上找了一些类似的帖子,但没有找到答案。你知道吗

这是我的密码:

from PyPDF2 import PdfFileWriter, PdfFileReader
import os
from os import listdir
from os.path import isfile, join
from pdf2image import convert_from_path, convert_from_bytes
from wand.image import Image
import multiprocessing as mp
from functools import partial

def extract_page(pgnum, inputPDF, pdfFolder, file_name):
    output = PdfFileWriter()
    output.addPage(inputPDF.getPage(pgnum))
    page = pdfFolder + "/" + file_name + "-%s.pdf" % (pgnum + 1)
    with open(page, "wb") as outputStream:
        output.write(outputStream)

def parallel_run(pages, inputPDF, pdfFolder, file_name):
    pool = mp.Pool(2)
    split_pdf = partial(extract_page, inputPDF = inputPDF, pdfFolder = pdfFolder, file_name = file_name)
    results = pool.map(split_pdf, pages)
    pool.close()

def splitPdf(file):
    listPdf = []
    inputpdf = PdfFileReader(open(file, "rb"))
    file_name = os.path.basename(file)[:-4]
    #create a folder to store splitted pdf
    pdfFolder = os.path.dirname(file)  +"/"+ file_name + "_split"
    try:
        os.mkdir(pdfFolder)
    except OSError:
        print("Directory %s is already existed" % pdfFolder)
        listPdf = [pdfFolder + "/" + f for f in listdir(pdfFolder) if isfile(join(pdfFolder, f))]
        listPdf = [f for f in listPdf if f.endswith(".pdf")]
    else:
        print("Successfully created the directory %s " % pdfFolder)
        pgs = list(range(0, inputpdf.numPages))
        parallel_run(pgs, inputpdf, pdfFolder, file_name)
        listPdf = [pdfFolder + "/" + f for f in listdir(pdfFolder) if isfile(join(pdfFolder, f))]
        listPdf = [f for f in listPdf if f.endswith(".pdf")]
    return listPdf

这就是我犯的错误。有谁曾经遇到过这个问题?你知道吗

Traceback (most recent call last):
  File "C:/Users/tkdang/PycharmProjects/listSPDP/data_no_index2/split_pdf.py", line 79, in <module>
    listPDF = splitPdf(path)
  File "C:/Users/tkdang/PycharmProjects/listSPDP/data_no_index2/split_pdf.py", line 41, in splitPdf
    parallel_run(pgs, inputpdf, pdfFolder, file_name)
  File "C:/Users/tkdang/PycharmProjects/listSPDP/data_no_index2/split_pdf.py", line 22, in parallel_run
    results = pool.map(split_pdf, pages)
  File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py", line 290, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py", line 683, in get
    raise self._value
  File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py", line 457, in _handle_tasks
    put(task)
  File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
TypeError: cannot serialize '_io.BufferedReader' object

Tags: nameinfrompyimportpdflineusers