使用PyPDF从PDF中提取图像，xObject中不带“/Filter”标记

import PyPDF4 from PIL import Image from pathlib import Path import os PDFFilePath = Path("somefile.pdf") OutputFolder = "somedirectory" pdfpage = 0 with open(PDFFilePath,'rb') as pdf_reader: pdf_object = PyPDF4.PdfFileReader(pdf_reader) PageFolder = Path(OutputFolder).joinpath(Path(PDFFilePath.stem + '.'+ str(pdfPage))) if not PageFolder.exists(): os.makedirs(PageFolder) CurrentPage = pdf_object.getPage(pdfPage) xObject = CurrentPage['/Resources']['/XObject'].getObject() for obj_index,obj in enumerate(xObject): if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj].getData() if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode = "RGB" else: mode = "P" if xObject[obj]['/Filter'] == '/FlateDecode': img = Image.frombytes(mode, size, data) img.save(PageFolder.joinpath(Path(PDFFilePath).stem +"."+ str(pdfPage) + "."+ str(obj_index) + ".png"),'wb') elif xObject[obj]['/Filter'] == '/DCTDecode': img = open(PageFolder.joinpath(Path(PDFFilePath).stem +"."+ str(pdfPage) + "."+ str(obj_index)+ ".jpg"),'wb') img.write(data) img.close() elif xObject[obj]['/Filter'] == '/JPXDecode': img = open(PageFolder.joinpath(Path(PDFFilePath).stem +"."+ str(pdfPage) + "."+ str(obj_index)+ ".jp2"),'wb') img.write(data) img.close() elif xObject[obj]['/Filter'] == '/CCITTFaxDecode': img = open(PageFolder.joinpath(Path(PDFFilePath).stem +"."+ str(pdfPage) + "."+ str(obj_index)+ ".tiff"),'wb') img.write(data) img.close()

1条回答

网友

1楼 · 发布于 2024-09-27 09:29:07

图像显然是.tiff图像，但没有标题。我找到这个：https://stackoverflow.com/a/34555343/13919892

我在代码中添加了此函数：

import struct

def tiff_header_for_CCITT(width, height, img_size, CCITT_group=4):
    tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
    return struct.pack(tiff_header_struct,
                       b'II',  # Byte order indication: Little indian
                       42,  # Version number (always 42)
                       8,  # Offset to first IFD
                       8,  # Number of tags in IFD
                       256, 4, 1, width,  # ImageWidth, LONG, 1, width
                       257, 4, 1, height,  # ImageLength, LONG, 1, lenght
                       258, 3, 1, 1,  # BitsPerSample, SHORT, 1, 1
                       259, 3, 1, CCITT_group,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
                       262, 3, 1, 0,  # Threshholding, SHORT, 1, 0 = WhiteIsZero
                       273, 4, 1, struct.calcsize(tiff_header_struct),  # StripOffsets, LONG, 1, len of header
                       278, 4, 1, height,  # RowsPerStrip, LONG, 1, lenght
                       279, 4, 1, img_size,  # StripByteCounts, LONG, 1, size of image
                       0  # last IFD
                       )

然后将其添加到我的代码中：

if not '/Filter' in xObject[obj]:
    tiff_header = tiff_header_for_CCITT(size[0],size[1],len(data),1) # Using the group "1" because it works for some reason
    inv_data = bytes((~bit + 256 for bit in data)) # for some reason the bits are inverted?
    tiff_data = tiff_header + inv_data # Add the header to the inverted data
    # Write the tiff file
    img = open(PageFolder.joinpath(Path(PDFFilePath).stem +"."+ str(pdfPage) + "."+ str(obj_index)+ ".tiff"),'wb')
    img.write(tiff_data)
    img.close()
    continue

我需要知道如何识别位是否需要反转，或者使用什么“CCITT组”

我将把这个标记为答案，也许只是为这个打开一个新的问题

相关问题更多 >

编程相关推荐

热门问题

热门文章