pyPDF startx处有错误的外部参照字符

2024-09-29 23:25:17 发布

您现在位置:Python中文网/ 问答频道 /正文

我使用pyPDF进行pdf页面提取和合并。我的问题并不完全依赖于pyPDF,因为我以前在同一个pdf文件中遇到过pdfSharp相同类型的错误。在

问题是,当我试图读入我们从供应商那里收到的一些pdf文档时,我遇到了一个错误。我不能叫他们去修理,所以我得在我们这边处理。现在我在java中使用iText来处理pdf合并,它对这些文件没有任何问题,但是iText比pyPDF慢,更难维护。pyPDF有一个用于读取外部参照表的部分。在该部分有几个选项,要么行以“xref”开头,要么以数字开头,要么以“xref”开头,但在x之前有一个额外的字符

在我的例子中,行以“196 0 obj”开头,但下一行是“lt;<;/Length 197 0 R”。pyPDF和pdfSharp没有意识到这一点,他们试图将其作为交叉引用来阅读,并抛出异常。我能做些什么来避免这种情况或者修补pyPDF有什么建议吗?它可能格式不好,但我需要像Acrobat和iText那样处理它。

这是来自py.pdf文件在pyPDF库中。代码很多,但重要的是序列中以if x==“x”开头的if语句:

# read all cross reference tables and their trailers
    self.xref = {}
    self.xref_objStm = {}
    self.trailer = DictionaryObject()
    while 1:
        # load the xref table
        stream.seek(startxref, 0)
        x = stream.read(1)
        if x == "x":
            # standard cross-reference table
            ref = stream.read(4)
            if ref[:3] != "ref":
                raise utils.PdfReadError, "xref table read error"
            readNonWhitespace(stream)
            stream.seek(-1, 1)
            while 1:
                num = readObject(stream, self)
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                size = readObject(stream, self)
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                cnt = 0
                while cnt < size:
                    line = stream.read(20)
                    # It's very clear in section 3.4.3 of the PDF spec
                    # that all cross-reference table lines are a fixed
                    # 20 bytes.  However... some malformed PDF files
                    # use a single character EOL without a preceeding
                    # space.  Detect that case, and seek the stream
                    # back one character.  (0-9 means we've bled into
                    # the next xref entry, t means we've bled into the
                    # text "trailer"):
                    if line[-1] in "0123456789t":
                        stream.seek(-1, 1)
                    offset, generation = line[:16].split(" ")
                    offset, generation = int(offset), int(generation)
                    if not self.xref.has_key(generation):
                        self.xref[generation] = {}
                    if self.xref[generation].has_key(num):
                        # It really seems like we should allow the last
                        # xref table in the file to override previous
                        # ones. Since we read the file backwards, assume
                        # any existing key is already set correctly.
                        pass
                    else:
                        self.xref[generation][num] = offset
                    cnt += 1
                    num += 1
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                trailertag = stream.read(7)
                if trailertag != "trailer":
                    # more xrefs!
                    stream.seek(-7, 1)
                else:
                    break
            readNonWhitespace(stream)
            stream.seek(-1, 1)
            newTrailer = readObject(stream, self)
            for key, value in newTrailer.items():
                if not self.trailer.has_key(key):
                    self.trailer[key] = value
            if newTrailer.has_key("/Prev"):
                startxref = newTrailer["/Prev"]
            else:
                break
        elif x.isdigit():
            # PDF 1.5+ Cross-Reference Stream
            stream.seek(-1, 1)
            idnum, generation = self.readObjectHeader(stream)
            xrefstream = readObject(stream, self)
            assert xrefstream["/Type"] == "/XRef"
            self.cacheIndirectObject(generation, idnum, xrefstream)
            streamData = StringIO(xrefstream.getData())
            idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
            entrySizes = xrefstream.get("/W")
            for num, size in self._pairs(idx_pairs):
                cnt = 0
                while cnt < size:
                    for i in range(len(entrySizes)):
                        d = streamData.read(entrySizes[i])
                        di = convertToInt(d, entrySizes[i])
                        if i == 0:
                            xref_type = di
                        elif i == 1:
                            if xref_type == 0:
                                next_free_object = di
                            elif xref_type == 1:
                                byte_offset = di
                            elif xref_type == 2:
                                objstr_num = di
                        elif i == 2:
                            if xref_type == 0:
                                next_generation = di
                            elif xref_type == 1:
                                generation = di
                            elif xref_type == 2:
                                obstr_idx = di
                    if xref_type == 0:
                        pass
                    elif xref_type == 1:
                        if not self.xref.has_key(generation):
                            self.xref[generation] = {}
                        if not num in self.xref[generation]:
                            self.xref[generation][num] = byte_offset
                    elif xref_type == 2:
                        if not num in self.xref_objStm:
                            self.xref_objStm[num] = [objstr_num, obstr_idx]
                    cnt += 1
                    num += 1
            trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
            for key in trailerKeys:
                if xrefstream.has_key(key) and not self.trailer.has_key(key):
                    self.trailer[NameObject(key)] = xrefstream.raw_get(key)
            if xrefstream.has_key("/Prev"):
                startxref = xrefstream["/Prev"]
            else:
                break
        else:
            # bad xref character at startxref.  Let's see if we can find
            # the xref table nearby, as we've observed this error with an
            # off-by-one before.
            stream.seek(-11, 1)
            tmp = stream.read(20)
            print tmp
            xref_loc = tmp.find("xref")
            if xref_loc != -1:
                startxref -= (10 - xref_loc)
                continue
            else:
                # no xref table found at specified location
                assert False
                break

注: 我的示例是在最后三行中抛出assert False


Tags: thekeyinselfreadstreamiftype

热门问题