使用Python获取PDF附件问题的回答

使用Python获取PDF附件

回答此问题可获得 20 贡献值，回答如果被采纳可获得 50 分。

0 条评论
分类：Python问答

默认排序时间排序

1 个回答

匿名 1天前

　擅长：python、mysql、java

这是可以改进的，但是已经测试过了（使用PyMuPDF）。 它检测损坏的PDF文件、加密、附件、批注和公文包。 我还没有将输出与我们的内部分类进行比较。 生成可以导入Excel的分号分隔文件 <pre><code>import fitz # = PyMuPDF import os outfile = open("C:/Users/me/Downloads/testPDF3.txt", "w", encoding="utf-8") folder = "C:/Users/me/Downloads" print ("filepath;","encrypted;","pages;", "embedded;","attachments;","annotations;","portfolio", file = outfile) enc=pages=count=names=annots=collection='' for subdir, dirs, files in os.walk(folder): for file in files: #print (os.path.join(subdir, file)) filepath = subdir + os.sep + file if filepath.endswith(".pdf"): #print (filepath, file = outfile) try: doc = fitz.open(filepath) enc = doc.is_encrypted #print("Encrypted? ", enc, file = outfile) pages = doc.page_count #print("Number of pages: ", pages, file = outfile) count = doc.embfile_count() #print("Number of embedded files:", count, file = outfile) # shows number of embedded files names = doc.embfile_names() #print("Embedded files:", str(names), file = outfile) #if count > 0: # for emb in names: # print(doc.embfile_info(emb), file = outfile) annots = doc.has_annots() #print("Has annots?", annots, file = outfile) links = doc.has_links() #print("Has links?", links, file = outfile) trailer = doc.pdf_trailer() #print("Trailer: ", trailer, file = outfile) xreflen = doc.xref_length() # length of objects table for xref in range(1, xreflen): # skip item 0! #print("", file = outfile) #print("object %i (stream: %s)" % (xref, doc.is_stream(xref)), file = outfile) #print(doc.xref_object(i, compressed=False), file = outfile) if "Collection" in doc.xref_object(xref, compressed=False): #print ("Portfolio", file = outfile) collection ='True' break else: collection="False" #print(doc.xref_object(xref, compressed=False), file = outfile) except: #print ("Not a valid PDF", file = outfile) enc=pages=count=names=annots=collection="Not a valid PDF" print(filepath,";", enc,";",pages, ";",count, ";",names, ";",annots, ";",collection, file = outfile ) outfile.close() </code></pre>

使用Python获取PDF附件

1 个回答

相关Python问题