从pdf中提取文本str中不带白色sp的字符后的数字

#import PyPDF2 and set extracted text as the page_content variable import PyPDF2 pdf_file = open('5302.pdf','rb') read_pdf = PyPDF2.PdfFileReader(pdf_file) number_of_pages = read_pdf.getNumPages() page = read_pdf.getPage(0) page_content = page.extractText() #initialize the user_input variable user_input = "" #function to get the AFE numbers from the pdf document def get_afenumbers(Y): #initialize the afe and afelist variables afe = "A" afelist = "" x = "" #Make a while loop of this after figuring out how to get only 6 digits #after the "A" use .isdigit() somehow? while True: if user_input.upper().startswith("Y") == True: #Find the letter A and extract it and its following 6 digits if "A" in page_content: #right now only getting everything after first A afe = page_content[page_content.find("A")+1:] #Add AFEs to afelist afelist += afe #Build a string of AFEs seperated by a new line character x = x + '\n' + afe print(afe) break else: afe = "No AFE numbers found..." if user_input.upper().startswith("N") == True: print("HAVE A GREAT DAY - GOODBYE!!!") break #Build a while loop for initial question prompt (when Y or N is not True): while user_input != "Y" and user_input != "N": user_input = input('List AFE numbers? Y or N: ').upper() if user_input not in ["Y","N"]: print('"',user_input,'"','is an invalid input') get_afenumbers(user_input)

1条回答

网友

1楼 · 发布于 2024-06-26 14:05:07

可以使用正则表达式来提取匹配项

忽略您的循环，我们可以使用以下方法设置要搜索的文本：

text = '''A12345612341234 asdfa we'a aslkfj4353 alsdfasA345678asA858585943'''

现在我们要匹配任何大写字母（[A-Z]），后跟任意数字的6（[0-9]{6}）。在您的代码中，似乎只需要A，因此您可以将[A-Z]替换为：

import re 
re.findall('[A-Z][0-9]{6}', text)

你的答案是：

['A123456', 'A345678', 'A858585']

相关问题更多 >

编程相关推荐

热门问题

热门文章