我尝试了一些脚本,我捕获pdf文件,然后将该文件转换为txt文件。我得到的文本文件类型为_io.textIoWrapper。现在我用_io.textIoWrapper对象调用itertools.isslice()。如果文件格式的纯io.textIoWrapper没有给出任何错误,则生成ValueError: I/O operation on closed file.
。我试图打开textIoWrapper对象,但结果是expect only str , bytes or None not TextIoWrapper type
views.py
def save_file(cls, user, file, file_format, project_id,file1):
project = get_object_or_404(Project, pk=project_id)
parser = cls.select_parser(file_format)
if file_format == 'pdf':
path = default_storage.save('text_pdf_file.pdf', ContentFile(file.read()))
return_file = convert_pdf_txt(path,file_format)
print(type(return_file)) # _io.textIoWrapper
file = return_file
data = parser.parse(file,file_format)
storage = project.get_storage(data)
storage.save(user)
utils.py
class PlainTextParser(FileParser):
def parse(self, file,file_format):
if file_format == 'plain':
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
while True:
batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
if not batch:
break
yield [{'text': line.strip()} for line in batch]
convert.py
import os
from docx import Document
import pdfplumber as pp
import io
import unidecode
import re
def remove_accented_chars(text):
text = unidecode.unidecode(text)
return text
def remove_extra_spaces(line):
return re.sub(' +|\t+',' ',line)
def remove_special_char(line):
return re.sub(r"[^a-zA-Z0-9%.@]+",' ', line)
def preprocessing(lines,fileExtension):
example_text_file = "ex_txt.txt"
for line in lines:
if fileExtension == "docx":
x=str(line.text)
elif fileExtension == "pdf":
x = line.extract_text()
else:
x = line
x=remove_accented_chars(x)
x=remove_extra_spaces(x)
x=remove_special_char(x)
with open(example_text_file,"a+",) as pre:
if len(x)>=5:
pre.write(x+"\n")
return pre
def convert_pdf_txt(path,fileExtension):
with pp.open(path) as pdf :
pages = pdf.pages
pre = preprocessing(pages,fileExtension)
return pre
目前没有回答
相关问题 更多 >
编程相关推荐