从特定路径读取多个文本文件，并使用inf TFIdF获取前十个最常用单词

Traceback (most recent call last): File "C:/Python27/cluster.py", line 51, in <module> scores = {word: tfidf(word, blob, bloblist) for word in blob.words} File "C:\Python27\lib\site-packages\textblob\decorators.py", line 24, in __get__ value = obj.__dict__[self.func.__name__] = self.func(obj) File "C:\Python27\lib\site-packages\textblob\blob.py", line 643, in words return WordList(word_tokenize(self.raw, include_punc=False)) File "C:\Python27\lib\site-packages\textblob\blob.py", line 218, in __init__ self._collection = [Word(w) for w in collection] File "C:\Python27\lib\site-packages\textblob\blob.py", line 74, in __new__ return super(Word, cls).__new__(cls, string) UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)

from __future__ import division, unicode_literals import sys, getopt import glob,os import math from cStringIO import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from textblob import TextBlob as tb sys.path.append("C:\Python27\Lib\site-packages\pdfminer.six-20160614- py2.7.egg\pdfminer") def tf(word, blob): return blob.words.count(word) / len(blob.words) def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) file_names = glob.glob("C:\Python27\PDF2text\Text\\*.txt") corpus = [] for file_path in file_names: with open(file_path) as f_input: corpus.append(f_input.read()) print corpus bloblist = map(tb,corpus) for i, blob in enumerate(bloblist): print("Document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:10]: print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章