from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# our corpus
data = ['I like dog', 'I love cat', 'I interested in cat']
cv = CountVectorizer()
# convert text data into term-frequency matrix
data = cv.fit_transform(data)
tfidf_transformer = TfidfTransformer()
# convert term-frequency matrix into tf-idf
tfidf_matrix = tfidf_transformer.fit_transform(data)
# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))
for word, score in word2tfidf.items():
print(word, score)
In [1]: from sklearn.datasets import fetch_20newsgroups
In [2]: data = fetch_20newsgroups(categories=['rec.autos'])
In [3]: from sklearn.feature_extraction.text import TfidfVectorizer
In [4]: cv = TfidfVectorizer()
In [5]: X = cv.fit_transform(data.data)
In [6]: cv.vocabulary_
@kinkajou,No,TF和IDF是不同的,但它们属于同一算法TF-IDF,即术语频率逆文档频率
这是另一个有
CountVectorizer
和TfidfTransformer
的解决方案,可以找到每个单词的Tfidf
分数:输出:
是的。请参见已安装/已转换的TF-IDF矢量器上的
.vocabulary_
。这是一本字典的形式:
{word : column index in array}
相关问题 更多 >
编程相关推荐