使用CountVectorizer进行词干分析后对文本数据集进行矢量化时获得全部零

# Stemming from nltk.stem import PorterStemmer ps = PorterStemmer() stemmed_words=[] for w in filtered_sent: stemmed_words.append(ps.stem(w)) print("Filtered Sentence:",filtered_sent[0:50]) print("Stemmed Sentence:",stemmed_words[0:50])

Stemmed Sentence: ['0', 'crack', 'adam', 'disco', 'cooki', 'ecstasi', 'discard', 'travel', '...', '1', 'o.k', '.', 'o.k', '.', 'o.k', '.', 'o.k', '.', 'lar', 'play', 'joke', 'joke', 'joke', 'jo', '...', '2', 'free', 'peopl', 'introduct', 'record', 'entranc', 'entra', '...', '3', 'brawl', 'caus', 'doctor', 'osteopathi', 'unreal', '...', '4', 'nobelium', 'nobelium', 'brawl', 'act', 'doctor', 'osteop', '...', '5', 'freemsg']

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer cv = CountVectorizer(analyzer='word', ngram_range=(2, 2)) text_counts = cv.fit_transform(stemmed_words) print(text_counts[0:10].toarray())

print(text_counts[0:10].toarray()) [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

1条回答

网友

1楼 · 发布于 2024-05-28 11:17:29

您需要将未发音的句子输入到CountVectorizer或TfidfVectorizer，否则它会将每个单词理解为一个单独的句子。因此，只有零，因为它无法在您的输入中找到bi-gram

更简单的方法是更改向量器的build_analyzer方法（在self.analyzer == 'word':之后），这样您就不必单独执行

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import PorterStemmer
import pandas as pd

ps = PorterStemmer()

class CustomVectorizer(CountVectorizer): 

    # overwrite the build_analyzer method, allowing one to
    # create a custom analyzer for the vectorizer
    def build_analyzer(self):
        """Return a callable that handles preprocessing and tokenization"""
        if callable(self.analyzer):
            return self.analyzer

        preprocess = self.build_preprocessor()

        if self.analyzer == 'char':
            return lambda doc: self._char_ngrams(preprocess(self.decode(doc)))

        elif self.analyzer == 'char_wb':
            return lambda doc: self._char_wb_ngrams(
                preprocess(self.decode(doc)))

        elif self.analyzer == 'word':


            tokenize = self.build_tokenizer()

            lemmatize = lambda doc: \
                    [ps.stem(token) for token in doc]

            stop_words = self.get_stop_words()

            return lambda doc: self._word_ngrams(
                lemmatize(tokenize(preprocess(self.decode(doc)))),
                stop_words)


        else:
            raise ValueError('%s is not a valid tokenization scheme/analyzer' %
                             self.analyzer)

sentences= [" crack adam disco cooki ecstasi discard another sentence", "another sentences"]

vec = CustomVectorizer(analyzer='word', ngram_range=(2, 2))
text_counts = vec.fit_transform(sentences)    
print(pd.DataFrame(text_counts.toarray(),columns=vec.get_feature_names()))

   adam disco  anoth sentenc  cooki ecstasi  crack adam  discard anoth  disco cooki  ecstasi discard
0           1              1              1           1              1            1                1
1           0              1              0           0              0            0                0

相关问题更多 >

编程相关推荐

热门问题

热门文章