对Word2Vec增量向量求平均值时出错

np.set_printoptions(threshold=sys.maxsize) #Pre-Processor Function pre_processor = TextPreProcessor( omit=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons] ) #Averaging Words Vectors to Create Sentence Embedding def word_averaging(wv, words): all_words, mean = set(), [] for word in words: if isinstance(word, np.ndarray): mean.append(word) elif word in wv.vocab: mean.append(wv.syn0norm[wv.vocab[word].index]) all_words.add(wv.vocab[word].index) if not mean: logging.warning("cannot compute similarity with no input %s", words) # FIXME: remove these examples in pre-processing return np.zeros(wv.vector_size,) mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32) return mean def word_averaging_list(wv, text_list): return np.vstack([word_averaging(wv, post) for post in text_list ]) #Loading data raw_aggregate_tweets = pandas.read_excel('E:\\aggregate.xlsx').iloc[:,0] #Loading all tweets to have a bigger word2vec corpus raw_train_tweets = pandas.read_excel('E:\\train.xlsx').iloc[:,1] #Loading all train tweets train_labels = np.array(pandas.read_excel('E:\\train.xlsx').iloc[:,2:13]) #Loading corresponding train labels (11 emotions) raw_test_tweets = pandas.read_excel('E:\\test.xlsx').iloc[:,1] #Loading all test tweets test_gold_labels = np.array(pandas.read_excel('E:\\test.xlsx').iloc[:,2:13]) #Loading corresponding test labels (11 emotions) print("please wait") #Pre-Processing aggregate_tweets=[] train_tweets=[] test_tweets=[] for tweets in raw_aggregate_tweets: aggregate_tweets.append(pre_processor.pre_process_doc(tweets)) for tweets in raw_train_tweets: train_tweets.append(pre_processor.pre_process_doc(tweets)) for tweets in raw_test_tweets: test_tweets.append(pre_processor.pre_process_doc(tweets)) #Vectorizing w2v_model = gensim.models.Word2Vec(aggregate_tweets, min_count = 10, size = 300, window = 8) train_array = word_averaging_list(w2v_model.wv,train_tweets) test_array = word_averaging_list(w2v_model.wv,test_tweets)

TypeError Traceback (most recent call last) <ipython-input-1-8a5fe4dbf144> in <module> 110 print(w2v_model.wv.vectors.shape) 111 --> 112 train_array = word_averaging_list(w2v_model.wv,train_tweets) 113 test_array = word_averaging_list(w2v_model.wv,test_tweets) 114 <ipython-input-1-8a5fe4dbf144> in word_averaging_list(wv, text_list) 70 71 def word_averaging_list(wv, text_list): ---> 72 return np.vstack([word_averaging(wv, post) for post in text_list ]) 73 74 #Averaging Words Vectors to Create Sentence Embedding <ipython-input-1-8a5fe4dbf144> in <listcomp>(.0) 70 71 def word_averaging_list(wv, text_list): ---> 72 return np.vstack([word_averaging(wv, post) for post in text_list ]) 73 74 #Averaging Words Vectors to Create Sentence Embedding <ipython-input-1-8a5fe4dbf144> in word_averaging(wv, words) 58 mean.append(word) 59 elif word in wv.vocab: ---> 60 mean.append(wv.syn0norm[wv.vocab[word].index]) 61 all_words.add(wv.vocab[word].index) 62 TypeError: 'NoneType' object is not subscriptable

二次平均法

#Averaging Words Vectors to Create Sentence Embedding def get_mean_vector(word2vec_model, words): # remove out-of-vocabulary words words = [word for word in words if word in word2vec_model.vocab] if len(words) >= 1: return np.mean(word2vec_model[words], axis=0) else: return np.zeros(word2vec_model.vector_size) #Vectorizing w2v_model = gensim.models.Word2Vec(aggregate_tweets, min_count = 11, size = 400, window = 18, sg=1) train_array=[] test_array=[] for tweet in train_tweets: vec = get_mean_vector(w2v_model.wv, tweet) if len(vec) > 0: train_array.append(vec) for tweet in test_tweets: vec = get_mean_vector(w2v_model.wv, tweet) if len(vec) > 0: test_array.append(vec)

1条回答

网友

1楼 · 发布于 2024-09-25 00:31:50

错误“'NoneType'对象不可下标”表示您试图下标（使用[]进行索引访问）一个实际为None的变量

查看突出显示的行，wv.syn0norm可能是None

它不会自动存在：它只在需要时创建，例如通过.most_similar()操作创建。但是，一旦培训完成，您可以通过调用.init_sims()手动触发其创建：

w2v_model.wv.init_sims()

（请注意，您可能会从代码中得到一个弃用警告：在最近的gensim版本中，该属性被重命名为vectors_norm。此外，出于某些目的，使用这些单位长度规范化向量可能不如原始向量好。）

二次平均法

相关问题更多 >

编程相关推荐

热门问题

热门文章