我想使用gensim在我的tweet数据集上创建Word2Vec向量。该代码用于基于tweets的多标签情感分类。我已经聚合了tweets文件,其中包含107k条tweets。我使用它基于创建Word2Vec向量。我的代码:
np.set_printoptions(threshold=sys.maxsize)
#Pre-Processor Function
pre_processor = TextPreProcessor(
omit=['url', 'email', 'percent', 'money', 'phone', 'user',
'time', 'url', 'date', 'number'],
normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
'time', 'url', 'date', 'number'],
segmenter="twitter",
corrector="twitter",
unpack_hashtags=True,
unpack_contractions=True,
tokenizer=SocialTokenizer(lowercase=True).tokenize,
dicts=[emoticons]
)
#Averaging Words Vectors to Create Sentence Embedding
def word_averaging(wv, words):
all_words, mean = set(), []
for word in words:
if isinstance(word, np.ndarray):
mean.append(word)
elif word in wv.vocab:
mean.append(wv.syn0norm[wv.vocab[word].index])
all_words.add(wv.vocab[word].index)
if not mean:
logging.warning("cannot compute similarity with no input %s", words)
# FIXME: remove these examples in pre-processing
return np.zeros(wv.vector_size,)
mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
return mean
def word_averaging_list(wv, text_list):
return np.vstack([word_averaging(wv, post) for post in text_list ])
#Loading data
raw_aggregate_tweets = pandas.read_excel('E:\\aggregate.xlsx').iloc[:,0] #Loading all tweets to have a bigger word2vec corpus
raw_train_tweets = pandas.read_excel('E:\\train.xlsx').iloc[:,1] #Loading all train tweets
train_labels = np.array(pandas.read_excel('E:\\train.xlsx').iloc[:,2:13]) #Loading corresponding train labels (11 emotions)
raw_test_tweets = pandas.read_excel('E:\\test.xlsx').iloc[:,1] #Loading all test tweets
test_gold_labels = np.array(pandas.read_excel('E:\\test.xlsx').iloc[:,2:13]) #Loading corresponding test labels (11 emotions)
print("please wait")
#Pre-Processing
aggregate_tweets=[]
train_tweets=[]
test_tweets=[]
for tweets in raw_aggregate_tweets:
aggregate_tweets.append(pre_processor.pre_process_doc(tweets))
for tweets in raw_train_tweets:
train_tweets.append(pre_processor.pre_process_doc(tweets))
for tweets in raw_test_tweets:
test_tweets.append(pre_processor.pre_process_doc(tweets))
#Vectorizing
w2v_model = gensim.models.Word2Vec(aggregate_tweets, min_count = 10, size = 300, window = 8)
train_array = word_averaging_list(w2v_model.wv,train_tweets)
test_array = word_averaging_list(w2v_model.wv,test_tweets)
但我得到了这个错误:
TypeError Traceback (most recent call last)
<ipython-input-1-8a5fe4dbf144> in <module>
110 print(w2v_model.wv.vectors.shape)
111
--> 112 train_array = word_averaging_list(w2v_model.wv,train_tweets)
113 test_array = word_averaging_list(w2v_model.wv,test_tweets)
114
<ipython-input-1-8a5fe4dbf144> in word_averaging_list(wv, text_list)
70
71 def word_averaging_list(wv, text_list):
---> 72 return np.vstack([word_averaging(wv, post) for post in text_list ])
73
74 #Averaging Words Vectors to Create Sentence Embedding
<ipython-input-1-8a5fe4dbf144> in <listcomp>(.0)
70
71 def word_averaging_list(wv, text_list):
---> 72 return np.vstack([word_averaging(wv, post) for post in text_list ])
73
74 #Averaging Words Vectors to Create Sentence Embedding
<ipython-input-1-8a5fe4dbf144> in word_averaging(wv, words)
58 mean.append(word)
59 elif word in wv.vocab:
---> 60 mean.append(wv.syn0norm[wv.vocab[word].index])
61 all_words.add(wv.vocab[word].index)
62
TypeError: 'NoneType' object is not subscriptable
看起来你的帖子大部分都是代码;请添加更多详细信息。这个站点的错误是什么?天哪。我没有更多的细节。抱歉,我必须这样做才能绕过错误
#Averaging Words Vectors to Create Sentence Embedding
def get_mean_vector(word2vec_model, words):
# remove out-of-vocabulary words
words = [word for word in words if word in word2vec_model.vocab]
if len(words) >= 1:
return np.mean(word2vec_model[words], axis=0)
else:
return np.zeros(word2vec_model.vector_size)
#Vectorizing
w2v_model = gensim.models.Word2Vec(aggregate_tweets, min_count = 11, size = 400, window = 18, sg=1)
train_array=[]
test_array=[]
for tweet in train_tweets:
vec = get_mean_vector(w2v_model.wv, tweet)
if len(vec) > 0:
train_array.append(vec)
for tweet in test_tweets:
vec = get_mean_vector(w2v_model.wv, tweet)
if len(vec) > 0:
test_array.append(vec)
错误“'NoneType'对象不可下标”表示您试图下标(使用
[]
进行索引访问)一个实际为None
的变量查看突出显示的行,
wv.syn0norm
可能是None
它不会自动存在:它只在需要时创建,例如通过
.most_similar()
操作创建。但是,一旦培训完成,您可以通过调用.init_sims()
手动触发其创建:(请注意,您可能会从代码中得到一个弃用警告:在最近的gensim版本中,该属性被重命名为
vectors_norm
。此外,出于某些目的,使用这些单位长度规范化向量可能不如原始向量好。)相关问题 更多 >
编程相关推荐