def get_nouns (text):
tagger = MeCab.Tagger()
words = []
for c in tagger.parse(text).splitlines()[:-1]:
if len(c.split('\t')) < 2:
continue
surface, feature = c.split('\t')
pos = feature.split(',')[0]
if pos == '名詞': # noun
words.append(surface)
return ' '.join(words)
def bio():
biolist =[]
howmany = 10
for giin in read:
if len(giin["education"]) < 1:
continue
biolist.append(get_nouns(" ".join(giin["education"])))
######################################################
nparray = np.array (biolist)
cv = CountVectorizer()
bags = cv.fit_transform(nparray)
tfidf=TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(bags)
km_model = KMeans(n_clusters=howmany, init='k-means++')
km_model.fit_transform(tfidf)
lsa2 = TruncatedSVD(2)
compressed_text_list = lsa2.fit_transform(tfidf)
compressed_center_list = lsa2.fit_transform(km_model.cluster_centers_)
X = []
Y = []
X_cent = []
Y_cent = []
for x, y in compressed_text_list:
X.append(x)
Y.append(y)
for x, y in compressed_center_list:
X_cent.append(x)
Y_cent.append(y)
clus_list = []
for i in range (howmany):
clus_list.append([])
for a in biolist:
if km_model.labels_[biolist.index(a)] == i:
clus_list[i].append(a)
for a in clus_list:
print (a)
print (" ")
plt.scatter(X, Y, c = km_model.labels_)
plt.scatter(X_cent, Y_cent, c="r", marker = "+")
plt.show()
我有一个代码,将人们的教育历史分为10组。我的散点图是这样的。
如您所见,散点图并没有真正按组排序,不同的颜色相互混合。如何更改此代码以使分组更精确
目前没有回答
相关问题 更多 >
编程相关推荐