如何使用python使用KMeans将标签集群与真实标签匹配

from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neighbors import KNeighborsClassifier from sklearn.cluster import KMeans from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer import string import re import numpy as np from collections import Counter stop = set(stopwords.words('indonesian')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() # Cleaning the text sentences so that punctuation marks, stop words & digits are removed def clean(doc): stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) processed = re.sub(r"\d+","",normalized) y = processed.split() #print (y) return y path = "coba.txt" train_clean_sentences = [] fp = open(path,'r') for line in fp: line = line.strip() cleaned = clean(line) cleaned = ' '.join(cleaned) train_clean_sentences.append(cleaned) #print(train_clean_sentences) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train_clean_sentences) # Clustering the training 30 sentences with K-means technique modelkmeans = KMeans(n_clusters=3, init='k-means++', max_iter=200, n_init=100) modelkmeans.fit(X) teks_satu = "Aplikasi Machine Learning untuk mengenali daun mangga dengan metode CNN" test_clean_sentence = [] cleaned_test = clean(teks_satu) cleaned = ' '.join(cleaned_test) cleaned = re.sub(r"\d+","",cleaned) test_clean_sentence.append(cleaned) Test = vectorizer.transform(test_clean_sentence) true_test_labels = ['AI','VR','Sistem Informasi'] predicted_labels_kmeans = modelkmeans.predict(Test) print(predicted_labels_kmeans) print ("\n-------------------------------PREDICTIONS BY K-Means--------------------------------------") print ("\nIndex of Virtual Reality : ",Counter(modelkmeans.labels_[5:10]).most_common(1)[0][0]) print ("Index of Machine Learning : ",Counter(modelkmeans.labels_[0:5]).most_common(1)[0][0]) print ("Index of Sistem Informasi : ",Counter(modelkmeans.labels_[10:15]).most_common(1)[0][0]) print ("\n",teks_satu,":",true_test_labels[np.int(predicted_labels_kmeans)],":",predicted_labels_kmeans)

2条回答

网友

1楼 · 编辑于 2024-10-03 11:18:06

我也有同样的问题：我的集群（kmeans）确实返回了不同的类（集群编号），然后才是真正的类。实际标签和预测标签不匹配的结果。对我有效的解决方案是this代码（滚动到“最大化对角线元素和的排列”）。虽然这种方法很有效，但我认为在某些情况下它是错误的

网友

2楼 · 编辑于 2024-10-03 11:18:06

下面是一个具体的示例，演示如何将KMeans集群ID与训练数据标签匹配。基本思想是confusion_matrix在其对角线上应有较大的值，前提是分类正确。以下是将群集中心ID与培训标签关联之前的混淆矩阵：

cm = 
array([[  0, 395,   0,   5,   0],
       [  0,   2,   5, 391,   2],
       [  2,   0,   0,   0, 398],
       [  0,   0, 400,   0,   0],
       [398,   0,   0,   0,   2]])

现在我们只需要重新排列混淆矩阵，使其大值重新定位在对角线上。它可以很容易地实现

cm_argmax = cm.argmax(axis=0)
cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])

这里我们得到了新的混乱矩阵，现在看起来很熟悉，对吗

cm_ = 
array([[395,   5,   0,   0,   0],
       [  2, 391,   2,   5,   0],
       [  0,   0, 398,   0,   2],
       [  0,   0,   0, 400,   0],
       [  0,   0,   2,   0, 398]])

您可以使用accuracy_score进一步验证结果

y_pred_ = np.array([cm_argmax[i] for i in y_pred])
accuracy_score(y,y_pred_)
# 0.991

完整的独立代码如下所示：

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import confusion_matrix,accuracy_score
blob_centers = np.array(
    [[ 0.2,  2.3],
     [-1.5 ,  2.3],
     [-2.8,  1.8],
     [-2.8,  2.8],
     [-2.8,  1.3]])
blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])
X, y = make_blobs(n_samples=2000, centers=blob_centers,
                  cluster_std=blob_std, random_state=7)

def plot_clusters(X, y=None):
    plt.scatter(X[:, 0], X[:, 1], c=y, s=1)
    plt.xlabel("$x_1$", fontsize=14)
    plt.ylabel("$x_2$", fontsize=14, rotation=0)

plt.figure(figsize=(8, 4))
plot_clusters(X)
plt.show()

k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X)
cm = confusion_matrix(y, y_pred)
cm
cm_argmax = cm.argmax(axis=0)
cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
cm_ = confusion_matrix(y, y_pred)
cm_
accuracy_score(y,y_pred_)

相关问题更多 >

编程相关推荐

热门问题

热门文章