Python聚类数值d

rows=[(1,1),(3,6),(11,2),(7,19),(22,11),(32,11)] def pearson(v1,v2): #sums sum1=sum(v1) sum2=sum(v2) print(sum1) #sums of the sqs sum1Sq=sum([pow(v,2) for v in v1]) sum2Sq=sum([pow(v,2) for v in v2]) #sum of products pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) #calculate pearson R num=pSum-(sum1*sum2/len(v1)) den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) if den==0: return 0 return 1.0-num/den def kmeans(rows,distance=pearson,k=3): #Determine the min and max values for each point #COunt through "rows"(data) and find min and max values ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))] #create k randomly placed centroids within len of 'data' clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)] lastmatches=None for t in range(100): print 'Iteration %d' % t bestmatches=[[] for i in range(k)] #find which centroid is the closest to each row for j in range(len(rows)): row=rows[j] bestmatch=0 for i in range(k): d=distance(clusters[i],row) if d<distance(clusters[bestmatch],row): bestmatch=i bestmatches[bestmatch].append(j) if bestmatches==lastmatches: break lastmatches=bestmatches #move centroids to the avg of members for i in range(k): avgs=[0.0]*len(rows[0]) if len(bestmatches[i])>0: #print(len(bestmatches[i])) for rowid in bestmatches[i]: for m in range(len(rows[rowid])): avgs[m]+=rows[rowid][m] for j in range(len(avgs)): avgs[j]/=len(bestmatches[i]) clusters[i]=avgs return bestmatches

1条回答

网友

1楼 · 发布于 2024-09-27 18:04:00

不要使用pearson相关系数的k均值

这可能会失败，因为pearson相关和平均值是不兼容的，可能会阻止算法收敛。更糟糕的是，它可能产生无效值。在

如果你取这两个向量

1 2 3 4 5
9 8 7 6 5

那么平均值是

^{pr2}$

所得的平均值不能与Pearson correlation一起使用，因为它是常值。在

K均值仅适用于Brgeman发散，例如平方欧几里德。因为它是关于方差最小化，而不是距离最小化

K-means不能用于任意距离。如果您有其他距离，请使用k-medians（PAM）或其他聚类算法。在

不要使用pearson相关系数的k均值

K均值仅适用于Brgeman发散，例如平方欧几里德。因为它是关于方差最小化，而不是距离最小化

相关问题更多 >

编程相关推荐

热门问题

热门文章