python-kmedoids更高效地计算新的medoid中心

###helper function here### def compute_d_p(X, medoids, p): m = len(X) medoids_shape = medoids.shape # If a 1-D array is provided, # it will be reshaped to a single row 2-D array if len(medoids_shape) == 1: medoids = medoids.reshape((1,len(medoids))) k = len(medoids) S = np.empty((m, k)) for i in range(m): d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1) S[i, :] = d_i**p return S

for datap in cluster_points: new_medoid = datap new_dissimilarity= np.sum(compute_d_p(X, datap, p)) if new_dissimilarity < avg_dissimilarity : avg_dissimilarity = new_dissimilarity out_medoids[i] = datap

# Imports import pandas as pd import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn import datasets from sklearn.decomposition import PCA # Dataset iris = datasets.load_iris() data = pd.DataFrame(iris.data,columns = iris.feature_names) target = iris.target_names labels = iris.target #Scaling from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns) #PCA Transformation from sklearn.decomposition import PCA pca = PCA(n_components=3) principalComponents = pca.fit_transform(data) PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3']) datapoints = PCAdf.values m, f = datapoints.shape k = 3 def init_medoids(X, k): from numpy.random import choice from numpy.random import seed seed(1) samples = choice(len(X), size=k, replace=False) return X[samples, :] medoids_initial = init_medoids(datapoints, 3) def compute_d_p(X, medoids, p): m = len(X) medoids_shape = medoids.shape # If a 1-D array is provided, # it will be reshaped to a single row 2-D array if len(medoids_shape) == 1: medoids = medoids.reshape((1,len(medoids))) k = len(medoids) S = np.empty((m, k)) for i in range(m): d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1) S[i, :] = d_i**p return S S = compute_d_p(datapoints, medoids_initial, 2) def assign_labels(S): return np.argmin(S, axis=1) labels = assign_labels(S) def update_medoids(X, medoids, p): S = compute_d_p(points, medoids, p) labels = assign_labels(S) out_medoids = medoids for i in set(labels): avg_dissimilarity = np.sum(compute_d_p(points, medoids[i], p)) cluster_points = points[labels == i] for datap in cluster_points: new_medoid = datap new_dissimilarity= np.sum(compute_d_p(points, datap, p)) if new_dissimilarity < avg_dissimilarity : avg_dissimilarity = new_dissimilarity out_medoids[i] = datap return out_medoids def has_converged(old_medoids, medoids): return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids]) #Full algorithm def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf): if starting_medoids is None: medoids = init_medoids(X, k) else: medoids = starting_medoids converged = False labels = np.zeros(len(X)) i = 1 while (not converged) and (i <= max_steps): old_medoids = medoids.copy() S = compute_d_p(X, medoids, p) labels = assign_labels(S) medoids = update_medoids(X, medoids, p) converged = has_converged(old_medoids, medoids) i += 1 return (medoids,labels) results = kmedoids(datapoints, 3, 2) final_medoids = results[0] data['clusters'] = results[1]

1条回答

网友

1楼 · 发布于 2024-05-19 14:31:53

numpy的广播功能很有可能会有所帮助。让广播在3+维中工作有点棘手，我通常不得不求助于一些尝试和错误来获得正确的细节

这里使用linalg.norm会使事情更加复杂，因为我的代码版本不会为所有输入提供与linalg.norm相同的结果。但我相信，在这种情况下，它将为所有相关输入提供相同的结果

我在代码中添加了一些注释来解释某些细节背后的想法

def compute_d_p_broadcasted(X, medoids, p):

    # If a 1-D array is provided, 
    # it will be reshaped to a single row 2-D array

    if len(medoids.shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))

    # In general, broadcasting n-dim arrays requires that the last
    # dim of the first array be a singleton dimension, and that the
    # first dim of the second array be a singleton dimension. We can
    # quickly accomplish that by slicing with `None` in the appropriate
    # places. (`np.newaxis` is a slightly more self-documenting way
    # of spelling `None`, but I rarely bother.)

    # In this case, the shapes of the other two dimensions also
    # have to align in the same way you'd expect for a dot product.
    # So we pass `medoids.T`.

    diff = np.abs(X[:, :, None] - medoids.T[None, :, :])

    # The last tricky bit is to figure out which axis to sum. Right
    # now, the array is a 3-dimensional array, with the first 
    # dimension corresponding to the rows of `X` and the last 
    # dimension corresponding to the columns of `medoids.T`. 
    # The middle dimension corresponds to the underlying dimensionality
    # of the space; that's what we want to sum for a sum of squares.
    # (Or sum of cubes for L3 norm, etc.)

    return (diff ** p).sum(axis=1)

def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape
    # If a 1-D array is provided, 
    # it will be reshaped to a single row 2-D array
    if len(medoids_shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))
    k = len(medoids)
    S = np.empty((m, k))
    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p
    return S

# A couple of simple tests:
X = np.array([[ 1.0,  2,  3],
              [ 4,  5,  6],
              [ 7,  8,  9],
              [10, 11, 12]])

medoids = X[[0, 2], :]

np.allclose(compute_d_p(X, medoids, 2), 
            compute_d_p_broadcasted(X, medoids, 2))
# Returns True
np.allclose(compute_d_p(X, medoids, 3),
            compute_d_p_broadcasted(X, medoids, 3))
# Returns True

当然，这些测试并不能说明这是否真的会带来显著的加速。您必须自己检查相关用例。但我怀疑这至少会有所帮助

相关问题更多 >

编程相关推荐

热门问题

热门文章