Python中的高斯混合模型

2024-09-29 21:36:59 发布

您现在位置:Python中文网/ 问答频道 /正文

请帮忙

我收到错误:ValueError: Array 'mean' must be a vector of length 4

我的代码如下:

#初始化:

def initialize(data, K):  
    mu_0 = np.mean(data, 0)
    cov = np.cov(data.T)
    init_mus = mvn.rvs(size=K, mean=mu_0, cov=cov)
    init_covs = np.tile(cov,(K,1,1))
    init_pi = np.ones(K) / K
    return init_mus, init_covs, init_pi

#E-step

from scipy import stats
def e_step(data, mus, covs, pi):  
    np.log([stats.multivariate_normal(mus, covs).pdf(data)])
    log_p_y_x = np.log([1-pi, pi])[np.newaxis, ...] + \
                np.log([stats.multivariate_normal(mus, covs).pdf(data)]).T
    log_p_y_x_norm = logsumexp(log_p_y_x, axis=1)
    return log_gammas

#M步

def m_step(data, mus, covs, pi):
   total_count = data.shape[0]
    _, heuristics = e_step(data, mus, covs, pi)
    heuristic1 = heuristics[:, 1]
    sum_heuristic1 = np.sum(heuristic1)
    pi = (sum_heuristic1/total_count)
    mus = (heuristic1[..., np.newaxis].T.dot(data)/sum_heuristic1).flatten()
    diff1 = data - mus
    covs = diff1.T.dot(diff1 * heuristic1[..., np.newaxis]) / sum_heuristic1
    return mus, covs, pi

#原木可能性

def log_likelihood(data, mus, covs, pi,k): 
   k=int
    loglikelihood, _ = e_step(data, mus, covs, pi)
    ll= np(loglikelihood)
    return ll

#费用最大化

def EM(data, K, max_iters):
    init_mus, init_covs, init_pi = initialize(data, K)
    mus = init_mus
    covs = init_covs
    pi = init_pi
    lls = []
    ll_prev = -np.inf
    for i in range(max_iters):
        k=int
        ll =log_likelihood(data,mus, covs, pi,k)
        lls.append(ll)
    return mus, covs, pi, log_gammas, lls

#培训

def fit_gmm(data, K, num_restarts = 300):
   ll_best = -np.inf
    for i in range(num_restarts):
        mus, covs, pi, log_gammas, lls = EM(data, K, max_iters=1000)
        if lls[-1] > ll_best:
            mus_best = mus
            covs_best = covs
            pi_best = pi
            log_gammas_best = log_gammas
            ll_best = lls[-1]
            lls_best = lls
    return mus_best, covs_best, pi_best, log_gammas_best, lls_best

fig, axes = plt.subplots(3, 4, figsize=(10, 8), sharey='row')

print("================= data 1 ==================")
data1_GMM_results = []
for K in range(2, 6):
    mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data1, K, num_restarts = 10)
    cluster_assignments = np.argmax(log_gammas_best, axis=1)
    plot_clusters(data1, mus_best, covs_best, labels1, K, axes[0,K-2])
    data1_GMM_results.append((mus_best, covs_best, cluster_assignments, LLs_best[-1]))
print("================= data 2 ==================")
data2_GMM_results = []
for K in range(2, 6):
    mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data2, K, num_restarts = 10)
    cluster_assignments = np.argmax(log_gammas_best, axis=1)
    plot_clusters(data2, mus_best, covs_best, labels2, K, axes[1,K-2])
    data2_GMM_results.append((mus_best, covs_best, cluster_assignments, LLs_best[-1]))
print("================= data 3 ==================")
data3_GMM_results = []
for K in range(2, 6):
    mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data3, K, num_restarts = 10)
    cluster_assignments = np.argmax(log_gammas_best, axis=1)
    plot_clusters(data3, mus_best, covs_best, labels3, K, axes[2,K-2])
    data3_GMM_results.append((mus_best, covs_best, cluster_assignments, LLs_best[-1]))
for K in range(2, 6):
    axes[0,K-2].set_title("K = %d" % K)
for i in range(1, 4):
    axes[i-1,0].set_ylabel("Dataset %d" % i)
plt.tight_layout()

错误:

 ================= data 1 ==================
    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
    <ipython-input-11-e50708c0dbad> in <module>
          4 data1_GMM_results = []
          5 for K in range(2, 6):
    ----> 6     mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data1, K, num_restarts = 10)
          7     cluster_assignments = np.argmax(log_gammas_best, axis=1)
          8     plot_clusters(data1, mus_best, covs_best, labels1, K, axes[0,K-2])
    
    <ipython-input-10-114ce239b4c1> in fit_gmm(data, K, num_restarts)
          6     ll_best = -np.inf
          7     for i in range(num_restarts):
    ----> 8         mus, covs, pi, log_gammas, lls = EM(data, K, max_iters=1000)
          9         if lls[-1] > ll_best:
         10             mus_best = mus
    
    <ipython-input-9-4e5aa1b04da1> in EM(data, K, max_iters)
         20     for i in range(max_iters):
         21         k=int
    ---> 22         ll =log_likelihood(data,mus, covs, pi,k)
         23         lls.append(ll)
         24     return mus, covs, pi, log_gammas, lls
    
    <ipython-input-8-7c3f0c433124> in log_likelihood(data, mus, covs, pi, k)
          8     """
          9     k=int
    ---> 10     loglikelihood, _ = e_step(data, mus, covs, pi)
         11     ll= np(loglikelihood)
         12     return ll
    
    <ipython-input-6-400f900216a0> in e_step(data, mus, covs, pi)
          7     log_gammas: N x K, the matrix specifying the log probability of each point belonging to cluster k
          8     """
    ----> 9     np.log([stats.multivariate_normal(mus, covs).pdf(data)])
         10     log_p_y_x = np.log([1-pi, pi])[np.newaxis, ...] + \
         11                 np.log([stats.multivariate_normal(mus, covs).pdf(data)]).T
    
    ~\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py in __call__(self, mean, cov, allow_singular, seed)
        361         return multivariate_normal_frozen(mean, cov,
        362                                           allow_singular=allow_singular,
    --> 363                                           seed=seed)
        364 
        365     def _process_parameters(self, dim, mean, cov):
    
    ~\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py in __init__(self, mean, cov, allow_singular, seed, maxpts, abseps, releps)
        733         self._dist = multivariate_normal_gen(seed)
        734         self.dim, self.mean, self.cov = self._dist._process_parameters(
    --> 735                                                             None, mean, cov)
        736         self.cov_info = _PSD(self.cov, allow_singular=allow_singular)
        737         if not maxpts:
    
    ~\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py in _process_parameters(self, dim, mean, cov)
        405         if mean.ndim != 1 or mean.shape[0] != dim:
        406             raise ValueError("Array 'mean' must be a vector of length %d." %
    --> 407                              dim)
        408         if cov.ndim == 0:
        409             cov = cov * np.eye(dim)
    
    ValueError: Array 'mean' must be a vector of length 4.

Tags: inselflogdatainitnppimean

热门问题