我试图构建一个可以执行Kruskal-Wallis测试的类。该类使用以下公式计算H:
但是,它产生的H值与scipy的kruskal
函数不同。有人知道为什么会这样吗
import numpy as np
from scipy.stats import rankdata
from scipy.stats import kruskal
class Kruskal_Wallis():
def __init__(self):
pass
def fit(self, groups):
"""
Performs Kruskal-Wallis test.
:param groups: list containing 1D group arrays
Adds the following attributes:
- n: size of total population
- n_groups: number of groups (n_groups = len(n_i) = len(r_i))
- n_i: array containing group sizes
- df: degrees of freedom
- r2_i: array containing the square of the sum of ranks for each group
- h: kruskal-wallis statistic
"""
def sum_ranks_per_group(groups):
n_groups = len(groups)
n_i = np.array([group.shape[0] for group in groups])
data = np.array([])
for group in groups:
data = np.concatenate((data, group), axis=0)
ranked_data = rankdata(data, method="average")
ranked_groups = ranked_data.reshape((n_groups, n_i[0])) #works only if groups have equal size
summed_ranks = ranked_groups.sum(axis=1)
return summed_ranks
def get_h(n, r2_i, n_i):
summed_r2_i_per_n_i = (r2_i/n_i).sum()
h = (12/(n*(n-1)) * summed_r2_i_per_n_i) - 3*(n+1)
return h
n_groups = len(groups)
n_i = np.array([group.shape[0] for group in groups])
n = sum(n_i)
df = n_groups - 1
r2_i = sum_ranks_per_group(groups)**2
h = get_h(n, r2_i, n_i)
self.n_groups = n_groups
self.n_i = n_i
self.n = n
self.df = df
self.r2_i = r2_i
self.h = h
## Compare results yielded by scipy.stats.kruskal and Kruskal_Wallis class
groups = [np.arange(1,3),
np.arange(3,5)]
res = kruskal(groups[0], groups[1])
kruskal_wallis = Kruskal_Wallis()
kruskal_wallis.fit(groups)
print(res)
print(kruskal_wallis.h)
答案之间的差异可能是由python处理division operations中的
float
类型的方式造成的不要使用pythonic除法(
/
),尝试使用numpy的true division相关问题 更多 >
编程相关推荐