NumPy或SciPy计算加权中值

#Boilerplate & Import files import csv import scipy as sp from scipy import stats from scipy.stats import norm import numpy as np from numpy import genfromtxt import pandas as pd import matplotlib.pyplot as plt inputFile = '/Users/cl/prov.csv' origArray = genfromtxt(inputFile, delimiter = ",") nArray = np.array(origArray) dimensions = nArray.shape shape = np.asarray(dimensions) #Mask values ==0 maTest = np.ma.masked_equal(nArray,0) #Create array of masses the same shape as the weights (nArray) fieldLength = shape[0] rowLength = shape[1] for i in range (rowLength): createArr = np.arange(0, fieldLength*10, 10) nCreateArr = np.array(createArr) massArr.append(nCreateArr) nCreateArr = np.array(massArr) nmassArr = nCreateArr.transpose()

2条回答

网友

1楼 · 编辑于 2024-10-02 22:28:53

如果我正确理解你的问题，我们能做什么。就是把观测值加起来，除以2就会得到与中值对应的观测值。从那里我们需要弄清楚这个数字是什么样的观测值。

这里的一个技巧是用np.cumsum计算观测和。这给了我们一个连续的累积和。

示例：
np.cumsum([1,2,3,4]) -> [ 1, 3, 6, 10]
每个元素都是所有先前元素和自身的总和。我们这里有10个观察点。所以平均值是第5次观察。（最后一个元素除以2得到5）。
现在看一下cumsum结果，我们可以很容易地看到，这一定是第二个和第三个元素之间的观测（观测3和6）。

所以我们需要做的就是找出中位数（5）的指数。
np.searchsorted正是我们所需要的。它将找到将元素插入数组的索引，以便保持排序。

这样做的代码如下：

import numpy as np
#my test data
freq_count = np.array([[30, 191, 9, 0], [10, 20, 300, 10], [10,20,30,40], [100,10,10,10], [1,1,1,100]])

c = np.cumsum(freq_count, axis=1) 
indices = [np.searchsorted(row, row[-1]/2.0) for row in c]
masses = [i * 10 for i in indices] #Correct if the masses are indeed 0, 10, 20,...

#This is just for explanation.
print "median masses is:",  masses
print freq_count
print np.hstack((c, c[:, -1, np.newaxis]/2.0))

输出为：

median masses is: [10 20 20  0 30]  
[[ 30 191   9   0]  <- The test data
 [ 10  20 300  10]  
 [ 10  20  30  40]  
 [100  10  10  10]  
 [  1   1   1 100]]  
[[  30.   221.   230.   230.   115. ]  <- cumsum results with median added to the end.
 [  10.    30.   330.   340.   170. ]     you can see from this where they fit in.
 [  10.    30.    60.   100.    50. ]  
 [ 100.   110.   120.   130.    65. ]  
 [   1.     2.     3.   103.    51.5]]

网友

2楼 · 编辑于 2024-10-02 22:28:53

分享一些我参与的代码。这允许您在excel电子表格的每一列上运行统计信息。

import xlrd
import sys
import csv
import numpy as np
import itertools
from itertools import chain

book = xlrd.open_workbook('/filepath/workbook.xlsx')
sh = book.sheet_by_name("Sheet1")
ofile = '/outputfilepath/workbook.csv'

masses = sh.col_values(0, start_rowx=1)  # first column has mass
age = sh.row_values(0, start_colx=1)   # first row has age ranges

count = 1
mass = []
for a in ages:
    age.append(sh.col_values(count, start_rowx=1))
    count += 1

stats = []
count = 0    
for a in ages:
    expanded = []
    # create a tuple with the mass vector

    age_mass = zip(masses, age[count])
    count += 1
    # replicate element[0] for element[1] times
    expanded = list(list(itertools.repeat(am[0], int(am[1]))) for am in age_mass)

    #  separate into one big list
    medianlist = [x for t in expanded for x in t]

    # convert to array and mask out zeroes
    npa = np.array(medianlist)
    npa = np.ma.masked_equal(npa,0)

    median = np.median(npa)
    meanMass = np.average(npa)
    maxMass = np.max(npa)
    minMass = np.min(npa)
    stdev = np.std(npa)

    stats1 = [median, meanMass, maxMass, minMass, stdev]
    print stats1

    stats.append(stats1)

np.savetxt(ofile, (stats), fmt="%d")

相关问题更多 >

编程相关推荐

热门问题

热门文章