Cython Numpy代码不比纯python快

from distutils.core import setup from distutils.extension import Extension import numpy from Cython.Distutils import build_ext setup( cmdclass = {'build_ext': build_ext}, ext_modules = [Extension("calcBrownCombinedP", ["calcBrownCombinedP.pyx"], include_dirs=[numpy.get_include()])] )

import numpy as np cimport numpy as np from scipy import stats DTYPE = np.int ctypedef np.int_t DTYPE_t def calcBrownCombinedP(np.ndarray genotypeArray): cdef int nSNPs, i cdef np.ndarray ms, datam, datass, d, rs, temp cdef float runningSum, sigmaSq, E, df nSNPs = genotypeArray.shape[0] ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)] datam = genotypeArray - ms datass = np.sqrt(stats.ss(datam,axis=1)) runningSum = 0 for i in xrange(nSNPs): temp = np.dot(datam[i:],datam[i].T) d = (datass[i:]*datass[i]) rs = temp / d rs = np.absolute(rs)[1:] runningSum += sum(rs*(3.25+(0.75*rs))) sigmaSq = 4*nSNPs+2*runningSum E = 2*nSNPs df = (2*(E*E))/sigmaSq runningSum = sigmaSq/(2*E) return runningSum

import numpy as np from scipy import stats import random import time from calcBrownCombinedP import calcBrownCombinedP from PycalcBrownCombinedP import PycalcBrownCombinedP ms = [10,50,100,500,1000,5000] for m in ms: print '---testing implentation with m = {0}---'.format(m) genotypeArray = np.empty((m,20),dtype=int) for i in xrange(m): genotypeArray[i] = [random.randint(0,2) for j in xrange(20)] print genotypeArray.shape start = time.time() print calcBrownCombinedP(genotypeArray) print 'cython implementation took {0}'.format(time.time() - start) start = time.time() print PycalcBrownCombinedP(genotypeArray) print 'python implementation took {0}'.format(time.time() - start)

---testing implentation with m = 10--- (10L, 20L) 2.13660168648 cython implementation took 0.000999927520752 2.13660167749 python implementation took 0.000999927520752 ---testing implentation with m = 50--- (50L, 20L) 8.82721138 cython implementation took 0.00399994850159 8.82721130234 python implementation took 0.00500011444092 ---testing implentation with m = 100--- (100L, 20L) 16.7438983917 cython implementation took 0.0139999389648 16.7438965333 python implementation took 0.0120000839233 ---testing implentation with m = 500--- (500L, 20L) 80.5343856812 cython implementation took 0.183000087738 80.5343694046 python implementation took 0.161000013351 ---testing implentation with m = 1000--- (1000L, 20L) 160.122573853 cython implementation took 0.615000009537 160.122491308 python implementation took 0.598000049591 ---testing implentation with m = 5000--- (5000L, 20L) 799.813842773 cython implementation took 10.7159998417 799.813880445 python implementation took 11.2510001659

import numpy as np from scipy import stats def PycalcBrownCombinedP(genotypeArray): nSNPs = genotypeArray.shape[0] ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)] datam = genotypeArray - ms datass = np.sqrt(stats.ss(datam,axis=1)) runningSum = 0 for i in xrange(nSNPs): temp = np.dot(datam[i:],datam[i].T) d = (datass[i:]*datass[i]) rs = temp / d rs = np.absolute(rs)[1:] runningSum += sum(rs*(3.25+(0.75*rs))) sigmaSq = 4*nSNPs+2*runningSum E = 2*nSNPs df = (2*(E*E))/sigmaSq runningSum = sigmaSq/(2*E) return runningSum

1条回答

网友

1楼 · 发布于 2024-10-01 09:36:30

使用^{}进行分析显示瓶颈是循环的最后一行：

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
<snip>
    16      5000      6145280   1229.1     86.6          runningSum += sum(rs*(3.25+(0.75*rs)))

这并不奇怪，因为在Python和Cython版本中都使用了Python内置函数sum。当输入数组的形状为(5000, 20)时，切换到np.sum会将代码速度提高4.5倍。在

如果精度有一点损失是可以的，那么您可以利用线性代数进一步加快最后一行的速度：

^{pr2}$

实际上是向量点积，即

np.dot(rs, 3.25 + 0.75 * rs)

这仍然是次优的，因为它在rs上循环了三次，并构造了两个rs大小的临时数组。使用初等代数，这个表达式可以重写为

3.25 * np.sum(rs) +  .75 * np.dot(rs, rs)

它不仅给出了原始结果而没有上一个版本中的舍入错误，而且只在rs上循环两次并使用常量内存。（*）

现在的瓶颈是np.dot，所以安装一个更好的BLAS库比用Cython重写整个程序要多。在

（*）或最新NumPy中的对数内存，它有一个递归的np.sum的重新实现，比旧的迭代实现更快。在

相关问题更多 >

编程相关推荐

热门问题

热门文章