cython中的嵌套循环优化：有没有更快的方法来设置这个代码示例？

import numpy as np import math cimport numpy as np cimport cython DTYPE = np.float ctypedef np.float_t DTYPE_t cdef extern from "math.h": double log(double x) nogil double exp(double x) nogil double pow(double x, double y) nogil def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0): return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc) @cython.boundscheck(False) @cython.wraparound(False) @cython.cdivision(True) @cython.initializedcheck(False) cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ): cdef int k_max = np.max(data) + 1 cdef double A = np.float(theta[0]) cdef double n1 = np.float(theta[1]) cdef double n2 = np.float(theta[2]) cdef double Sb = np.float(theta[3]) cdef int npixROI = len(xbg_PSF_compressed) cdef double f2 = 0.0 cdef double df_rho_div_f2 = 0.0 cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE) cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE) cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE) cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE) cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE) cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE) cdef Py_ssize_t f_index, p, k, n #calculations for PS cdef int do_half = 0 cdef double term1 = 0.0 cdef double term2 = 0.0 cdef double second_2_a = 0.0 cdef double second_2_b = 0.0 cdef double second_2_c = 0.0 cdef double second_2_d = 0.0 cdef double second_1_a = 0.0 cdef double second_1_b = 0.0 cdef double second_1_c = 0.0 cdef double second_1_d = 0.0 for f_index in range(len(f_ary)): f2 = f_ary[f_index] df_rho_div_f2 = df_rho_div_f_ary[f_index] g1_ary_f = np.random.random(k_max+1) g2_ary_f = np.random.random(k_max+1) term1 = (A * Sb * f2) \ * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \ - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0])) second_1_a = A * pow(Sb * f2, n1) second_1_b = A * pow(Sb * f2, n2) for p in range(npixROI): x_m_sum_f[p] = term1 * PS_dist_compressed[p] x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p] second_1_c = second_1_a * PS_dist_compressed[p] second_1_d = second_1_b * PS_dist_compressed[p] for k in range(data[p]+1): x_m_ary_f[k,p] = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k] x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p] cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE) cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE) cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE) cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE) cdef double ll = 0. for p in range(npixROI): f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p]) f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p]) nu_mat[0,p] = exp(f0_ary[p]) nu_mat[1,p] = nu_mat[0,p] * f1_ary[p] for k in range(2,data[p]+1): for n in range(0, k - 1): nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p] nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k) ll+=log( nu_mat[data[p],p]) if math.isnan(ll) ==True or math.isinf(ll) ==True: ll = -10.1**10. return ll

1条回答

网友

1楼 · 发布于 2024-09-30 14:23:29

在代码中调用cython -a表明几乎所有相关的部分都是用纯C运行的，所以这里没有太多好处。在

不过，您过度使用数组，其中标量就足够了。或者你在使用矩阵，而一个一维数组就足够了。执行此优化将删除大量内存访问，如下所示：

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double x_m_ary_f
    cdef double x_m_sum_f

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):
                x_m_ary_f = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double f0_ary
    cdef double f1_ary

    cdef double[:] nu_mat = np.zeros((k_max+1), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0] = exp(f0_ary)
        nu_mat[1] = nu_mat[0] * f1_ary

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n]
            nu_mat[k] += f1_ary * nu_mat[k-1] / float(k)
        ll+=log( nu_mat[data[p]])

    if math.isnan(ll) or math.isinf(ll):
        ll = -10.1**10.

    return ll

在这个版本上运行您的基准测试可以得到：

^{pr2}$

当原始代码运行速度慢得多时：

>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
1 loops, best of 3: 146 ms per loop

相关问题更多 >

编程相关推荐

热门问题

热门文章