cython中的嵌套循环优化:有没有更快的方法来设置这个代码示例?

2024-09-30 14:23:29 发布

您现在位置:Python中文网/ 问答频道 /正文

作为一大块代码的一部分,我需要使用不同的参数调用(简化)函数example(粘贴在下面)多次(几十万次)。因此,我需要这个模块快速运行。在

该模块的主要问题似乎是多个嵌套循环。但是,我不确定是否真的存在与这些循环相关的不必要的开销(如编写的那样),或者代码是否真的像它所能达到的那样快。在

一般来说,在cython中处理多个嵌套for循环时,是否有可以用来减少开销和加快代码速度的循环优化技术?这些技术是否适用于下面粘贴的示例代码?在

我也在用extra_compile_args=["-ffast-math",'-O3']编译cython,尽管这似乎没有什么大的区别。在

如果这段代码在cython中真的不能更快(我希望不是这样),用C或Fortran编写这个模块的全部或部分有什么好处吗?在

import numpy as np
import math
cimport numpy as np
cimport cython

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cdef extern from "math.h":
    double log(double x) nogil
    double exp(double x) nogil
    double pow(double x, double y) nogil


def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
    return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f[p] = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):            
                x_m_ary_f[k,p] = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE) 
    cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE) 

    cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0,p] = exp(f0_ary[p])
        nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
            nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
        ll+=log( nu_mat[data[p],p])

    if math.isnan(ll) ==True or math.isinf(ll) ==True:
        ll = -10.1**10.

    return ll

作为参考,当尝试运行此代码时,示例参数为

^{pr2}$

这个例子可以称为example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed)。对于计时,我发现这个例子在~10 loops, best of 3: 147 ms per loop中计算。由于完整的代码需要数小时才能运行,因此运行时间的任何减少都会对性能产生很大的整体影响。在


Tags: dfnpzeroscompressedmaxnuf2double
1条回答
网友
1楼 · 发布于 2024-09-30 14:23:29

在代码中调用cython -a表明几乎所有相关的部分都是用纯C运行的,所以这里没有太多好处。在

不过,您过度使用数组,其中标量就足够了。或者你在使用矩阵,而一个一维数组就足够了。执行此优化将删除大量内存访问,如下所示:

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double x_m_ary_f
    cdef double x_m_sum_f

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):
                x_m_ary_f = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double f0_ary
    cdef double f1_ary

    cdef double[:] nu_mat = np.zeros((k_max+1), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0] = exp(f0_ary)
        nu_mat[1] = nu_mat[0] * f1_ary

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n]
            nu_mat[k] += f1_ary * nu_mat[k-1] / float(k)
        ll+=log( nu_mat[data[p]])

    if math.isnan(ll) or math.isinf(ll):
        ll = -10.1**10.

    return ll

在这个版本上运行您的基准测试可以得到:

^{pr2}$

当原始代码运行速度慢得多时:

>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
1 loops, best of 3: 146 ms per loop

相关问题 更多 >