我正在用PyCuda开发一种遗传细胞自动机。每个细胞都有大量的基因组数据,以及细胞参数。我想知道什么是最有效的方法来1)将单元格数据传递到CUDA内核,然后2)处理这些数据。在
我从一个特别糟糕的(imo)开始,但仍然有效的解决方案。它在一个单独的数组中传递每个参数,然后用switch case和大量重复代码处理它们。在
然后,我意识到我可以很快得到每个内核函数的大量参数,并决定重写它。在
第二种解决方案是将所有的单元参数存储在一个具有额外维度的数组中。这在代码中要优雅得多,但令人惊讶的是,代码运行速度慢了10倍!在
为了更清楚地说明,每个单元格需要存储的数据的完整列表:
我把一个自动机步骤分成两个阶段。第一个(发射阶段)是计算(Fi,Mi,Ti)每个相邻单元。第二个阶段(吸收阶段)是将8x(Fi,Mi,Ti)值与当前电池的状态混合。目前还没有基因组或注册,但我需要它的数据以备将来使用。在
所以,我的第一个解决方案的代码是:
Mk = 64
Tk = 1000
emit_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """
int x = i / h;
int y = i % h;
int3 cell = cells[i];
float M = (float) cell.y;
float T = (float) cell.z;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[i].y -= Mi;
cells[i].z -= (int) (T * fmin(1, T / Tk) / 1);
int Fi = cell.x;
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int Ti = cell.z;
int ii, xo, yo;
for (int cc = 0; cc < 9; cc++) {
int c = (cc + Fi) % 9;
if (c == 4) continue;
xo = x + c%3 - 1;
if (xo < 0) xo = w + xo;
if (xo >= w) xo = xo - w;
yo = y + c/3 - 1;
if (yo < 0) yo = h + yo;
if (xo >= w) yo = yo - h;
ii = xo * h + yo;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
switch(c) {
case 0: dcells0[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 1: dcells1[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 2: dcells2[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 3: dcells3[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 5: dcells4[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 6: dcells5[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 7: dcells6[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 8: dcells7[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
default: break;
}
}
""", "ca_prepare", preamble="""
#define Tk %s
""" % Tk)
absorb_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """
int3 cell = cells[i];
int3 dcell = dcells0[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
dcell = dcells1[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells2[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells3[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells4[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells5[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells6[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells7[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
cells[i] = cell;
img[i] = hsv2rgb(cell);
""", "ca_calc", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
__device__ int3 operator+(const int3 &a, const int3 &b) {
return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}
__device__ int3 calc_d(int Fc, int Mc, int Tc, int Fi, int Mi, int Ti) {
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
return make_int3(dF, dM, dT);
}
__device__ uint hsv2rgb(int3 pixel) {
// skipped for brevity
}
""" % (Mk, Tk, RAM))
第二个也是当前的解决方案:
^{pr2}$这两个变体产生完全相同的CA行为,但后者的运行速度慢得多。在
GTX泰坦:
630万吨:
如果您需要:
欢迎提供任何线索或建议:
好吧,我设法使第二个解决方案的运行速度快了15倍。进行了以下更改:
int
转换为int4
。这使得它比int3
的解决方案更快。不过,多余的空间未使用(.w维)。[3x加速]优化后的代码如下所示:
感谢Park Young Bae提供了重新打包的线索,也感谢Alexey Shchepin提供了一些优化问题。在
相关问题 更多 >
编程相关推荐