我正在尝试使用pycuda在网络上进行数据传输来实现base64。 我需要将float转换为byte或unsigned char,在我发现memcpy在CPU上运行良好后,我就用cudamemcpy完成了转换。 我的意思是,我只需要cudamem复制一些浮点值,然后通过“unsigendchar*”在内核中获取这些值,将其视为字节数组
我还看到我的c++/cuda代码也很好用,但同样的代码在pycuda中不起作用
部分代码快照如下所示; C++/CUDA
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
using namespace std;
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
// grid 2D block 2D
__global__ void base64_encode(int N, unsigned char* in, unsigned char* out) //////////////// not float type, but uchar to treat it as byte array!!
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N){
out[idx] = in[idx];
printf("cuda thread %d : %02x \n",idx, in[idx]);
}
}
int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
int nx = 1 << 2;
int ny = 1 << 2;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
printf("Matrix size: nx %d ny %d\n", nx, ny);
float *h_A, *hostRef;
unsigned char * gpuRef;
h_A = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (unsigned char *)malloc(nBytes);
int size= (int)(nxy/ sizeof(float));
unsigned char b[nxy];//sizeof(float)
for (int i = 0; i < size; i++)
{
h_A[i] = (float)(i & 0xFF);
cout << h_A[i] << ", " << endl;
}
memset(hostRef, 0, nBytes);
memcpy(b, &h_A, nxy);
memset(gpuRef, 0, nBytes);
unsigned char *d_input, *d_output;
CHECK(cudaMalloc((void **)&d_input, nBytes));
CHECK(cudaMalloc((void **)&d_output, nBytes));
CHECK(cudaMemcpy(d_input, h_A, nBytes, cudaMemcpyHostToDevice));
int dimx = 4*4;
dim3 block(dimx, 1);
dim3 grid((nxy + block.x - 1) / block.x );
base64_encode<<<grid, block>>>(nxy, d_input, d_output);
CHECK(cudaDeviceSynchronize());
CHECK(cudaGetLastError());
CHECK(cudaMemcpy(gpuRef, d_output, nBytes, cudaMemcpyDeviceToHost));
for (int i = 0; i < nxy; i++)
printf("%02x, ",gpuRef[i]);
CHECK(cudaFree(d_input));
CHECK(cudaFree(d_output));
free(h_A);
free(hostRef);
free(gpuRef);
CHECK(cudaDeviceReset());
return (0);
}
结果看起来不错
0,
1,
2,
3,
cuda thread 0 : 00
cuda thread 1 : 00
cuda thread 2 : 00
cuda thread 3 : 00
cuda thread 4 : 00
cuda thread 5 : 00
cuda thread 6 : 80
cuda thread 7 : 3f
cuda thread 8 : 00
cuda thread 9 : 00
cuda thread 10 : 00
cuda thread 11 : 40
cuda thread 12 : 00
cuda thread 13 : 00
cuda thread 14 : 40
cuda thread 15 : 40
00, 00, 00, 00, 00, 00, 80, 3f, 00, 00, 00, 40, 00, 00, 40, 40
PyCUDA代码
import numpy as np
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
kernel = SourceModule("""
#include <stdio.h>
using namespace std;
__global__ void base64_encode(int N, unsigned char* in, unsigned char* out){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N){
out[idx] = in[idx];
printf("cuda thread %d : %02x \\n",idx, in[idx]);
}
}
""")
def gpu_rgb2gray():
floatValue = np.asarray(1.0).astype(np.float32)
floatValue_gpu = cuda.mem_alloc(floatValue.nbytes)
cuda.memcpy_htod(floatValue_gpu, floatValue)
h_output = np.asarray(0.0).astype(np.float32)
d_output = cuda.mem_alloc(h_output.nbytes)
cuda.memcpy_htod(d_output, h_output)
base64_encoder = kernel.get_function("base64_encode")
blockDim = (4, 1, 1)
gridDim = (1, 1, 1)
base64_encoder(4, floatValue_gpu, d_output, block=blockDim, grid=gridDim)
h_output2 = np.array(d_output.get(), dtype=np.ubyte)
return 0#h_output
此代码显示和错误:TypeError:参数#0的类型无效(基于0) 我可以请任何人帮我吗
首先,这项进口:
与代码的其余部分不匹配。要匹配代码的其余部分,应该是:
关于你的问题。pycuda抱怨的参数是此行中的参数0(即第一个参数):
这与
float
的用法或你在问题中提出的任何主题无关。在内核定义中,您需要一个32位整数:但是像python中那样的简单常量显然是另一回事。您可以通过如下方式修改调用来解决此问题:
当我进行这两个更改并运行
gpu_rgb2gray()
函数时,我从内核获得了外观合理的printf
输出进行这些更改后,尽管您在发布的代码中没有真正使用它,但您将遇到的下一个问题是:
您的
d_output
是DeviceAllocation
{a1},而不是GPUArray
{a2},因此它没有get
属性/方法。为了解决这个问题,只需进行最少的更改,我将颠倒您用于填充该对象的方法:下面是一个完整的示例:
相关问题 更多 >
编程相关推荐