pycuda能否像C++/CUDA那样将浮点解析为无符号字符数组?

2024-09-26 22:52:18 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试使用pycuda在网络上进行数据传输来实现base64。 我需要将float转换为byte或unsigned char,在我发现memcpy在CPU上运行良好后,我就用cudamemcpy完成了转换。 我的意思是,我只需要cudamem复制一些浮点值,然后通过“unsigendchar*”在内核中获取这些值,将其视为字节数组

我还看到我的c++/cuda代码也很好用,但同样的代码在pycuda中不起作用

部分代码快照如下所示; C++/CUDA

#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
 
using namespace std;

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(1);                                                               \
    }                                                                          \
}  
// grid 2D block 2D
__global__ void base64_encode(int N, unsigned char* in, unsigned char* out) //////////////// not float type, but uchar to treat it as byte array!!

{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if (idx < N){
        out[idx] = in[idx];
        printf("cuda thread %d : %02x \n",idx, in[idx]);
    }
}

int main(int argc, char **argv)
{
    printf("%s Starting...\n", argv[0]);
 
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));
 
    int nx = 1 << 2;
    int ny = 1 << 2;

    int nxy = nx * ny;
    int nBytes = nxy * sizeof(float);
    printf("Matrix size: nx %d ny %d\n", nx, ny);
 
    float *h_A, *hostRef;
    unsigned char * gpuRef;
    h_A = (float *)malloc(nBytes); 
    hostRef = (float *)malloc(nBytes);
    gpuRef = (unsigned char *)malloc(nBytes);
    int size= (int)(nxy/ sizeof(float));

    unsigned char b[nxy];//sizeof(float)
    for (int i = 0; i < size; i++)
    {
        h_A[i] = (float)(i & 0xFF);
        cout << h_A[i]  << ", " << endl;
    }
    memset(hostRef, 0, nBytes);   
    memcpy(b, &h_A, nxy);  
    memset(gpuRef, 0, nBytes);
    
    unsigned char *d_input, *d_output;
    CHECK(cudaMalloc((void **)&d_input, nBytes)); 
    CHECK(cudaMalloc((void **)&d_output, nBytes));
 
    CHECK(cudaMemcpy(d_input, h_A, nBytes, cudaMemcpyHostToDevice)); 
 
    int dimx = 4*4; 
    dim3 block(dimx, 1);
    dim3 grid((nxy + block.x - 1) / block.x );
 
    base64_encode<<<grid, block>>>(nxy, d_input, d_output);
    CHECK(cudaDeviceSynchronize());  
    CHECK(cudaGetLastError()); 
    CHECK(cudaMemcpy(gpuRef, d_output, nBytes, cudaMemcpyDeviceToHost));
  
    for (int i = 0; i < nxy; i++) 
        printf("%02x, ",gpuRef[i]); 
 
    CHECK(cudaFree(d_input)); 
    CHECK(cudaFree(d_output));
 
    free(h_A); 
    free(hostRef);
    free(gpuRef);
 
    CHECK(cudaDeviceReset());

    return (0);
}

结果看起来不错

0, 
1, 
2, 
3, 
cuda thread 0 : 00 
cuda thread 1 : 00 
cuda thread 2 : 00 
cuda thread 3 : 00 
cuda thread 4 : 00 
cuda thread 5 : 00 
cuda thread 6 : 80 
cuda thread 7 : 3f 
cuda thread 8 : 00 
cuda thread 9 : 00 
cuda thread 10 : 00 
cuda thread 11 : 40 
cuda thread 12 : 00 
cuda thread 13 : 00 
cuda thread 14 : 40 
cuda thread 15 : 40 
00, 00, 00, 00, 00, 00, 80, 3f, 00, 00, 00, 40, 00, 00, 40, 40

PyCUDA代码

import numpy as np
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

kernel = SourceModule("""
#include <stdio.h>  
using namespace std;

__global__ void base64_encode(int N, unsigned char* in, unsigned char* out){
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if (idx < N){
        out[idx] = in[idx];
        printf("cuda thread %d : %02x \\n",idx, in[idx]);
    }
}
""")
def gpu_rgb2gray(): 
    floatValue = np.asarray(1.0).astype(np.float32)
    floatValue_gpu = cuda.mem_alloc(floatValue.nbytes) 
    cuda.memcpy_htod(floatValue_gpu, floatValue)
    
    h_output = np.asarray(0.0).astype(np.float32)      
    d_output = cuda.mem_alloc(h_output.nbytes)
    cuda.memcpy_htod(d_output, h_output) 
    base64_encoder = kernel.get_function("base64_encode") 
    blockDim = (4, 1, 1)  
    gridDim = (1, 1, 1)  
    base64_encoder(4, floatValue_gpu, d_output, block=blockDim, grid=gridDim)
    
    h_output2 = np.array(d_output.get(), dtype=np.ubyte) 
    return 0#h_output

此代码显示和错误:TypeError:参数#0的类型无效(基于0) 我可以请任何人帮我吗


Tags: pycudaoutputchecknpfloatblockthreadcuda
1条回答
网友
1楼 · 发布于 2024-09-26 22:52:18

首先,这项进口:

import pycuda.driver as drv

与代码的其余部分不匹配。要匹配代码的其余部分,应该是:

import pycuda.driver as cuda

关于你的问题。pycuda抱怨的参数是此行中的参数0(即第一个参数):

base64_encoder(4, floatValue_gpu, d_output, block=blockDim, grid=gridDim)
               ^

这与float的用法或你在问题中提出的任何主题无关。在内核定义中,您需要一个32位整数:

__global__ void base64_encode(int N, ...
                              ^^^

但是像python中那样的简单常量显然是另一回事。您可以通过如下方式修改调用来解决此问题:

base64_encoder(np.int32(4), floatValue_gpu, d_output, block=blockDim, grid=gridDim)
               ^^^^^^^^^^^

当我进行这两个更改并运行gpu_rgb2gray()函数时,我从内核获得了外观合理的printf输出

进行这些更改后,尽管您在发布的代码中没有真正使用它,但您将遇到的下一个问题是:

h_output2 = np.array(d_output.get(), dtype=np.ubyte) 

您的d_outputDeviceAllocation{a1},而不是GPUArray{a2},因此它没有get属性/方法。为了解决这个问题,只需进行最少的更改,我将颠倒您用于填充该对象的方法:

h_output2 = np.empty(floatValue.nbytes, dtype=np.ubyte)
cuda.memcpy_dtoh(h_output2, d_output)

下面是一个完整的示例:

$ cat t29.py
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from   pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

kernel = SourceModule("""
#include <stdio.h>
using namespace std;

__global__ void base64_encode(int N, unsigned char* in, unsigned char* out){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N){
        out[idx] = in[idx];
        printf("cuda thread %d : %02x \\n",idx, in[idx]);
    }
}
""")
def gpu_rgb2gray():
    floatValue = np.asarray(1.0).astype(np.float32)
    floatValue_gpu = cuda.mem_alloc(floatValue.nbytes)
    cuda.memcpy_htod(floatValue_gpu, floatValue)

    h_output = np.asarray(0.0).astype(np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)
    cuda.memcpy_htod(d_output, h_output)
    base64_encoder = kernel.get_function("base64_encode")
    blockDim = (4, 1, 1)
    gridDim = (1, 1, 1)
    base64_encoder(np.int32(4), floatValue_gpu, d_output, block=blockDim, grid=gridDim)
    h_output2 = np.empty(floatValue.nbytes, dtype=np.ubyte)
    cuda.memcpy_dtoh(h_output2, d_output)
    return h_output2

print(gpu_rgb2gray())

$ cuda-memcheck  python t29.py
========= CUDA-MEMCHECK
cuda thread 0 : 00
cuda thread 1 : 00
cuda thread 2 : 80
cuda thread 3 : 3f
[  0   0 128  63]
========= ERROR SUMMARY: 0 errors
$

相关问题 更多 >

    热门问题