pycuda能否像C++/CUDA那样将浮点解析为无符号字符数组？

#include <cuda_runtime.h> #include <stdio.h> #include <iostream> using namespace std; #define CHECK(call) \ { \ const cudaError_t error = call; \ if (error != cudaSuccess) \ { \ fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ fprintf(stderr, "code: %d, reason: %s\n", error, \ cudaGetErrorString(error)); \ exit(1); \ } \ } // grid 2D block 2D __global__ void base64_encode(int N, unsigned char* in, unsigned char* out) //////////////// not float type, but uchar to treat it as byte array!! { unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < N){ out[idx] = in[idx]; printf("cuda thread %d : %02x \n",idx, in[idx]); } } int main(int argc, char **argv) { printf("%s Starting...\n", argv[0]); int dev = 0; cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); printf("Using Device %d: %s\n", dev, deviceProp.name); CHECK(cudaSetDevice(dev)); int nx = 1 << 2; int ny = 1 << 2; int nxy = nx * ny; int nBytes = nxy * sizeof(float); printf("Matrix size: nx %d ny %d\n", nx, ny); float *h_A, *hostRef; unsigned char * gpuRef; h_A = (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef = (unsigned char *)malloc(nBytes); int size= (int)(nxy/ sizeof(float)); unsigned char b[nxy];//sizeof(float) for (int i = 0; i < size; i++) { h_A[i] = (float)(i & 0xFF); cout << h_A[i] << ", " << endl; } memset(hostRef, 0, nBytes); memcpy(b, &h_A, nxy); memset(gpuRef, 0, nBytes); unsigned char *d_input, *d_output; CHECK(cudaMalloc((void **)&d_input, nBytes)); CHECK(cudaMalloc((void **)&d_output, nBytes)); CHECK(cudaMemcpy(d_input, h_A, nBytes, cudaMemcpyHostToDevice)); int dimx = 4*4; dim3 block(dimx, 1); dim3 grid((nxy + block.x - 1) / block.x ); base64_encode<<<grid, block>>>(nxy, d_input, d_output); CHECK(cudaDeviceSynchronize()); CHECK(cudaGetLastError()); CHECK(cudaMemcpy(gpuRef, d_output, nBytes, cudaMemcpyDeviceToHost)); for (int i = 0; i < nxy; i++) printf("%02x, ",gpuRef[i]); CHECK(cudaFree(d_input)); CHECK(cudaFree(d_output)); free(h_A); free(hostRef); free(gpuRef); CHECK(cudaDeviceReset()); return (0); }

0, 1, 2, 3, cuda thread 0 : 00 cuda thread 1 : 00 cuda thread 2 : 00 cuda thread 3 : 00 cuda thread 4 : 00 cuda thread 5 : 00 cuda thread 6 : 80 cuda thread 7 : 3f cuda thread 8 : 00 cuda thread 9 : 00 cuda thread 10 : 00 cuda thread 11 : 40 cuda thread 12 : 00 cuda thread 13 : 00 cuda thread 14 : 40 cuda thread 15 : 40 00, 00, 00, 00, 00, 00, 80, 3f, 00, 00, 00, 40, 00, 00, 40, 40

import numpy as np import matplotlib.pyplot as plt import pycuda.autoinit import pycuda.driver as drv from pycuda.compiler import SourceModule import pycuda.gpuarray as gpuarray kernel = SourceModule(""" #include <stdio.h> using namespace std; __global__ void base64_encode(int N, unsigned char* in, unsigned char* out){ int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < N){ out[idx] = in[idx]; printf("cuda thread %d : %02x \\n",idx, in[idx]); } } """) def gpu_rgb2gray(): floatValue = np.asarray(1.0).astype(np.float32) floatValue_gpu = cuda.mem_alloc(floatValue.nbytes) cuda.memcpy_htod(floatValue_gpu, floatValue) h_output = np.asarray(0.0).astype(np.float32) d_output = cuda.mem_alloc(h_output.nbytes) cuda.memcpy_htod(d_output, h_output) base64_encoder = kernel.get_function("base64_encode") blockDim = (4, 1, 1) gridDim = (1, 1, 1) base64_encoder(4, floatValue_gpu, d_output, block=blockDim, grid=gridDim) h_output2 = np.array(d_output.get(), dtype=np.ubyte) return 0#h_output

1条回答

网友

1楼 · 发布于 2024-09-26 22:52:18

首先，这项进口：

import pycuda.driver as drv

与代码的其余部分不匹配。要匹配代码的其余部分，应该是：

import pycuda.driver as cuda

关于你的问题。pycuda抱怨的参数是此行中的参数0（即第一个参数）：

base64_encoder(4, floatValue_gpu, d_output, block=blockDim, grid=gridDim)
               ^

这与float的用法或你在问题中提出的任何主题无关。在内核定义中，您需要一个32位整数：

__global__ void base64_encode(int N, ...
                              ^^^

但是像python中那样的简单常量显然是另一回事。您可以通过如下方式修改调用来解决此问题：

base64_encoder(np.int32(4), floatValue_gpu, d_output, block=blockDim, grid=gridDim)
               ^^^^^^^^^^^

当我进行这两个更改并运行gpu_rgb2gray()函数时，我从内核获得了外观合理的printf输出

进行这些更改后，尽管您在发布的代码中没有真正使用它，但您将遇到的下一个问题是：

h_output2 = np.array(d_output.get(), dtype=np.ubyte)

您的d_output是DeviceAllocation{a1}，而不是GPUArray{a2}，因此它没有get属性/方法。为了解决这个问题，只需进行最少的更改，我将颠倒您用于填充该对象的方法：

h_output2 = np.empty(floatValue.nbytes, dtype=np.ubyte)
cuda.memcpy_dtoh(h_output2, d_output)

下面是一个完整的示例：

$ cat t29.py
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from   pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

kernel = SourceModule("""
#include <stdio.h>
using namespace std;

__global__ void base64_encode(int N, unsigned char* in, unsigned char* out){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N){
        out[idx] = in[idx];
        printf("cuda thread %d : %02x \\n",idx, in[idx]);
    }
}
""")
def gpu_rgb2gray():
    floatValue = np.asarray(1.0).astype(np.float32)
    floatValue_gpu = cuda.mem_alloc(floatValue.nbytes)
    cuda.memcpy_htod(floatValue_gpu, floatValue)

    h_output = np.asarray(0.0).astype(np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)
    cuda.memcpy_htod(d_output, h_output)
    base64_encoder = kernel.get_function("base64_encode")
    blockDim = (4, 1, 1)
    gridDim = (1, 1, 1)
    base64_encoder(np.int32(4), floatValue_gpu, d_output, block=blockDim, grid=gridDim)
    h_output2 = np.empty(floatValue.nbytes, dtype=np.ubyte)
    cuda.memcpy_dtoh(h_output2, d_output)
    return h_output2

print(gpu_rgb2gray())

$ cuda-memcheck  python t29.py
========= CUDA-MEMCHECK
cuda thread 0 : 00
cuda thread 1 : 00
cuda thread 2 : 80
cuda thread 3 : 3f
[  0   0 128  63]
========= ERROR SUMMARY: 0 errors
$

相关问题更多 >

编程相关推荐

热门问题

热门文章