Answer the question
In order to leave comments, you need to log in
What is the correct way to use memcpy inside the __device__ function of the CUDA kernel?
Please tell me how to use memcpy correctly (and is it even possible) inside the __device__ function in the CUDA kernel. Simple example:
#include <stdio.h>
#include <stdint.h>
// Simulate _mm_unpacklo_epi32
__device__ void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b)
{
unsigned char tmp[16];
memcpy(tmp, a, 4);
memcpy(tmp + 4, b, 4);
memcpy(tmp + 8, a + 4, 4);
memcpy(tmp + 12, b + 4, 4);
memcpy(t, tmp, 16);
}
__global__ void printme(unsigned char *t, unsigned char *a, unsigned char *b) {
printf("threadIdx.x = %d, blockIdx.x = %d, gridDim.x = %d\n",threadIdx.x, blockIdx.x, gridDim.x);
int i;
printf("T: "); for (i=0; i<16; i++) printf("%02x", t[i]); printf("\n");
printf("A: "); for (i=0; i<16; i++) printf("%02x", a[i]); printf("\n");
printf("B: "); for (i=0; i<16; i++) printf("%02x", b[i]); printf("\n");
unpacklo32(t, a, b);
printf("T: "); for (i=0; i<16; i++) printf("%02x", t[i]); printf("\n");
printf("A: "); for (i=0; i<16; i++) printf("%02x", a[i]); printf("\n");
printf("B: "); for (i=0; i<16; i++) printf("%02x", b[i]); printf("\n");
}
int main() {
unsigned char *t = NULL;
unsigned char *t_cuda = NULL;
unsigned char *a = NULL;
unsigned char *a_cuda = NULL;
unsigned char *b = NULL;
unsigned char *b_cuda = NULL;
// a = (unsigned char *) malloc (16);
cudaMallocHost((void**)&a, 16);
cudaMalloc(&a_cuda, 16);
// b = (unsigned char *) malloc (16);
cudaMallocHost((void**)&b, 16);
cudaMalloc(&b_cuda, 16);
cudaMallocHost((void**)&t, 16);
cudaMalloc(&t_cuda, 16);
int i;
for (i=0; i<16; i++) t[i] = 0x00;
for (i=0; i<16; i++) a[i] = 0xa0 | i;
for (i=0; i<16; i++) b[i] = 0xb0 | i;
cudaMemcpy(a_cuda, a, 16, cudaMemcpyHostToDevice);
cudaMemcpy(b_cuda, b, 16, cudaMemcpyHostToDevice);
cudaMemcpy(t_cuda, t, 16, cudaMemcpyHostToDevice);
printme<<< 1 , 1 >>>(t_cuda, a_cuda, b_cuda);
cudaDeviceSynchronize();
return 0;
}
threadIdx.x = 0, blockIdx.x = 0, gridDim.x = 1
T: 00000000000000000000000000000000
A: a0a1a2a3a4a5a6a7a8a9aaabacadaeaf
B: b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
T: a0a1a2a3b0000000a4000000b4000000
A: a0a1a2a3a4a5a6a7a8a9aaabacadaeaf
B: b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
*((uint32_t *)tmp + 1) = *((uint32_t *)b);
instead of memcpy(tmp + 4, b, 4);
. But I would like to understand the meaning - why this happens. Those. why memcpy does not work correctly in this case.
Answer the question
In order to leave comments, you need to log in
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question