D
D
Decker2018-06-28 21:11:18
CUDA
Decker, 2018-06-28 21:11:18

What is the correct way to use memcpy inside the __device__ function of the CUDA kernel?

Please tell me how to use memcpy correctly (and is it even possible) inside the __device__ function in the CUDA kernel. Simple example:

#include <stdio.h>
#include <stdint.h>

// Simulate _mm_unpacklo_epi32
__device__ void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) 
{
    unsigned char tmp[16];
    memcpy(tmp, a, 4);
    memcpy(tmp + 4, b, 4);
    memcpy(tmp + 8, a + 4, 4);
    memcpy(tmp + 12, b + 4, 4);
    memcpy(t, tmp, 16);
}

__global__ void printme(unsigned char *t, unsigned char *a, unsigned char *b) {

  printf("threadIdx.x = %d, blockIdx.x = %d, gridDim.x = %d\n",threadIdx.x, blockIdx.x, gridDim.x);

  int i;
  printf("T: "); for (i=0; i<16; i++) printf("%02x", t[i]); printf("\n");
  printf("A: "); for (i=0; i<16; i++) printf("%02x", a[i]); printf("\n");
  printf("B: "); for (i=0; i<16; i++) printf("%02x", b[i]); printf("\n");

        unpacklo32(t, a, b);

  printf("T: "); for (i=0; i<16; i++) printf("%02x", t[i]); printf("\n");
  printf("A: "); for (i=0; i<16; i++) printf("%02x", a[i]); printf("\n");
  printf("B: "); for (i=0; i<16; i++) printf("%02x", b[i]); printf("\n");
}

int main() {

  unsigned char *t = NULL;
  unsigned char *t_cuda = NULL;
  unsigned char *a = NULL;
  unsigned char *a_cuda = NULL;
  unsigned char *b = NULL;
  unsigned char *b_cuda = NULL;

  // a = (unsigned char *) malloc (16);
  cudaMallocHost((void**)&a, 16);
  cudaMalloc(&a_cuda, 16);
  // b = (unsigned char *) malloc (16);
  cudaMallocHost((void**)&b, 16);
  cudaMalloc(&b_cuda, 16);
  cudaMallocHost((void**)&t, 16);
  cudaMalloc(&t_cuda, 16);
  
  int i;
  for (i=0; i<16; i++) t[i] = 0x00;
  for (i=0; i<16; i++) a[i] = 0xa0 | i;
  for (i=0; i<16; i++) b[i] = 0xb0 | i;

  cudaMemcpy(a_cuda, a, 16, cudaMemcpyHostToDevice);
  cudaMemcpy(b_cuda, b, 16, cudaMemcpyHostToDevice);
  cudaMemcpy(t_cuda, t, 16, cudaMemcpyHostToDevice);

  printme<<< 1 , 1 >>>(t_cuda, a_cuda, b_cuda);
  cudaDeviceSynchronize();
  return  0;
}

Execution result:
threadIdx.x = 0, blockIdx.x = 0, gridDim.x = 1
T: 00000000000000000000000000000000           
A: a0a1a2a3a4a5a6a7a8a9aaabacadaeaf           
B: b0b1b2b3b4b5b6b7b8b9babbbcbdbebf           
T: a0a1a2a3b0000000a4000000b4000000           
A: a0a1a2a3a4a5a6a7a8a9aaabacadaeaf           
B: b0b1b2b3b4b5b6b7b8b9babbbcbdbebf

Those. we see that inside the device the first memcpy worked successfully and copied 4 bytes to T, but the second one, which was supposed to copy 4 bytes B to T + 4, instead of b0b1b2b3 copied b0000000. Question - why so?
Educational example. It is clear that in principle no tmp is needed here and you can copy directly or do something like *((uint32_t *)tmp + 1) = *((uint32_t *)b);instead of memcpy(tmp + 4, b, 4);. But I would like to understand the meaning - why this happens. Those. why memcpy does not work correctly in this case.

Answer the question

In order to leave comments, you need to log in

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question