What is the correct way to use memcpy inside the __device_

D

Decker2018-06-28 21:11:18

CUDA

Decker, 2018-06-28 21:11:18

What is the correct way to use memcpy inside the __device__ function of the CUDA kernel?

Please tell me how to use memcpy correctly (and is it even possible) inside the __device__ function in the CUDA kernel. Simple example:

#include <stdio.h>
#include <stdint.h>

// Simulate _mm_unpacklo_epi32
__device__ void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) 
{
    unsigned char tmp[16];
    memcpy(tmp, a, 4);
    memcpy(tmp + 4, b, 4);
    memcpy(tmp + 8, a + 4, 4);
    memcpy(tmp + 12, b + 4, 4);
    memcpy(t, tmp, 16);
}

__global__ void printme(unsigned char *t, unsigned char *a, unsigned char *b) {

  printf("threadIdx.x = %d, blockIdx.x = %d, gridDim.x = %d\n",threadIdx.x, blockIdx.x, gridDim.x);

  int i;
  printf("T: "); for (i=0; i<16; i++) printf("%02x", t[i]); printf("\n");
  printf("A: "); for (i=0; i<16; i++) printf("%02x", a[i]); printf("\n");
  printf("B: "); for (i=0; i<16; i++) printf("%02x", b[i]); printf("\n");

        unpacklo32(t, a, b);

  printf("T: "); for (i=0; i<16; i++) printf("%02x", t[i]); printf("\n");
  printf("A: "); for (i=0; i<16; i++) printf("%02x", a[i]); printf("\n");
  printf("B: "); for (i=0; i<16; i++) printf("%02x", b[i]); printf("\n");
}

int main() {

  unsigned char *t = NULL;
  unsigned char *t_cuda = NULL;
  unsigned char *a = NULL;
  unsigned char *a_cuda = NULL;
  unsigned char *b = NULL;
  unsigned char *b_cuda = NULL;

  // a = (unsigned char *) malloc (16);
  cudaMallocHost((void**)&a, 16);
  cudaMalloc(&a_cuda, 16);
  // b = (unsigned char *) malloc (16);
  cudaMallocHost((void**)&b, 16);
  cudaMalloc(&b_cuda, 16);
  cudaMallocHost((void**)&t, 16);
  cudaMalloc(&t_cuda, 16);
  
  int i;
  for (i=0; i<16; i++) t[i] = 0x00;
  for (i=0; i<16; i++) a[i] = 0xa0 | i;
  for (i=0; i<16; i++) b[i] = 0xb0 | i;

  cudaMemcpy(a_cuda, a, 16, cudaMemcpyHostToDevice);
  cudaMemcpy(b_cuda, b, 16, cudaMemcpyHostToDevice);
  cudaMemcpy(t_cuda, t, 16, cudaMemcpyHostToDevice);

  printme<<< 1 , 1 >>>(t_cuda, a_cuda, b_cuda);
  cudaDeviceSynchronize();
  return  0;
}

Execution result:

threadIdx.x = 0, blockIdx.x = 0, gridDim.x = 1
T: 00000000000000000000000000000000           
A: a0a1a2a3a4a5a6a7a8a9aaabacadaeaf           
B: b0b1b2b3b4b5b6b7b8b9babbbcbdbebf           
T: a0a1a2a3b0000000a4000000b4000000           
A: a0a1a2a3a4a5a6a7a8a9aaabacadaeaf           
B: b0b1b2b3b4b5b6b7b8b9babbbcbdbebf

Those. we see that inside the device the first memcpy worked successfully and copied 4 bytes to T, but the second one, which was supposed to copy 4 bytes B to T + 4, instead of b0b1b2b3 copied b0000000. Question - why so?
Educational example. It is clear that in principle no tmp is needed here and you can copy directly or do something like *((uint32_t *)tmp + 1) = *((uint32_t *)b);instead of memcpy(tmp + 4, b, 4);. But I would like to understand the meaning - why this happens. Those. why memcpy does not work correctly in this case.