The neural network on Cuda is slow, how to speed it up?

C

Clay2019-04-26 17:05:26

GPGPU

Clay, 2019-04-26 17:05:26

I read in T. Rashid's book "Make your own network" about a simple neural network for recognizing numbers. Implemented it in python, then decided to write it in Cuda. The problem is that it learns many times slower than in python or just on the pluses. Already tried everything. I hope very much for your help.
PS I'm new to programming, so don't be too harsh on code style and possibly stupid mistakes.

class neuralNet {
  float input_nodes;
  float hidden_nodes;
  float output_nodes;
  float learning_grade;
  float* wih;
  float* who;
public:
  neuralNet(int in, int hid, int out, float lr) {
    input_nodes = in;
    hidden_nodes = hid;
    output_nodes = out;
    learning_grade = lr;

    wih = new float[hidden_nodes * input_nodes];
    who = new float[hidden_nodes * output_nodes];

    for (int i = 0; i<input_nodes*hidden_nodes; i++)
      wih[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) - 0.5;

    for (int i = 0; i<hidden_nodes*output_nodes; i++)
      who[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) - 0.5;
  }
  ~neuralNet() {
    delete[] wih;
    delete[] who;
  }
  float*activation_function(float* a, int n) {
    float* tmp = new float[n];
    for (int i = 0; i < n; i++)
      tmp[i] = 1 / (1 + exp(-a[i]));
    return tmp;
  }
  void train(float*inputs, float*targets) {

    //==================================  СЧИТАЕМ QUERY  =========================================

    cublasStatus_t stat; // CUBLAS functions status
    cublasHandle_t handle; // CUBLAS context

    float *hid_in, *hid_out, *in_out, *out_out;
    float * wih_d, *inputs_d, *hid_in_d, *in_out_d, *who_d, *hid_out_d;

    hid_in = new float[hidden_nodes];
    in_out = new float[output_nodes];

    // выделяем память на GPU сначала для того, чтобы посчитать hid_in
    cudaMalloc((float**)& hid_in_d, hidden_nodes * sizeof(float));
    cudaMalloc((float**)& inputs_d, input_nodes * sizeof(float));
    cudaMalloc((float**)& wih_d, input_nodes *hidden_nodes * sizeof(float));
    // выделяем память на GPU  для того, чтобы посчитать in_out
    cudaMalloc((float**)& in_out_d, output_nodes * sizeof(float));
    cudaMalloc((float**)& who_d, output_nodes *hidden_nodes * sizeof(float));
    cudaMalloc((float**)& hid_out_d, hidden_nodes * sizeof(float));

    // Показываем компилятору, сколько строк и сколько столбцов, чтобы он правильно умножал
    stat = cublasSetMatrix(hidden_nodes, input_nodes, sizeof(*wih), wih, hidden_nodes, wih_d, hidden_nodes);
    stat = cublasSetMatrix(input_nodes, 1, sizeof(*inputs), inputs, input_nodes, inputs_d, input_nodes);
    stat = cublasSetMatrix(hidden_nodes, 1, sizeof(*hid_in), hid_in, hidden_nodes, hid_in_d, hidden_nodes);
    // Аналогично
    stat = cublasSetMatrix(output_nodes, hidden_nodes, sizeof(*who), who, output_nodes, who_d, output_nodes);
    stat = cublasSetMatrix(output_nodes, 1, sizeof(*in_out), in_out, output_nodes, in_out_d, output_nodes);
    //

    stat = cublasCreate(&handle);
    // константы С1 и С2 - c1*[a]*[b] + c2*[c]
    float al = 1.0f;
    float bet = 0.0f;
    // Умножаем WIH на Inputs
    stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_nodes, 1, input_nodes, &al, wih_d,
      hidden_nodes, inputs_d, input_nodes, &bet, hid_in_d, hidden_nodes);
    // Запись результат в hid_in
    stat = cublasGetMatrix(hidden_nodes, 1, sizeof(*hid_in), hid_in_d, hidden_nodes, hid_in, hidden_nodes);

    // С помощью функции активации делаем пересчет hid_out
    hid_out = activation_function(hid_in, hidden_nodes);

    // Заносим hid_out на GPU и делаем его условно матрицей, указывая столбцы и строчки...
    stat = cublasSetMatrix(hidden_nodes, 1, sizeof(*hid_out), hid_out, hidden_nodes, hid_out_d, hidden_nodes);

    // Умножаем  WHO на hid_out
    stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, output_nodes, 1, hidden_nodes, &al, who_d,
      output_nodes, hid_out_d, hidden_nodes, &bet, in_out_d, output_nodes);

    // Записываем результат в in_out
    stat = cublasGetMatrix(output_nodes, 1, sizeof(*in_out), in_out_d, output_nodes, in_out, output_nodes);

    out_out = activation_function(in_out, output_nodes);

    //==================================  СЧИТАЕМ ERROR1  =========================================

    float * error1;

    // Считаем ошибки на выходе(здесь не обязателен GPU, т.к. всего 10 выходов)
    error1 = new float[output_nodes];
    for (int i = 0; i < output_nodes; i++)
      error1[i] = targets[i] - out_out[i];

    //==================================  СЧИТАЕМ ERROR2  =========================================

    float* error2;
    error2 = new float[hidden_nodes];

    float*who_t_h;  // transposed on CPU
    float *who_t_d; // transposed on GPU

    who_t_h = new float[hidden_nodes*output_nodes];

    cudaMalloc((void**)&who_t_d, output_nodes*hidden_nodes * sizeof(float));

    // Транспонируем матрицу WHO
    dim3 Grid(output_nodes / BLOCK_DIM, hidden_nodes / BLOCK_DIM);
    dim3 Block(BLOCK_DIM, BLOCK_DIM);
    transposeMatrixFast << <Grid, Block >> > (who_d, who_t_d, output_nodes, hidden_nodes);

    // Записываем значение транспонированной матрицы в who_t_h
    cudaMemcpy(who_t_h, who_t_d, output_nodes*hidden_nodes * sizeof(float), cudaMemcpyDeviceToHost);

    //__________________________________  WHO(TRANSP) * ERROR1  _________________________________________

    float *error1_d, *error2_d;

    // Начала выделим память для GPU пер-х
    cudaMalloc((void**)&error1_d, output_nodes * sizeof(float));
    cudaMalloc((void**)&error2_d, hidden_nodes * sizeof(float));

    // Покажем компилятору, сколько строчек и сколько столбцов в матрицах
    stat = cublasSetMatrix(output_nodes, 1, sizeof(*error1), error1, output_nodes, error1_d, output_nodes);
    stat = cublasSetMatrix(hidden_nodes, output_nodes, sizeof(*who_t_h), who_t_h, hidden_nodes, who_t_d, hidden_nodes);
    stat = cublasSetMatrix(hidden_nodes, 1, sizeof(*error2), error2, hidden_nodes, error2_d, hidden_nodes);

    // Перемножаем  who_t_h и error1
    stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_nodes, 1, output_nodes, &al, who_t_d,
      hidden_nodes, error1_d, output_nodes, &bet, error2_d, hidden_nodes);

    // Перекидываем результат в error2
    stat = cublasGetMatrix(hidden_nodes, 1, sizeof(*error2), error2_d, hidden_nodes, error2, hidden_nodes);

    //==================================  СЧИТАЕМ WHO  =========================================

    float*left_part = new float[output_nodes];
    for (int i = 0; i < output_nodes; i++)
      left_part[i] = learning_grade*(error1[i] * out_out[i] * (1 - out_out[i]));

    float *left_part_d, *who_dd;
    // Выделяем память на GPU для дальнейшего подсчено new WHO
    cudaMalloc((void**)&left_part_d, output_nodes * sizeof(float));
    cudaMalloc((void**)&who_dd, hidden_nodes*output_nodes * sizeof(float));
    // Сообщаем компилятору вид матриц
    stat = cublasSetMatrix(output_nodes, 1, sizeof(*error1), left_part, output_nodes, left_part_d, output_nodes);
    stat = cublasSetMatrix(output_nodes, hidden_nodes, sizeof(*who), who, output_nodes, who_dd, output_nodes);
    stat = cublasSetMatrix(1, hidden_nodes, sizeof(*hid_out), hid_out, 1, hid_out_d, 1); // already transposed

    // Перемножаем и получаем new WHO
    stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, output_nodes, hidden_nodes, 1, &al, left_part_d,
      output_nodes, hid_out_d, 1, &bet, who_dd, output_nodes);
    
    float* temporary;
    cudaMalloc((void**)&temporary, hidden_nodes*output_nodes * sizeof(float));
    cudaMemcpy(temporary, who, hidden_nodes*output_nodes * sizeof(float), cudaMemcpyHostToDevice);
    sum << <(output_nodes*hidden_nodes + 127) / 128, 128 >> > (temporary, who_dd, output_nodes*hidden_nodes);                // Может здесь возможно ускорить 
    // Копируем новые веса на CPU
    cudaMemcpy(who, temporary, output_nodes*hidden_nodes * sizeof(float), cudaMemcpyDeviceToHost);              
    cudaFree(temporary);																						
                                                          
    //==================================  СЧИТАЕМ WIH  =========================================

    float *left_part2 = new float[hidden_nodes];
    float*left_part2_d;
    cudaMalloc((void**)&left_part2_d, hidden_nodes * sizeof(float));
    special << <(hidden_nodes + 127) / 128, 128 >> > (left_part2_d, error2_d, hid_out_d, hidden_nodes, learning_grade);

    float  *wih_dd;

    // Выделяем память на GPU для дальнейшего подсчено new WHO
    cudaMalloc((void**)&wih_dd, hidden_nodes*input_nodes * sizeof(float));
    // Сообщаем компилятору вид матриц
    stat = cublasSetMatrix(hidden_nodes, input_nodes, sizeof(*wih), wih, hidden_nodes, wih_dd, hidden_nodes);
    stat = cublasSetMatrix(1, input_nodes, sizeof(*inputs), inputs, 1, inputs_d, 1); // already transposed

    // Перемножаем и получаем new WHO
    stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_nodes, input_nodes, 1, &al, left_part2_d,
      hidden_nodes, inputs_d, 1, &bet, wih_dd, hidden_nodes);
    
    float* temporary1;
    cudaMalloc((void**)&temporary1, hidden_nodes*input_nodes * sizeof(float));
    cudaMemcpy(temporary1, wih, hidden_nodes*input_nodes * sizeof(float), cudaMemcpyHostToDevice);
    sum << <(input_nodes*hidden_nodes + 127) / 128, 128 >> > (temporary1, wih_dd, input_nodes*hidden_nodes);
    // Копируем новые веса на CPU
    cudaMemcpy(wih, temporary1, input_nodes*hidden_nodes * sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(temporary1);

    //==================================  ОЧИЩАЕМ ПАМЯТЬ  =========================================

    cudaFree(left_part2_d);
    cudaFree(wih_dd);
    cudaFree(who_dd);
    delete[] left_part2;
    //
    cudaFree(who_t_d);
    //
    cudaFree(error1_d);
    cudaFree(error2_d);
    // Подчищаем за собой
    cudaFree(hid_in_d);
    cudaFree(inputs_d);
    cudaFree(wih_d);
    // ... 
    cudaFree(in_out_d);
    cudaFree(who_d);
    cudaFree(hid_out_d);
    //... 
    cudaFree(left_part_d);
    delete[] left_part;
    //
    delete[] hid_in;
    delete[] in_out;
    delete[] hid_out;
    delete[] out_out;

    delete[] error1;
    delete[] error2;

    cublasDestroy(handle);
  }

Reply

Answer the question

In order to leave comments, you need to log in

4 answer(s)

D

Dimonchik, 2019-04-26
@dimonchik2013

profiling would be nice

C

Clay, 2019-04-26
@Pushunter

So, I decided to check how long it takes to allocate memory on the device and copy elements there ... I wrote a program separately to compare the speed of matrix multiplication on the CPU and on the GPU. A pure multiplication on the GPU is 0.07 ms, and on the CPU 0.001 s. But the selection, copying back and forth, and the multiplication itself takes 300 ms on the GPU. Just SHOCK!!! I began to check each line separately and found out...
stat = cublasCreate(&handle); this line is 300 ms. I have not yet figured out how to fix it, but I found it good that I eat so much speed ...

O

Otrivin, 2017-03-24
@Otrivin

Compressing pictures will be considered captaincy?
Here I reduce the color depth of Png-graphics, process jpeg through advanced jpeg compressor

A

Alex Bond, 2017-03-24
@AlexBond

Services often lie. In fact, I checked through the office Internet - page loading takes only 3-4 seconds. Your page weight is small (about 2MB), so this is not a problem.
Of course, you can transfer to CDN if you are not using it yet. Maybe you will win 100-200ms.