Answer the question
In order to leave comments, you need to log in
The neural network on Cuda is slow, how to speed it up?
I read in T. Rashid's book "Make your own network" about a simple neural network for recognizing numbers. Implemented it in python, then decided to write it in Cuda. The problem is that it learns many times slower than in python or just on the pluses. Already tried everything. I hope very much for your help.
PS I'm new to programming, so don't be too harsh on code style and possibly stupid mistakes.
class neuralNet {
float input_nodes;
float hidden_nodes;
float output_nodes;
float learning_grade;
float* wih;
float* who;
public:
neuralNet(int in, int hid, int out, float lr) {
input_nodes = in;
hidden_nodes = hid;
output_nodes = out;
learning_grade = lr;
wih = new float[hidden_nodes * input_nodes];
who = new float[hidden_nodes * output_nodes];
for (int i = 0; i<input_nodes*hidden_nodes; i++)
wih[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) - 0.5;
for (int i = 0; i<hidden_nodes*output_nodes; i++)
who[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) - 0.5;
}
~neuralNet() {
delete[] wih;
delete[] who;
}
float*activation_function(float* a, int n) {
float* tmp = new float[n];
for (int i = 0; i < n; i++)
tmp[i] = 1 / (1 + exp(-a[i]));
return tmp;
}
void train(float*inputs, float*targets) {
//================================== СЧИТАЕМ QUERY =========================================
cublasStatus_t stat; // CUBLAS functions status
cublasHandle_t handle; // CUBLAS context
float *hid_in, *hid_out, *in_out, *out_out;
float * wih_d, *inputs_d, *hid_in_d, *in_out_d, *who_d, *hid_out_d;
hid_in = new float[hidden_nodes];
in_out = new float[output_nodes];
// выделяем память на GPU сначала для того, чтобы посчитать hid_in
cudaMalloc((float**)& hid_in_d, hidden_nodes * sizeof(float));
cudaMalloc((float**)& inputs_d, input_nodes * sizeof(float));
cudaMalloc((float**)& wih_d, input_nodes *hidden_nodes * sizeof(float));
// выделяем память на GPU для того, чтобы посчитать in_out
cudaMalloc((float**)& in_out_d, output_nodes * sizeof(float));
cudaMalloc((float**)& who_d, output_nodes *hidden_nodes * sizeof(float));
cudaMalloc((float**)& hid_out_d, hidden_nodes * sizeof(float));
// Показываем компилятору, сколько строк и сколько столбцов, чтобы он правильно умножал
stat = cublasSetMatrix(hidden_nodes, input_nodes, sizeof(*wih), wih, hidden_nodes, wih_d, hidden_nodes);
stat = cublasSetMatrix(input_nodes, 1, sizeof(*inputs), inputs, input_nodes, inputs_d, input_nodes);
stat = cublasSetMatrix(hidden_nodes, 1, sizeof(*hid_in), hid_in, hidden_nodes, hid_in_d, hidden_nodes);
// Аналогично
stat = cublasSetMatrix(output_nodes, hidden_nodes, sizeof(*who), who, output_nodes, who_d, output_nodes);
stat = cublasSetMatrix(output_nodes, 1, sizeof(*in_out), in_out, output_nodes, in_out_d, output_nodes);
//
stat = cublasCreate(&handle);
// константы С1 и С2 - c1*[a]*[b] + c2*[c]
float al = 1.0f;
float bet = 0.0f;
// Умножаем WIH на Inputs
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_nodes, 1, input_nodes, &al, wih_d,
hidden_nodes, inputs_d, input_nodes, &bet, hid_in_d, hidden_nodes);
// Запись результат в hid_in
stat = cublasGetMatrix(hidden_nodes, 1, sizeof(*hid_in), hid_in_d, hidden_nodes, hid_in, hidden_nodes);
// С помощью функции активации делаем пересчет hid_out
hid_out = activation_function(hid_in, hidden_nodes);
// Заносим hid_out на GPU и делаем его условно матрицей, указывая столбцы и строчки...
stat = cublasSetMatrix(hidden_nodes, 1, sizeof(*hid_out), hid_out, hidden_nodes, hid_out_d, hidden_nodes);
// Умножаем WHO на hid_out
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, output_nodes, 1, hidden_nodes, &al, who_d,
output_nodes, hid_out_d, hidden_nodes, &bet, in_out_d, output_nodes);
// Записываем результат в in_out
stat = cublasGetMatrix(output_nodes, 1, sizeof(*in_out), in_out_d, output_nodes, in_out, output_nodes);
out_out = activation_function(in_out, output_nodes);
//================================== СЧИТАЕМ ERROR1 =========================================
float * error1;
// Считаем ошибки на выходе(здесь не обязателен GPU, т.к. всего 10 выходов)
error1 = new float[output_nodes];
for (int i = 0; i < output_nodes; i++)
error1[i] = targets[i] - out_out[i];
//================================== СЧИТАЕМ ERROR2 =========================================
float* error2;
error2 = new float[hidden_nodes];
float*who_t_h; // transposed on CPU
float *who_t_d; // transposed on GPU
who_t_h = new float[hidden_nodes*output_nodes];
cudaMalloc((void**)&who_t_d, output_nodes*hidden_nodes * sizeof(float));
// Транспонируем матрицу WHO
dim3 Grid(output_nodes / BLOCK_DIM, hidden_nodes / BLOCK_DIM);
dim3 Block(BLOCK_DIM, BLOCK_DIM);
transposeMatrixFast << <Grid, Block >> > (who_d, who_t_d, output_nodes, hidden_nodes);
// Записываем значение транспонированной матрицы в who_t_h
cudaMemcpy(who_t_h, who_t_d, output_nodes*hidden_nodes * sizeof(float), cudaMemcpyDeviceToHost);
//__________________________________ WHO(TRANSP) * ERROR1 _________________________________________
float *error1_d, *error2_d;
// Начала выделим память для GPU пер-х
cudaMalloc((void**)&error1_d, output_nodes * sizeof(float));
cudaMalloc((void**)&error2_d, hidden_nodes * sizeof(float));
// Покажем компилятору, сколько строчек и сколько столбцов в матрицах
stat = cublasSetMatrix(output_nodes, 1, sizeof(*error1), error1, output_nodes, error1_d, output_nodes);
stat = cublasSetMatrix(hidden_nodes, output_nodes, sizeof(*who_t_h), who_t_h, hidden_nodes, who_t_d, hidden_nodes);
stat = cublasSetMatrix(hidden_nodes, 1, sizeof(*error2), error2, hidden_nodes, error2_d, hidden_nodes);
// Перемножаем who_t_h и error1
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_nodes, 1, output_nodes, &al, who_t_d,
hidden_nodes, error1_d, output_nodes, &bet, error2_d, hidden_nodes);
// Перекидываем результат в error2
stat = cublasGetMatrix(hidden_nodes, 1, sizeof(*error2), error2_d, hidden_nodes, error2, hidden_nodes);
//================================== СЧИТАЕМ WHO =========================================
float*left_part = new float[output_nodes];
for (int i = 0; i < output_nodes; i++)
left_part[i] = learning_grade*(error1[i] * out_out[i] * (1 - out_out[i]));
float *left_part_d, *who_dd;
// Выделяем память на GPU для дальнейшего подсчено new WHO
cudaMalloc((void**)&left_part_d, output_nodes * sizeof(float));
cudaMalloc((void**)&who_dd, hidden_nodes*output_nodes * sizeof(float));
// Сообщаем компилятору вид матриц
stat = cublasSetMatrix(output_nodes, 1, sizeof(*error1), left_part, output_nodes, left_part_d, output_nodes);
stat = cublasSetMatrix(output_nodes, hidden_nodes, sizeof(*who), who, output_nodes, who_dd, output_nodes);
stat = cublasSetMatrix(1, hidden_nodes, sizeof(*hid_out), hid_out, 1, hid_out_d, 1); // already transposed
// Перемножаем и получаем new WHO
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, output_nodes, hidden_nodes, 1, &al, left_part_d,
output_nodes, hid_out_d, 1, &bet, who_dd, output_nodes);
float* temporary;
cudaMalloc((void**)&temporary, hidden_nodes*output_nodes * sizeof(float));
cudaMemcpy(temporary, who, hidden_nodes*output_nodes * sizeof(float), cudaMemcpyHostToDevice);
sum << <(output_nodes*hidden_nodes + 127) / 128, 128 >> > (temporary, who_dd, output_nodes*hidden_nodes); // Может здесь возможно ускорить
// Копируем новые веса на CPU
cudaMemcpy(who, temporary, output_nodes*hidden_nodes * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(temporary);
//================================== СЧИТАЕМ WIH =========================================
float *left_part2 = new float[hidden_nodes];
float*left_part2_d;
cudaMalloc((void**)&left_part2_d, hidden_nodes * sizeof(float));
special << <(hidden_nodes + 127) / 128, 128 >> > (left_part2_d, error2_d, hid_out_d, hidden_nodes, learning_grade);
float *wih_dd;
// Выделяем память на GPU для дальнейшего подсчено new WHO
cudaMalloc((void**)&wih_dd, hidden_nodes*input_nodes * sizeof(float));
// Сообщаем компилятору вид матриц
stat = cublasSetMatrix(hidden_nodes, input_nodes, sizeof(*wih), wih, hidden_nodes, wih_dd, hidden_nodes);
stat = cublasSetMatrix(1, input_nodes, sizeof(*inputs), inputs, 1, inputs_d, 1); // already transposed
// Перемножаем и получаем new WHO
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_nodes, input_nodes, 1, &al, left_part2_d,
hidden_nodes, inputs_d, 1, &bet, wih_dd, hidden_nodes);
float* temporary1;
cudaMalloc((void**)&temporary1, hidden_nodes*input_nodes * sizeof(float));
cudaMemcpy(temporary1, wih, hidden_nodes*input_nodes * sizeof(float), cudaMemcpyHostToDevice);
sum << <(input_nodes*hidden_nodes + 127) / 128, 128 >> > (temporary1, wih_dd, input_nodes*hidden_nodes);
// Копируем новые веса на CPU
cudaMemcpy(wih, temporary1, input_nodes*hidden_nodes * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(temporary1);
//================================== ОЧИЩАЕМ ПАМЯТЬ =========================================
cudaFree(left_part2_d);
cudaFree(wih_dd);
cudaFree(who_dd);
delete[] left_part2;
//
cudaFree(who_t_d);
//
cudaFree(error1_d);
cudaFree(error2_d);
// Подчищаем за собой
cudaFree(hid_in_d);
cudaFree(inputs_d);
cudaFree(wih_d);
// ...
cudaFree(in_out_d);
cudaFree(who_d);
cudaFree(hid_out_d);
//...
cudaFree(left_part_d);
delete[] left_part;
//
delete[] hid_in;
delete[] in_out;
delete[] hid_out;
delete[] out_out;
delete[] error1;
delete[] error2;
cublasDestroy(handle);
}
Answer the question
In order to leave comments, you need to log in
So, I decided to check how long it takes to allocate memory on the device and copy elements there ... I wrote a program separately to compare the speed of matrix multiplication on the CPU and on the GPU. A pure multiplication on the GPU is 0.07 ms, and on the CPU 0.001 s. But the selection, copying back and forth, and the multiplication itself takes 300 ms on the GPU. Just SHOCK!!! I began to check each line separately and found out...
stat = cublasCreate(&handle); this line is 300 ms. I have not yet figured out how to fix it, but I found it good that I eat so much speed ...
Compressing pictures will be considered captaincy?
Here I reduce the color depth of Png-graphics, process jpeg through advanced jpeg compressor
Services often lie. In fact, I checked through the office Internet - page loading takes only 3-4 seconds. Your page weight is small (about 2MB), so this is not a problem.
Of course, you can transfer to CDN if you are not using it yet. Maybe you will win 100-200ms.
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question