Answer the question
In order to leave comments, you need to log in
Why does this CUDA kernel produce different results than the CPU version?
I ported this piece of code to CUDA:
if(_layersCount > 1)
{
for(int i=_layersCount-2;i>=0;i--)
{
for(int j=0;j<_neuronsPerLayerCount[i];j++) // cuda kernel
{
localGradients[indexByLayerAndNeuron(i, j)] = 0;
for(int k=0;k<_neuronsPerLayerCount[i+1];k++)
{
localGradients[indexByLayerAndNeuron(i, j)] += _neuronsInputsWeights[indexByLayerNeuronAndInput(i+1, k, j)]
* localGradients[indexByLayerAndNeuron(i+1, k)];
}
localGradients[indexByLayerAndNeuron(i, j)] *= derivatives[indexByLayerAndNeuron(i, j)];
}
}
}
if(_layersCount > 1)
{
for(int i=_layersCount-2;i>=0;i--)
{
// calculateLocalGradientsForAnotherLayers
blocksCount = floor((double) _neuronsPerLayerCount[i] / threads.x) + 1;
blocks = dim3(blocksCount, 1);
calculateLocalGradientsForAnotherLayers <<<blocks, threads>>> (deviceLocalGradients, _neuronsInputsWeights, deviceDerivatives, _neuronsPerLayerCount[i], _neuronsInPreviousLayers[i], _neuronsInPreviousLayers[i+1], _neuronsPerLayerCount[i+1], _inputsInPreviousLayers[i], _inputsInCurrentLayer[i]);
}
}
__global__ void calculateLocalGradientsForAnotherLayers(double * localGradients, double * neuronsInputsWeights, double * derivatives, int neuronsCount, int neuronsInPreviousLayers, int neuronsInPreviousLayersWithCurrent, int neuronsInNextLayer, int inputsInPreviousLayers, int inputsInCurrentLayer)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < neuronsCount)
{
int neuron = neuronsInPreviousLayers + idx;
localGradients[neuron] = 0;
// this to Kernel, then reduce localGradients.
for(int k=0;k<neuronsInNextLayer;k++)
{
localGradients[neuron] += neuronsInputsWeights[inputsInPreviousLayers + k*inputsInCurrentLayer + idx]
* localGradients[neuronsInPreviousLayersWithCurrent + k];
}
localGradients[neuron] *= derivatives[neuron];
}
}
Answer the question
In order to leave comments, you need to log in
Perhaps the problem is that the cores in the loop are executed asynchronously.
And the result of the execution of one kernel depends on the result of the other, and the next kernel in the cycle is executed without waiting for the completion of the previous one.
Double precision, if you do not have tesla k20 or old tesla 2xxx, it makes no sense to use it on nvidia cards.
The GF555M does not shine particularly in single precision, but with double precision it is an order of magnitude slower.
Modern 4 core CPU will be faster in double.
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question