I need to use a function like cudaDeviceSynchronize to wait for a kernel to finish execution. However after version 11.6, it is no longer possible to use any form of synchronization within device functions. How can we synchronize subkernels in device function?
Here's the code that I'm trying to run:
__global__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__global__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
//normally cudaDeviceSynchronize() kind of function to wait for child kernel to finish;
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
Thanks!
发布者:admin,转转请注明出处:http://www.yc00.com/questions/1744869146a4598155.html
评论列表(0条)