
Untitled
By: a guest on
Aug 10th, 2012 | syntax:
None | size: 0.75 KB | hits: 9 | expires: Never
CUDA: atomicAdd takes too much time, serializing threads
__global__ void mykernel(..., unsigned int *gColCnt) {
...
__shared__ unsigned int sColCnt;
__shared__ unsigned int sIndex;
if (threadIdx.x == 0) {
sColCnt = 0;
}
__syncthreads();
unsigned int index = 0;
if (colliding)
index = atomicAdd(&sColCnt, 1); //!!Time Consuming!!
__syncthreads();
if (threadIdx.x == 0)
sIndex = atomicAdd(gColCnt, sColCnt);
__syncthreads();
if (sColCnt + sIndex > outputSize) { //output buffer is not enough
//printf("Exceeds outputsize: %d + %d > %dn", sColCnt, sIndex, outputSize);
return;
}
if (colliding) {
output[sIndex + index] = make_uint2(startId, toId);
}
}