Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Aug 10th, 2012  |  syntax: None  |  size: 0.75 KB  |  hits: 9  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. CUDA: atomicAdd takes too much time, serializing threads
  2. __global__ void mykernel(..., unsigned int *gColCnt) {
  3.     ...
  4.  
  5.     __shared__ unsigned int sColCnt;
  6.     __shared__ unsigned int sIndex;
  7.  
  8.     if (threadIdx.x == 0) {
  9.         sColCnt = 0;
  10.     }
  11.  
  12.     __syncthreads();
  13.  
  14.     unsigned int index = 0;
  15.     if (colliding)
  16.         index = atomicAdd(&sColCnt, 1); //!!Time Consuming!!
  17.  
  18.     __syncthreads();
  19.  
  20.     if (threadIdx.x == 0)
  21.         sIndex = atomicAdd(gColCnt, sColCnt);
  22.  
  23.     __syncthreads();
  24.  
  25.     if (sColCnt + sIndex > outputSize) { //output buffer is not enough
  26.         //printf("Exceeds outputsize: %d + %d > %dn", sColCnt, sIndex, outputSize);
  27.         return;
  28.     }
  29.  
  30.     if (colliding) {
  31.         output[sIndex + index] = make_uint2(startId, toId);
  32.     }
  33. }