drout

LPW Indexed Search (parallel algorithm in CUDA)

Mar 17th, 2019
172
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 0.95 KB | None | 0 0
  1. // CUDA kernel
  2. //
  3. // This is an answer to a StackOverflow question: "Search an ordered array in a CUDA kernel"
  4. // -- "I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory
  5. //     each thread has to find the array index i such that a[i] <= threadIdx.x and a[i + 1] > threadIdx.x."
  6.  
  7. __global__ void lpw_indexed_search( int *a, int n )
  8. {
  9.     int idx = threadIdx.x;
  10.  
  11.     __shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];
  12.  
  13.     aux[ idx ] = 0;
  14.  
  15.     if( idx < n )
  16.         atomicAdd( &aux[ a[idx] ], 1); // atomics in case there are duplicates
  17.  
  18.     __syncthreads();
  19.  
  20.     int tmp;
  21.  
  22.     // Scan    
  23.     for( int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1 )
  24.     {
  25.         if( idx >= j ) tmp = aux[ idx - j ];
  26.         __syncthreads();
  27.         if( idx >= j ) aux[ idx ] += tmp;
  28.         __syncthreads();
  29.     }
  30.  
  31.     // result in "i"
  32.     int i = aux[ idx ] - 1;
  33.  
  34.     // use "i" here...
  35.     // ...
  36. }
Add Comment
Please, Sign In to add comment