Guest User

Untitled

a guest
Oct 21st, 2017
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 3.63 KB | None | 0 0
  1. __global__ void parallel_maxIndexPrep(float const * const array_in, float2 *array_out, int const N) {
  2.    
  3.     extern __shared__ float2 sdata[];
  4.  
  5.     int tpb = blockDim.x;
  6.     int bid = blockIdx.x;
  7.     int tid = threadIdx.x;
  8.     int i = bid*tpb + tid;
  9.  
  10.     if( i < N ) {
  11.         sdata[tid].x=array_in[i];
  12.         sdata[tid].y=i;
  13.         array_out[i]=sdata[tid];
  14.     }
  15.  
  16. }
  17.  
  18. __global__ void parallel_maxIndex(float2 const * const array_in, float2 * const array_out, int const N) {
  19.  
  20.     extern __shared__ float2 sdata[];
  21.  
  22.     int tpb = blockDim.x;
  23.     int bid = blockIdx.x;
  24.     int tid = threadIdx.x;
  25.  
  26.     int i = bid*tpb*2 + tid;
  27.     //load into shared memory
  28.     //if we are in the last block
  29.     if(gridDim.x - 1 == bid) {
  30.  
  31.         //remainder left:
  32.         int R = N - (2*(gridDim.x-1))*tpb;
  33.        
  34.         //if two blocks left
  35.         if ( R > tpb ) {
  36.             if( tid < (R-tpb) ) {
  37.                 sdata[tid] = (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
  38.             }
  39.             else {
  40.                 sdata[tid] = array_in[i];
  41.             }
  42.         }
  43.         //if one block left
  44.         else {
  45.             if(tid < R) {
  46.                 sdata[tid]=array_in[i];
  47.             }
  48.             else {
  49.                 sdata[tid].x=0;
  50.                 sdata[tid].y=0;
  51.             }
  52.         }
  53.     }
  54.     else {
  55.         //OK!
  56.         sdata[tid]= (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
  57.     }
  58.  
  59.     __syncthreads();
  60.  
  61.     //start with a stride of 1
  62.     //this method minimizes the amount of divergence within a warp
  63. /*
  64.     for(unsigned int s=1;s<tpb;s*=2) {
  65.        
  66.         int index = 2*s*tid;
  67.  
  68.         if(index<tpb) {
  69.             if(sdata[index+s].x > sdata[index].x) {
  70.                 sdata[index] = sdata[index+s];
  71.             }
  72.         }
  73.         __syncthreads();
  74.     }
  75.  
  76. */
  77.     //this way we have coalesced reads that avoid bank-conflicts?
  78.     for(unsigned int s=tpb/2;s>0;s>>=1) {
  79.        
  80.         if(tid < s) {
  81.             if(sdata[tid+s].x > sdata[tid].x) {
  82.                 sdata[tid] = sdata[tid+s];
  83.             }
  84.         }
  85.         __syncthreads();
  86.     }
  87.  
  88.     if(tid==0) {
  89.         array_out[bid] = sdata[0];
  90.     }
  91. }
  92.  
  93. float2 *recursive_parallel_maxIndex(float2 * d_array_in, int const N) {
  94.  
  95.     int nblocks = ceil((float) N / (float) (THREADSPERBLOCK*2));
  96.  
  97.     float2 *d_array_out;
  98.     cudaMalloc((void **) &d_array_out, sizeof(*d_array_out)*nblocks);
  99.  
  100.     int smem = sizeof(float2) * THREADSPERBLOCK;
  101.  
  102.     //each threadperblock will need to do additional work!
  103.     int nblocksLaunch = min(65536, nblocks);
  104.    
  105.     assert(nblocksLaunch < MAXBLOCKSPERGRID);
  106. //  printf("Launching parallel_maxIndex() with N=%d points....\n", N);
  107. //  printf("num points: %d\nnum blocks launched: %d\nthreads per block: %d\n", N, nblocks, THREADSPERBLOCK);
  108.  
  109.     parallel_maxIndex<<< nblocksLaunch, THREADSPERBLOCK, smem>>>(d_array_in, d_array_out, N);
  110.  
  111.     // *****DEBUG******************************************
  112. /*
  113.     printf("______________________________________________________\n");
  114.     float2 *h_array_input = (float2 *) malloc(sizeof(float2)*N);
  115.     cudaMemcpy(h_array_input,d_array_in,sizeof(float2)*N,cudaMemcpyDeviceToHost);
  116.     print_array(h_array_input, N);
  117.     printf("||||||||||||||||||||||||||||||||||||||||||||||||||||||\n");
  118.     float2 *h_array_debug = (float2 *) malloc(sizeof(float2)*nblocks);
  119.     cudaMemcpy(h_array_debug,d_array_out,sizeof(float2)*nblocks,cudaMemcpyDeviceToHost);
  120.     print_array(h_array_debug, nblocks);
  121.     printf("Size of nblocks %d\n", nblocks);
  122. */ 
  123.     // *****DEBUG******************************************
  124.  
  125.     cudaFree(d_array_in);
  126.  
  127.     //if we have more points than threads
  128.     if(N > THREADSPERBLOCK) {
  129.         return recursive_parallel_maxIndex(d_array_out, nblocks);
  130.     }
  131.     //after we have reached termination
  132.     else {
  133.         //printf("Termination reached!\n");
  134.         //this last pointer should never be de-allocated by this function!
  135.         //it should be passed onto the very first function that calls recursive_..()
  136.         printf("Target memory address: %p\n", d_array_out);
  137.         return d_array_out;
  138.     }
  139.  
  140. }
Add Comment
Please, Sign In to add comment