Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ void parallel_maxIndexPrep(float const * const array_in, float2 *array_out, int const N) {
- extern __shared__ float2 sdata[];
- int tpb = blockDim.x;
- int bid = blockIdx.x;
- int tid = threadIdx.x;
- int i = bid*tpb + tid;
- if( i < N ) {
- sdata[tid].x=array_in[i];
- sdata[tid].y=i;
- array_out[i]=sdata[tid];
- }
- }
- __global__ void parallel_maxIndex(float2 const * const array_in, float2 * const array_out, int const N) {
- extern __shared__ float2 sdata[];
- int tpb = blockDim.x;
- int bid = blockIdx.x;
- int tid = threadIdx.x;
- int i = bid*tpb*2 + tid;
- //load into shared memory
- //if we are in the last block
- if(gridDim.x - 1 == bid) {
- //remainder left:
- int R = N - (2*(gridDim.x-1))*tpb;
- //if two blocks left
- if ( R > tpb ) {
- if( tid < (R-tpb) ) {
- sdata[tid] = (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
- }
- else {
- sdata[tid] = array_in[i];
- }
- }
- //if one block left
- else {
- if(tid < R) {
- sdata[tid]=array_in[i];
- }
- else {
- sdata[tid].x=0;
- sdata[tid].y=0;
- }
- }
- }
- else {
- //OK!
- sdata[tid]= (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
- }
- __syncthreads();
- //start with a stride of 1
- //this method minimizes the amount of divergence within a warp
- /*
- for(unsigned int s=1;s<tpb;s*=2) {
- int index = 2*s*tid;
- if(index<tpb) {
- if(sdata[index+s].x > sdata[index].x) {
- sdata[index] = sdata[index+s];
- }
- }
- __syncthreads();
- }
- */
- //this way we have coalesced reads that avoid bank-conflicts?
- for(unsigned int s=tpb/2;s>0;s>>=1) {
- if(tid < s) {
- if(sdata[tid+s].x > sdata[tid].x) {
- sdata[tid] = sdata[tid+s];
- }
- }
- __syncthreads();
- }
- if(tid==0) {
- array_out[bid] = sdata[0];
- }
- }
- float2 *recursive_parallel_maxIndex(float2 * d_array_in, int const N) {
- int nblocks = ceil((float) N / (float) (THREADSPERBLOCK*2));
- float2 *d_array_out;
- cudaMalloc((void **) &d_array_out, sizeof(*d_array_out)*nblocks);
- int smem = sizeof(float2) * THREADSPERBLOCK;
- //each threadperblock will need to do additional work!
- int nblocksLaunch = min(65536, nblocks);
- assert(nblocksLaunch < MAXBLOCKSPERGRID);
- // printf("Launching parallel_maxIndex() with N=%d points....\n", N);
- // printf("num points: %d\nnum blocks launched: %d\nthreads per block: %d\n", N, nblocks, THREADSPERBLOCK);
- parallel_maxIndex<<< nblocksLaunch, THREADSPERBLOCK, smem>>>(d_array_in, d_array_out, N);
- // *****DEBUG******************************************
- /*
- printf("______________________________________________________\n");
- float2 *h_array_input = (float2 *) malloc(sizeof(float2)*N);
- cudaMemcpy(h_array_input,d_array_in,sizeof(float2)*N,cudaMemcpyDeviceToHost);
- print_array(h_array_input, N);
- printf("||||||||||||||||||||||||||||||||||||||||||||||||||||||\n");
- float2 *h_array_debug = (float2 *) malloc(sizeof(float2)*nblocks);
- cudaMemcpy(h_array_debug,d_array_out,sizeof(float2)*nblocks,cudaMemcpyDeviceToHost);
- print_array(h_array_debug, nblocks);
- printf("Size of nblocks %d\n", nblocks);
- */
- // *****DEBUG******************************************
- cudaFree(d_array_in);
- //if we have more points than threads
- if(N > THREADSPERBLOCK) {
- return recursive_parallel_maxIndex(d_array_out, nblocks);
- }
- //after we have reached termination
- else {
- //printf("Termination reached!\n");
- //this last pointer should never be de-allocated by this function!
- //it should be passed onto the very first function that calls recursive_..()
- printf("Target memory address: %p\n", d_array_out);
- return d_array_out;
- }
- }
Add Comment
Please, Sign In to add comment