Untitled

__global__ void parallel_maxIndexPrep(float const * const array_in, float2 *array_out, int const N) {

    extern __shared__ float2 sdata[];

    int tpb = blockDim.x;
    int bid = blockIdx.x;
    int tid = threadIdx.x;
    int i = bid*tpb + tid;

    if( i < N ) {
        sdata[tid].x=array_in[i];
        sdata[tid].y=i;
        array_out[i]=sdata[tid];
    }

}

__global__ void parallel_maxIndex(float2 const * const array_in, float2 * const array_out, int const N) {

    extern __shared__ float2 sdata[];

    int tpb = blockDim.x;
    int bid = blockIdx.x;
    int tid = threadIdx.x;

    int i = bid*tpb*2 + tid;
    //load into shared memory
    //if we are in the last block
    if(gridDim.x - 1 == bid) {

        //remainder left:
        int R = N - (2*(gridDim.x-1))*tpb;

        //if two blocks left
        if ( R > tpb ) {
            if( tid < (R-tpb) ) {
                sdata[tid] = (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
            }
            else {
                sdata[tid] = array_in[i];
            }
        }
        //if one block left
        else {
            if(tid < R) {
                sdata[tid]=array_in[i];
            }
            else {
                sdata[tid].x=0;
                sdata[tid].y=0;
            }
        }
    }
    else {
        //OK!
        sdata[tid]= (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
    }

    __syncthreads();

    //start with a stride of 1
    //this method minimizes the amount of divergence within a warp
/*
    for(unsigned int s=1;s<tpb;s*=2) {

        int index = 2*s*tid;

        if(index<tpb) {
            if(sdata[index+s].x > sdata[index].x) {
                sdata[index] = sdata[index+s];
            }
        }
        __syncthreads();
    }

*/
    //this way we have coalesced reads that avoid bank-conflicts?
    for(unsigned int s=tpb/2;s>0;s>>=1) {

        if(tid < s) {
            if(sdata[tid+s].x > sdata[tid].x) {
                sdata[tid] = sdata[tid+s];
            }
        }
        __syncthreads();
    }

    if(tid==0) {
        array_out[bid] = sdata[0];
    }
}

float2 *recursive_parallel_maxIndex(float2 * d_array_in, int const N) {

    int nblocks = ceil((float) N / (float) (THREADSPERBLOCK*2));

    float2 *d_array_out;
    cudaMalloc((void **) &d_array_out, sizeof(*d_array_out)*nblocks);

    int smem = sizeof(float2) * THREADSPERBLOCK;

    //each threadperblock will need to do additional work!
    int nblocksLaunch = min(65536, nblocks);

    assert(nblocksLaunch < MAXBLOCKSPERGRID);
//  printf("Launching parallel_maxIndex() with N=%d points....\n", N);
//  printf("num points: %d\nnum blocks launched: %d\nthreads per block: %d\n", N, nblocks, THREADSPERBLOCK);

    parallel_maxIndex<<< nblocksLaunch, THREADSPERBLOCK, smem>>>(d_array_in, d_array_out, N);

    // *****DEBUG******************************************
/*
    printf("______________________________________________________\n");
    float2 *h_array_input = (float2 *) malloc(sizeof(float2)*N);
    cudaMemcpy(h_array_input,d_array_in,sizeof(float2)*N,cudaMemcpyDeviceToHost);
    print_array(h_array_input, N);
    printf("||||||||||||||||||||||||||||||||||||||||||||||||||||||\n");
    float2 *h_array_debug = (float2 *) malloc(sizeof(float2)*nblocks);
    cudaMemcpy(h_array_debug,d_array_out,sizeof(float2)*nblocks,cudaMemcpyDeviceToHost);
    print_array(h_array_debug, nblocks);
    printf("Size of nblocks %d\n", nblocks);
*/
    // *****DEBUG******************************************

    cudaFree(d_array_in);

    //if we have more points than threads
    if(N > THREADSPERBLOCK) {
        return recursive_parallel_maxIndex(d_array_out, nblocks);
    }
    //after we have reached termination
    else {
        //printf("Termination reached!\n");
        //this last pointer should never be de-allocated by this function!
        //it should be passed onto the very first function that calls recursive_..()
        printf("Target memory address: %p\n", d_array_out);
        return d_array_out;
    }

}