Untitled

// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
// +
// lst[n-1]}

#include <wb.h>

#define BLOCK_SIZE 1024 //@@ You can change this
#define SEC_SIZE 2*BLOCK_SIZE //section size is how many elements it takes in once

#define wbCheck(stmt)                                                     \
  do {                                                                    \
    cudaError_t err = stmt;                                               \
    if (err != cudaSuccess) {                                             \
      wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
      wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
      return -1;                                                          \
    }                                                                     \
  } while (0)

__global__  void add(float *input, float *aux, int len){
  // adds the first element from the auxillary array to the second section of the input array
  //till the second last element of the auxiallary array
  //returns input array itself
  int i = 2 * blockIdx.x * blockDim.x + threadIdx.x;
  if(blockIdx.x > 0){
   if(i<len)
    input[i] += aux[blockIdx.x-1];
   if(i + BLOCK_SIZE < len)
    input[i + BLOCK_SIZE] += aux[blockIdx.x-1];
  }
}


//use flag to tell GPU not to create an auxillary array when scanning auxillary array itself
__global__ void scan(float *input, float *output, float *aux, int flag, int len) {
  //@@ Modify the body of this function to complete the functionality of
  //@@ the scan on the device
  //@@ You may need multiple kernel calls; write your kernels before this
  //@@ function and call them from here


  __shared__ float XY[SEC_SIZE];
  int i = 2*blockIdx.x*blockDim.x + threadIdx.x;
  int t = threadIdx.x;
  XY[t] = 0;
  XY[t + BLOCK_SIZE] = 0;
  if (i < len)
    XY[t] = input[i];
  if(i + BLOCK_SIZE < len)
    XY[t+blockDim.x] = input[i+blockDim.x];

  //for loop for first reduction phase
  for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
    __syncthreads();
    int index = (t+1) * 2* stride -1;
    if (index < SEC_SIZE) {
      XY[index] += XY[index - stride];
    }
}

  //start distribution phase here
  for (int stride = SEC_SIZE/4; stride > 0; stride /= 2) {
    __syncthreads();
    int index = (t+1)*stride*2 - 1;
    if(index + stride < SEC_SIZE) {
      XY[index + stride] += XY[index];
    }
}

  //copy it
  __syncthreads();
  if (i < len)
    output[i] = XY[t];
  if (i+BLOCK_SIZE < len)
    output[i+blockDim.x] = XY[t+blockDim.x];

  //only for first case to create our auxillary array
  if(flag && t == blockDim.x-1)
      aux[blockIdx.x] = XY[SEC_SIZE-1];
}


int main(int argc, char **argv) {
  wbArg_t args;
  float *hostInput;  // The input 1D list
  float *hostOutput; // The output list
  float *deviceInput;
  float *deviceOutput;
  float *aux; //array auxillary
  int numElements; // number of elements in the list
  args = wbArg_read(argc, argv);

  wbTime_start(Generic, "Importing data and creating memory on host");
  hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  hostOutput = (float *)malloc(numElements * sizeof(float));
  wbTime_stop(Generic, "Importing data and creating memory on host");

  int num = ceil(numElements/(2.0*BLOCK_SIZE)); //number elements of the block

  wbLog(TRACE, "The number of input elements in the input is ",
        numElements);

  wbTime_start(GPU, "Allocating GPU memory.");
  wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  wbCheck(cudaMalloc((void **)&aux, num * sizeof(float))); //allocating GPU memory for auxillary array input

  wbTime_stop(GPU, "Allocating GPU memory.");

  wbTime_start(GPU, "Clearing output memory.");
  wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  wbTime_stop(GPU, "Clearing output memory.");

  wbTime_start(GPU, "Copying input memory to the GPU.");
  wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float),
                     cudaMemcpyHostToDevice));
  wbTime_stop(GPU, "Copying input memory to the GPU.");

  //@@ Initialize the grid and block dimensions here
  dim3 dimGrid(num); //number of blocks needed are just num
  dim3 dimBlock(BLOCK_SIZE); //number of threads are BLOCK_SIZE. Operate on twice that value
  dim3 auxDimGrid(1);
  dim3 auxDimBlock(ceil(num/2.0)); //try changing to ceil(num/2) because we don't need that many threads

  //call the scan kernel to create the auxillary array and perform scan on input
  scan <<<dimGrid, dimBlock>>> (deviceInput, deviceOutput, aux, 1, numElements);
  //cudaFree(deviceInput); (just gonna reuse this)
  //copy auxillary array received back to host memory
  //copy output array to host memory
  cudaDeviceSynchronize();

  scan <<<auxDimGrid, auxDimBlock>>> (aux, aux, NULL, 0, numElements);
  //we have auxOut in the GPU now.
  //we also have deviceOutput in the GPU now.
  cudaDeviceSynchronize();

  add <<<dimGrid, dimBlock>>> (deviceOutput, aux, numElements);
  cudaDeviceSynchronize();

  wbTime_start(Compute, "Performing CUDA computation");
  //@@ Modify this to complete the functionality of the scan
  //@@ on the deivce

  wbTime_stop(Compute, "Performing CUDA computation");

  wbTime_start(Copy, "Copying output memory to the CPU");
  wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float),
                     cudaMemcpyDeviceToHost));
  wbTime_stop(Copy, "Copying output memory to the CPU");

  wbTime_start(GPU, "Freeing GPU Memory");
  cudaFree(deviceInput);
  cudaFree(deviceOutput);
  cudaFree(aux);

  wbTime_stop(GPU, "Freeing GPU Memory");

  wbSolution(args, hostOutput, numElements);

  free(hostInput);
  free(hostOutput);

  return 0;
}