asightforsoreeyes

// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
// +
// lst[n-1]}

#include <wb.h>

#define BLOCK_SIZE 512 //@@ You can change this

#define wbCheck(stmt)                                                     \
  do {                                                                    \
    cudaError_t err = stmt;                                               \
    if (err != cudaSuccess) {                                             \
      wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
      wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
      return -1;                                                          \
    }                                                                     \
  } while (0)
__global__ void addfinalscan(float* blockSums, float* scannedOutput, int len){
  int t = threadIdx.x;
  int idx = (blockIdx.x * blockDim.x * 2) + t;
  if(blockIdx.x > 0 && idx < len)
    scannedOutput[idx] += blockSums[blockIdx.x - 1];
  if(blockIdx.x > 0 && (blockDim.x + idx) < len)
    scannedOutput[idx + blockDim.x] += blockSums[blockIdx.x - 1];
}
__global__ void finalscan(float *blockSums, int len){
  __shared__ float T[2*BLOCK_SIZE];
  int t = threadIdx.x;
  int idx = (blockIdx.x * blockDim.x * 2) + t;
  if((idx) < len)
    T[t] = blockSums[idx];
  else
    T[t] = 0.0;
  if((blockDim.x + idx) < len)
    T[blockDim.x + t] = blockSums[blockDim.x + idx];
  else
    T[blockDim.x + t] = 0.0;

  //@@ Traverse the reduction tree
  int stride = 1;
  while(stride < 2*BLOCK_SIZE){
    __syncthreads();
    int index = (t + 1)* stride * 2 - 1;
    if(index < 2*BLOCK_SIZE && (index - stride) >= 0)
      T[index] += T[index-stride];
    stride *= 2;
  }
  // Root to leaf traversal -> balanced binary tree computation
  stride = BLOCK_SIZE/2;
  while(stride > 0){
    __syncthreads();
    int index = (t + 1)* 2 * stride - 1;
    if((index + stride) < 2*BLOCK_SIZE)
      T[index + stride] += T[index];
    stride /= 2;
  }
  __syncthreads();
  if(idx < len)
    blockSums[idx] = T[t];
  if((idx + blockDim.x) < len)
    blockSums[idx + blockDim.x] = T[t + blockDim.x];
}
__global__ void scan(float *input, float* blockSums, float *output, int len) {
  //@@ Modify the body of this function to complete the functionality of
  //@@ the scan on the device
  //@@ You may need multiple kernel calls; write your kernels before this
  //@@ function and call them from the host
  __shared__ float T[2*BLOCK_SIZE];
  int t = threadIdx.x;
  int idx = (blockIdx.x * blockDim.x * 2) + t;
  if((idx) < len)
    T[t] = input[idx];
  else
    T[t] = 0.0;
  if((blockDim.x + idx) < len)
    T[blockDim.x + t] = input[blockDim.x + idx];
  else
    T[blockDim.x + t] = 0.0;

  //@@ Traverse the reduction tree
  int stride = 1;
  while(stride < 2*BLOCK_SIZE){
    __syncthreads();
    int index = (t + 1)* stride * 2 - 1;
    if(index < 2*BLOCK_SIZE && (index - stride) >= 0)
      T[index] += T[index-stride];
    stride *= 2;
  }
  // Root to leaf traversal -> balanced binary tree computation
  stride = BLOCK_SIZE/2;
  while(stride > 0){
    __syncthreads();
    int index = (t + 1)* 2 * stride - 1;
    if((index + stride) < 2*BLOCK_SIZE)
      T[index + stride] += T[index];
    stride /= 2;
  }

  __syncthreads();
  if(idx < len)
    output[idx] = T[t];
  if(idx + blockDim.x < len)
    output[idx + blockDim.x] = T[t + blockDim.x];
  if(idx == (blockDim.x - 1))
    blockSums[blockIdx.x] = T[2*BLOCK_SIZE - 1];
}

int main(int argc, char **argv) {
  wbArg_t args;
  float *hostInput;  // The input 1D list
  float *hostOutput; // The output list
  float *deviceInput;
  float *deviceOutput;
  int numElements; // number of elements in the list
  float *blockSums;
  args = wbArg_read(argc, argv);

  wbTime_start(Generic, "Importing data and creating memory on host");
  hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  hostOutput = (float *)malloc(numElements * sizeof(float));
  wbTime_stop(Generic, "Importing data and creating memory on host");

  wbLog(TRACE, "The number of input elements in the input is ",
        numElements);
  unsigned int blockScanSize = ceil(numElements/(BLOCK_SIZE*2.0));
  wbTime_start(GPU, "Allocating GPU memory.");
  wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  wbCheck(cudaMalloc((void **)&blockSums, blockScanSize * sizeof(float)));
  wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  wbTime_stop(GPU, "Allocating GPU memory.");

  wbTime_start(GPU, "Clearing output memory.");
  wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  wbTime_stop(GPU, "Clearing output memory.");

  wbTime_start(GPU, "Copying input memory to the GPU.");
  wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float),
                     cudaMemcpyHostToDevice));
  wbTime_stop(GPU, "Copying input memory to the GPU.");

  //@@ Initialize the grid and block dimensions here
  dim3 DimGrid(blockScanSize, 1, 1);
  dim3 DimBlock(BLOCK_SIZE, 1, 1);
  wbTime_start(Compute, "Performing CUDA computation");
  //@@ Modify this to complete the functionality of the scan
  //@@ on the deivce
  scan<<<DimGrid, DimBlock>>>(&deviceInput[0], blockSums, &deviceOutput[0], numElements);
  cudaDeviceSynchronize();
  // blockSum scan
  dim3 DimGridblockSum(1, 1, 1);
  finalscan<<<DimGridblockSum, DimBlock>>>(blockSums, blockScanSize);
  cudaDeviceSynchronize();
  // add kernel
  addfinalscan<<<DimGrid, DimBlock>>>(&blockSums[0], &deviceOutput[0], numElements);
  cudaDeviceSynchronize();
  wbTime_stop(Compute, "Performing CUDA computation");

  wbTime_start(Copy, "Copying output memory to the CPU");
  wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float),
                     cudaMemcpyDeviceToHost));
  for(int i = 0; i < numElements; i++) {
    printf("% d %f %f\n", i, hostInput[i], hostOutput[i]);
  }
  float sum = 0;
  for(int j = 0; j < numElements; j++) {
    sum += hostInput[j];
  }
  printf("Expected: %f\n", sum);
  wbTime_stop(Copy, "Copying output memory to the CPU");

  wbTime_start(GPU, "Freeing GPU Memory");
  cudaFree(deviceInput);
  cudaFree(deviceOutput);
  cudaFree(blockSums);
  wbTime_stop(GPU, "Freeing GPU Memory");

  wbSolution(args, hostOutput, numElements);

  free(hostInput);
  free(hostOutput);

  return 0;
}