SHARE
TWEET

Untitled

a guest Oct 12th, 2017 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. // MP Scan
  2. // Given a list (lst) of length n
  3. // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
  4. // +
  5. // lst[n-1]}
  6.  
  7. #include <wb.h>
  8.  
  9. #define BLOCK_SIZE 1024 //@@ You can change this
  10. #define SEC_SIZE 2*BLOCK_SIZE //section size is how many elements it takes in once
  11.  
  12. #define wbCheck(stmt)                                                     \
  13.   do {                                                                    \
  14.     cudaError_t err = stmt;                                               \
  15.     if (err != cudaSuccess) {                                             \
  16.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  17.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  18.       return -1;                                                          \
  19.     }                                                                     \
  20.   } while (0)
  21.  
  22. __global__  void add(float *input, float *aux, int len){
  23.   // adds the first element from the auxillary array to the second section of the input array
  24.   //till the second last element of the auxiallary array
  25.   //returns input array itself
  26.   int i = 2 * blockIdx.x * blockDim.x + threadIdx.x;
  27.   if(blockIdx.x > 0){
  28.    if(i<len)
  29.     input[i] += aux[blockIdx.x-1];
  30.    if(i + BLOCK_SIZE < len)
  31.     input[i + BLOCK_SIZE] += aux[blockIdx.x-1];
  32.   }
  33. }
  34.  
  35.  
  36. //use flag to tell GPU not to create an auxillary array when scanning auxillary array itself
  37. __global__ void scan(float *input, float *output, float *aux, int flag, int len) {
  38.   //@@ Modify the body of this function to complete the functionality of
  39.   //@@ the scan on the device
  40.   //@@ You may need multiple kernel calls; write your kernels before this
  41.   //@@ function and call them from here
  42.  
  43.  
  44.   __shared__ float XY[SEC_SIZE];
  45.   int i = 2*blockIdx.x*blockDim.x + threadIdx.x;
  46.   int t = threadIdx.x;
  47.   XY[t] = 0;
  48.   XY[t + BLOCK_SIZE] = 0;
  49.   if (i < len)
  50.     XY[t] = input[i];
  51.   if(i + BLOCK_SIZE < len)
  52.     XY[t+blockDim.x] = input[i+blockDim.x];
  53.  
  54.   //for loop for first reduction phase
  55.   for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
  56.     __syncthreads();
  57.     int index = (t+1) * 2* stride -1;
  58.     if (index < SEC_SIZE) {
  59.       XY[index] += XY[index - stride];
  60.     }
  61. }
  62.  
  63.   //start distribution phase here
  64.   for (int stride = SEC_SIZE/4; stride > 0; stride /= 2) {
  65.     __syncthreads();
  66.     int index = (t+1)*stride*2 - 1;
  67.     if(index + stride < SEC_SIZE) {
  68.       XY[index + stride] += XY[index];
  69.     }
  70. }
  71.  
  72.   //copy it
  73.   __syncthreads();
  74.   if (i < len)
  75.     output[i] = XY[t];
  76.   if (i+BLOCK_SIZE < len)
  77.     output[i+blockDim.x] = XY[t+blockDim.x];
  78.  
  79.   //only for first case to create our auxillary array
  80.   if(flag && t == blockDim.x-1)
  81.       aux[blockIdx.x] = XY[SEC_SIZE-1];
  82. }
  83.  
  84.  
  85. int main(int argc, char **argv) {
  86.   wbArg_t args;
  87.   float *hostInput;  // The input 1D list
  88.   float *hostOutput; // The output list
  89.   float *deviceInput;
  90.   float *deviceOutput;
  91.   float *aux; //array auxillary
  92.   int numElements; // number of elements in the list
  93.   args = wbArg_read(argc, argv);
  94.  
  95.   wbTime_start(Generic, "Importing data and creating memory on host");
  96.   hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  97.   hostOutput = (float *)malloc(numElements * sizeof(float));
  98.   wbTime_stop(Generic, "Importing data and creating memory on host");
  99.  
  100.   int num = ceil(numElements/(2.0*BLOCK_SIZE)); //number elements of the block
  101.  
  102.   wbLog(TRACE, "The number of input elements in the input is ",
  103.         numElements);
  104.  
  105.   wbTime_start(GPU, "Allocating GPU memory.");
  106.   wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  107.   wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  108.   wbCheck(cudaMalloc((void **)&aux, num * sizeof(float))); //allocating GPU memory for auxillary array input
  109.  
  110.   wbTime_stop(GPU, "Allocating GPU memory.");
  111.  
  112.   wbTime_start(GPU, "Clearing output memory.");
  113.   wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  114.   wbTime_stop(GPU, "Clearing output memory.");
  115.  
  116.   wbTime_start(GPU, "Copying input memory to the GPU.");
  117.   wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float),
  118.                      cudaMemcpyHostToDevice));
  119.   wbTime_stop(GPU, "Copying input memory to the GPU.");
  120.  
  121.   //@@ Initialize the grid and block dimensions here
  122.   dim3 dimGrid(num); //number of blocks needed are just num
  123.   dim3 dimBlock(BLOCK_SIZE); //number of threads are BLOCK_SIZE. Operate on twice that value
  124.   dim3 auxDimGrid(1);
  125.   dim3 auxDimBlock(ceil(num/2.0)); //try changing to ceil(num/2) because we don't need that many threads
  126.  
  127.   //call the scan kernel to create the auxillary array and perform scan on input
  128.   scan <<<dimGrid, dimBlock>>> (deviceInput, deviceOutput, aux, 1, numElements);
  129.   //cudaFree(deviceInput); (just gonna reuse this)
  130.   //copy auxillary array received back to host memory
  131.   //copy output array to host memory
  132.   cudaDeviceSynchronize();
  133.  
  134.   scan <<<auxDimGrid, auxDimBlock>>> (aux, aux, NULL, 0, numElements);
  135.   //we have auxOut in the GPU now.
  136.   //we also have deviceOutput in the GPU now.
  137.   cudaDeviceSynchronize();
  138.  
  139.   add <<<dimGrid, dimBlock>>> (deviceOutput, aux, numElements);
  140.   cudaDeviceSynchronize();
  141.  
  142.   wbTime_start(Compute, "Performing CUDA computation");
  143.   //@@ Modify this to complete the functionality of the scan
  144.   //@@ on the deivce
  145.  
  146.   wbTime_stop(Compute, "Performing CUDA computation");
  147.  
  148.   wbTime_start(Copy, "Copying output memory to the CPU");
  149.   wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float),
  150.                      cudaMemcpyDeviceToHost));
  151.   wbTime_stop(Copy, "Copying output memory to the CPU");
  152.  
  153.   wbTime_start(GPU, "Freeing GPU Memory");
  154.   cudaFree(deviceInput);
  155.   cudaFree(deviceOutput);
  156.   cudaFree(aux);
  157.  
  158.   wbTime_stop(GPU, "Freeing GPU Memory");
  159.  
  160.   wbSolution(args, hostOutput, numElements);
  161.  
  162.   free(hostInput);
  163.   free(hostOutput);
  164.  
  165.   return 0;
  166. }
RAW Paste Data
Top