Advertisement
Guest User

asightforsoreeyes

a guest
Apr 1st, 2020
126
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 6.48 KB | None | 0 0
  1. // MP Scan
  2. // Given a list (lst) of length n
  3. // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
  4. // +
  5. // lst[n-1]}
  6.  
  7. #include <wb.h>
  8.  
  9. #define BLOCK_SIZE 512 //@@ You can change this
  10.  
  11. #define wbCheck(stmt)                                                     \
  12.   do {                                                                    \
  13.     cudaError_t err = stmt;                                               \
  14.     if (err != cudaSuccess) {                                             \
  15.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  16.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  17.       return -1;                                                          \
  18.     }                                                                     \
  19.   } while (0)
  20. __global__ void addfinalscan(float* blockSums, float* scannedOutput, int len){
  21.   int t = threadIdx.x;
  22.   int idx = (blockIdx.x * blockDim.x * 2) + t;
  23.   if(blockIdx.x > 0 && idx < len)
  24.     scannedOutput[idx] += blockSums[blockIdx.x - 1];
  25.   if(blockIdx.x > 0 && (blockDim.x + idx) < len)
  26.     scannedOutput[idx + blockDim.x] += blockSums[blockIdx.x - 1];
  27. }
  28. __global__ void finalscan(float *blockSums, int len){
  29.   __shared__ float T[2*BLOCK_SIZE];
  30.   int t = threadIdx.x;
  31.   int idx = (blockIdx.x * blockDim.x * 2) + t;
  32.   if((idx) < len)
  33.     T[t] = blockSums[idx];
  34.   else
  35.     T[t] = 0.0;
  36.   if((blockDim.x + idx) < len)
  37.     T[blockDim.x + t] = blockSums[blockDim.x + idx];
  38.   else
  39.     T[blockDim.x + t] = 0.0;
  40.  
  41.   //@@ Traverse the reduction tree
  42.   int stride = 1;
  43.   while(stride < 2*BLOCK_SIZE){
  44.     __syncthreads();
  45.     int index = (t + 1)* stride * 2 - 1;
  46.     if(index < 2*BLOCK_SIZE && (index - stride) >= 0)
  47.       T[index] += T[index-stride];
  48.     stride *= 2;
  49.   }
  50.   // Root to leaf traversal -> balanced binary tree computation
  51.   stride = BLOCK_SIZE/2;
  52.   while(stride > 0){
  53.     __syncthreads();
  54.     int index = (t + 1)* 2 * stride - 1;
  55.     if((index + stride) < 2*BLOCK_SIZE)
  56.       T[index + stride] += T[index];
  57.     stride /= 2;
  58.   }
  59.   __syncthreads();
  60.   if(idx < len)
  61.     blockSums[idx] = T[t];
  62.   if((idx + blockDim.x) < len)
  63.     blockSums[idx + blockDim.x] = T[t + blockDim.x];
  64. }
  65. __global__ void scan(float *input, float* blockSums, float *output, int len) {
  66.   //@@ Modify the body of this function to complete the functionality of
  67.   //@@ the scan on the device
  68.   //@@ You may need multiple kernel calls; write your kernels before this
  69.   //@@ function and call them from the host
  70.   __shared__ float T[2*BLOCK_SIZE];
  71.   int t = threadIdx.x;
  72.   int idx = (blockIdx.x * blockDim.x * 2) + t;
  73.   if((idx) < len)
  74.     T[t] = input[idx];
  75.   else
  76.     T[t] = 0.0;
  77.   if((blockDim.x + idx) < len)
  78.     T[blockDim.x + t] = input[blockDim.x + idx];
  79.   else
  80.     T[blockDim.x + t] = 0.0;
  81.  
  82.   //@@ Traverse the reduction tree
  83.   int stride = 1;
  84.   while(stride < 2*BLOCK_SIZE){
  85.     __syncthreads();
  86.     int index = (t + 1)* stride * 2 - 1;
  87.     if(index < 2*BLOCK_SIZE && (index - stride) >= 0)
  88.       T[index] += T[index-stride];
  89.     stride *= 2;
  90.   }
  91.   // Root to leaf traversal -> balanced binary tree computation
  92.   stride = BLOCK_SIZE/2;
  93.   while(stride > 0){
  94.     __syncthreads();
  95.     int index = (t + 1)* 2 * stride - 1;
  96.     if((index + stride) < 2*BLOCK_SIZE)
  97.       T[index + stride] += T[index];
  98.     stride /= 2;
  99.   }
  100.  
  101.   __syncthreads();
  102.   if(idx < len)
  103.     output[idx] = T[t];
  104.   if(idx + blockDim.x < len)
  105.     output[idx + blockDim.x] = T[t + blockDim.x];
  106.   if(idx == (blockDim.x - 1))
  107.     blockSums[blockIdx.x] = T[2*BLOCK_SIZE - 1];
  108. }
  109.  
  110. int main(int argc, char **argv) {
  111.   wbArg_t args;
  112.   float *hostInput;  // The input 1D list
  113.   float *hostOutput; // The output list
  114.   float *deviceInput;
  115.   float *deviceOutput;
  116.   int numElements; // number of elements in the list
  117.   float *blockSums;  
  118.   args = wbArg_read(argc, argv);
  119.  
  120.   wbTime_start(Generic, "Importing data and creating memory on host");
  121.   hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  122.   hostOutput = (float *)malloc(numElements * sizeof(float));
  123.   wbTime_stop(Generic, "Importing data and creating memory on host");
  124.  
  125.   wbLog(TRACE, "The number of input elements in the input is ",
  126.         numElements);
  127.   unsigned int blockScanSize = ceil(numElements/(BLOCK_SIZE*2.0));
  128.   wbTime_start(GPU, "Allocating GPU memory.");
  129.   wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  130.   wbCheck(cudaMalloc((void **)&blockSums, blockScanSize * sizeof(float)));
  131.   wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  132.   wbTime_stop(GPU, "Allocating GPU memory.");
  133.  
  134.   wbTime_start(GPU, "Clearing output memory.");
  135.   wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  136.   wbTime_stop(GPU, "Clearing output memory.");
  137.  
  138.   wbTime_start(GPU, "Copying input memory to the GPU.");
  139.   wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float),
  140.                      cudaMemcpyHostToDevice));
  141.   wbTime_stop(GPU, "Copying input memory to the GPU.");
  142.  
  143.   //@@ Initialize the grid and block dimensions here
  144.   dim3 DimGrid(blockScanSize, 1, 1);
  145.   dim3 DimBlock(BLOCK_SIZE, 1, 1);
  146.   wbTime_start(Compute, "Performing CUDA computation");
  147.   //@@ Modify this to complete the functionality of the scan
  148.   //@@ on the deivce
  149.   scan<<<DimGrid, DimBlock>>>(&deviceInput[0], blockSums, &deviceOutput[0], numElements);
  150.   cudaDeviceSynchronize();
  151.   // blockSum scan
  152.   dim3 DimGridblockSum(1, 1, 1);
  153.   finalscan<<<DimGridblockSum, DimBlock>>>(blockSums, blockScanSize);
  154.   cudaDeviceSynchronize();
  155.   // add kernel
  156.   addfinalscan<<<DimGrid, DimBlock>>>(&blockSums[0], &deviceOutput[0], numElements);
  157.   cudaDeviceSynchronize();
  158.   wbTime_stop(Compute, "Performing CUDA computation");
  159.  
  160.   wbTime_start(Copy, "Copying output memory to the CPU");
  161.   wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float),
  162.                      cudaMemcpyDeviceToHost));
  163.   for(int i = 0; i < numElements; i++) {
  164.     printf("% d %f %f\n", i, hostInput[i], hostOutput[i]);
  165.   }
  166.   float sum = 0;
  167.   for(int j = 0; j < numElements; j++) {
  168.     sum += hostInput[j];
  169.   }
  170.   printf("Expected: %f\n", sum);
  171.   wbTime_stop(Copy, "Copying output memory to the CPU");
  172.  
  173.   wbTime_start(GPU, "Freeing GPU Memory");
  174.   cudaFree(deviceInput);
  175.   cudaFree(deviceOutput);
  176.   cudaFree(blockSums);
  177.   wbTime_stop(GPU, "Freeing GPU Memory");
  178.  
  179.   wbSolution(args, hostOutput, numElements);
  180.  
  181.   free(hostInput);
  182.   free(hostOutput);
  183.  
  184.   return 0;
  185. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement