Advertisement
Guest User

Untitled

a guest
Apr 19th, 2019
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 2.19 KB | None | 0 0
  1. #include "com2039_lab5_reduce_gpu.h"
  2.  
  3. int main(void) {
  4.  
  5.     // Allocate Unified memory
  6.     float *arr, *intermediate, *final;
  7.     cudaMallocManaged(&arr, N * sizeof(float));
  8.     cudaMallocManaged(&intermediate, (N * sizeof(float)/ BLOCK_SIZE));
  9.     cudaMallocManaged(&final, sizeof(float));
  10.  
  11.     cudaEvent_t start, stop;
  12.     cudaEventCreate(&start);
  13.     cudaEventCreate(&stop);
  14.  
  15.     // Initialising the array
  16.     for (int i = 0; i < N; i++) {
  17.         arr[i] = 1.0f;
  18.     }
  19.  
  20.     int grid_size = N / BLOCK_SIZE;
  21.     printf("Grid Size is: %d\n", grid_size);
  22.     printf("Block Size is: %d\n", BLOCK_SIZE);
  23.  
  24.     cudaEventRecord(start); // Start timing
  25.     reduceKernel<<<grid_size, BLOCK_SIZE>>>(intermediate, arr);
  26.  
  27.     // Wait for GPU to finish before accessing on host
  28.     cudaDeviceSynchronize();
  29.  
  30.     reduceKernel<<<1, grid_size>>>(final, intermediate);
  31.  
  32.     // Wait for GPU to finish before accessing on host
  33.     cudaDeviceSynchronize();
  34.  
  35.     cudaEventRecord(stop); // Stop timing
  36.  
  37.     //printf("Input Array: \n");
  38.     //for (int n = 0; n < N; n++) {
  39.         //printf("%f ", arr[n]);
  40.     //}
  41.  
  42.     //printf("\n");
  43.  
  44.     //printf("Intermediate Array: \n");
  45.     //for (int n = 0; n < grid_size; n++) {
  46.         //printf("%f ", intermediate[n]);
  47.     //}
  48.  
  49.     //printf("\n");
  50.  
  51.     printf("And the final reduction is: %f\n", final[0]);
  52.  
  53.     // Calculate the difference in times of the two events.
  54.     cudaEventSynchronize(stop);
  55.     float milliseconds = 0;
  56.     cudaEventElapsedTime(&milliseconds, start, stop);
  57.     printf("Elapsed time was: %f milliseconds.\n", milliseconds);
  58.  
  59.     cudaFree(arr);
  60.     cudaFree(intermediate);
  61.     cudaFree(final);
  62. }
  63.  
  64. __global__ void reduceKernel(float* out, float* in) {
  65.     int myId = threadIdx.x + blockDim.x * blockIdx.x;
  66. // ID relative to whole array
  67.     int tid = threadIdx.x;
  68. // Local ID within the current block
  69.  
  70.     __shared__ float temp[BLOCK_SIZE];
  71.     temp[tid] = in[myId];
  72.     // Make sure entire block is loaded!
  73.     __syncthreads();
  74.  
  75.     // do reduction in shared memory
  76.     for (unsigned int s = blockDim.x / 2; s >= 1; s >>= 1) {
  77.         if (tid < s) {
  78.             temp[tid] += temp[tid + s];
  79.         }
  80.         __syncthreads();
  81.         // make sure all adds at one stage are done!
  82.     }
  83.  
  84.     // only thread 0 writes result for this block back to global memory
  85.     if (tid == 0) {
  86.         out[blockIdx.x] = temp[tid];
  87.     }
  88. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement