Advertisement
alexsetyaev

task_8_stencil

Oct 1st, 2021
834
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 8.42 KB | None | 0 0
  1. // UCSC CMPE220 Advanced Parallel Processing
  2. // Prof. Heiner Leitz
  3. // Author: Marcelo Siero.
  4. // Modified from code by:: Andreas Goetz (agoetz@sdsc.edu)
  5. // CUDA program to perform 1D stencil operation in parallel on the GPU
  6. //
  7. // /* FIXME */ COMMENTS ThAT REQUIRE ATTENTION
  8.  
  9. #include <iostream>
  10. #include <stdio.h>
  11. #include <cuda_device_runtime_api.h>
  12.  
  13. // define vector length, stencil radius,
  14. //#define N (1024*1024*512l)
  15. #define N 100000000
  16. #define RADIUS 7
  17. #define GRIDSIZE (N - 1 - 2 * RADIUS) / BLOCKSIZE + 1
  18. #define BLOCKSIZE 1000
  19. #define EPS 1e-9
  20.  
  21. #define SAFE_CALL( CallInstruction ) { \
  22.     cudaError_t cuerr = CallInstruction; \
  23.     if(cuerr != cudaSuccess) { \
  24.          printf("CUDA error: %s at call \"" #CallInstruction "\"\n", cudaGetErrorString(cuerr)); \
  25.          throw "error in CUDA API function, aborting..."; \
  26.     } \
  27. }
  28.  
  29. #define SAFE_KERNEL_CALL( KernelCallInstruction ){ \
  30.     KernelCallInstruction; \
  31.     cudaError_t cuerr = cudaGetLastError(); \
  32.     if(cuerr != cudaSuccess) { \
  33.         printf("CUDA error in kernel launch: %s at kernel \"" #KernelCallInstruction "\"\n", cudaGetErrorString(cuerr)); \
  34.         throw "error in CUDA kernel launch, aborting..."; \
  35.     } \
  36.     cuerr = cudaDeviceSynchronize(); \
  37.     if(cuerr != cudaSuccess) { \
  38.         printf("CUDA error in kernel execution: %s at kernel \"" #KernelCallInstruction "\"\n", cudaGetErrorString(cuerr)); \
  39.         throw "error in CUDA kernel execution, aborting..."; \
  40.     } \
  41. }
  42.  
  43. int gridSize  = GRIDSIZE;
  44. int blockSize = BLOCKSIZE;
  45. cudaEvent_t start_gpu, stop_gpu;
  46.  
  47. void cudaErrorCheck() {
  48.    // FIXME: Add code that finds the last error for CUDA functions performed.
  49.    // Upon getting an error have it print out a meaningful error message as
  50.    //  provided by the CUDA API, then exit with an error exit code.
  51. }
  52.  
  53. void start_timer() {
  54.         cudaEventCreate(&start_gpu);
  55.         cudaEventCreate(&stop_gpu);
  56.         cudaEventRecord(start_gpu);
  57.    // FIXME: ADD TIMING CODE, HERE, USE GLOBAL VARIABLES AS NEEDED.
  58. }
  59.  
  60. float stop_timer() {
  61.         float time_gpu = 0;
  62.  
  63.     cudaDeviceSynchronize();
  64.     cudaEventRecord(stop_gpu);
  65.         cudaEventSynchronize(stop_gpu);
  66.  
  67.         cudaEventElapsedTime(&time_gpu, start_gpu, stop_gpu);
  68.    // FIXME: ADD TIMING CODE, HERE, USE GLOBAL VARIABLES AS NEEDED.
  69.    return(time_gpu);
  70. }
  71.  
  72. cudaDeviceProp prop;
  73. void getDeviceProperties() {
  74.    // FIXME: Implement this function so as to acquire and print the following
  75.    // device properties:
  76.    //    Major and minor CUDA capability, total device global memory,
  77.    //    size of shared memory per block, number of registers per block,
  78.    //    warp size, max number of threads per block, number of multi-prccessors
  79.    //    (SMs) per device, Maximum number of threads per block dimension (x,y,z),
  80.    //    Maximumum number of blocks per grid dimension (x,y,z).
  81.    //
  82.    // These properties can be useful to dynamically optimize programs.  For
  83.    // instance the number of SMs can be useful as a heuristic to determine
  84.    // how many is a good number of blocks to use.  The total device global
  85.    // memory might be important to know just how much data to operate on at
  86.    // once.
  87. }
  88.  
  89. void newline() { std::cout << std::endl; };
  90.  
  91. void printThreadSizes() {
  92.    int noOfThreads = gridSize * blockSize;
  93.    printf("Blocks            = %d\n", gridSize);  // no. of blocks to launch.
  94.    printf("Threads per block = %d\n", blockSize); // no. of threads to launch.
  95.    printf("Total threads     = %d\n", noOfThreads);
  96.    printf("Number of grids   = %d\n", (N + noOfThreads -1)/ noOfThreads);
  97. }
  98.  
  99. // -------------------------------------------------------
  100. // CUDA device function that performs 1D stencil operation
  101. // -------------------------------------------------------
  102. __global__ void stencil_1D(double *in, double *out, long dim){
  103.  
  104.     long gindex = blockDim.x * blockIdx.x + threadIdx.x;// - RADIUS + RADIUS;
  105.     const int sh_mem_size = 1024 + 2 * RADIUS + 1;
  106.  
  107.     __shared__ double shmem[sh_mem_size];
  108.  
  109.    
  110.     shmem[threadIdx.x] = in[gindex];
  111.     shmem[threadIdx.x + RADIUS] = in[gindex + RADIUS];
  112.     shmem[threadIdx.x + 2 * RADIUS] = in[gindex + 2 * RADIUS];
  113.  
  114.     __syncthreads();
  115.    
  116.     double result = 0;
  117.     for (int offset = -RADIUS; offset <= RADIUS; offset++) {
  118.         int index = threadIdx.x + RADIUS + offset;
  119.         result += shmem[index];
  120.     }
  121.  
  122.  
  123.      if (gindex < dim + RADIUS) {
  124.         out[gindex + RADIUS] = result;
  125.     }
  126. }
  127.  
  128. __global__ void stencil_1D_slow(double *in, double *out, long dim){
  129.  
  130.   long gindex = threadIdx.x + blockDim.x * blockIdx.x + RADIUS;
  131.  
  132.   // Go through all data
  133.   // Step all threads in a block to avoid synchronization problem
  134.     /* FIXME PART 2 - MODIFIY PROGRAM TO USE SHARED MEMORY. */
  135.  
  136.     // Apply the stencil
  137.     double result = 0;
  138.     for (int offset = -RADIUS; offset <= RADIUS; offset++) {
  139.       if (gindex + offset < dim + RADIUS)
  140.             result += in[gindex + offset];
  141.     }
  142.  
  143.     // Store the result
  144.     if (gindex < dim + RADIUS)
  145.       out[gindex] = result;
  146. }
  147.  
  148. #define True  1
  149. #define False 0
  150. void checkResults(double *h_in, double *h_out, int DoCheck=True) {
  151.    // DO NOT CHANGE THIS CODE.
  152.    // CPU calculates the stencil from data in *h_in
  153.    // if DoCheck is True (default) it compares it with *h_out
  154.    // to check the operation of this code.
  155.    // If DoCheck is set to False, it can be used to time the CPU.
  156.    int i, j, ij;
  157.    double result;
  158.    int err = 0;
  159.    for (i=RADIUS; i<N; i++) {  // major index.
  160.       result = 0;
  161.       for (j=-RADIUS; j<=RADIUS; j++){
  162.          ij = i+j;
  163.          if (ij < N + RADIUS)
  164.             result += h_in[ij];
  165.       }
  166.       if (DoCheck) {  // print out some errors for debugging purposes.
  167.          if (abs(h_out[i] - result) >= EPS) { // count errors.
  168.             err++;
  169.             if (err < 8) { // help debug
  170.                printf("h_out[%d]=%lf should be %lf\n",i,h_out[i], result);
  171.             };
  172.          }
  173.       } else {  // for timing purposes.
  174.          h_out[i - RADIUS] = result;
  175.       }
  176.    }
  177.  
  178.    if (DoCheck) { // report results.
  179.       if (err != 0){
  180.          printf("Error, %d elements do not match!\n", err);
  181.       } else {
  182.          printf("Success! All elements match CPU result.\n");
  183.       }
  184.    }
  185. }
  186.  
  187. // ------------
  188. // main program
  189. // ------------
  190. int main(void)
  191. {
  192.   double *h_in, *h_out;
  193.   double *d_in, *d_out;
  194.   long size =  (N + 2 * RADIUS) * sizeof(double);
  195.   int i;
  196.  
  197.   // allocate host memory
  198.   h_in = new double[N + 2 * RADIUS];
  199.   h_out = new double[N + 2 * RADIUS];
  200.  
  201.   getDeviceProperties();
  202.  
  203.   // initialize vector
  204.   for (i=0; i<N; i++) {
  205.     if (i < RADIUS)
  206.       h_in[i] = 0;
  207.     else
  208.       h_in[i] = (i+1);
  209.   }
  210. std::cout << std::endl;
  211.  
  212.   // allocate device memory
  213.   cudaMalloc((void **)&d_in, size);
  214.   cudaMalloc((void **)&d_out, size);
  215.   cudaErrorCheck();
  216.  
  217.   // copy input data to device
  218.   cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
  219.   cudaErrorCheck();
  220.  
  221.   // Apply stencil by launching a sufficient number of blocks
  222.   printf("\n---------------------------\n");
  223.   printf("Launching 1D stencil kernel\n");
  224.   printf("---------------------------\n");
  225.   printf("Vector length     = %ld (%ld MB)\n",N,N*sizeof(double)/1024/1024);
  226.   printf("Stencil radius    = %d\n",RADIUS);
  227.  
  228.   //----------------------------------------------------------
  229.   // CODE TO RUN AND TIME THE STENCIL KERNEL.
  230.   //----------------------------------------------------------
  231.   newline();
  232.   printThreadSizes();
  233.   start_timer();
  234.   SAFE_KERNEL_CALL((stencil_1D<<<gridSize,blockSize>>>(d_in, d_out, N)));
  235.   std::cout << "Elapsed time: " << stop_timer() << std::endl;
  236.   // copy results back to host
  237.   cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
  238.   cudaErrorCheck();
  239.   checkResults(h_in, h_out);
  240.   //----------------------------------------------------------
  241.  
  242.   // deallocate device memory
  243.   cudaFree(d_in);
  244.   cudaFree(d_out);
  245.   cudaErrorCheck();
  246.   //=====================================================
  247.   // Evaluate total time of execution with just the CPU.
  248.   //=====================================================
  249.   newline();
  250.   std::cout << "Running stencil with the CPU.\n";
  251.   start_timer();
  252.   // Use checkResults to time CPU version of the stencil with False flag.
  253.   checkResults(h_in, h_out, False);
  254.   std::cout << "Elapsed time: " << stop_timer() << std::endl;
  255.   //=====================================================
  256.  
  257.   // deallocate host memory
  258.   free(h_in);
  259.   free(h_out);
  260.  
  261.   return 0;
  262. }
  263.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement