Advertisement
Guest User

concurrentKernels.cu

a guest
Jul 30th, 2014
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 7.41 KB | None | 0 0
  1. /*
  2.  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3.  *
  4.  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5.  * with this source code for terms and conditions that govern your use of
  6.  * this software. Any use, reproduction, disclosure, or distribution of
  7.  * this software and related documentation outside the terms of the EULA
  8.  * is strictly prohibited.
  9.  *
  10.  */
  11.  
  12. //
  13. // This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
  14. // introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
  15. // in CUDA 3.2.
  16. //
  17. // Devices of compute capability 1.x will run the kernels one after another
  18. // Devices of compute capability 2.0 or higher can overlap the kernels
  19. //
  20. #include <stdio.h>
  21. #include <helper_functions.h>
  22. #include <helper_cuda.h>
  23.  
  24. // This is a kernel that does no real work but runs at least for a specified number of clocks
  25. __global__ void clock_block(clock_t *d_o, clock_t clock_count)
  26. {
  27.     unsigned int start_clock = (unsigned int) clock();
  28.  
  29.     clock_t clock_offset = 0;
  30.  
  31.     while (clock_offset < clock_count)
  32.     {
  33.         unsigned int end_clock = (unsigned int) clock();
  34.  
  35.         // The code below should work like
  36.         // this (thanks to modular arithmetics):
  37.         //
  38.         // clock_offset = (clock_t) (end_clock > start_clock ?
  39.         //                           end_clock - start_clock :
  40.         //                           end_clock + (0xffffffffu - start_clock));
  41.         //
  42.         // Indeed, let m = 2^32 then
  43.         // end - start = end + m - start (mod m).
  44.  
  45.         clock_offset = (clock_t)(end_clock - start_clock);
  46.     }
  47.  
  48.     d_o[0] = clock_offset;
  49. }
  50.  
  51.  
  52. // Single warp reduction kernel
  53. __global__ void sum(clock_t *d_clocks, int N)
  54. {
  55.     __shared__ clock_t s_clocks[32];
  56.  
  57.     clock_t my_sum = 0;
  58.  
  59.     for (int i = threadIdx.x; i < N; i+= blockDim.x)
  60.     {
  61.         my_sum += d_clocks[i];
  62.     }
  63.  
  64.     s_clocks[threadIdx.x] = my_sum;
  65.     syncthreads();
  66.  
  67.     for (int i=16; i>0; i/=2)
  68.     {
  69.         if (threadIdx.x < i)
  70.         {
  71.             s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
  72.         }
  73.  
  74.         syncthreads();
  75.     }
  76.  
  77.     d_clocks[0] = s_clocks[0];
  78. }
  79.  
  80. int main(int argc, char **argv)
  81. {
  82.     int nkernels = 8;               // number of concurrent kernels
  83.     int nstreams = nkernels + 1;    // use one more stream than concurrent kernel
  84.     int nbytes = nkernels * sizeof(clock_t);   // number of data bytes
  85.     float kernel_time = 10; // time the kernel should run in ms
  86.     float elapsed_time;   // timing variables
  87.     int cuda_device = 0;
  88.  
  89.     printf("[%s] - Starting...\n", argv[0]);
  90.  
  91.     // get number of kernels if overridden on the command line
  92.     if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
  93.     {
  94.         nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
  95.         nstreams = nkernels + 1;
  96.     }
  97.  
  98.     // use command-line specified CUDA device, otherwise use device with highest Gflops/s
  99.     cuda_device = findCudaDevice(argc, (const char **)argv);
  100.  
  101.     cudaDeviceProp deviceProp;
  102.     checkCudaErrors(cudaGetDevice(&cuda_device));
  103.  
  104.     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
  105.  
  106.     if ((deviceProp.concurrentKernels == 0))
  107.     {
  108.         printf("> GPU does not support concurrent kernel execution\n");
  109.         printf("  CUDA kernel runs will be serialized\n");
  110.     }
  111.  
  112.     printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
  113.            deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
  114.  
  115.     // allocate host memory
  116.     clock_t *a = 0;                     // pointer to the array data in host memory
  117.     checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
  118.  
  119.     // allocate device memory
  120.     clock_t *d_a = 0;             // pointers to data and init value in the device memory
  121.     checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
  122.  
  123.     // allocate and initialize an array of stream handles
  124.     cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
  125.  
  126.     for (int i = 0; i < nstreams; i++)
  127.     {
  128.         checkCudaErrors(cudaStreamCreate(&(streams[i])));
  129.     }
  130.  
  131.     // create CUDA event handles
  132.     cudaEvent_t start_event, stop_event;
  133.     checkCudaErrors(cudaEventCreate(&start_event));
  134.     checkCudaErrors(cudaEventCreate(&stop_event));
  135.  
  136.  
  137.     // the events are used for synchronization only and hence do not need to record timings
  138.     // this also makes events not introduce global sync points when recorded which is critical to get overlap
  139.     cudaEvent_t *kernelEvent;
  140.     kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
  141.  
  142.     for (int i = 0; i < nkernels; i++)
  143.     {
  144.         checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
  145.     }
  146.  
  147.     //////////////////////////////////////////////////////////////////////
  148.     // time execution with nkernels streams
  149.     clock_t total_clocks = 0;
  150.     clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
  151.  
  152.     cudaEventRecord(start_event, 0);
  153.  
  154.     // queue nkernels in separate streams and record when they are done
  155.     for (int i=0; i<nkernels; ++i)
  156.     {
  157.         clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
  158.         total_clocks += time_clocks;
  159.         checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
  160.  
  161.         // make the last stream wait for the kernel event to be recorded
  162.         checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
  163.     }
  164.  
  165.     // queue a sum kernel and a copy back to host in the last stream.
  166.     // the commands in this stream get dispatched as soon as all the kernel events have been recorded
  167.     sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
  168.     checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
  169.  
  170.     // at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
  171.  
  172.     // in this sample we just wait until the GPU is done
  173.     checkCudaErrors(cudaEventRecord(stop_event, 0));
  174.     checkCudaErrors(cudaEventSynchronize(stop_event));
  175.     checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
  176.  
  177.     printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
  178.     printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
  179.     printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
  180.  
  181.     bool bTestResult  = (a[0] > total_clocks);
  182.  
  183.     // release resources
  184.     for (int i = 0; i < nkernels; i++)
  185.     {
  186.         cudaStreamDestroy(streams[i]);
  187.         cudaEventDestroy(kernelEvent[i]);
  188.     }
  189.  
  190.     free(streams);
  191.     free(kernelEvent);
  192.  
  193.     cudaEventDestroy(start_event);
  194.     cudaEventDestroy(stop_event);
  195.     cudaFreeHost(a);
  196.     cudaFree(d_a);
  197.  
  198.     // cudaDeviceReset causes the driver to clean up all state. While
  199.     // not mandatory in normal operation, it is good practice.  It is also
  200.     // needed to ensure correct operation when the application is being
  201.     // profiled. Calling cudaDeviceReset causes all profile data to be
  202.     // flushed before the application exits
  203.     cudaDeviceReset();
  204.  
  205.     if (!bTestResult)
  206.     {
  207.         printf("Test failed!\n");
  208.         exit(EXIT_FAILURE);
  209.     }
  210.  
  211.     printf("Test passed\n");
  212.     exit(EXIT_SUCCESS);
  213. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement