pi_stuff

Test CPU usage during CUDA kernels, cudaSetDeviceFlags

Sep 15th, 2021
779
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /*
  2.   Test blocking modes.
  3.  
  4.   https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g69e73c7dda3fc05306ae7c811a690fac
  5.  
  6. Linux
  7.  
  8. $ ./test_cpu_usage 25000000 128 default
  9. Wall clock time 0.978 seconds, cpu time 0.977 seconds
  10.  
  11. $ ./test_cpu_usage 25000000 128 spin
  12. Wall clock time 0.995 seconds, cpu time 0.995 seconds
  13.  
  14. $ ./test_cpu_usage 25000000 128 yield
  15. Wall clock time 0.998 seconds, cpu time 0.997 seconds
  16.  
  17. $ ./test_cpu_usage 25000000 128 block
  18. Wall clock time 1.001 seconds, cpu time 0.000 seconds
  19.  
  20.  
  21. Windows
  22.  
  23. $ ./test_cpu_usage 25000000 128 default
  24. Wall clock time 0.974 seconds, cpu time 0.984 seconds
  25.  
  26. $ ./test_cpu_usage 25000000 128 spin
  27. Wall clock time 0.993 seconds, cpu time 1.000 seconds
  28.  
  29. $ ./test_cpu_usage 25000000 128 yield
  30. Wall clock time 0.969 seconds, cpu time 0.297 seconds
  31.  
  32. $ ./test_cpu_usage 25000000 128 block
  33. Wall clock time 0.972 seconds, cpu time 0.000 seconds
  34.  
  35. */
  36.  
  37. #include <cstdio>
  38. #include <cstring>
  39. #include <cstdint>
  40. #include <time.h>
  41. #include <cuda.h>
  42.  
  43. // A macro for checking the error codes of cuda runtime calls
  44. #define CUDA_ERROR_CHECK(expr) \
  45.   {                            \
  46.     cudaError_t err = expr;    \
  47.     if (err != cudaSuccess)    \
  48.     {                          \
  49.       printf("CUDA call failed!\n%s\n", cudaGetErrorString(err)); \
  50.       exit(1);                 \
  51.     }                          \
  52.   }
  53.  
  54.  
  55. #ifdef _WIN32
  56. #include <windows.h>
  57.  
  58. double getSeconds(void) {
  59.   static LARGE_INTEGER queryPerfFrequency = {0};
  60.   LARGE_INTEGER counter;
  61.  
  62.   if (queryPerfFrequency.QuadPart == 0) {
  63.     QueryPerformanceFrequency(&queryPerfFrequency);
  64.   }
  65.  
  66.   QueryPerformanceCounter(&counter);
  67.  
  68.   return counter.QuadPart / (double) queryPerfFrequency.QuadPart;
  69. }
  70.  
  71.  
  72. double getCPUSeconds() {
  73.   FILETIME creation_time, exit_time, kernel_time, user_time;
  74.   GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time);
  75.   uint64_t intervals_100_nanos = user_time.dwLowDateTime + ((uint64_t)user_time.dwHighDateTime << 32);
  76.   return intervals_100_nanos / 10000000.;
  77. }
  78.  
  79. #else
  80.  
  81. double getSeconds() {
  82.   struct timespec t;
  83.   clock_gettime(CLOCK_MONOTONIC, &t);
  84.   return t.tv_sec + 1e-9 * t.tv_nsec;
  85. }
  86.  
  87.  
  88. double getCPUSeconds() {
  89.   return (double) clock() / CLOCKS_PER_SEC;
  90. }
  91.  
  92. #endif
  93.  
  94. __global__ void busy_kernel(int iters, int size, double *data) {
  95.   if (threadIdx.x >= size) return;
  96.  
  97.   data[threadIdx.x] += 3.0;
  98.  
  99.   for (int i=0; i < iters; i++) {
  100.     data[threadIdx.x] = data[threadIdx.x] * 2. - 3.;
  101.   }
  102. }
  103.  
  104.  
  105. int main(int argc, char **argv) {
  106.   int iters, size;
  107.  
  108.   if (argc != 4) {
  109.     printf("\n  test_cpu_usage <iters> <GPU thread count> default|spin|yield|block\n\n");
  110.     return 1;
  111.   }
  112.  
  113.   iters = atoi(argv[1]);
  114.   size = atoi(argv[2]);
  115.  
  116.   int sync_flag = 0;
  117.   if (!strcmp(argv[3], "spin")) {
  118.     sync_flag = cudaDeviceScheduleSpin;
  119.   } else if (!strcmp(argv[3], "yield")) {
  120.     sync_flag = cudaDeviceScheduleYield;
  121.   } else if (!strcmp(argv[3], "block")) {
  122.     sync_flag = cudaDeviceScheduleBlockingSync;
  123.   } else if (strcmp(argv[3], "default")) {
  124.     printf("Invalid sync mode: %s\n", argv[3]);
  125.     return 1;
  126.   }
  127.  
  128.   CUDA_ERROR_CHECK(cudaSetDeviceFlags(sync_flag));
  129.  
  130.   double *data, *data_dev;
  131.  
  132.   data = new double[size];
  133.   memset(data, 0, sizeof(double) * size);
  134.  
  135.   CUDA_ERROR_CHECK(cudaMalloc((void**) &data_dev, sizeof(double) * size));
  136.   CUDA_ERROR_CHECK(cudaMemcpy(data_dev, data, sizeof(double) * size, cudaMemcpyHostToDevice));
  137.  
  138.   double wall_time = getSeconds();
  139.   double cpu_time = getCPUSeconds();
  140.   busy_kernel<<<1, size>>>(iters, size, data_dev);
  141.  
  142.   CUDA_ERROR_CHECK(cudaDeviceSynchronize());
  143.   wall_time = getSeconds() - wall_time;
  144.   cpu_time = getCPUSeconds() - cpu_time;
  145.  
  146.   printf("Wall clock time %.3f seconds, cpu time %.3f seconds\n", wall_time, cpu_time);
  147.  
  148.   cudaFree(data_dev);
  149.  
  150.   return 0;
  151. }
  152.  
RAW Paste Data