Test CPU usage during CUDA kernels, cudaSetDeviceFlags

/*
  Test blocking modes.

  https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g69e73c7dda3fc05306ae7c811a690fac

Linux

$ ./test_cpu_usage 25000000 128 default
Wall clock time 0.978 seconds, cpu time 0.977 seconds

$ ./test_cpu_usage 25000000 128 spin
Wall clock time 0.995 seconds, cpu time 0.995 seconds

$ ./test_cpu_usage 25000000 128 yield
Wall clock time 0.998 seconds, cpu time 0.997 seconds

$ ./test_cpu_usage 25000000 128 block
Wall clock time 1.001 seconds, cpu time 0.000 seconds


Windows

$ ./test_cpu_usage 25000000 128 default
Wall clock time 0.974 seconds, cpu time 0.984 seconds

$ ./test_cpu_usage 25000000 128 spin
Wall clock time 0.993 seconds, cpu time 1.000 seconds

$ ./test_cpu_usage 25000000 128 yield
Wall clock time 0.969 seconds, cpu time 0.297 seconds

$ ./test_cpu_usage 25000000 128 block
Wall clock time 0.972 seconds, cpu time 0.000 seconds

*/

#include <cstdio>
#include <cstring>
#include <cstdint>
#include <time.h>
#include <cuda.h>

// A macro for checking the error codes of cuda runtime calls
#define CUDA_ERROR_CHECK(expr) \
  {                            \
    cudaError_t err = expr;    \
    if (err != cudaSuccess)    \
    {                          \
      printf("CUDA call failed!\n%s\n", cudaGetErrorString(err)); \
      exit(1);                 \
    }                          \
  }


#ifdef _WIN32
#include <windows.h>

double getSeconds(void) {
  static LARGE_INTEGER queryPerfFrequency = {0};
  LARGE_INTEGER counter;

  if (queryPerfFrequency.QuadPart == 0) {
    QueryPerformanceFrequency(&queryPerfFrequency);
  }

  QueryPerformanceCounter(&counter);

  return counter.QuadPart / (double) queryPerfFrequency.QuadPart;
}


double getCPUSeconds() {
  FILETIME creation_time, exit_time, kernel_time, user_time;
  GetProcessTimes(GetCurrentProcess(), &creation_time, &exit_time, &kernel_time, &user_time);
  uint64_t intervals_100_nanos = user_time.dwLowDateTime + ((uint64_t)user_time.dwHighDateTime << 32);
  return intervals_100_nanos / 10000000.;
}

#else

double getSeconds() {
  struct timespec t;
  clock_gettime(CLOCK_MONOTONIC, &t);
  return t.tv_sec + 1e-9 * t.tv_nsec;
}


double getCPUSeconds() {
  return (double) clock() / CLOCKS_PER_SEC;
}

#endif

__global__ void busy_kernel(int iters, int size, double *data) {
  if (threadIdx.x >= size) return;

  data[threadIdx.x] += 3.0;

  for (int i=0; i < iters; i++) {
    data[threadIdx.x] = data[threadIdx.x] * 2. - 3.;
  }
}


int main(int argc, char **argv) {
  int iters, size;

  if (argc != 4) {
    printf("\n  test_cpu_usage <iters> <GPU thread count> default|spin|yield|block\n\n");
    return 1;
  }

  iters = atoi(argv[1]);
  size = atoi(argv[2]);

  int sync_flag = 0;
  if (!strcmp(argv[3], "spin")) {
    sync_flag = cudaDeviceScheduleSpin;
  } else if (!strcmp(argv[3], "yield")) {
    sync_flag = cudaDeviceScheduleYield;
  } else if (!strcmp(argv[3], "block")) {
    sync_flag = cudaDeviceScheduleBlockingSync;
  } else if (strcmp(argv[3], "default")) {
    printf("Invalid sync mode: %s\n", argv[3]);
    return 1;
  }

  CUDA_ERROR_CHECK(cudaSetDeviceFlags(sync_flag));

  double *data, *data_dev;

  data = new double[size];
  memset(data, 0, sizeof(double) * size);

  CUDA_ERROR_CHECK(cudaMalloc((void**) &data_dev, sizeof(double) * size));
  CUDA_ERROR_CHECK(cudaMemcpy(data_dev, data, sizeof(double) * size, cudaMemcpyHostToDevice));

  double wall_time = getSeconds();
  double cpu_time = getCPUSeconds();
  busy_kernel<<<1, size>>>(iters, size, data_dev);

  CUDA_ERROR_CHECK(cudaDeviceSynchronize());
  wall_time = getSeconds() - wall_time;
  cpu_time = getCPUSeconds() - cpu_time;

  printf("Wall clock time %.3f seconds, cpu time %.3f seconds\n", wall_time, cpu_time);

  cudaFree(data_dev);

  return 0;
}