daily pastebin goal
24%
SHARE
TWEET

Untitled

a guest Mar 22nd, 2019 61 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include <stdio.h>
  2. #include <sys/time.h>
  3. #include <cuda.h>
  4. #include <math.h>
  5.  
  6. double wtime()
  7. {
  8.     struct timeval t;
  9.     gettimeofday (&t, NULL);
  10.     return (double)t.tv_sec + (double)t.tv_usec * 1E-6;
  11. }
  12.  
  13. __global__ void add(float *a_device, float *b_device, float *c_device)
  14. {
  15.     int i = threadIdx.x + blockDim.x * blockIdx.x;
  16.     a_device[i] = b_device[i] + c_device[i];
  17.     a_device[i] *= a_device[i];
  18. }
  19.  
  20. int main()
  21. {
  22.     int N = 32 * 386,
  23.     threads = 32,
  24.     num_of_blocks = 386;
  25.  
  26.     float *a = (float *)calloc(N, sizeof(*a));
  27.     float *b = (float *)calloc(N, sizeof(*b));
  28.     float *c = (float *)calloc(N, sizeof(*c));
  29.  
  30.     float *a_device;
  31.     float *b_device;
  32.     float *c_device;
  33.  
  34.     float *a_async;
  35.     float *b_async;
  36.     float *c_async;
  37.  
  38.     cudaMalloc((void **)&a_device, N * sizeof(*a_device));
  39.     cudaMalloc((void **)&b_device, N * sizeof(*b_device));
  40.     cudaMalloc((void **)&c_device, N * sizeof(*c_device));
  41.     cudaHostAlloc((void**)&a_async, N * sizeof(float), cudaHostAllocDefault);
  42.     cudaHostAlloc((void**)&b_async, N * sizeof(float), cudaHostAllocDefault);
  43.     cudaHostAlloc((void**)&c_async, N * sizeof(float), cudaHostAllocDefault);
  44.  
  45.     for (int i = 0; i < N; i++)
  46.     {
  47.         b[i] = i;
  48.         c[i] = i;
  49.     }
  50.  
  51.     double cpyDef = -wtime();
  52.     cudaMemcpy(b_device, b, N * sizeof(*b), cudaMemcpyHostToDevice);
  53.     cudaMemcpy(c_device, c, N * sizeof(*c), cudaMemcpyHostToDevice);
  54.     cpyDef += wtime();
  55.  
  56.     double cpyAsync = -wtime();
  57.     cudaMemcpyAsync(b_device, b_async, N * sizeof(*b), cudaMemcpyHostToDevice);
  58.     cudaMemcpyAsync(c_device, c_async, N * sizeof(*c), cudaMemcpyHostToDevice);
  59.     cpyAsync += wtime();
  60.    
  61.     printf("host->device: \n" "def: %lf\n" "paging: %lf\n\n", cpyDef, cpyAsync);
  62.  
  63.     add <<< num_of_blocks, threads >>>  (a_device, b_device, c_device);
  64.     cudaDeviceSynchronize();
  65.  
  66.     double backDef = -wtime();
  67.     cudaMemcpy(a, a_device, N * sizeof(float), cudaMemcpyDeviceToHost);
  68.     backDef += wtime();
  69.  
  70.     double backAsync = -wtime();
  71.     cudaMemcpyAsync(a_async, a_device, N * sizeof(float), cudaMemcpyDeviceToHost);
  72.     backAsync += wtime();
  73.  
  74.     printf("device->host:\n" "def: %lf\n" "paging: %lf\n\n", backDef, backAsync);
  75.     printf("summary:\n" "def: %lf\n" "paging: %lf\n", cpyDef + backDef, cpyAsync + backAsync);
  76.  
  77.     return 0;
  78. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top