Advertisement
alexsetyaev

task_9_pinned_unified

Oct 1st, 2021
1,105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.03 KB | None | 0 0
  1. //
  2. //  main.cpp
  3. //  
  4. //
  5. //  Created by Elijah Afanasiev on 25.09.2018.
  6. //
  7. //
  8.  
  9. // System includes
  10. #include <stdio.h>
  11. #include <assert.h>
  12.  
  13. // CUDA runtime
  14. #include <cuda.h>
  15. #include <cuda_runtime.h>
  16. #include <cfloat>
  17. #include <chrono>
  18. #include <cuda_profiler_api.h>
  19. #include <iostream>
  20.  
  21. using namespace std;
  22.  
  23. #ifndef MAX
  24. #define MAX(a,b) (a > b ? a : b)
  25. #endif
  26.  
  27. __global__ void vectorAddGPU(float *a, float *b, float *c, int N)
  28. {
  29.     int idx = blockIdx.x*blockDim.x + threadIdx.x;
  30.    
  31.     if (idx < N)
  32.     {
  33.         c[idx] = a[idx] + b[idx];
  34.     }
  35. }
  36.  
  37. void unified_sample(int size = 1048576)
  38. {  
  39.     printf("unified\n");
  40.     int n = size;
  41.    
  42.     int nBytes = n*sizeof(float);
  43.    
  44.     float *a, *b;  // host data
  45.     float *c;  // results
  46.    
  47.     dim3 block(256);
  48.     dim3 grid((unsigned int)ceil(n/(float)block.x));
  49.    
  50.     cudaEvent_t start, stop;
  51.     cudaEventCreate(&start);
  52.     cudaEventCreate(&stop);
  53.    
  54.     cudaEventRecord(start);
  55.  
  56.  
  57.    
  58.     cudaMallocManaged(&a, nBytes);
  59.     cudaMallocManaged(&b, nBytes);
  60.     cudaMallocManaged(&c, nBytes);
  61.  
  62.  
  63.  
  64.     for(int i=0;i<n;i++)
  65.     {
  66.         a[i] = rand() / (float)RAND_MAX;
  67.         b[i] = rand() / (float)RAND_MAX;
  68.         c[i] = 0;
  69.     }
  70.    
  71.     vectorAddGPU<<<grid, block>>>(a, b, c, n);
  72.    
  73.     cudaEventRecord(stop);
  74.     cudaEventSynchronize(stop);
  75.     float milliseconds = 0;
  76.     cudaEventElapsedTime(&milliseconds, start, stop);
  77.     printf("time: %f ms\n", milliseconds);
  78.    
  79.     cudaThreadSynchronize();
  80.    
  81.     cudaFree(a);
  82.     cudaFree(b);
  83.     cudaFree(c);
  84. }
  85.  
  86. void pinned_sample(int size = 1048576)
  87. {  
  88.     printf("pinned\n");
  89.     int n = size;
  90.    
  91.     int nBytes = n*sizeof(float);
  92.    
  93.     float *a, *b;  // host data
  94.     float *c;  // results
  95.    
  96.     auto start1 = std::chrono::steady_clock::now();
  97.     a = (float *)malloc(nBytes);
  98.     b = (float *)malloc(nBytes);
  99.     c = (float *)malloc(nBytes);
  100.     cudaHostRegister(a, nBytes, 0);
  101.     cudaHostRegister(b, nBytes, 0);
  102.     cudaHostRegister(c, nBytes, 0);
  103.     auto end = std::chrono::steady_clock::now();
  104.     std::chrono::duration<double> elapsed_seconds = end-start1;
  105.     cout << "Pinned host malloc time: " << (elapsed_seconds.count())*1000.0 << " ms" << endl;
  106.    
  107.     float *a_d,*b_d,*c_d;
  108.    
  109.     dim3 block(256);
  110.     dim3 grid((unsigned int)ceil(n/(float)block.x));
  111.    
  112.     for(int i=0;i<n;i++)
  113.     {
  114.         a[i] = rand() / (float)RAND_MAX;
  115.         b[i] = rand() / (float)RAND_MAX;
  116.         c[i] = 0;
  117.     }
  118.    
  119.     start1 = std::chrono::steady_clock::now();
  120.     cudaMalloc((void **)&a_d,n*sizeof(float));
  121.     cudaMalloc((void **)&b_d,n*sizeof(float));
  122.     cudaMalloc((void **)&c_d,n*sizeof(float));
  123.     end = std::chrono::steady_clock::now();
  124.     elapsed_seconds = end-start1;
  125.     cout << "Pinned device malloc time: " << (elapsed_seconds.count())*1000.0 << " ms" << endl;
  126.  
  127.    
  128.    
  129.     cudaEvent_t start, stop;
  130.     cudaEventCreate(&start);
  131.     cudaEventCreate(&stop);
  132.    
  133.     cudaEventRecord(start);
  134.  
  135.     start1 = std::chrono::steady_clock::now();
  136.     cudaMemcpy(a_d,a,n*sizeof(float), cudaMemcpyHostToDevice);
  137.     cudaMemcpy(b_d,b,n*sizeof(float), cudaMemcpyHostToDevice);
  138.     end = std::chrono::steady_clock::now();
  139.     elapsed_seconds = end-start1;
  140.     cout << "Pinned copy time: " << (elapsed_seconds.count())*1000.0 << " ms" << endl;
  141.    
  142.     vectorAddGPU<<<grid, block>>>(a_d, b_d, c_d, n);
  143.    
  144.     cudaEventRecord(stop);
  145.     cudaEventSynchronize(stop);
  146.     float milliseconds = 0;
  147.     cudaEventElapsedTime(&milliseconds, start, stop);
  148.     printf("time: %f ms\n", milliseconds);
  149.    
  150.     cudaThreadSynchronize();
  151.    
  152.     cudaFree(a_d);
  153.     cudaFree(b_d);
  154.     cudaFree(c_d);
  155.     cudaHostUnregister(a);
  156.     cudaHostUnregister(b);
  157.     cudaHostUnregister(c);
  158.     free(a);
  159.     free(b);
  160.     free(c);
  161. }
  162.  
  163. void usual_sample(int size = 1048576)
  164. {  
  165.     printf("usual\n");
  166.     int n = size;
  167.    
  168.     int nBytes = n*sizeof(float);
  169.    
  170.     float *a, *b;  // host data
  171.     float *c;  // results
  172.    
  173.     auto start1 = std::chrono::steady_clock::now();
  174.     a = (float *)malloc(nBytes);
  175.     b = (float *)malloc(nBytes);
  176.     c = (float *)malloc(nBytes);
  177.     auto end = std::chrono::steady_clock::now();
  178.     std::chrono::duration<double> elapsed_seconds = end-start1;
  179.     cout << "Usual host malloc time: " << (elapsed_seconds.count())*1000.0 << " ms" << endl;
  180.    
  181.     float *a_d,*b_d,*c_d;
  182.    
  183.     dim3 block(256);
  184.     dim3 grid((unsigned int)ceil(n/(float)block.x));
  185.    
  186.     for(int i=0;i<n;i++)
  187.     {
  188.         a[i] = rand() / (float)RAND_MAX;
  189.         b[i] = rand() / (float)RAND_MAX;
  190.         c[i] = 0;
  191.     }
  192.    
  193.    
  194.     start1 = std::chrono::steady_clock::now();
  195.     cudaMalloc((void **)&a_d,n*sizeof(float));
  196.     cudaMalloc((void **)&b_d,n*sizeof(float));
  197.     cudaMalloc((void **)&c_d,n*sizeof(float));
  198.     end = std::chrono::steady_clock::now();
  199.     elapsed_seconds = end-start1;
  200.     cout << "Usual device malloc time: " << (elapsed_seconds.count())*1000.0 << " ms" << endl;
  201.    
  202.    
  203.     cudaEvent_t start, stop;
  204.     cudaEventCreate(&start);
  205.     cudaEventCreate(&stop);
  206.    
  207.     cudaEventRecord(start);
  208.  
  209.     start1 = std::chrono::steady_clock::now();
  210.     cudaMemcpy(a_d,a,n*sizeof(float), cudaMemcpyHostToDevice);
  211.     cudaMemcpy(b_d,b,n*sizeof(float), cudaMemcpyHostToDevice);
  212.     end = std::chrono::steady_clock::now();
  213.     elapsed_seconds = end-start1;
  214.     cout << "Usual copy time: " << (elapsed_seconds.count())*1000.0 << " ms" << endl;
  215.    
  216.    
  217.     vectorAddGPU<<<grid, block>>>(a_d, b_d, c_d, n);
  218.    
  219.     cudaEventRecord(stop);
  220.     cudaEventSynchronize(stop);
  221.     float milliseconds = 0;
  222.     cudaEventElapsedTime(&milliseconds, start, stop);
  223.     printf("time: %f ms\n", milliseconds);
  224.    
  225.     cudaThreadSynchronize();
  226.    
  227.     cudaFree(a_d);
  228.     cudaFree(b_d);
  229.     cudaFree(c_d);
  230. }
  231.  
  232.  
  233. int main(int argc, char **argv)
  234. {
  235.     int n = atoi(argv[1]);
  236.     usual_sample(n);
  237.     printf("\n");
  238.     pinned_sample(n);
  239.     printf("\n");
  240.     unified_sample(n);
  241.    
  242.     return 0;
  243. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement