Advertisement
Guest User

Untitled

a guest
Jun 29th, 2016
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. __global__ void add(int* a, int* b, int* c, int n)
  2. {
  3.   int globalIdx = threadIdx.x + blockIdx.x * blockDim.x;
  4.   int nt = blockDim.x * gridDim.x;
  5.   while (globalIdx < n) {
  6.     c[globalIdx] = a[globalIdx] + b[globalIdx];
  7.     globalIdx += nt;
  8.   }
  9. }
  10.  
  11. template<typename T>
  12. void random_ints(T* a, int n)
  13. {
  14.   for (int i = 0; i < n; ++i)
  15.   {
  16.     a[i] = T(1);
  17.   }
  18. }
  19.  
  20. void check_results(int* c, int n)
  21. {
  22.   bool isOk = true;
  23.   int i;
  24.   for (i = 0; i < n ; ++i)
  25.   {
  26.     if (c[i] != 2) {
  27.       isOk = false;
  28.       break;
  29.     }
  30.   }
  31.   std::cout << (isOk ? "OK" : "WA") << " " << i << std::endl;
  32. }
  33.  
  34. void AddVectorsViaStreams()
  35. {
  36.   const int StreamsCount = 2;
  37.   const int N = 67108864;
  38.   const int M = 1024;
  39.  
  40.   int *a, *b, *c; // host copies of a, b, c
  41.   int *d_a, *d_b, *d_c; // device copies of a, b, c
  42.   int size = N * sizeof(int);
  43.   // Alloc space for device copies of a, b, c
  44.   cudaMalloc((void **)&d_a, size);
  45.   cudaMalloc((void **)&d_b, size);
  46.   cudaMalloc((void **)&d_c, size);
  47.   // Alloc space for host copies of a, b, c and setup input values
  48.   a = (int *)malloc(size); random_ints(a, N);
  49.   b = (int *)malloc(size); random_ints(b, N);
  50.   c = (int *)calloc(N, sizeof(int));
  51.   // Copy inputs to device
  52.   //cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  53.   //cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
  54.   // Launch add() kernel on GPU with N blocks
  55. /*
  56.   cudaEvent_t start, stop;
  57.   float time = 0;
  58.   cudaEventCreate(&start);
  59.   cudaEventCreate(&stop);
  60.   cudaEventRecord(start, 0); */
  61.   cudaDeviceProp devProp;
  62.   cudaGetDeviceProperties(&devProp, 0);
  63.  /* std::cout << devProp.maxThreadsPerBlock << " " << "\n";
  64.   for (int i = 0; i < 3; ++i)
  65.   {
  66.     std::cout << devProp.maxGridSize[i] << std::endl;
  67.   }*/
  68.  
  69.   cudaStream_t streams[StreamsCount];
  70.   for (int i = 0; i < StreamsCount; ++i)
  71.   {
  72.     cudaStreamCreate(&streams[i]);
  73.     cudaError_t error = cudaGetLastError();
  74.     if (error != cudaSuccess)
  75.     {
  76.       // print the CUDA error message and exit
  77.       printf("CUDA error: %s\n", cudaGetErrorString(error));
  78.       exit(-1);
  79.     }
  80.   }
  81.  
  82.   int actialN = N / StreamsCount;
  83.  
  84.   int blocksCount = std::min(devProp.maxGridSize[1], (actialN + M - 1) / M);
  85.   std::cout << "blocks count: " << blocksCount << std::endl;
  86.   std::cout << "threads per block: " << M << std::endl;
  87.  
  88.  
  89.   for (int i = 0; i < StreamsCount; ++i)
  90.   {
  91.     cudaMemcpyAsync(d_a + i * actialN,
  92.       a + i * actialN, actialN, cudaMemcpyHostToDevice, streams[i]);
  93.  
  94.     cudaMemcpyAsync(d_b + i * actialN,
  95.       b + i * actialN, actialN, cudaMemcpyHostToDevice, streams[i]);
  96.  
  97.     add <<< blocksCount, M, 0, streams[i] >> >(d_a + i * actialN,
  98.       d_b + i * actialN, d_c + i * actialN, actialN);
  99.  
  100.     cudaMemcpyAsync(c + i * actialN, d_c + i * actialN,
  101.       actialN, cudaMemcpyDeviceToHost, streams[i]);
  102.   }
  103.  
  104.   // std::this_thread::sleep_for(std::chrono::seconds(25));
  105.  
  106.   cudaDeviceSynchronize();
  107.  
  108. /*
  109.   cudaEventRecord(stop, 0);
  110.   cudaEventSynchronize(stop);
  111.   cudaEventElapsedTime(&time, start, stop);
  112.   printf("Elapsed time : %.2f ms\n", time);
  113.   cudaEventDestroy(start);
  114.   cudaEventDestroy(stop);
  115. */
  116.   check_results(c, N);
  117.  
  118.   cudaError_t error = cudaGetLastError();
  119.   if (error != cudaSuccess)
  120.   {
  121.     // print the CUDA error message and exit
  122.     printf("CUDA error: %s\n", cudaGetErrorString(error));
  123.     exit(-1);
  124.   }
  125.  
  126.   // Copy result back to host
  127.   cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
  128.  
  129.   // Cleanup
  130.   for (int i = 0; i < StreamsCount; ++i)
  131.   {
  132.     cudaStreamDestroy(streams[i]);
  133.   }
  134.  
  135.   free(a); free(b); free(c);
  136.   cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
  137. }
  138.  
  139. int main(void)
  140. {
  141.   //AddVectors();
  142.   // MulMatrices();
  143.  AddVectorsViaStreams();
  144.   return 0;
  145. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement