albaSANDROS

Untitled

Apr 1st, 2021 (edited)
663
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.22 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <cuda.h>
  4. #include <curand.h>
  5. #include <stdio.h>
  6. #include <iostream>
  7. #include <iomanip>
  8. #include <chrono>
  9. #include <intrin.h>
  10. #ifndef __CUDACC__
  11. #define __CUDACC__
  12. #endif
  13. #include <cuda_runtime_api.h>
  14.  
  15. using namespace std;
  16.  
  17. #pragma intrinsic(__rdtsc)
  18.  
  19.  
  20. #define N 1000
  21. #define N_int N / 2
  22. #define M 2000
  23. #define M_int M / 2
  24.  
  25. #define THREAD_COUNT (N * M)
  26. #define BLOCK_COUTN ceil(THREAD_COUNT/128)
  27.  
  28. #define CHECK_ERROR( call )             \
  29. {                                       \
  30. cudaError_t result = call;              \
  31. if ( cudaSuccess != result ) {          \
  32.     cerr << "CUDA error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString( result ) << " (" << #call << ")" << endl;  \
  33.     exit(1); }                          \
  34. }
  35.  
  36.  
  37. int* randomInit(char* a) {
  38.     int size = N * M;
  39.     curandGenerator_t generator;
  40.     curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW);
  41.     curandSetPseudoRandomGeneratorSeed(generator, NULL);
  42.     int* d_a;
  43.     CHECK_ERROR(cudaMalloc(&d_a, size));
  44.     curandGenerate(generator, (unsigned int*)d_a, size/4);
  45.     CHECK_ERROR(cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost));
  46.     curandDestroyGenerator(generator);
  47.     return d_a;
  48. }
  49.  
  50.  
  51. void printPart(char* b) {
  52.     for (int i = 0; i < 10; i++) {
  53.         for (int j = 0; j < 10; j++) {
  54.             cout << setw(15) << (int)b[i * N + j] << " |";
  55.         }
  56.         cout << endl;
  57.     }
  58.  
  59. }
  60.  
  61. void CPU(char* a, char* b) {
  62.     for (int i = 0; i < M; i++) {
  63.         for (int j = 0; j < N; j++) {
  64.             b[i * N + j] = a[(N - j) * M - i - 1];
  65.         }
  66.     }
  67. }
  68.  
  69. __global__ void GPU(int* a, int* b) {
  70.     __shared__ char temp[512];
  71.     char buffer[4];
  72.  
  73.     int currentThread = blockIdx.x * blockDim.x + threadIdx.x;
  74.     int i = currentThread / N_int;
  75.     int j = currentThread % N_int;
  76.     if (currentThread >= N_int * M_int)
  77.         return;
  78.  
  79.     *(int*)buffer = a[(N_int - j) * M_int - i - 1];
  80.  
  81.     temp[4 * threadIdx.x] = buffer[0];
  82.     temp[4 * threadIdx.x + 1] = buffer[1];
  83.     temp[4 * threadIdx.x + 2] = buffer[2];
  84.     temp[4 * threadIdx.x + 3] = buffer[3];
  85.  
  86.     __syncthreads();
  87.  
  88.     b[i * N_int + j] = *(int*)(&temp[4 * threadIdx.x]);
  89. }
  90.  
  91.  
  92. bool equal(char* b1, char* b2) {
  93.     for (int i = 0; i < N * M; i++) {
  94.         if (b1[i] != b2[i]) {
  95.             return false;
  96.         }
  97.     }
  98.     return true;
  99. }
  100.  
  101. int main()
  102. {
  103.     char* a = (char*)malloc(N * M);
  104.     char* b = (char*)malloc(N * M);
  105.     char* gpuResult = (char*)malloc(N * M);
  106.     //CHECK_ERROR(cudaMallocHost(&a, N * M));
  107.    // CHECK_ERROR(cudaMallocHost(&b, N * M));
  108.    
  109.    
  110.     cout << setw(30) << left << "Threads count: " << THREAD_COUNT << endl;
  111.     cout << setw(30) << left << "Blocks count: " << BLOCK_COUTN << endl;
  112.   //  cout << setw(30) << left << "Thread per block: " << threadsInBlock << endl << endl;
  113.  
  114.     int* d_a = randomInit(a);
  115.     int* d_b;
  116.    
  117.     CHECK_ERROR(cudaMalloc(&d_b, N * M));
  118.  
  119.     cout << sizeof(a[0]) << endl;
  120.     cout << sizeof(d_a[0]) << endl;
  121.  
  122.     cudaEvent_t start, stop;
  123.     cudaEventCreate(&start);
  124.     cudaEventCreate(&stop);
  125.  
  126.     auto start_time = __rdtsc();
  127.     CPU(a, b);
  128.     auto end_time = __rdtsc();
  129.     float time = end_time - start_time;
  130.     cout << setw(30) << left << "CPU time: " << time / 3590000 << " ms" << endl; //3.59GHz   (tick/freq = time)
  131.  
  132.     float timeGPU = NULL;
  133.  
  134.  
  135.     CHECK_ERROR(cudaEventRecord(start));
  136.     GPU <<< BLOCK_COUTN, 128 >>> (d_a, d_b);
  137.     CHECK_ERROR(cudaEventRecord(stop));
  138.     CHECK_ERROR(cudaEventSynchronize(stop));
  139.     CHECK_ERROR(cudaEventElapsedTime(&timeGPU, start, stop));
  140.     cout << setw(30) << left << "GPU time: " << timeGPU << " ms" << endl;;
  141.     CHECK_ERROR(cudaMemcpy(gpuResult, d_b, N * M * sizeof(char), cudaMemcpyDeviceToHost));
  142.     CHECK_ERROR(cudaEventDestroy(start));
  143.     CHECK_ERROR(cudaEventDestroy(stop));
  144.     CHECK_ERROR(cudaFree(d_b));
  145.     CHECK_ERROR(cudaFree(d_a));
  146.  
  147.  
  148.  
  149.     printPart(b);
  150.     cout << endl;
  151.     printPart(gpuResult);
  152.  
  153.     cout << (equal(b, gpuResult) ? "Results are equal!" : " Results not equal");
  154.  
  155.     //CHECK_ERROR(cudaFreeHost(a));
  156.   //  CHECK_ERROR(cudaFreeHost(b));
  157.     return 0;
  158. }
Add Comment
Please, Sign In to add comment