Aaaaa988

Untitled

Jun 12th, 2020
98
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include <stdio.h>
  2. #include <cublas_v2.h>
  3. #include <thrust/host_vector.h>
  4. #include <thrust/device_vector.h>
  5. #include <thrust/transform.h>
  6. #include <thrust/fill.h>
  7. #include <thrust/sequence.h>
  8. #define N 9000000
  9.  
  10. struct saxpyFunctor {
  11. const float a;
  12. saxpyFunctor(float _a) : a(_a) {}
  13. __host__ __device__ float operator()(float x, float y) {
  14. return a * x + y;
  15. }
  16. };
  17.  
  18. __global__ void cudaSaxpy(float alpha, float *x, float *y) {
  19. int i = threadIdx.x + blockIdx.x * blockDim.x;
  20. y[i] = alpha * x[i] + y[i];
  21. }
  22.  
  23. void saxpy(float a, thrust::device_vector<float>& x, thrust::device_vector<float>& y) {
  24. saxpyFunctor func(a);
  25. thrust::transform(x.begin(), x.end(), y.begin(), y.begin(), func);
  26. }
  27.  
  28. void CudaKernel(){
  29. float elapsedTime;
  30. cudaEvent_t start, stop;
  31. float *x_d, *x_h, *y_h, *y_d;
  32. cudaEventCreate(&start);
  33. cudaEventCreate(&stop);
  34. cudaMalloc((void**)&x_d, N * sizeof(float));
  35. cudaMalloc((void**)&y_d, N * sizeof(float));
  36. x_h = (float*)calloc(N, sizeof(float));
  37. y_h = (float*)calloc(N, sizeof(float));
  38.  
  39. for(int i = 0; i < N; i++) {
  40. x_h[i] = i;
  41. y_h[i] = 0.87;
  42. }
  43.  
  44. cudaMemcpy(x_d, x_h, N * sizeof(float), cudaMemcpyHostToDevice);
  45. cudaMemcpy(y_d, y_h, N * sizeof(float), cudaMemcpyHostToDevice);
  46.  
  47. cudaEventRecord(start, 0);
  48. cudaSaxpy <<< N / 256, 256 >>> (3.0, x_d, y_d);
  49. cudaDeviceSynchronize();
  50. cudaEventRecord(stop, 0);
  51.  
  52. cudaEventSynchronize(stop);
  53. cudaEventElapsedTime(&elapsedTime, start, stop);
  54. cudaMemcpy(y_h, y_d, N * sizeof(float), cudaMemcpyDeviceToHost);
  55. printf("CUDA Kernel Time:\n \t %f ms\n", elapsedTime);
  56. free(x_h);
  57. free(y_h);
  58. cudaFree(y_d);
  59. cudaFree(x_d);
  60. }
  61.  
  62. void cudaCublas(){
  63. float elapsedTime;
  64. cudaEvent_t start, stop;
  65. float *cx_d, *cx_h, *cy_h, *cy_d;
  66. cudaEventCreate(&start);
  67. cudaEventCreate(&stop);
  68.  
  69. cudaMallocHost((void**)&cx_h, N * sizeof(float));
  70. cudaMallocHost((void**)&cy_h, N * sizeof(float));
  71. cudaMalloc((void**)&cx_d, N * sizeof(float));
  72. cudaMalloc((void**)&cy_d, N * sizeof(float));
  73.  
  74. for(int i = 0; i < N; i++) {
  75. cx_h[i] = (float) i;
  76. cy_h[i] = 0.87f;
  77. }
  78.  
  79. cublasHandle_t cublas_handle;
  80. cublasCreate(&cublas_handle);
  81.  
  82. const int num_rows = N;
  83. const int num_cols = 1;
  84. const size_t elem_size = sizeof(float);
  85.  
  86. cublasSetMatrix(num_rows, num_cols, elem_size, cx_h, num_rows, cx_d, num_rows);
  87. cublasSetMatrix(num_rows, num_cols, elem_size, cy_h, num_rows, cy_d, num_rows);
  88.  
  89. const int stride = 1;
  90. float alpha = 3.0f;
  91.  
  92. cudaEventRecord(start, 0);
  93. cublasSaxpy(cublas_handle, N, &alpha, cx_d, stride, cy_d, stride);
  94. cudaEventRecord(stop, 0);
  95.  
  96. cudaEventSynchronize(stop);
  97. cudaEventElapsedTime(&elapsedTime, start, stop);
  98.  
  99. cublasGetMatrix(num_rows, num_cols, elem_size, cx_d, num_rows, cx_h, num_rows);
  100. cublasGetMatrix(num_rows, num_cols, elem_size, cy_d, num_rows, cy_h, num_rows);
  101. printf("cuBLAS Time:\n \t %f ms\n", elapsedTime);
  102. cublasDestroy(cublas_handle);
  103. cudaFreeHost(cx_h);
  104. cudaFreeHost(cy_h);
  105. cudaFree(cx_d);
  106. cudaFree(cy_d);
  107. }
  108.  
  109.  
  110. void ThrustLib(){
  111. float elapsedTime;
  112. cudaEvent_t start, stop;
  113. cudaEventCreate(&start);
  114. cudaEventCreate(&stop);
  115. thrust::host_vector<float> h1(N);
  116. thrust::host_vector<float> h2(N);
  117. thrust::sequence(h1.begin(), h1.end());
  118. thrust::fill(h2.begin(), h2.end(), 0.87);
  119.  
  120. thrust::device_vector<float> d1 = h1;
  121. thrust::device_vector<float> d2 = h2;
  122.  
  123. cudaEventRecord(start, 0);
  124. saxpy(3.0, d1, d2);
  125. cudaEventRecord(stop, 0);
  126.  
  127. cudaEventSynchronize(stop);
  128. cudaEventElapsedTime(&elapsedTime, start, stop);
  129.  
  130. h2 = d2;
  131. h1 = d1;
  132.  
  133. printf("Thrust Time:\n \t %f ms\n\n", elapsedTime);
  134. cudaEventDestroy(start);
  135. cudaEventDestroy(stop);
  136. }
  137.  
  138.  
  139. int main() {
  140. printf("\t\tРасчетно-графическое задание\n");
  141. printf("\tПо дисциплине ''Программирование графических процессоров''\n");
  142. printf("\tТема: Провести анализ производительности программ,\nреализующих алгоритмы линейной алгебры с использованием библиотек\nThrust, cuBLAS и «сырого» кода на CUDA C.\n");
  143. printf("\nВыполнили: студенты группы ИП-715 Киселев В.С. и Пляскина А.Ю.\n\n");
  144. CudaKernel();
  145. cudaCublas();
  146. ThrustLib();
  147. return 0;
  148. }
RAW Paste Data