Aaaaa988

Untitled

Jun 12th, 2020
92
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include <stdio.h>
  2. #include <cublas_v2.h>
  3. #include <thrust/host_vector.h>
  4. #include <thrust/device_vector.h>
  5. #include <thrust/transform.h>
  6. #include <thrust/fill.h>
  7. #include <thrust/sequence.h>
  8. #define N (10 << 50)
  9.  
  10.  
  11. __global__ void gSaxpy(float alpha, float *x, float *y) {
  12. int i = threadIdx.x + blockIdx.x * blockDim.x;
  13. y[i] = alpha * x[i] + y[i];
  14. }
  15.  
  16.  
  17. struct saxpy_functor {
  18. const float a;
  19. saxpy_functor(float _a) : a(_a) {}
  20. __host__ __device__ float operator()(float x, float y) {
  21. return a * x + y;
  22. }
  23. };
  24.  
  25. void saxpy(float a, thrust::device_vector<float>& x, thrust::device_vector<float>& y) {
  26. saxpy_functor func(a);
  27. thrust::transform(x.begin(), x.end(), y.begin(), y.begin(), func);
  28. }
  29.  
  30.  
  31. int main() {
  32. float elapsedTime;
  33.  
  34. /* Cuda Kernel */
  35. cudaEvent_t start, stop;
  36. float *x_d, *x_h, *y_h, *y_d;
  37. cudaEventCreate(&start);
  38. cudaEventCreate(&stop);
  39. cudaMalloc((void**)&x_d, N * sizeof(float));
  40. cudaMalloc((void**)&y_d, N * sizeof(float));
  41. x_h = (float*)calloc(N, sizeof(float));
  42. y_h = (float*)calloc(N, sizeof(float));
  43.  
  44. for(int i = 0; i < N; i++) {
  45. x_h[i] = i;
  46. y_h[i] = 0.87;
  47. }
  48.  
  49. cudaMemcpy(x_d, x_h, N * sizeof(float), cudaMemcpyHostToDevice);
  50. cudaMemcpy(y_d, y_h, N * sizeof(float), cudaMemcpyHostToDevice);
  51.  
  52. cudaEventRecord(start, 0);
  53. gSaxpy <<< N / 256, 256 >>> (3.0, x_d, y_d);
  54. cudaDeviceSynchronize();
  55. cudaEventRecord(stop, 0);
  56.  
  57. cudaEventSynchronize(stop);
  58. cudaEventElapsedTime(&elapsedTime, start, stop);
  59. cudaMemcpy(y_h, y_d, N * sizeof(float), cudaMemcpyDeviceToHost);
  60. printf("_______________________________\n");
  61. printf("CUDA Kernel Time:\n \t %f ms\n", elapsedTime);
  62. free(x_h);
  63. free(y_h);
  64. cudaFree(y_d);
  65. cudaFree(x_d);
  66. /* Cuda Kernel */
  67.  
  68.  
  69. /* Cuda Cublas */
  70. float *cx_d, *cx_h, *cy_h, *cy_d;
  71. cudaEventCreate(&start);
  72. cudaEventCreate(&stop);
  73.  
  74. cudaMallocHost((void**)&cx_h, N * sizeof(float));
  75. cudaMallocHost((void**)&cy_h, N * sizeof(float));
  76. cudaMalloc((void**)&cx_d, N * sizeof(float));
  77. cudaMalloc((void**)&cy_d, N * sizeof(float));
  78.  
  79. for(int i = 0; i < N; i++) {
  80. cx_h[i] = (float) i;
  81. cy_h[i] = 0.87f;
  82. }
  83.  
  84. cublasHandle_t cublas_handle;
  85. cublasCreate(&cublas_handle);
  86.  
  87. const int num_rows = N;
  88. const int num_cols = 1;
  89. const size_t elem_size = sizeof(float);
  90.  
  91. cublasSetMatrix(num_rows, num_cols, elem_size, cx_h, num_rows, cx_d, num_rows);
  92. cublasSetMatrix(num_rows, num_cols, elem_size, cy_h, num_rows, cy_d, num_rows);
  93.  
  94. const int stride = 1;
  95. float alpha = 3.0f;
  96.  
  97. cudaEventRecord(start, 0);
  98. cublasSaxpy(cublas_handle, N, &alpha, cx_d, stride, cy_d, stride);
  99. cudaEventRecord(stop, 0);
  100.  
  101. cudaEventSynchronize(stop);
  102. cudaEventElapsedTime(&elapsedTime, start, stop);
  103.  
  104. cublasGetMatrix(num_rows, num_cols, elem_size, cx_d, num_rows, cx_h, num_rows);
  105. cublasGetMatrix(num_rows, num_cols, elem_size, cy_d, num_rows, cy_h, num_rows);
  106. printf("\n_______________________________\n");
  107. printf("cuBLAS Time:\n \t %f ms\n", elapsedTime);
  108. cublasDestroy(cublas_handle);
  109. cudaFreeHost(cx_h);
  110. cudaFreeHost(cy_h);
  111. cudaFree(cx_d);
  112. cudaFree(cy_d);
  113. /* Cuda Cublas */
  114.  
  115. /* Cuda Thrust */
  116. thrust::host_vector<float> h1(N);
  117. thrust::host_vector<float> h2(N);
  118. thrust::sequence(h1.begin(), h1.end());
  119. thrust::fill(h2.begin(), h2.end(), 0.87);
  120.  
  121. thrust::device_vector<float> d1 = h1;
  122. thrust::device_vector<float> d2 = h2;
  123.  
  124. cudaEventRecord(start, 0);
  125. saxpy(3.0, d1, d2);
  126. cudaEventRecord(stop, 0);
  127.  
  128. cudaEventSynchronize(stop);
  129. cudaEventElapsedTime(&elapsedTime, start, stop);
  130.  
  131. h2 = d2;
  132. h1 = d1;
  133. printf("\n_______________________________\n");
  134. printf("THRUST Time:\n \t %f ms\n\n", elapsedTime);
  135. cudaEventDestroy(start);
  136. cudaEventDestroy(stop);
  137.  
  138. /* Cuda Thrust */
  139.  
  140.  
  141.  
  142. return 0;
  143. }
RAW Paste Data