Aaaaa988

Untitled

Jun 13th, 2020
87
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include <cuda.h>
  2. #include <stdio.h>
  3. #include <thrust/host_vector.h>
  4. #include <thrust/device_vector.h>
  5. #include <thrust/transform.h>
  6. #include <thrust/fill.h>
  7. #include <thrust/sequence.h>
  8. #include <cublas_v2.h>
  9. #include <iostream>
  10. #include <ctime>
  11. #include <time.h>
  12.  
  13. using namespace std;
  14.  
  15. struct SaxpyFunctor {
  16. const float a;
  17.  
  18. SaxpyFunctor(const float a) : a(a) {}
  19.  
  20. __host__ __device__ float operator ()(float x, float y) {
  21. return a * x + y;
  22. }
  23. };
  24.  
  25. __global__ void saxpyKernel(float a, float *x, float *y) {
  26. int i = threadIdx.x + blockIdx.x * blockDim.x;
  27. y[i] = a * x[i] + y[i];
  28. }
  29.  
  30. int main() {
  31. const float a = 100;
  32. int t;
  33. cin>>t;
  34. const int n = 1 << t;
  35. const int threadsPerBlock = 1 << 10;
  36.  
  37. cudaEvent_t start, stop;
  38. cudaEventCreate(&start);
  39. cudaEventCreate(&stop);
  40.  
  41. float ms;
  42. float *array = new float[n];
  43. for(int i = 0; i < n; i++)
  44. array[i] = i;
  45.  
  46. float *X = new float [n];
  47. float *Y = new float [n];
  48. for(int i = 0; i < n; i++)
  49. X[i] = i;
  50.  
  51. // PURE CUDA
  52. float *deviceX, *deviceY;
  53. cudaMalloc((void**)&deviceX, n * sizeof(float));
  54. cudaMalloc((void**)&deviceY, n * sizeof(float));
  55.  
  56. cudaMemcpy(deviceX, array, sizeof(float) * n, cudaMemcpyHostToDevice);
  57. cudaMemcpy(deviceY, array, sizeof(float) * n, cudaMemcpyHostToDevice);
  58.  
  59. cudaEventRecord(start);
  60. saxpyKernel<<<n / threadsPerBlock, threadsPerBlock>>>(a, deviceX, deviceY);
  61. cudaEventRecord(stop);
  62. cudaEventSynchronize(stop);
  63. cudaEventElapsedTime(&ms, start, stop);
  64. printf("Pure cuda: %f\n", ms);
  65.  
  66. cudaMemcpy(array, deviceY, sizeof(float) * n, cudaMemcpyDeviceToHost);
  67.  
  68. // THRUST
  69. thrust::device_vector<float> thrustX(n);
  70. thrust::device_vector<float> thrustY(n);
  71.  
  72. thrust::sequence(thrustX.begin(), thrustX.end());
  73. thrust::sequence(thrustY.begin(), thrustY.end());
  74.  
  75. cudaEventRecord(start);
  76. thrust::transform(thrustX.begin(), thrustX.end(), thrustY.begin(), thrustY.begin(), SaxpyFunctor(a));
  77. cudaEventRecord(stop);
  78. cudaEventSynchronize(stop);
  79. cudaEventElapsedTime(&ms, start, stop);
  80. printf("Thrust: %f\n", ms);
  81.  
  82. //thrust::copy(thrustY.begin(), thrustY.begin() + 10, std::ostream_iterator<float>(std::cout, "\n"));
  83.  
  84. // CUBLAS
  85. cublasHandle_t cublasHandle;
  86. cublasCreate(&cublasHandle);
  87.  
  88. for(int i = 0; i < n; i++)
  89. array[i] = i;
  90.  
  91. const int stride = 1;
  92. cublasSetVector(n, sizeof(float), array, stride, deviceX, stride);
  93. cublasSetVector(n, sizeof(float), array, stride, deviceY, stride);
  94.  
  95. cudaEventRecord(start);
  96. cublasSaxpy(cublasHandle, n, &a, deviceX, stride, deviceY, stride);
  97. cudaEventRecord(stop);
  98. cudaEventSynchronize(stop);
  99. cudaEventElapsedTime(&ms, start, stop);
  100. printf("cuBLAS: %f\n", ms);
  101.  
  102. cublasGetVector(n, sizeof(float), deviceY, stride, array, stride);
  103. cublasDestroy(cublasHandle);
  104.  
  105. /*for(int i = 0; i < 10; i++) {
  106. printf("%f\n", array[i]);
  107. }*/
  108.  
  109. cudaEventDestroy(start);
  110. cudaEventDestroy(stop);
  111. cudaFree(deviceX);
  112. cudaFree(deviceY);
  113.  
  114. return 0;
  115. }
RAW Paste Data