Advertisement
Mickey13

Untitled

Dec 15th, 2017
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.09 KB | None | 0 0
  1. // Includes
  2. #include <stdio.h>
  3. #include <ctime>
  4. #include <iostream>
  5. #include <algorithm>
  6.  
  7.  
  8. // includes CUDA
  9. #include <cuda_runtime.h>
  10.  
  11. #include "cudautil.cuh"
  12. #include "cuda_call.h"
  13.  
  14. using namespace std;
  15.  
  16. const int DIM_PORTION = 32;
  17.  
  18.  
  19.  
  20. // Code GPU Non shared memory
  21. __global__ void gpu_matrix_mult(float *A, float *B, float *C, int n)
  22. {
  23.  
  24.  
  25. int x = blockIdx.x * DIM_PORTION + threadIdx.x; // colonne
  26. int y = blockIdx.y * DIM_PORTION + threadIdx.y; // ligne
  27.  
  28.  
  29. float sum_tmp = 0.0f;
  30.  
  31.  
  32. for (int k = 0; k < n; k += 1) {
  33.  
  34. sum_tmp += A[y * n + k] * B[k * n + x];
  35. }
  36.  
  37. C[y * n + x] = sum_tmp;
  38. }
  39.  
  40.  
  41. // Code CPU
  42. void genmat(float *A, int n)
  43. {
  44. for (int i = 0; i<n; i++)
  45. for (int j = 0; j<n; j++)
  46. A[i*n + j] = i*j / (float)n / (float)n;
  47. }
  48.  
  49. void dispmat(float *A, int n) {
  50. for (int i = 0; i<n; i++)
  51. for (int j = 0; j<n; j++)
  52. printf("%f%s", A[i*n + j], ((j == (n - 1)) ? "\n" : "\t"));
  53. }
  54.  
  55. float verify(const float *A, const float* B, int n)
  56. {
  57. float error = 0;
  58.  
  59. for (int i = 0; i<n; i++)
  60. for (int j = 0; j<n; j++) {
  61. //printf("(%d,%d)\t%f\t%f\n", i, j, A[i*n + j], B[i*n + j]);
  62. error = (std::max)(error, abs(A[i*n + j] - B[i*n + j]));
  63. }
  64.  
  65. return error;
  66. }
  67.  
  68.  
  69. void cpu_matrix_mult(float *h_A, float *h_B, float *h_C, int N) {
  70.  
  71. float sum;
  72.  
  73. for (int x = 0; x<N; x++) { // boucle ligne
  74.  
  75. if (x % 256 == 0)
  76. printf("x = %d\n", x);
  77.  
  78. for (int y = 0; y<N; y++) { //boucle colonne
  79.  
  80.  
  81. sum = 0.f;
  82. for (int n = 0; n<N; n++) { // boucle : pour chaque pixel de la matrice C
  83.  
  84. sum += h_A[x*N + n] * h_B[n*N + y];
  85. //printf("%f %f %f\n", sum, h_A[x*N + n], h_B[n*N + y]);
  86.  
  87. }
  88.  
  89. h_C[x*N + y] = sum;
  90. }
  91.  
  92. }
  93. }
  94.  
  95.  
  96.  
  97.  
  98. // Host code
  99. int main(void) {
  100.  
  101. int dev;
  102. cudaError_t check = cudaGetDevice(&dev);
  103.  
  104. if (check == cudaSuccess)
  105. printf("cudaSuccess\n");
  106.  
  107.  
  108. int n = 2048;
  109. size_t size = n*n*sizeof(float);
  110.  
  111. // Initialisation de CUDA
  112. //checkCUDA(0);
  113.  
  114. // Matrices CPU
  115. float *h_A = NULL, *h_B = NULL, *h_Cgpu = NULL, *h_Ccpu = NULL;
  116.  
  117. // Matrices GPU
  118. float *d_A = NULL, *d_B = NULL, *d_C = NULL;
  119.  
  120. // Allocatation des vecteurs dans la mémoire CPU
  121. h_A = new float[n*n];
  122. h_B = new float[n*n];
  123. h_Cgpu = new float[n*n];
  124. h_Ccpu = new float[n*n];
  125.  
  126.  
  127. // Allocation des vecteurs dans la mémoire GPU
  128. // A compléter
  129. CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, size));
  130. CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, size));
  131. CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, size));
  132.  
  133. // Initialisation de la matrice A
  134. genmat(h_A, n);
  135.  
  136.  
  137. // Initialisation de la matrice B
  138. genmat(h_B, n);
  139.  
  140. // Appel du de la multiplication matricielle du cpu
  141. cpu_matrix_mult(h_A, h_B, h_Ccpu, n);
  142.  
  143. // Copie de la matrice A dans la mémoire GPU
  144. cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  145.  
  146. // Copie de la matrice B dans la mémoire GPU
  147. cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
  148.  
  149. // Appel du kernel
  150. dim3 dimBlock(DIM_PORTION, DIM_PORTION);
  151. dim3 dimGrid(n / DIM_PORTION, n / DIM_PORTION);
  152.  
  153. gpu_matrix_mult <<<dimGrid, dimBlock >>>(d_A, d_B, d_C, n);
  154.  
  155. // Copie du résultat
  156. cudaMemcpy(h_Cgpu, d_C, size, cudaMemcpyDeviceToHost);
  157.  
  158.  
  159. // Verification
  160. printf("Erreur max: %e\n", verify(h_Ccpu, h_Cgpu, n));
  161.  
  162.  
  163.  
  164. // Timing
  165. cudaEvent_t start, stop;
  166. CUDA_SAFE_CALL(cudaEventCreate(&start));
  167. CUDA_SAFE_CALL(cudaEventCreate(&stop));
  168.  
  169. const int nb = 1;
  170. CUDA_SAFE_CALL(cudaEventRecord(start, 0));
  171.  
  172. for (int i = 0; i < nb; i++)
  173. gpu_matrix_mult <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, n);
  174.  
  175.  
  176. CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
  177. CUDA_SAFE_CALL(cudaEventSynchronize(stop));
  178.  
  179. float t_ms;
  180. cudaEventElapsedTime(&t_ms, start, stop);
  181. t_ms /= nb;
  182.  
  183. CUDA_SAFE_CALL(cudaEventDestroy(start));
  184. CUDA_SAFE_CALL(cudaEventDestroy(stop));
  185.  
  186.  
  187. printf("Time for the kernel: %f ms\n", t_ms); // Print Elapsed time
  188. printf(" Performance: %f GFlops (/s\n", (3 * size / t_ms / 1e6));
  189.  
  190.  
  191. // Deallocation de la memoire GPU
  192.  
  193.  
  194. CUDA_SAFE_CALL(cudaFree(d_A));
  195. CUDA_SAFE_CALL(cudaFree(d_B));
  196.  
  197. // Deallocation de la memoire CPU
  198. delete[] h_A;
  199. delete[] h_B;
  200.  
  201. system("PAUSE");
  202. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement