Advertisement
Guest User

Untitled

a guest
Dec 15th, 2017
47
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.50 KB | None | 0 0
  1. // Includes
  2. #include <stdio.h>
  3. #include <ctime>
  4. #include <iostream>
  5.  
  6. // includes CUDA
  7. #include <cuda_runtime.h>
  8.  
  9. #include <chrono>
  10.  
  11. #include "../include/cudautil.cuh"
  12. #include "../include/cuda_call.h"
  13.  
  14. #define DIM_PORTION 32
  15. #define LIGNES_BLOC 8
  16.  
  17. #define MAT_SIZE 1024
  18. #define NB_THREAD_BLOC 256
  19.  
  20. #define KERNEL_LAUNCH_NB 10
  21.  
  22. // Code GPU
  23. __global__ void matrix_multi(const float* A, const float* B, float* C){
  24. __shared__ float A_SM[DIM_PORTION][DIM_PORTION+1];
  25. __shared__ float B_SM[DIM_PORTION][DIM_PORTION+1];
  26.  
  27. //column
  28. int x = blockIdx.x * DIM_PORTION + threadIdx.x;
  29. //line
  30. int y = blockIdx.y * DIM_PORTION + threadIdx.y;
  31. int largeur = gridDim.x * DIM_PORTION;
  32.  
  33. for (int j = 0; j < DIM_PORTION; j += LIGNES_BLOC){
  34. B_SM[threadIdx.y+j][threadIdx.x] = B[(y+j)*largeur + x];
  35. A_SM[threadIdx.y+j][threadIdx.x] = A[(y+j)*largeur + x];
  36. }
  37.  
  38. __syncthreads();
  39. //line
  40. x = blockIdx.y * DIM_PORTION + threadIdx.x;
  41. //column
  42. y = blockIdx.x * DIM_PORTION + threadIdx.y;
  43.  
  44. float C_val = 0;
  45.  
  46. for (int j = 0; j < DIM_PORTION; j += LIGNES_BLOC){
  47. C_val += A_SM[threadIdx.x][threadIdx.y+j] * B_SM[threadIdx.y+j][threadIdx.x];
  48. }
  49. C[y*largeur + x] = C_val;
  50. }
  51.  
  52. // Code CPU
  53. void genmat(float *A, int n){
  54. for (int i=0; i<n; i++)
  55. for (int j=0; j<n; j++)
  56. A[i*n + j] = rand()/(float) RAND_MAX;
  57. }
  58.  
  59. void matMultAdd(const float *A, const float *B, float *C){
  60. for (int i=0; i<MAT_SIZE; i++){
  61. for (int j=0; j<MAT_SIZE; j++){
  62. C[i*MAT_SIZE + j]=0;
  63. for (int k=0; k<MAT_SIZE; k++){
  64. C[i*MAT_SIZE + j] += A[i*MAT_SIZE+k]*B[j+k*MAT_SIZE];
  65. }
  66. }
  67. }
  68. }
  69.  
  70. float verify(const float *A, const float* B, int n){
  71. float error = 0;
  72. float num = 0;
  73. float den = 0;
  74. for (int i=0; i<n; i++){
  75. for (int j=0; j<n; j++){
  76. num += (A[i*n + j] - B[i*n + j])*(A[i*n + j] - B[i*n + j]);
  77. den += (A[i*n + j]*A[i*n + j]);
  78. }
  79. }
  80. error =std::sqrt(num)/std::sqrt(den);
  81.  
  82. return error;
  83. }
  84.  
  85. // Host code
  86. int main(int argc, char** argv){
  87. printf("Multiplication de matrice méthode avec shared memory\n");
  88. size_t size = MAT_SIZE*MAT_SIZE*sizeof(float);
  89.  
  90. // Initialisation de CUDA
  91. checkCUDA(0);
  92.  
  93. // Matrices CPU
  94. float *h_A, *h_B, *h_C;
  95. // Matrices GPU
  96. float *d_A, *d_B, *d_C;
  97.  
  98. // Allocatation des vecteurs dans la mémoire CPU
  99. h_A = new float[MAT_SIZE*MAT_SIZE];
  100. h_B = new float[MAT_SIZE*MAT_SIZE];
  101. h_C = new float[MAT_SIZE*MAT_SIZE];
  102.  
  103. // Allocation des vecteurs dans la mémoire GPU
  104. CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, size));
  105. CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, size));
  106. CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, size));
  107.  
  108. // Initialisation de la matrice A
  109. genmat(h_A, MAT_SIZE);
  110. genmat(h_B, MAT_SIZE);
  111.  
  112. // Copie de la matrice A dans la mémoire GPU
  113. CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
  114. CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
  115.  
  116. // Appel du kernel
  117. dim3 threadsPerBlock(DIM_PORTION, LIGNES_BLOC);
  118. dim3 numBlocks(MAT_SIZE/DIM_PORTION, MAT_SIZE/DIM_PORTION);
  119.  
  120. // Timing
  121. cudaEvent_t start, stop;
  122. CUDA_SAFE_CALL(cudaEventCreate(&start));
  123. CUDA_SAFE_CALL(cudaEventCreate(&stop));
  124.  
  125. CUDA_SAFE_CALL(cudaEventRecord(start, 0));
  126. for (int i = 0; i < KERNEL_LAUNCH_NB; i++)
  127. matrix_multi<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
  128.  
  129. CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
  130. CUDA_SAFE_CALL(cudaEventSynchronize(stop));
  131. float t_ms;
  132. CUDA_SAFE_CALL(cudaEventElapsedTime(&t_ms, start, stop));
  133. t_ms /= KERNEL_LAUNCH_NB;
  134.  
  135. // Copie du résultat
  136. CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
  137.  
  138. // Verification
  139. float *h_RESP;
  140. h_RESP = new float[MAT_SIZE*MAT_SIZE];
  141.  
  142. //Multiplication
  143. auto h_start = std::chrono::system_clock::now();
  144.  
  145. matMultAdd(h_A, h_B, h_RESP);
  146. auto h_end = std::chrono::system_clock::now();
  147.  
  148. std::chrono::duration<double> h_elapsed = h_end-h_start;
  149.  
  150. printf("Erreur max: %e\n", verify(h_RESP, h_C, MAT_SIZE));
  151.  
  152. //performances en GFlops pour le CPU
  153. printf("Performance pour le GPU : %f GFlops\n", ((MAT_SIZE*MAT_SIZE*(2*MAT_SIZE-1))/(t_ms))/1e6);
  154.  
  155. //performances en GFlops
  156. printf("Performance pour le CPU : %f GFlops\n", ((MAT_SIZE*MAT_SIZE*(2*MAT_SIZE-1))/(h_elapsed.count()))/1e9);
  157.  
  158. // Deallocation de la memoire GPU
  159. // A compléter
  160. CUDA_SAFE_CALL(cudaFree (d_A));
  161. CUDA_SAFE_CALL(cudaFree (d_B));
  162. CUDA_SAFE_CALL(cudaFree (d_C));
  163.  
  164. // Deallocation de la memoire CPU
  165. delete [] h_A;
  166. delete [] h_B;
  167. delete [] h_C;
  168. delete [] h_RESP;
  169.  
  170. return 0;
  171. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement