Advertisement
Mickey13

Untitled

Nov 17th, 2017
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.04 KB | None | 0 0
  1. // Includes
  2. #include <stdio.h>
  3. #include <ctime>
  4. #include <iostream>
  5. #include <algorithm>
  6.  
  7.  
  8. // includes CUDA
  9. #include <cuda_runtime.h>
  10.  
  11. #include "cudautil.cuh"
  12. #include "cuda_call.h"
  13.  
  14. using namespace std;
  15.  
  16. const int DIM_PORTION = 32;
  17. const int LIGNES_BLOC = 8;
  18.  
  19.  
  20. // Code GPU
  21. __global__ void copymat_device(const float* A, float* B)
  22. {
  23. // A compléter
  24.  
  25. int x = blockIdx.x * DIM_PORTION + threadIdx.x;
  26. int y = blockIdx.y * DIM_PORTION + threadIdx.y;
  27. int largeur = gridDim.x * DIM_PORTION;
  28.  
  29. for (int j = 0; j < DIM_PORTION; j+= LIGNES_BLOC){
  30. B[(y+j)*largeur + x] = A[(y+j)*largeur + x];
  31. }
  32.  
  33.  
  34. }
  35.  
  36.  
  37. // Code CPU
  38. void genmat(float *A, int n)
  39. {
  40. for (int i=0; i<n; i++)
  41. for (int j=0; j<n; j++)
  42. A[i*n + j] = rand()/(float) RAND_MAX;
  43. }
  44.  
  45.  
  46. float verify(const float *A, const float* B, int n)
  47. {
  48. float error = 0;
  49. for (int i=0; i<n; i++)
  50. for (int j=0; j<n; j++)
  51. error = (std::max)(error, abs(A[i*n + j] - B[i*n + j]));
  52.  
  53. return error;
  54. }
  55.  
  56.  
  57.  
  58.  
  59. // Host code
  60. int main(void)
  61. {
  62.  
  63.  
  64. int dev;
  65. cudaError_t check = cudaGetDevice(&dev);
  66.  
  67. if(check == cudaSuccess)
  68. printf("OK\n");
  69.  
  70.  
  71. printf("Copie de matrice\n");
  72.  
  73. // n ne peut exceder 256 (pb d'allocation mémoire)
  74. int n = 256;
  75. size_t size = n*n*sizeof(float);
  76.  
  77. // Initialisation de CUDA
  78. //checkCUDA(0);
  79.  
  80. // Matrices CPU
  81. float *h_A = NULL, *h_B = NULL;
  82.  
  83. // Matrices GPU
  84. float *d_A = NULL, *d_B = NULL;
  85.  
  86. // Allocatation des vecteurs dans la mémoire CPU
  87. h_A = new float[n*n];
  88. h_B = new float[n*n];
  89.  
  90.  
  91.  
  92. // Allocation des vecteurs dans la mémoire GPU
  93. // A compléter
  94. CUDA_SAFE_CALL(cudaMalloc((void **) &d_A, size));
  95. CUDA_SAFE_CALL(cudaMalloc((void **) &d_B, size));
  96.  
  97. // Initialisation de la matrice A
  98. genmat(h_A, n);
  99.  
  100. // Copie de la matrice A dans la mémoire GPU
  101. cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  102.  
  103. // Appel du kernel
  104.  
  105.  
  106. dim3 threadsPerBlock(DIM_PORTION, LIGNES_BLOC);
  107. dim3 numBlocks(n / DIM_PORTION, n / DIM_PORTION);
  108.  
  109.  
  110.  
  111. copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);
  112.  
  113. // Copie du résultat
  114. cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
  115.  
  116. // Verification
  117. printf("Erreur max: %e\n", verify(h_A, h_B, n));
  118.  
  119.  
  120.  
  121. // Timing
  122. cudaEvent_t start, stop;
  123. CUDA_SAFE_CALL(cudaEventCreate(&start));
  124. CUDA_SAFE_CALL(cudaEventCreate(&stop));
  125.  
  126. const int nb = 10;
  127. CUDA_SAFE_CALL(cudaEventRecord(start, 0));
  128.  
  129.  
  130.  
  131. for (int i = 0; i < nb; i++){
  132. copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);
  133. }
  134.  
  135. CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
  136. CUDA_SAFE_CALL(cudaEventSynchronize(stop));
  137.  
  138. float t_ms;
  139. cudaEventElapsedTime(&t_ms, start, stop);
  140. t_ms /= nb;
  141.  
  142.  
  143.  
  144. printf ("Time for the kernel: %f ms\n", t_ms); // Print Elapsed time
  145. printf("Bande passante: %f Go/s\n", (2*size*4/t_ms)/(1024*1024*1024));
  146.  
  147.  
  148.  
  149. // Deallocation de la memoire GPU
  150. // A compléter
  151.  
  152. CUDA_SAFE_CALL(cudaFree( d_A ));
  153. CUDA_SAFE_CALL(cudaFree( d_B ));
  154.  
  155. // Deallocation de la memoire CPU
  156. delete [] h_A;
  157. delete [] h_B;
  158. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement