Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Includes
- #include <stdio.h>
- #include <ctime>
- #include <iostream>
- // includes CUDA
- #include <cuda_runtime.h>
- #include <chrono>
- #include "../include/cudautil.cuh"
- #include "../include/cuda_call.h"
- #define DIM_PORTION 32
- #define LIGNES_BLOC 8
- #define MAT_SIZE 1024
- #define NB_THREAD_BLOC 256
- #define KERNEL_LAUNCH_NB 10
- // Code GPU
- __global__ void matrix_multi(const float* A, const float* B, float* C){
- __shared__ float A_SM[DIM_PORTION][DIM_PORTION+1];
- __shared__ float B_SM[DIM_PORTION][DIM_PORTION+1];
- //column
- int x = blockIdx.x * DIM_PORTION + threadIdx.x;
- //line
- int y = blockIdx.y * DIM_PORTION + threadIdx.y;
- int largeur = gridDim.x * DIM_PORTION;
- for (int j = 0; j < DIM_PORTION; j += LIGNES_BLOC){
- B_SM[threadIdx.y+j][threadIdx.x] = B[(y+j)*largeur + x];
- A_SM[threadIdx.y+j][threadIdx.x] = A[(y+j)*largeur + x];
- }
- __syncthreads();
- //line
- x = blockIdx.y * DIM_PORTION + threadIdx.x;
- //column
- y = blockIdx.x * DIM_PORTION + threadIdx.y;
- float C_val = 0;
- for (int j = 0; j < DIM_PORTION; j += LIGNES_BLOC){
- C_val += A_SM[threadIdx.x][threadIdx.y+j] * B_SM[threadIdx.y+j][threadIdx.x];
- }
- C[y*largeur + x] = C_val;
- }
- // Code CPU
- void genmat(float *A, int n){
- for (int i=0; i<n; i++)
- for (int j=0; j<n; j++)
- A[i*n + j] = rand()/(float) RAND_MAX;
- }
- void matMultAdd(const float *A, const float *B, float *C){
- for (int i=0; i<MAT_SIZE; i++){
- for (int j=0; j<MAT_SIZE; j++){
- C[i*MAT_SIZE + j]=0;
- for (int k=0; k<MAT_SIZE; k++){
- C[i*MAT_SIZE + j] += A[i*MAT_SIZE+k]*B[j+k*MAT_SIZE];
- }
- }
- }
- }
- float verify(const float *A, const float* B, int n){
- float error = 0;
- float num = 0;
- float den = 0;
- for (int i=0; i<n; i++){
- for (int j=0; j<n; j++){
- num += (A[i*n + j] - B[i*n + j])*(A[i*n + j] - B[i*n + j]);
- den += (A[i*n + j]*A[i*n + j]);
- }
- }
- error =std::sqrt(num)/std::sqrt(den);
- return error;
- }
- // Host code
- int main(int argc, char** argv){
- printf("Multiplication de matrice méthode avec shared memory\n");
- size_t size = MAT_SIZE*MAT_SIZE*sizeof(float);
- // Initialisation de CUDA
- checkCUDA(0);
- // Matrices CPU
- float *h_A, *h_B, *h_C;
- // Matrices GPU
- float *d_A, *d_B, *d_C;
- // Allocatation des vecteurs dans la mémoire CPU
- h_A = new float[MAT_SIZE*MAT_SIZE];
- h_B = new float[MAT_SIZE*MAT_SIZE];
- h_C = new float[MAT_SIZE*MAT_SIZE];
- // Allocation des vecteurs dans la mémoire GPU
- CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, size));
- CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, size));
- CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, size));
- // Initialisation de la matrice A
- genmat(h_A, MAT_SIZE);
- genmat(h_B, MAT_SIZE);
- // Copie de la matrice A dans la mémoire GPU
- CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
- CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
- // Appel du kernel
- dim3 threadsPerBlock(DIM_PORTION, LIGNES_BLOC);
- dim3 numBlocks(MAT_SIZE/DIM_PORTION, MAT_SIZE/DIM_PORTION);
- // Timing
- cudaEvent_t start, stop;
- CUDA_SAFE_CALL(cudaEventCreate(&start));
- CUDA_SAFE_CALL(cudaEventCreate(&stop));
- CUDA_SAFE_CALL(cudaEventRecord(start, 0));
- for (int i = 0; i < KERNEL_LAUNCH_NB; i++)
- matrix_multi<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
- CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
- CUDA_SAFE_CALL(cudaEventSynchronize(stop));
- float t_ms;
- CUDA_SAFE_CALL(cudaEventElapsedTime(&t_ms, start, stop));
- t_ms /= KERNEL_LAUNCH_NB;
- // Copie du résultat
- CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
- // Verification
- float *h_RESP;
- h_RESP = new float[MAT_SIZE*MAT_SIZE];
- //Multiplication
- auto h_start = std::chrono::system_clock::now();
- matMultAdd(h_A, h_B, h_RESP);
- auto h_end = std::chrono::system_clock::now();
- std::chrono::duration<double> h_elapsed = h_end-h_start;
- printf("Erreur max: %e\n", verify(h_RESP, h_C, MAT_SIZE));
- //performances en GFlops pour le CPU
- printf("Performance pour le GPU : %f GFlops\n", ((MAT_SIZE*MAT_SIZE*(2*MAT_SIZE-1))/(t_ms))/1e6);
- //performances en GFlops
- printf("Performance pour le CPU : %f GFlops\n", ((MAT_SIZE*MAT_SIZE*(2*MAT_SIZE-1))/(h_elapsed.count()))/1e9);
- // Deallocation de la memoire GPU
- // A compléter
- CUDA_SAFE_CALL(cudaFree (d_A));
- CUDA_SAFE_CALL(cudaFree (d_B));
- CUDA_SAFE_CALL(cudaFree (d_C));
- // Deallocation de la memoire CPU
- delete [] h_A;
- delete [] h_B;
- delete [] h_C;
- delete [] h_RESP;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement