Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Includes
- #include <stdio.h>
- #include <ctime>
- #include <iostream>
- #include <algorithm>
- // includes CUDA
- #include <cuda_runtime.h>
- #include "cudautil.cuh"
- #include "cuda_call.h"
- using namespace std;
- const int DIM_PORTION = 32;
- // Code GPU Non shared memory
- __global__ void gpu_matrix_mult(float *A, float *B, float *C, int n)
- {
- int x = blockIdx.x * DIM_PORTION + threadIdx.x; // colonne
- int y = blockIdx.y * DIM_PORTION + threadIdx.y; // ligne
- float sum_tmp = 0.0f;
- for (int k = 0; k < n; k += 1) {
- sum_tmp += A[y * n + k] * B[k * n + x];
- }
- C[y * n + x] = sum_tmp;
- }
- // Code CPU
- void genmat(float *A, int n)
- {
- for (int i = 0; i<n; i++)
- for (int j = 0; j<n; j++)
- A[i*n + j] = i*j / (float)n / (float)n;
- }
- void dispmat(float *A, int n) {
- for (int i = 0; i<n; i++)
- for (int j = 0; j<n; j++)
- printf("%f%s", A[i*n + j], ((j == (n - 1)) ? "\n" : "\t"));
- }
- float verify(const float *A, const float* B, int n)
- {
- float error = 0;
- for (int i = 0; i<n; i++)
- for (int j = 0; j<n; j++) {
- //printf("(%d,%d)\t%f\t%f\n", i, j, A[i*n + j], B[i*n + j]);
- error = (std::max)(error, abs(A[i*n + j] - B[i*n + j]));
- }
- return error;
- }
- void cpu_matrix_mult(float *h_A, float *h_B, float *h_C, int N) {
- float sum;
- for (int x = 0; x<N; x++) { // boucle ligne
- if (x % 256 == 0)
- printf("x = %d\n", x);
- for (int y = 0; y<N; y++) { //boucle colonne
- sum = 0.f;
- for (int n = 0; n<N; n++) { // boucle : pour chaque pixel de la matrice C
- sum += h_A[x*N + n] * h_B[n*N + y];
- //printf("%f %f %f\n", sum, h_A[x*N + n], h_B[n*N + y]);
- }
- h_C[x*N + y] = sum;
- }
- }
- }
- // Host code
- int main(void) {
- int dev;
- cudaError_t check = cudaGetDevice(&dev);
- if (check == cudaSuccess)
- printf("cudaSuccess\n");
- int n = 2048;
- size_t size = n*n*sizeof(float);
- // Initialisation de CUDA
- //checkCUDA(0);
- // Matrices CPU
- float *h_A = NULL, *h_B = NULL, *h_Cgpu = NULL, *h_Ccpu = NULL;
- // Matrices GPU
- float *d_A = NULL, *d_B = NULL, *d_C = NULL;
- // Allocatation des vecteurs dans la mémoire CPU
- h_A = new float[n*n];
- h_B = new float[n*n];
- h_Cgpu = new float[n*n];
- h_Ccpu = new float[n*n];
- // Allocation des vecteurs dans la mémoire GPU
- // A compléter
- CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, size));
- CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, size));
- CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, size));
- // Initialisation de la matrice A
- genmat(h_A, n);
- // Initialisation de la matrice B
- genmat(h_B, n);
- // Appel du de la multiplication matricielle du cpu
- cpu_matrix_mult(h_A, h_B, h_Ccpu, n);
- // Copie de la matrice A dans la mémoire GPU
- cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
- // Copie de la matrice B dans la mémoire GPU
- cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
- // Appel du kernel
- dim3 dimBlock(DIM_PORTION, DIM_PORTION);
- dim3 dimGrid(n / DIM_PORTION, n / DIM_PORTION);
- gpu_matrix_mult <<<dimGrid, dimBlock >>>(d_A, d_B, d_C, n);
- // Copie du résultat
- cudaMemcpy(h_Cgpu, d_C, size, cudaMemcpyDeviceToHost);
- // Verification
- printf("Erreur max: %e\n", verify(h_Ccpu, h_Cgpu, n));
- // Timing
- cudaEvent_t start, stop;
- CUDA_SAFE_CALL(cudaEventCreate(&start));
- CUDA_SAFE_CALL(cudaEventCreate(&stop));
- const int nb = 1;
- CUDA_SAFE_CALL(cudaEventRecord(start, 0));
- for (int i = 0; i < nb; i++)
- gpu_matrix_mult <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, n);
- CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
- CUDA_SAFE_CALL(cudaEventSynchronize(stop));
- float t_ms;
- cudaEventElapsedTime(&t_ms, start, stop);
- t_ms /= nb;
- CUDA_SAFE_CALL(cudaEventDestroy(start));
- CUDA_SAFE_CALL(cudaEventDestroy(stop));
- printf("Time for the kernel: %f ms\n", t_ms); // Print Elapsed time
- printf(" Performance: %f GFlops (/s\n", (3 * size / t_ms / 1e6));
- // Deallocation de la memoire GPU
- CUDA_SAFE_CALL(cudaFree(d_A));
- CUDA_SAFE_CALL(cudaFree(d_B));
- // Deallocation de la memoire CPU
- delete[] h_A;
- delete[] h_B;
- system("PAUSE");
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement