Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Includes
- #include <stdio.h>
- #include <ctime>
- #include <iostream>
- #include <algorithm>
- // includes CUDA
- #include <cuda_runtime.h>
- #include "cudautil.cuh"
- #include "cuda_call.h"
- using namespace std;
- const int DIM_PORTION = 32;
- const int LIGNES_BLOC = 8;
- // Code GPU
- __global__ void copymat_device(float* A, float* B)
- {
- // A compléter
- int x = blockIdx.x * DIM_PORTION + threadIdx.x;
- int y = blockIdx.y * DIM_PORTION + threadIdx.y;
- int largeur = gridDim.x * DIM_PORTION;
- for (int j = 0; j < DIM_PORTION; j+= LIGNES_BLOC){
- B[(y+j)*largeur + x] = A[(y+j)*largeur + x];
- }
- }
- // Code CPU
- void genmat(float *A, int n)
- {
- for (int i=0; i<n; i++)
- for (int j=0; j<n; j++)
- A[i*n + j] = rand()/(float) RAND_MAX;
- }
- float verify(const float *A, const float* B, int n)
- {
- float error = 0;
- for (int i=0; i<n; i++)
- for (int j=0; j<n; j++)
- error = (std::max)(error, abs(A[i*n + j] - B[i*n + j]));
- return error;
- }
- // Host code
- int main(void)
- {
- int dev;
- cudaError_t check = cudaGetDevice(&dev);
- if(check == cudaSuccess)
- printf("OK\n");
- printf("Copie de matrice\n");
- // n ne peut exceder 256 (pb d'allocation mémoire)
- int n = 256;
- size_t size = n*n*sizeof(float);
- // Initialisation de CUDA
- //checkCUDA(0);
- // Matrices CPU
- float *h_A = NULL, *h_B = NULL;
- // Matrices GPU
- float *d_A = NULL, *d_B = NULL;
- // Allocatation des vecteurs dans la mémoire CPU
- h_A = new float[n*n];
- h_B = new float[n*n];
- // Allocation des vecteurs dans la mémoire GPU
- // A compléter
- CUDA_SAFE_CALL(cudaMalloc((void **) &d_A, size));
- CUDA_SAFE_CALL(cudaMalloc((void **) &d_B, size));
- // Initialisation de la matrice A
- genmat(h_A, n);
- // Copie de la matrice A dans la mémoire GPU
- cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
- // Appel du kernel
- const int block_size = 32;
- dim3 threadsPerBlock(block_size, block_size);
- dim3 numBlocks(n / threadsPerBlock.x, n / threadsPerBlock.y);
- copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);
- // Copie du résultat
- cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
- // Verification
- printf("Erreur max: %e\n", verify(h_A, h_B, n));
- // Timing
- cudaEvent_t start, stop;
- CUDA_SAFE_CALL(cudaEventCreate(&start));
- CUDA_SAFE_CALL(cudaEventCreate(&stop));
- const int nb = 10;
- CUDA_SAFE_CALL(cudaEventRecord(start, 0));
- for (int i = 0; i < nb; i++){
- copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);
- }
- CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
- CUDA_SAFE_CALL(cudaEventSynchronize(stop));
- float t_ms;
- cudaEventElapsedTime(&t_ms, start, stop);
- t_ms /= nb;
- printf ("Time for the kernel: %f ms\n", t_ms); // Print Elapsed time
- // printf("Bande passante: %e GB/s\n", (size*2)/(1024*1024*1024)/(t_ms*1e-6));
- printf("Bande passante: %e B/s\n", (size*2)/(t_ms*1e-6));
- // Deallocation de la memoire GPU
- // A compléter
- CUDA_SAFE_CALL(cudaFree( d_A ));
- CUDA_SAFE_CALL(cudaFree( d_B ));
- // Deallocation de la memoire CPU
- delete [] h_A;
- delete [] h_B;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement