Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Includes
- #include <stdio.h>
- #include <ctime>
- #include <iostream>
- // includes CUDA
- #include <cuda_runtime.h>
- #include "../include/cudautil.cu"
- #include "../include/cuda_call.h"
- #define DIM_PORTION 32
- #define LIGNES_BLOC 8
- // Code GPU
- __global__ void copymat_device(const float* A, float* B){
- // A compléter
- //column
- int x = blockIdx.x * DIM_PORTION + threadIdx.x;
- //line
- int y = blockIdx.y * DIM_PORTION + threadIdx.y;
- int largeur = gridDim.x * DIM_PORTION;
- //indice : line*largeur + column
- for (int j = 0; j < DIM_PORTION; j+= LIGNES_BLOC)
- B[(y+j)*largeur + x] = A[(y+j)*largeur + x];
- }
- // Code CPU
- void genmat(float *A, int n){
- for (int i=0; i<n; i++)
- for (int j=0; j<n; j++)
- A[i*n + j] = rand()/(float) RAND_MAX;
- }
- float verify(const float *A, const float* B, int n){
- float error = 0;
- for (int i=0; i<n; i++)
- for (int j=0; j<n; j++)
- error = std::max(error, abs(A[i*n + j] - B[i*n + j]));
- return error;
- }
- // Host code
- int main(int argc, char** argv){
- printf("Copie de matrice\n");
- int n = 256;
- size_t size = n*n*sizeof(float);
- // Initialisation de CUDA
- checkCUDA(0);
- // Matrices CPU
- float *h_A = NULL, *h_B = NULL;
- // Matrices GPU
- float *d_A = NULL, *d_B = NULL;
- // Allocatation des vecteurs dans la mémoire CPU
- h_A = new float[n*n];
- h_B = new float[n*n];
- // Allocation des vecteurs dans la mémoire GPU
- // A compléter
- CUDA_SAFE_CALL( cudaMalloc((void **)&d_A, size));
- CUDA_SAFE_CALL( cudaMalloc((void **)&d_B, size));
- // Initialisation de la matrice A
- genmat(h_A, n);
- // Copie de la matrice A dans la mémoire GPU
- CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
- // Appel du kernel
- const int block_size = 32;
- dim3 threadsPerBlock(block_size, block_size);
- dim3 numBlocks(n / threadsPerBlock.x, n / threadsPerBlock.y);
- // Verification
- printf("Erreur max: %e\n", verify(h_A, h_B, n));
- // Timing
- cudaEvent_t start, stop;
- CUDA_SAFE_CALL(cudaEventCreate(&start));
- CUDA_SAFE_CALL(cudaEventCreate(&stop));
- const int nb = 10;
- CUDA_SAFE_CALL(cudaEventRecord(start, 0));
- for (int i = 0; i < nb; i++)
- copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);
- CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
- CUDA_SAFE_CALL(cudaEventSynchronize(stop));
- float t_ms;
- CUDA_SAFE_CALL(cudaEventElapsedTime(&t_ms, start, stop));
- t_ms /= nb;
- // Copie du résultat
- CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost));
- // A compléter
- printf("Bande passante: %f Go/s\n", (2*size*4/t_ms)/1073741824);
- // Deallocation de la memoire GPU
- // A compléter
- CUDA_SAFE_CALL(cudaFree (d_A));
- CUDA_SAFE_CALL(cudaFree (d_B));
- // Deallocation de la memoire CPU
- delete [] h_A;
- delete [] h_B;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement