Untitled

// Includes
#include <stdio.h>
#include <ctime>
#include <iostream>
#include <algorithm>


// includes CUDA
#include <cuda_runtime.h>

#include "cudautil.cuh"
#include "cuda_call.h"

	using namespace std;

 	const int DIM_PORTION = 32;
 	const int LIGNES_BLOC = 8;


// Code GPU
__global__ void copymat_device(const float* A, float* B)
{
	// A compléter

	int x = blockIdx.x * DIM_PORTION + threadIdx.x;
	int y = blockIdx.y * DIM_PORTION + threadIdx.y;
	int largeur = gridDim.x * DIM_PORTION;

	for (int j = 0; j < DIM_PORTION; j+= LIGNES_BLOC){
			  B[(y+j)*largeur + x] = A[(y+j)*largeur + x];
	}


}


// Code CPU
void genmat(float *A, int n)
{
	for (int i=0; i<n; i++)
		for (int j=0; j<n; j++)
			A[i*n + j] = rand()/(float) RAND_MAX;
}


float verify(const float *A, const float* B, int n)
{
	float error = 0;
	for (int i=0; i<n; i++)
		for (int j=0; j<n; j++)
			error = (std::max)(error, abs(A[i*n + j] - B[i*n + j]));

	return error;
}


// Host code
int main(void)
{


    int dev;
	cudaError_t check = cudaGetDevice(&dev);

	if(check == cudaSuccess)
          printf("OK\n");


	printf("Copie de  matrice\n");

	// n ne peut exceder 256 (pb d'allocation mémoire)
	int n = 256;
	size_t size = n*n*sizeof(float);

	// Initialisation de CUDA
	//checkCUDA(0);

	// Matrices CPU
	float *h_A = NULL, *h_B = NULL;

	// Matrices GPU
	float *d_A = NULL, *d_B = NULL;

	// Allocatation des vecteurs dans la mémoire CPU
	h_A = new float[n*n];
	h_B = new float[n*n];


	// Allocation des vecteurs dans la mémoire GPU
	// A compléter
	CUDA_SAFE_CALL(cudaMalloc((void **) &d_A, size));
    CUDA_SAFE_CALL(cudaMalloc((void **) &d_B, size));

	// Initialisation de la matrice A
	genmat(h_A, n);

	// Copie de la matrice A dans la mémoire GPU
	cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

	// Appel du kernel


	dim3 threadsPerBlock(DIM_PORTION, LIGNES_BLOC);
	dim3 numBlocks(n / DIM_PORTION, n / DIM_PORTION);


	copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);

	// Copie du résultat
	cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);

	// Verification
	printf("Erreur max: %e\n", verify(h_A, h_B, n));


	// Timing
	cudaEvent_t start, stop;
	CUDA_SAFE_CALL(cudaEventCreate(&start));
	CUDA_SAFE_CALL(cudaEventCreate(&stop));

	const int nb = 10;
	CUDA_SAFE_CALL(cudaEventRecord(start, 0));


	for (int i = 0; i < nb; i++){
		copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);
	}

	CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
	CUDA_SAFE_CALL(cudaEventSynchronize(stop));

	float t_ms;
	cudaEventElapsedTime(&t_ms, start, stop);
	t_ms /= nb;


	printf ("Time for the kernel: %f ms\n", t_ms);  // Print Elapsed time
	printf("Bande passante: %f  Go/s\n", (2*size*4/t_ms)/(1024*1024*1024));


	// Deallocation de la memoire GPU
	// A compléter

	CUDA_SAFE_CALL(cudaFree( d_A ));
    CUDA_SAFE_CALL(cudaFree( d_B ));

	// Deallocation de la memoire CPU
	delete [] h_A;
	delete [] h_B;
}