Untitled

// Includes
#include <stdio.h>
#include <ctime>
#include <iostream>

// includes CUDA
#include <cuda_runtime.h>

#include "../include/cudautil.cu"
#include "../include/cuda_call.h"

#define DIM_PORTION 32
#define LIGNES_BLOC 8

// Code GPU
__global__ void copymat_device(const float* A, float* B){
	// A compléter
	//column
	int x = blockIdx.x * DIM_PORTION + threadIdx.x;

	//line
	int y = blockIdx.y * DIM_PORTION + threadIdx.y;
 	int largeur = gridDim.x * DIM_PORTION;

	//indice : line*largeur + column

	for (int j = 0; j < DIM_PORTION; j+= LIGNES_BLOC)
		B[(y+j)*largeur + x] = A[(y+j)*largeur + x];
}


// Code CPU
void genmat(float *A, int n){
	for (int i=0; i<n; i++)
		for (int j=0; j<n; j++)
			A[i*n + j] = rand()/(float) RAND_MAX;
}


float verify(const float *A, const float* B, int n){
	float error = 0;
	for (int i=0; i<n; i++)
		for (int j=0; j<n; j++)
			error = std::max(error, abs(A[i*n + j] - B[i*n + j]));

	return error;
}

// Host code
int main(int argc, char** argv){
	printf("Copie de  matrice\n");
	int n = 256;
	size_t size = n*n*sizeof(float);

	// Initialisation de CUDA
	checkCUDA(0);

	// Matrices CPU
	float *h_A = NULL, *h_B = NULL;
	// Matrices GPU
	float *d_A = NULL, *d_B = NULL;

	// Allocatation des vecteurs dans la mémoire CPU
	h_A = new float[n*n];
	h_B = new float[n*n];

	// Allocation des vecteurs dans la mémoire GPU
	// A compléter
	CUDA_SAFE_CALL( cudaMalloc((void **)&d_A, size));
	CUDA_SAFE_CALL( cudaMalloc((void **)&d_B, size));

	// Initialisation de la matrice A
	genmat(h_A, n);

	// Copie de la matrice A dans la mémoire GPU
	CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));

	// Appel du kernel
	const int block_size = 32;
	dim3 threadsPerBlock(block_size, block_size);
	dim3 numBlocks(n / threadsPerBlock.x, n / threadsPerBlock.y);

	// Verification
	printf("Erreur max: %e\n", verify(h_A, h_B, n));

	// Timing
	cudaEvent_t start, stop;
	CUDA_SAFE_CALL(cudaEventCreate(&start));
	CUDA_SAFE_CALL(cudaEventCreate(&stop));

	const int nb = 10;
	CUDA_SAFE_CALL(cudaEventRecord(start, 0));
	for (int i = 0; i < nb; i++)
		copymat_device<<<numBlocks, threadsPerBlock>>>(d_A, d_B);

	CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
	CUDA_SAFE_CALL(cudaEventSynchronize(stop));
	float t_ms;
	CUDA_SAFE_CALL(cudaEventElapsedTime(&t_ms, start, stop));
	t_ms /= nb;

	// Copie du résultat
	CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost));

	// A compléter
	printf("Bande passante: %f  Go/s\n", (2*size*4/t_ms)/1073741824);

	// Deallocation de la memoire GPU
	// A compléter
	CUDA_SAFE_CALL(cudaFree (d_A));
	CUDA_SAFE_CALL(cudaFree (d_B));

	// Deallocation de la memoire CPU
	delete [] h_A;
	delete [] h_B;

	return 0;
}