Untitled

// Includes
#include <stdio.h>
#include <ctime>
#include <iostream>
#include <algorithm>


// includes CUDA
#include <cuda_runtime.h>

#include "cudautil.cuh"
#include "cuda_call.h"

using namespace std;

const int DIM_PORTION = 32;


// Code GPU Non shared memory
__global__ void gpu_matrix_mult(float *A, float *B, float *C, int n)
{


	int x = blockIdx.x * DIM_PORTION + threadIdx.x;	 // colonne
	int y = blockIdx.y * DIM_PORTION + threadIdx.y;	 // ligne


	float sum_tmp = 0.0f;


	for (int k = 0; k < n; k += 1) {

		sum_tmp += A[y * n + k] * B[k * n + x];
	}

	C[y * n + x] = sum_tmp;
}


// Code CPU
void genmat(float *A, int n)
{
	for (int i = 0; i<n; i++)
		for (int j = 0; j<n; j++)
			A[i*n + j] =  i*j / (float)n / (float)n;
}

void dispmat(float *A, int n) {
	for (int i = 0; i<n; i++)
		for (int j = 0; j<n; j++)
			printf("%f%s", A[i*n + j], ((j == (n - 1)) ? "\n" : "\t"));
}

float verify(const float *A, const float* B, int n)
{
	float error = 0;

	for (int i = 0; i<n; i++)
		for (int j = 0; j<n; j++) {
			//printf("(%d,%d)\t%f\t%f\n", i, j, A[i*n + j], B[i*n + j]);
			error = (std::max)(error, abs(A[i*n + j] - B[i*n + j]));
		}

	return error;
}


void cpu_matrix_mult(float *h_A, float *h_B, float *h_C, int N) {

	float sum;

	for (int x = 0; x<N; x++) {  // boucle ligne

		if (x % 256 == 0)
		   printf("x = %d\n", x);

		for (int y = 0; y<N; y++) {  //boucle  colonne


			sum = 0.f;
			for (int n = 0; n<N; n++) {   // boucle : pour chaque pixel de la matrice C

  				sum += h_A[x*N + n] * h_B[n*N + y];
				//printf("%f  %f  %f\n", sum, h_A[x*N + n], h_B[n*N + y]);

			}

			h_C[x*N + y] = sum;
		}

	}
}


// Host code
int main(void) {

	int dev;
	cudaError_t check = cudaGetDevice(&dev);

	if (check == cudaSuccess)
		printf("cudaSuccess\n");


	int n = 2048;
	size_t size = n*n*sizeof(float);

	// Initialisation de CUDA
	//checkCUDA(0);

	// Matrices CPU
	float *h_A = NULL, *h_B = NULL, *h_Cgpu = NULL, *h_Ccpu = NULL;

	// Matrices GPU
	float *d_A = NULL, *d_B = NULL, *d_C = NULL;

	// Allocatation des vecteurs dans la mémoire CPU
	h_A = new float[n*n];
	h_B = new float[n*n];
	h_Cgpu = new float[n*n];
	h_Ccpu = new float[n*n];


	// Allocation des vecteurs dans la mémoire GPU
	// A compléter
	CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, size));
	CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, size));
	CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, size));

	// Initialisation de la matrice A
	genmat(h_A, n);


	// Initialisation de la matrice B
	genmat(h_B, n);

	// Appel du de la multiplication matricielle du cpu
	cpu_matrix_mult(h_A, h_B, h_Ccpu, n);

	// Copie de la matrice A dans la mémoire GPU
	cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

	// Copie de la matrice B dans la mémoire GPU
	cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

	// Appel du kernel
	dim3 dimBlock(DIM_PORTION, DIM_PORTION);
	dim3 dimGrid(n / DIM_PORTION, n / DIM_PORTION);

 	gpu_matrix_mult <<<dimGrid, dimBlock >>>(d_A, d_B, d_C, n);

	// Copie du résultat
	cudaMemcpy(h_Cgpu, d_C, size, cudaMemcpyDeviceToHost);


	// Verification
	printf("Erreur max: %e\n", verify(h_Ccpu, h_Cgpu, n));


	// Timing
	cudaEvent_t start, stop;
	CUDA_SAFE_CALL(cudaEventCreate(&start));
	CUDA_SAFE_CALL(cudaEventCreate(&stop));

	const int nb = 1;
	CUDA_SAFE_CALL(cudaEventRecord(start, 0));

	for (int i = 0; i < nb; i++)
		gpu_matrix_mult <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, n);


	CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
	CUDA_SAFE_CALL(cudaEventSynchronize(stop));

	float t_ms;
	cudaEventElapsedTime(&t_ms, start, stop);
	t_ms /= nb;

	CUDA_SAFE_CALL(cudaEventDestroy(start));
	CUDA_SAFE_CALL(cudaEventDestroy(stop));


	printf("Time for the kernel: %f ms\n", t_ms);	// Print Elapsed time
	printf(" Performance: %f	GFlops (/s\n", (3 * size / t_ms / 1e6));


	// Deallocation de la memoire GPU


	CUDA_SAFE_CALL(cudaFree(d_A));
	CUDA_SAFE_CALL(cudaFree(d_B));

	// Deallocation de la memoire CPU
	delete[] h_A;
	delete[] h_B;

	system("PAUSE");
}