Untitled

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>


// 652.052 us - naive, 1.942 ms - shared memory
// cpu: 5 ms
// memcpy - H to D : 10.368 us, 5 invocari, total: 86,008 kB
// memcpy - D to H : 3.904 us, 2 invocari, total: 32,768 kB
// total: 450.74 ms.

//#define COMP_SW__PRINT_SIGNAL
//#define COMP_SW__PRINT_TEMPLATE
//#define COMP_SW__PRINT_TEMPLATE_PADD
//#define COMP_SW__PRINT_MEDII_PADD

#define COMP_SW__PRINT_RESULT_CPU
#define COMP_SW__PRINT_RESULT_GPU


#define DIM_SIGNAL      (2559u)
#define DIM_TEMPLATE    (512u)
#define DIM_XCORR       (DIM_SIGNAL - DIM_TEMPLATE + 1)

//#if ((DIM_SIG % DIM_TEMPLATE) != (DIM_TEMPLATE - 1))
//#error 	"Alege o alta dimensiune a semnalului / template-ului care satisface conditia."
//#endif

#define NO_THREADS_BLOCK	DIM_TEMPLATE
#define NO_BLOCKS			(DIM_SIGNAL / DIM_TEMPLATE)
// NO_BLOCKS = DIM_SIG / DIM_TEMPLATE, pt DIM_SIG % DIM_TEMPLATE == (DIM_TEMPLATE - 1)

double getRandom(double min, double max)
{
	double range = (max - min);
	double divv = RAND_MAX / range;
	double rezultat = (min + ((double)rand() / divv));
	return rezultat;
}
//_______________________________________________________________________
// Calcul Template Matching with Cross-Correlation --- Secvential   (CPU)
//_______________________________________________________________________

void CPU_xCorr(double* xcorr, double* sig, double* tem, double medSig, double medTem, unsigned int dimTem, unsigned int dimSig)
{
	double sumNumarator = 0.0;
	double sumNumitor_S = 0.0;
	double sumNumitor_T = 0.0;

	for (unsigned int t = 0; t <= (dimSig - dimTem); ++t)
	{
		sumNumarator = 0.0;
		sumNumitor_S = 0.0;
		sumNumitor_T = 0.0;

		for (unsigned int i = 0; i < dimTem; ++i)
		{
			double l_S = sig[i + t] - medSig;
			double l_T = tem[i] - medTem;

			sumNumarator += l_S * l_T;
			sumNumitor_S += l_S * l_S;
			sumNumitor_T += l_T * l_T;
		}
		xcorr[t] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
	}
}
//_______________________________________________________________________
// Calcul Template Matching with Cross-Correlation --- Paralel      (GPU)
//_______________________________________________________________________

__global__ void naive__GPU_xCorr_Template_Matching(double* xcorr, double* sig, double* tem, double medSig, double medTem, unsigned int dimTem, unsigned int dimSig)
{
	double sumNumarator = 0.0;
	double sumNumitor_S = 0.0;
	double sumNumitor_T = 0.0;
	unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;


	for (unsigned int i = 0; i < dimTem; ++i)
	{
		double l_S = sig[thrd_offs + i] - medSig;
		double l_T = tem[i] - medTem;

		sumNumarator += l_S * l_T;
		sumNumitor_S += l_S * l_S;
		sumNumitor_T += l_T * l_T;
	}
	xcorr[thrd_offs] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
}

__global__ void shamem__GPU_xCorr_Template_Matching(double* xcorr, double* sig, double* tem, double* medSig, double* medTem)
{
	double sumNumarator = 0.0;
	double sumNumitor_S = 0.0;
	double sumNumitor_T = 0.0;

	unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;
	__shared__ double sham_sig[2 * DIM_TEMPLATE - 1];
	__shared__ double sham_tem[DIM_TEMPLATE];
	__shared__ double sham_xcorr[DIM_TEMPLATE]; 	// [(2 * DIM_TEMPLATE - 1) - DIM_TEMPLATE + 1]
	__shared__ double sham_medSig[DIM_TEMPLATE];
	__shared__ double sham_medTem[DIM_TEMPLATE];

	for (unsigned char i = 0; i <= 1; ++i)
	{
		sham_sig[threadIdx.x + DIM_TEMPLATE * i] = sig[thrd_offs + DIM_TEMPLATE - 1];
	}
	sham_tem[threadIdx.x] = tem[thrd_offs];
	sham_medTem[threadIdx.x] = medTem[thrd_offs];
	sham_medSig[threadIdx.x] = medSig[thrd_offs];
	__syncthreads();

	for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
	{
		double l_S = sham_sig[((threadIdx.x + i) % DIM_TEMPLATE) + threadIdx.x] - sham_medSig[threadIdx.x];
		double l_T = sham_tem[((threadIdx.x + i) % DIM_TEMPLATE)] - sham_medTem[threadIdx.x];

		sumNumarator += l_S * l_T;
		sumNumitor_S += l_S * l_S;
		sumNumitor_T += l_T * l_T;
	}
	sham_xcorr[threadIdx.x] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
	__syncthreads();

	xcorr[thrd_offs] = sham_xcorr[threadIdx.x];
}

int main(void)
{
	//_______________________________________________________________________
	// Calcul numar de threads necesare per block. Calcul numar de blocks.
	//_______________________________________________________________________
	dim3 dimGrid(NO_BLOCKS);
	dim3 dimBlock(NO_THREADS_BLOCK);

	cudaError_t cudaStatus;
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	clock_t cpu_start, cpu_end;

	double elap;

	srand(time(NULL));

	float elapsedTime;

	double *dev_signal, *hos_signal;
	double *dev_templa, *hos_templa;
	double *dev_xcorr, *hos_xcorr, *hos_xcorr_compare;

	double hos_medTemplate[DIM_TEMPLATE * NO_BLOCKS];
	double hos_medSignal[DIM_TEMPLATE * NO_BLOCKS];
	double *dev_medSignal, *dev_medTemplate;
	//_______________________________________________________________________
	// Alocare memorie pentru vectorii: semnal, template, rezultat
	// Cronometrare.
	//_______________________________________________________________________

	hos_signal = (double*)malloc(sizeof(double) * DIM_SIGNAL);
	hos_templa = (double*)malloc(sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
	hos_xcorr = (double*)malloc(sizeof(double) * DIM_XCORR);
	hos_xcorr_compare = (double*)malloc(sizeof(double) * DIM_XCORR);

	printf("\ndim_template: %d\n", DIM_TEMPLATE);
	printf("\ndim_threads_per_block: %d\n", NO_THREADS_BLOCK);
	printf("\ndim_grid: %d\n", NO_BLOCKS);
	printf("\ndim_xcorr: %d\n", DIM_XCORR);

	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_signal, sizeof(double) * DIM_SIGNAL);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed! - device signal\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_templa, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed! - device template\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_xcorr, sizeof(double) * DIM_XCORR);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed! - device xcorr\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_medTemplate, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed! - device medTemplate\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_medSignal, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed! - device medSignal\n");
		goto Error;
	}

	//_______________________________________________________________________
	// Initializare semnal cu valori random intre MIN si MAX. Calcul medie.
	//_______________________________________________________________________
	const double minn = -10.0;
	const double maxx = 10.0;

	for (unsigned int i = 0; i < DIM_TEMPLATE * NO_BLOCKS; i++)
	{
		hos_medSignal[i] = 0;
		hos_medTemplate[i] = 0;
	}
	for (unsigned int i = 0; i < DIM_SIGNAL; i++)
	{
		double rezultat = getRandom(minn, maxx);
		hos_signal[i] = rezultat;
		hos_medSignal[0] += hos_signal[i];
	}
	hos_medSignal[0] /= (double)DIM_SIGNAL;
	printf("\nmedia semnal: %f\n", hos_medSignal[0]);
	//_______________________________________________________________________
	// Initializare template cu valori in (-1,1) (normalizate). Calcul medie.
	//_______________________________________________________________________
	for (unsigned int i = 0; i < DIM_TEMPLATE; i++)
	{
		hos_templa[i] = getRandom(-1.0, 1.0);
		hos_medTemplate[0] += hos_templa[i];
	}
	hos_medTemplate[0] /= (double)DIM_TEMPLATE;
	printf("\nmedia template: %f\n", hos_medTemplate[0]);

#ifdef COMP_SW__PRINT_SIGNAL
	// Afisare semnal.
	{
		printf("SIGNAL: RAND(%f, %f) :\n\n", minn, maxx);
		for (unsigned int i = 0; i < DIM_SIGNAL; ++i)
		{
			if (i % 10 == 0) printf("\n");
			printf("%f ", hos_signal[i]);
		}
		printf("\n\n");
	}
#endif
#ifdef COMP_SW__PRINT_TEMPLATE
	// Afisare template.
	{
		printf("TEMPLATE: RAND(-1, 1) :\n\n");
		for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
		{
			if (i % 10 == 0) printf("\n");
			printf("%f ", hos_templa[i]);
		}
		printf("\n\n");
	}
#endif
	// padding template
	for (unsigned int i = DIM_TEMPLATE; i < DIM_TEMPLATE * NO_BLOCKS; i++)
	{
		hos_templa[i] = hos_templa[i % DIM_TEMPLATE];
	}
	// padding medii
	for (unsigned int i = 1; i < DIM_TEMPLATE * NO_BLOCKS; i++)
	{
		hos_medSignal[i] = hos_medSignal[0];
		hos_medTemplate[i] = hos_medTemplate[0];
	}
#ifdef COMP_SW__PRINT_TEMPLATE_PADD
	// Afisare template cu padding.
	{
		printf("TEMPLATE padded: RAND(-1, 1) :\n\n");
		for (unsigned int i = 0; i < DIM_TEMPLATE * NO_BLOCKS; ++i)
		{
			if (i % 10 == 0) printf("\n");
			printf("%f ", hos_templa[i]);
		}
		printf("\n\n");
	}
#endif
#ifdef COMP_SW__PRINT_MEDII_PADD
	// Afisare medii cu padding.
	{
		printf("Medii: Signal:\n\n");
		for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
		{
			if (i % 10 == 0) printf("\n");
			printf("%f ", hos_medTemplate[i]);
		}
		printf("\n\n");

		printf("Medii: Template:\n\n");
		for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
		{
			if (i % 10 == 0) printf("\n");
			printf("%f ", hos_medSignal[i]);
		}
		printf("\n\n");
	}
#endif


	//_______________________________________________________________________
	// Transmitere date de intrare catre device (GPU)
	// Cronometrare.
	//_______________________________________________________________________
	// startCPUTimer();
	cudaStatus = cudaMemcpy(dev_signal, hos_signal, sizeof(double) * DIM_SIGNAL, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed! - to device, signal\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_templa, hos_templa, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed! - to device, template\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_xcorr, hos_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed! - to device, xcorr\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_medTemplate, hos_medTemplate, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed! - to device, medTemplate\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_medSignal, hos_medSignal, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed! - to device, medSignal\n");
		goto Error;
	}

	printf("II transfer");
	//stopCPUTimer();
	//printf("time[TRANSFER-H-D]: %f\n\n", elapsedTime);

	//_______________________________________________________________________
	// Pornire kernel. Cronometrare.
	//_______________________________________________________________________

	cudaEventRecord(start, 0);

	shamem__GPU_xCorr_Template_Matching <<< dimGrid, dimBlock >>>(dev_xcorr, dev_signal, dev_templa, dev_medSignal, dev_medTemplate);

	cudaError_t eroare = cudaGetLastError();
	printf("\nCudaLastError:\n");
	printf(cudaGetErrorString(eroare));
	printf("\n\n");

	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaThreadSynchronize();

	//_______________________________________________________________________
	// Calcul timp de executie kernel. Afisare rezultate.
	//_______________________________________________________________________
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("time[KERNEL_SHAMEM]: %f\n\n", elapsedTime);

	cudaStatus = cudaMemcpy(hos_xcorr, dev_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed! - to host, xcorr");
		goto Error;
	}
#ifdef COMP_SW__PRINT_RESULT_GPU
	for (unsigned int i = 0; i < DIM_XCORR; ++i)
	{
		if (i % 10 == 0) printf("\n");
		printf("%f ", hos_xcorr[i]);
	}
	printf("\n_______________________________________________________________________\n\n");
#endif
	//_______________________________________________________________________
	// Calcul CPU. Cronometrare.
	//_______________________________________________________________________
	cpu_start = clock();
	CPU_xCorr(hos_xcorr_compare, hos_signal, hos_templa, hos_medSignal[0], hos_medTemplate[0], DIM_TEMPLATE, DIM_SIGNAL);
	printf("\ntime[CPU]: %f\n", ((double)(clock() - cpu_start))/CLOCKS_PER_SEC);

#ifdef COMP_SW__PRINT_RESULT_CPU
	for (unsigned int i = 0; i < DIM_XCORR; ++i)
	{
		if (i % 10 == 0) printf("\n");
		printf("%f ", hos_xcorr_compare[i]);
	}
#endif

	for (unsigned int i = 0; i < DIM_XCORR; ++i)
	{
		if (hos_xcorr[i] != hos_xcorr_compare[i])
		{
			printf("\n\n\nNOT EQUAL\n\n\n");
			//goto Error;
			goto tryagain;
		}
	}
tryagain:
	printf("trying old_ver\n\n");
	cudaEventRecord(start, 0);
	naive__GPU_xCorr_Template_Matching <<< dimGrid, dimBlock >>>(dev_xcorr, dev_signal, dev_templa, hos_medSignal[0], hos_medTemplate[0], DIM_TEMPLATE, DIM_SIGNAL);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaThreadSynchronize();
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("time[KERNEL_NAIVE]: %f\n\n", elapsedTime);

	cudaStatus = cudaMemcpy(hos_xcorr, dev_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed! - to host, xcorr");
		goto Error;
	}
	for (unsigned int i = 0; i < DIM_XCORR; ++i)
	{
		if (hos_xcorr[i] != hos_xcorr_compare[i])
		{
			printf("\n\n\nNOT EQUAL\n\n\n");
			goto Error;
		}
	}
	printf("\n\n\nRESULTS EQUAL\n\n\n");
	goto Succes;
	//_______________________________________________________________________
	// Tratare erori. Elibereare memorie.
	//_______________________________________________________________________
Error:
	printf("Eroare!\n\n");
Succes:
	cudaFree(hos_xcorr);
	printf("freed hos_xcorr!\n\n");
	cudaFree(hos_signal);
	printf("freed hos_signal!\n\n");
	cudaFree(hos_templa);
	printf("freed hos_templa!\n\n");
	cudaFree(dev_xcorr);
	printf("freed dev_xcorr!\n\n");
	cudaFree(dev_signal);
	printf("freed dev_signal!\n\n");
	cudaFree(dev_templa);
	printf("freed dev_templa!\n\n");
	cudaFree(dev_medSignal);
	printf("freed dev_medSignal!\n\n");
	cudaFree(dev_medTemplate);
	printf("freed dev_medTemplate!\n\n");


	return 0;
}