Untitled


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>


#define MAX_SIZE 4096
#define MAX_BLOCK_SIZE 16
#define GPU_MODE 0
#define CPU_MODE 1
#define CMP_MODE 2

//typedef double matrix[MAX_SIZE][MAX_SIZE];

int	N;					/* matrix size		*/
int	maxnum;				/* max number of element*/
const char *Init;		/* matrix init type	*/
int	PRINT;				/* print switch		*/
int MODE;
double	* A;			/* matrix A		*/
double  * A_seq;
double	b[MAX_SIZE];	/* vector b             */
double	y[MAX_SIZE];	/* vector y             */
double	b_seq[MAX_SIZE];	/* vector b             */
double	y_seq[MAX_SIZE];	/* vector y             */
size_t nrOfThreads;

						/* forward declarations */
void Init_Matrix(void);
void Print_Matrix(void);
void Print_Matrix_seq(void);
void Init_Default(void);
void scout_for_errors(void);
int Read_Options(int, char **);

cudaError_t gaussCuda();

__global__ void gaussKernelDivide(double * A_d, double * y_d, double * b_d, size_t pitch, size_t k, size_t stride, size_t size)
{
	size_t colLen = pitch / sizeof(double);
	int y = threadIdx.y + blockIdx.y * blockDim.y;
	if (k%stride == y) {
		for (size_t x = k + 1; x <= size; x++)
			A_d[k*colLen + x] = A_d[k*colLen + x] / A_d[k*colLen + k]; // Division step
		y_d[k] = b_d[k] / A_d[k*colLen + k];
		A_d[k*colLen + k] = 1.0;
	}
}

__global__ void gaussKernelEliminate(double * A_d, double * y_d, double * b_d, size_t pitch, size_t k, size_t stride, size_t size)
{
	size_t colLen = pitch / sizeof(double);
	int y = threadIdx.y + blockIdx.y * blockDim.y;
	for (int i = y; i < size; i+=stride) {
		if (i != k) {
			for (size_t x = k + 1; x < size; x++) {
				A_d[i*colLen + x] = A_d[i*colLen + x] - A_d[i*colLen + k] * A_d[k*colLen + x]; // Elimination step
			}
			y_d[i] = y_d[i] - A_d[i*colLen + k] * y_d[k];
			b_d[i] = b_d[i] - A_d[i*colLen + k] * y_d[k];
			A_d[i*colLen + k] = 0.0;
		}
	}
}

void gaussSeq(void)
{
	int i, j, k;

	/* Gaussian elimination algorithm, Algo 8.4 from Grama */
	for (k = 0; k < N; k++) { /* Outer loop */
		for (j = k + 1; j < N; j++)
			A_seq[(k*N)+j] = A_seq[(k*N) + j] / A_seq[(k*N) + k]; /* Division step */
		y_seq[k] = b_seq[k] / A_seq[(k*N) + k];
		A_seq[(k*N) + k] = 1.0;
		/*for (i = k + 1; i < N; i++) {
			for (j = k + 1; j < N; j++)
				A_seq[(i*N) + j] = A_seq[(i*N) + j] - A_seq[(i*N) + k] * A_seq[(k*N) + j]; // Elimination step
			b_seq[i] = b_seq[i] - A_seq[(i*N) + k] * y_seq[k];
			A_seq[(i*N) + k] = 0.0;
		}*/
		for (i = 0; i < N; i++) {
			if (i != k) {
				for (j = k + 1; j < N; j++)
					A_seq[(i*N) + j] = A_seq[(i*N) + j] - A_seq[(i*N) + k] * A_seq[(k*N) + j]; /* Elimination step */
				y_seq[i] = y_seq[i] - A_seq[(i*N) + k] * y_seq[k];
				b_seq[i] = b_seq[i] - A_seq[(i*N) + k] * y_seq[k];
				A_seq[(i*N) + k] = 0.0;
			}
		}
	}
}

int main(int argc, char * argv[])
{
	clock_t start, end;
	double cpu_time_used;
	//int exit_code = 0;

	Init_Default();		/* Init default values	*/
	Read_Options(argc, argv);	/* Read arguments	*/
	Init_Matrix();		/* Init the matrix	*/
	cudaError_t cudaStatus;
	if (MODE == GPU_MODE || MODE == CMP_MODE) {
		start = clock();
		cudaStatus = gaussCuda();
		end = clock();
		cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
		printf("GPU took %f seconds to execute \n", cpu_time_used);
	}
	if (MODE == CPU_MODE || MODE == CMP_MODE) {
		start = clock();
		gaussSeq();
		end = clock();
		cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
		printf("CPU took %f seconds to execute \n", cpu_time_used);
	}

	if (PRINT == 1) {
		if (MODE == GPU_MODE || MODE == CMP_MODE)
			Print_Matrix();
		if (MODE == CPU_MODE || MODE == CMP_MODE)
			Print_Matrix_seq();
	}

	if (MODE == CMP_MODE) {
		printf("memcmp A: %d", memcmp(A, A_seq, N*N * sizeof(double)));
		printf("memcmp y: %d", memcmp(y, y_seq, N * sizeof(double)));
		scout_for_errors();
	}


    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t gaussCuda()
{
    cudaError_t cudaStatus;
	double * A_d;
	size_t pitch;
	double * b_d;
	double * y_d;
	size_t blocksize, nrOfBlocks, stride;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMallocPitch((void**)&A_d, &pitch, N * sizeof(double), N);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
	cudaStatus = cudaMalloc((void**)&b_d, N * sizeof(double));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&y_d, N * sizeof(double));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}

    // Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy2D(A_d, pitch, A, N * sizeof(double), N * sizeof(double), N, cudaMemcpyHostToDevice);
    //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
	cudaStatus = cudaMemcpy(b_d,b, N*sizeof(double), cudaMemcpyHostToDevice);
	//cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}
	cudaStatus = cudaMemcpy(y_d, y, N * sizeof(double), cudaMemcpyHostToDevice);
	//cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	if (nrOfThreads > MAX_BLOCK_SIZE) {
		blocksize = MAX_BLOCK_SIZE;
		nrOfBlocks = ceil(nrOfThreads / (double)blocksize);
	}
	else {
		blocksize = nrOfThreads;
		nrOfBlocks = 1;
	}
	stride = nrOfThreads;

	printf("nrOfBlocks: %zd, blocksize: %zd\n", nrOfBlocks, blocksize);
	dim3 gridSize(1,nrOfBlocks,1);
	dim3 blockSize(1, blocksize,1);
	for (size_t i = 0; i < N; i++)
	{
		printf("N: %zd, Pitch: %zd, k: %zd, stride: %zd\n", N, pitch, i, stride);
		gaussKernelDivide<<<gridSize, blockSize>>> (A_d, y_d, b_d, pitch, i, stride, N);
		gaussKernelEliminate<<<gridSize, blockSize>>> (A_d, y_d, b_d, pitch, i, stride, N);
		//gaussKernel <<< gridSize, blockSize >>> (A_d, y_d, b_d, pitch, i);
	}
	printf("Donezo\n");

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
	printf("Donezo1\n");


    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }
	printf("Donezo2\n");


    // Copy output vector from GPU buffer to host memory.
	//cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
	cudaStatus = cudaMemcpy2D(A, N * sizeof(double), A_d, pitch, N * sizeof(double), N, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
	printf("Donezo3\n");

	cudaStatus = cudaMemcpy(b, b_d, N * sizeof(double), cudaMemcpyDeviceToHost);
	//cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}
	printf("Donezo4\n");

	cudaStatus = cudaMemcpy(y, y_d, N * sizeof(double), cudaMemcpyDeviceToHost);
	//cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}
	printf("Donezo5\n");

Error:
	cudaFree(A_d);
	cudaFree(b_d);
	cudaFree(y_d);
	printf("Donezo6\n");

    return cudaStatus;
}

void
Init_Matrix()
{
	int i, j;

	printf("\nsize      = %dx%d ", N, N);
	printf("\nmaxnum    = %d \n", maxnum);
	printf("Init	  = %s \n", Init);
	printf("Initializing matrix...");
	A = (double *)malloc((N*N) * sizeof(double));
	A_seq = (double *)malloc((N*N) * sizeof(double));
	if (strcmp(Init, "rand") == 0) {
		for (i = 0; i < N; i++) {
			for (j = 0; j < N; j++) {
				if (i == j) /* diagonal dominance */
					A[(i*N) + j] = (double)(rand() % maxnum) + 5.0;
				else
					A[(i*N) + j] = (double)(rand() % maxnum) + 1.0;
			}
		}
	}
	if (strcmp(Init, "fast") == 0) {
		for (i = 0; i < N; i++) {
			for (j = 0; j < N; j++) {
				if (i == j) /* diagonal dominance */
					A[(i*N) + j] = 5.0;
				else
					A[(i*N) + j] = 2.0;
			}
		}
	}

	/* Initialize vectors b and y */
	for (i = 0; i < N; i++) {
		b[i] = 1.0;
		y[i] = 1.0;
	}

	memcpy(A_seq, A, N*N * sizeof(double));
	memcpy(y_seq, y, N * sizeof(double));
	memcpy(b_seq, b, N * sizeof(double));

	printf("done \n\n");
	if (PRINT == 1) {
		Print_Matrix();
		Print_Matrix_seq();
	}

}

void
Print_Matrix()
{
	int i, j;

	printf("Matrix A:\n");
	for (i = 0; i < N; i++) {
		//printf("[");
		//printf("\n");
		for (j = 0; j < N; j++) {
			if (j < N-1)
				printf(" %5.2f,", A[(i*N) + j]);
			else
				printf(" %5.2f", A[(i*N) + j]);
		}
		//printf("]\n");
		printf("\n");
	}
	//printf("Vector b:\n[");
	//for (j = 0; j < N; j++)
	//	printf(" %5.8f,", b[j]);
	//printf("]\n");
	printf("Vector y:\n[");
	for (j = 0; j < N; j++)
		printf(" %5.8f\n", y[j]);
	printf("]\n");
	printf("\n\n");
}

void
Print_Matrix_seq()
{
	int i, j;

	printf("Matrix A_seq:\n");
	for (i = 0; i < N; i++) {
		//printf("[");
		//printf("\n");
		for (j = 0; j < N; j++) {
			if (j < N - 1)
				printf(" %5.2f,", A_seq[(i*N) + j]);
			else
				printf(" %5.2f", A_seq[(i*N) + j]);
		}
		//printf("]\n");
		printf("\n");
	}
	//printf("Vector b:\n[");
	//for (j = 0; j < N; j++)
	//	printf(" %5.8f,", b[j]);
	//printf("]\n");
	printf("Vector y_seq:\n[");
	for (j = 0; j < N; j++)
		printf(" %5.8f\n", y_seq[j]);
	printf("]\n");
	printf("\n\n");
}

void
Init_Default()
{
	N = 2048;
	Init = "rand";
	maxnum = 15.0;
	PRINT = 0;
}

void scout_for_errors() {
	//int ones = 0;
	//int ones_seq = 0;
	printf("N: %d\n", N);
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < N; j++) {
			//if (memcmp(A[(i*N)+j]), A[(i*N) + j]))
			//if (A[(i*N) + j] == 1.0)
			//	ones += 1;
			//if (A_seq[(i*N) + j] == 1.0)
			//	ones_seq += 1;
			if (i*N + j == 192)
				puts("its time");
			if (A[i*N + j] != A_seq[i*N + j])
			{
				printf("WRONG ");
				// printf("%5.8f | %5.8f @ idx [%d][%d]\n", A[(i*N) + j], A_seq[(i*N) + j], i, j);
			}
			else
			{
				// printf("%d\n", i*N + j);
			}
		}
		//if (y[i] != y_seq[i])
		//	printf("%5.8f | %5.8f @ idx [%d]\n", y[i], y_seq[i], i);
	}

}

int
Read_Options(int argc, char **argv)
{
	char    *prog;

	prog = *argv;
	while (++argv, --argc > 0)
		if (**argv == '-')
			switch (*++*argv) {
			case 'n':
				--argc;
				N = atoi(*++argv);
				break;
			case 'G':
				--argc;
				MODE = atoi(*++argv);
				break;
			case 'h':
				printf("\nHELP: try sor -u \n\n");
				exit(0);
				break;
			case 'u':
				printf("\nUsage: sor [-n problemsize]\n");
				printf("           [-D] show default values \n");
				printf("           [-h] help \n");
				printf("           [-I init_type] fast/rand \n");
				printf("           [-m maxnum] max random no \n");
				printf("           [-P print_switch] 0/1 \n");
				exit(0);
				break;
			case 'D':
				printf("\nDefault:  n         = %d ", N);
				printf("\n          Init      = rand");
				printf("\n          maxnum    = 5 ");
				printf("\n          P         = 0 \n\n");
				exit(0);
				break;
			case 'I':
				--argc;
				Init = *++argv;
				break;
			case 'm':
				--argc;
				maxnum = atoi(*++argv);
				break;
			case 'P':
				--argc;
				PRINT = atoi(*++argv);
				break;
			case 't':
				--argc;
				nrOfThreads = atoi(*++argv);
				break;
			default:
				printf("%s: ignored option: -%s\n", prog, *argv);
				printf("HELP: try %s -u \n\n", prog);
				break;
			}
}