Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- #define MAX_SIZE 4096
- #define MAX_BLOCK_SIZE 16
- #define GPU_MODE 0
- #define CPU_MODE 1
- #define CMP_MODE 2
- //typedef double matrix[MAX_SIZE][MAX_SIZE];
- int N; /* matrix size */
- int maxnum; /* max number of element*/
- const char *Init; /* matrix init type */
- int PRINT; /* print switch */
- int MODE;
- double * A; /* matrix A */
- double * A_seq;
- double b[MAX_SIZE]; /* vector b */
- double y[MAX_SIZE]; /* vector y */
- double b_seq[MAX_SIZE]; /* vector b */
- double y_seq[MAX_SIZE]; /* vector y */
- size_t nrOfThreads;
- /* forward declarations */
- void Init_Matrix(void);
- void Print_Matrix(void);
- void Print_Matrix_seq(void);
- void Init_Default(void);
- void scout_for_errors(void);
- int Read_Options(int, char **);
- cudaError_t gaussCuda();
- __global__ void gaussKernelDivide(double * A_d, double * y_d, double * b_d, size_t pitch, size_t k, size_t stride, size_t size)
- {
- size_t colLen = pitch / sizeof(double);
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- if (k%stride == y) {
- for (size_t x = k + 1; x <= size; x++)
- A_d[k*colLen + x] = A_d[k*colLen + x] / A_d[k*colLen + k]; // Division step
- y_d[k] = b_d[k] / A_d[k*colLen + k];
- A_d[k*colLen + k] = 1.0;
- }
- }
- __global__ void gaussKernelEliminate(double * A_d, double * y_d, double * b_d, size_t pitch, size_t k, size_t stride, size_t size)
- {
- size_t colLen = pitch / sizeof(double);
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- for (int i = y; i < size; i+=stride) {
- if (i != k) {
- for (size_t x = k + 1; x < size; x++) {
- A_d[i*colLen + x] = A_d[i*colLen + x] - A_d[i*colLen + k] * A_d[k*colLen + x]; // Elimination step
- }
- y_d[i] = y_d[i] - A_d[i*colLen + k] * y_d[k];
- b_d[i] = b_d[i] - A_d[i*colLen + k] * y_d[k];
- A_d[i*colLen + k] = 0.0;
- }
- }
- }
- void gaussSeq(void)
- {
- int i, j, k;
- /* Gaussian elimination algorithm, Algo 8.4 from Grama */
- for (k = 0; k < N; k++) { /* Outer loop */
- for (j = k + 1; j < N; j++)
- A_seq[(k*N)+j] = A_seq[(k*N) + j] / A_seq[(k*N) + k]; /* Division step */
- y_seq[k] = b_seq[k] / A_seq[(k*N) + k];
- A_seq[(k*N) + k] = 1.0;
- /*for (i = k + 1; i < N; i++) {
- for (j = k + 1; j < N; j++)
- A_seq[(i*N) + j] = A_seq[(i*N) + j] - A_seq[(i*N) + k] * A_seq[(k*N) + j]; // Elimination step
- b_seq[i] = b_seq[i] - A_seq[(i*N) + k] * y_seq[k];
- A_seq[(i*N) + k] = 0.0;
- }*/
- for (i = 0; i < N; i++) {
- if (i != k) {
- for (j = k + 1; j < N; j++)
- A_seq[(i*N) + j] = A_seq[(i*N) + j] - A_seq[(i*N) + k] * A_seq[(k*N) + j]; /* Elimination step */
- y_seq[i] = y_seq[i] - A_seq[(i*N) + k] * y_seq[k];
- b_seq[i] = b_seq[i] - A_seq[(i*N) + k] * y_seq[k];
- A_seq[(i*N) + k] = 0.0;
- }
- }
- }
- }
- int main(int argc, char * argv[])
- {
- clock_t start, end;
- double cpu_time_used;
- //int exit_code = 0;
- Init_Default(); /* Init default values */
- Read_Options(argc, argv); /* Read arguments */
- Init_Matrix(); /* Init the matrix */
- cudaError_t cudaStatus;
- if (MODE == GPU_MODE || MODE == CMP_MODE) {
- start = clock();
- cudaStatus = gaussCuda();
- end = clock();
- cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
- printf("GPU took %f seconds to execute \n", cpu_time_used);
- }
- if (MODE == CPU_MODE || MODE == CMP_MODE) {
- start = clock();
- gaussSeq();
- end = clock();
- cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
- printf("CPU took %f seconds to execute \n", cpu_time_used);
- }
- if (PRINT == 1) {
- if (MODE == GPU_MODE || MODE == CMP_MODE)
- Print_Matrix();
- if (MODE == CPU_MODE || MODE == CMP_MODE)
- Print_Matrix_seq();
- }
- if (MODE == CMP_MODE) {
- printf("memcmp A: %d", memcmp(A, A_seq, N*N * sizeof(double)));
- printf("memcmp y: %d", memcmp(y, y_seq, N * sizeof(double)));
- scout_for_errors();
- }
- // cudaDeviceReset must be called before exiting in order for profiling and
- // tracing tools such as Nsight and Visual Profiler to show complete traces.
- cudaStatus = cudaDeviceReset();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaDeviceReset failed!");
- return 1;
- }
- return 0;
- }
- // Helper function for using CUDA to add vectors in parallel.
- cudaError_t gaussCuda()
- {
- cudaError_t cudaStatus;
- double * A_d;
- size_t pitch;
- double * b_d;
- double * y_d;
- size_t blocksize, nrOfBlocks, stride;
- // Choose which GPU to run on, change this on a multi-GPU system.
- cudaStatus = cudaSetDevice(0);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- goto Error;
- }
- // Allocate GPU buffers for three vectors (two input, one output) .
- cudaStatus = cudaMallocPitch((void**)&A_d, &pitch, N * sizeof(double), N);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&b_d, N * sizeof(double));
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&y_d, N * sizeof(double));
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- // Copy input vectors from host memory to GPU buffers.
- cudaStatus = cudaMemcpy2D(A_d, pitch, A, N * sizeof(double), N * sizeof(double), N, cudaMemcpyHostToDevice);
- //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- cudaStatus = cudaMemcpy(b_d,b, N*sizeof(double), cudaMemcpyHostToDevice);
- //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- cudaStatus = cudaMemcpy(y_d, y, N * sizeof(double), cudaMemcpyHostToDevice);
- //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- if (nrOfThreads > MAX_BLOCK_SIZE) {
- blocksize = MAX_BLOCK_SIZE;
- nrOfBlocks = ceil(nrOfThreads / (double)blocksize);
- }
- else {
- blocksize = nrOfThreads;
- nrOfBlocks = 1;
- }
- stride = nrOfThreads;
- printf("nrOfBlocks: %zd, blocksize: %zd\n", nrOfBlocks, blocksize);
- dim3 gridSize(1,nrOfBlocks,1);
- dim3 blockSize(1, blocksize,1);
- for (size_t i = 0; i < N; i++)
- {
- printf("N: %zd, Pitch: %zd, k: %zd, stride: %zd\n", N, pitch, i, stride);
- gaussKernelDivide<<<gridSize, blockSize>>> (A_d, y_d, b_d, pitch, i, stride, N);
- gaussKernelEliminate<<<gridSize, blockSize>>> (A_d, y_d, b_d, pitch, i, stride, N);
- //gaussKernel <<< gridSize, blockSize >>> (A_d, y_d, b_d, pitch, i);
- }
- printf("Donezo\n");
- // Check for any errors launching the kernel
- cudaStatus = cudaGetLastError();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
- goto Error;
- }
- printf("Donezo1\n");
- // cudaDeviceSynchronize waits for the kernel to finish, and returns
- // any errors encountered during the launch.
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
- goto Error;
- }
- printf("Donezo2\n");
- // Copy output vector from GPU buffer to host memory.
- //cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
- cudaStatus = cudaMemcpy2D(A, N * sizeof(double), A_d, pitch, N * sizeof(double), N, cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- printf("Donezo3\n");
- cudaStatus = cudaMemcpy(b, b_d, N * sizeof(double), cudaMemcpyDeviceToHost);
- //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- printf("Donezo4\n");
- cudaStatus = cudaMemcpy(y, y_d, N * sizeof(double), cudaMemcpyDeviceToHost);
- //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- printf("Donezo5\n");
- Error:
- cudaFree(A_d);
- cudaFree(b_d);
- cudaFree(y_d);
- printf("Donezo6\n");
- return cudaStatus;
- }
- void
- Init_Matrix()
- {
- int i, j;
- printf("\nsize = %dx%d ", N, N);
- printf("\nmaxnum = %d \n", maxnum);
- printf("Init = %s \n", Init);
- printf("Initializing matrix...");
- A = (double *)malloc((N*N) * sizeof(double));
- A_seq = (double *)malloc((N*N) * sizeof(double));
- if (strcmp(Init, "rand") == 0) {
- for (i = 0; i < N; i++) {
- for (j = 0; j < N; j++) {
- if (i == j) /* diagonal dominance */
- A[(i*N) + j] = (double)(rand() % maxnum) + 5.0;
- else
- A[(i*N) + j] = (double)(rand() % maxnum) + 1.0;
- }
- }
- }
- if (strcmp(Init, "fast") == 0) {
- for (i = 0; i < N; i++) {
- for (j = 0; j < N; j++) {
- if (i == j) /* diagonal dominance */
- A[(i*N) + j] = 5.0;
- else
- A[(i*N) + j] = 2.0;
- }
- }
- }
- /* Initialize vectors b and y */
- for (i = 0; i < N; i++) {
- b[i] = 1.0;
- y[i] = 1.0;
- }
- memcpy(A_seq, A, N*N * sizeof(double));
- memcpy(y_seq, y, N * sizeof(double));
- memcpy(b_seq, b, N * sizeof(double));
- printf("done \n\n");
- if (PRINT == 1) {
- Print_Matrix();
- Print_Matrix_seq();
- }
- }
- void
- Print_Matrix()
- {
- int i, j;
- printf("Matrix A:\n");
- for (i = 0; i < N; i++) {
- //printf("[");
- //printf("\n");
- for (j = 0; j < N; j++) {
- if (j < N-1)
- printf(" %5.2f,", A[(i*N) + j]);
- else
- printf(" %5.2f", A[(i*N) + j]);
- }
- //printf("]\n");
- printf("\n");
- }
- //printf("Vector b:\n[");
- //for (j = 0; j < N; j++)
- // printf(" %5.8f,", b[j]);
- //printf("]\n");
- printf("Vector y:\n[");
- for (j = 0; j < N; j++)
- printf(" %5.8f\n", y[j]);
- printf("]\n");
- printf("\n\n");
- }
- void
- Print_Matrix_seq()
- {
- int i, j;
- printf("Matrix A_seq:\n");
- for (i = 0; i < N; i++) {
- //printf("[");
- //printf("\n");
- for (j = 0; j < N; j++) {
- if (j < N - 1)
- printf(" %5.2f,", A_seq[(i*N) + j]);
- else
- printf(" %5.2f", A_seq[(i*N) + j]);
- }
- //printf("]\n");
- printf("\n");
- }
- //printf("Vector b:\n[");
- //for (j = 0; j < N; j++)
- // printf(" %5.8f,", b[j]);
- //printf("]\n");
- printf("Vector y_seq:\n[");
- for (j = 0; j < N; j++)
- printf(" %5.8f\n", y_seq[j]);
- printf("]\n");
- printf("\n\n");
- }
- void
- Init_Default()
- {
- N = 2048;
- Init = "rand";
- maxnum = 15.0;
- PRINT = 0;
- }
- void scout_for_errors() {
- //int ones = 0;
- //int ones_seq = 0;
- printf("N: %d\n", N);
- for (int i = 0; i < N; i++) {
- for (int j = 0; j < N; j++) {
- //if (memcmp(A[(i*N)+j]), A[(i*N) + j]))
- //if (A[(i*N) + j] == 1.0)
- // ones += 1;
- //if (A_seq[(i*N) + j] == 1.0)
- // ones_seq += 1;
- if (i*N + j == 192)
- puts("its time");
- if (A[i*N + j] != A_seq[i*N + j])
- {
- printf("WRONG ");
- // printf("%5.8f | %5.8f @ idx [%d][%d]\n", A[(i*N) + j], A_seq[(i*N) + j], i, j);
- }
- else
- {
- // printf("%d\n", i*N + j);
- }
- }
- //if (y[i] != y_seq[i])
- // printf("%5.8f | %5.8f @ idx [%d]\n", y[i], y_seq[i], i);
- }
- }
- int
- Read_Options(int argc, char **argv)
- {
- char *prog;
- prog = *argv;
- while (++argv, --argc > 0)
- if (**argv == '-')
- switch (*++*argv) {
- case 'n':
- --argc;
- N = atoi(*++argv);
- break;
- case 'G':
- --argc;
- MODE = atoi(*++argv);
- break;
- case 'h':
- printf("\nHELP: try sor -u \n\n");
- exit(0);
- break;
- case 'u':
- printf("\nUsage: sor [-n problemsize]\n");
- printf(" [-D] show default values \n");
- printf(" [-h] help \n");
- printf(" [-I init_type] fast/rand \n");
- printf(" [-m maxnum] max random no \n");
- printf(" [-P print_switch] 0/1 \n");
- exit(0);
- break;
- case 'D':
- printf("\nDefault: n = %d ", N);
- printf("\n Init = rand");
- printf("\n maxnum = 5 ");
- printf("\n P = 0 \n\n");
- exit(0);
- break;
- case 'I':
- --argc;
- Init = *++argv;
- break;
- case 'm':
- --argc;
- maxnum = atoi(*++argv);
- break;
- case 'P':
- --argc;
- PRINT = atoi(*++argv);
- break;
- case 't':
- --argc;
- nrOfThreads = atoi(*++argv);
- break;
- default:
- printf("%s: ignored option: -%s\n", prog, *argv);
- printf("HELP: try %s -u \n\n", prog);
- break;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement