Advertisement
Guest User

Untitled

a guest
Dec 13th, 2019
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.74 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <time.h>
  7.  
  8.  
  9. #define MAX_SIZE 4096
  10. #define MAX_BLOCK_SIZE 16
  11. #define GPU_MODE 0
  12. #define CPU_MODE 1
  13. #define CMP_MODE 2
  14.  
  15. //typedef double matrix[MAX_SIZE][MAX_SIZE];
  16.  
  17. int N; /* matrix size */
  18. int maxnum; /* max number of element*/
  19. const char *Init; /* matrix init type */
  20. int PRINT; /* print switch */
  21. int MODE;
  22. double * A; /* matrix A */
  23. double * A_seq;
  24. double b[MAX_SIZE]; /* vector b */
  25. double y[MAX_SIZE]; /* vector y */
  26. double b_seq[MAX_SIZE]; /* vector b */
  27. double y_seq[MAX_SIZE]; /* vector y */
  28. size_t nrOfThreads;
  29.  
  30. /* forward declarations */
  31. void Init_Matrix(void);
  32. void Print_Matrix(void);
  33. void Print_Matrix_seq(void);
  34. void Init_Default(void);
  35. void scout_for_errors(void);
  36. int Read_Options(int, char **);
  37.  
  38. cudaError_t gaussCuda();
  39.  
  40. __global__ void gaussKernelDivide(double * A_d, double * y_d, double * b_d, size_t pitch, size_t k, size_t stride, size_t size)
  41. {
  42. size_t colLen = pitch / sizeof(double);
  43. int y = threadIdx.y + blockIdx.y * blockDim.y;
  44. if (k%stride == y) {
  45. for (size_t x = k + 1; x <= size; x++)
  46. A_d[k*colLen + x] = A_d[k*colLen + x] / A_d[k*colLen + k]; // Division step
  47. y_d[k] = b_d[k] / A_d[k*colLen + k];
  48. A_d[k*colLen + k] = 1.0;
  49. }
  50. }
  51.  
  52. __global__ void gaussKernelEliminate(double * A_d, double * y_d, double * b_d, size_t pitch, size_t k, size_t stride, size_t size)
  53. {
  54. size_t colLen = pitch / sizeof(double);
  55. int y = threadIdx.y + blockIdx.y * blockDim.y;
  56. for (int i = y; i < size; i+=stride) {
  57. if (i != k) {
  58. for (size_t x = k + 1; x < size; x++) {
  59. A_d[i*colLen + x] = A_d[i*colLen + x] - A_d[i*colLen + k] * A_d[k*colLen + x]; // Elimination step
  60. }
  61. y_d[i] = y_d[i] - A_d[i*colLen + k] * y_d[k];
  62. b_d[i] = b_d[i] - A_d[i*colLen + k] * y_d[k];
  63. A_d[i*colLen + k] = 0.0;
  64. }
  65. }
  66. }
  67.  
  68. void gaussSeq(void)
  69. {
  70. int i, j, k;
  71.  
  72. /* Gaussian elimination algorithm, Algo 8.4 from Grama */
  73. for (k = 0; k < N; k++) { /* Outer loop */
  74. for (j = k + 1; j < N; j++)
  75. A_seq[(k*N)+j] = A_seq[(k*N) + j] / A_seq[(k*N) + k]; /* Division step */
  76. y_seq[k] = b_seq[k] / A_seq[(k*N) + k];
  77. A_seq[(k*N) + k] = 1.0;
  78. /*for (i = k + 1; i < N; i++) {
  79. for (j = k + 1; j < N; j++)
  80. A_seq[(i*N) + j] = A_seq[(i*N) + j] - A_seq[(i*N) + k] * A_seq[(k*N) + j]; // Elimination step
  81. b_seq[i] = b_seq[i] - A_seq[(i*N) + k] * y_seq[k];
  82. A_seq[(i*N) + k] = 0.0;
  83. }*/
  84. for (i = 0; i < N; i++) {
  85. if (i != k) {
  86. for (j = k + 1; j < N; j++)
  87. A_seq[(i*N) + j] = A_seq[(i*N) + j] - A_seq[(i*N) + k] * A_seq[(k*N) + j]; /* Elimination step */
  88. y_seq[i] = y_seq[i] - A_seq[(i*N) + k] * y_seq[k];
  89. b_seq[i] = b_seq[i] - A_seq[(i*N) + k] * y_seq[k];
  90. A_seq[(i*N) + k] = 0.0;
  91. }
  92. }
  93. }
  94. }
  95.  
  96. int main(int argc, char * argv[])
  97. {
  98. clock_t start, end;
  99. double cpu_time_used;
  100. //int exit_code = 0;
  101.  
  102. Init_Default(); /* Init default values */
  103. Read_Options(argc, argv); /* Read arguments */
  104. Init_Matrix(); /* Init the matrix */
  105. cudaError_t cudaStatus;
  106. if (MODE == GPU_MODE || MODE == CMP_MODE) {
  107. start = clock();
  108. cudaStatus = gaussCuda();
  109. end = clock();
  110. cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
  111. printf("GPU took %f seconds to execute \n", cpu_time_used);
  112. }
  113. if (MODE == CPU_MODE || MODE == CMP_MODE) {
  114. start = clock();
  115. gaussSeq();
  116. end = clock();
  117. cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
  118. printf("CPU took %f seconds to execute \n", cpu_time_used);
  119. }
  120.  
  121. if (PRINT == 1) {
  122. if (MODE == GPU_MODE || MODE == CMP_MODE)
  123. Print_Matrix();
  124. if (MODE == CPU_MODE || MODE == CMP_MODE)
  125. Print_Matrix_seq();
  126. }
  127.  
  128. if (MODE == CMP_MODE) {
  129. printf("memcmp A: %d", memcmp(A, A_seq, N*N * sizeof(double)));
  130. printf("memcmp y: %d", memcmp(y, y_seq, N * sizeof(double)));
  131. scout_for_errors();
  132. }
  133.  
  134.  
  135. // cudaDeviceReset must be called before exiting in order for profiling and
  136. // tracing tools such as Nsight and Visual Profiler to show complete traces.
  137. cudaStatus = cudaDeviceReset();
  138. if (cudaStatus != cudaSuccess) {
  139. fprintf(stderr, "cudaDeviceReset failed!");
  140. return 1;
  141. }
  142.  
  143. return 0;
  144. }
  145.  
  146. // Helper function for using CUDA to add vectors in parallel.
  147. cudaError_t gaussCuda()
  148. {
  149. cudaError_t cudaStatus;
  150. double * A_d;
  151. size_t pitch;
  152. double * b_d;
  153. double * y_d;
  154. size_t blocksize, nrOfBlocks, stride;
  155.  
  156. // Choose which GPU to run on, change this on a multi-GPU system.
  157. cudaStatus = cudaSetDevice(0);
  158. if (cudaStatus != cudaSuccess) {
  159. fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  160. goto Error;
  161. }
  162.  
  163. // Allocate GPU buffers for three vectors (two input, one output) .
  164. cudaStatus = cudaMallocPitch((void**)&A_d, &pitch, N * sizeof(double), N);
  165. if (cudaStatus != cudaSuccess) {
  166. fprintf(stderr, "cudaMalloc failed!");
  167. goto Error;
  168. }
  169. cudaStatus = cudaMalloc((void**)&b_d, N * sizeof(double));
  170. if (cudaStatus != cudaSuccess) {
  171. fprintf(stderr, "cudaMalloc failed!");
  172. goto Error;
  173. }
  174. cudaStatus = cudaMalloc((void**)&y_d, N * sizeof(double));
  175. if (cudaStatus != cudaSuccess) {
  176. fprintf(stderr, "cudaMalloc failed!");
  177. goto Error;
  178. }
  179.  
  180. // Copy input vectors from host memory to GPU buffers.
  181. cudaStatus = cudaMemcpy2D(A_d, pitch, A, N * sizeof(double), N * sizeof(double), N, cudaMemcpyHostToDevice);
  182. //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  183. if (cudaStatus != cudaSuccess) {
  184. fprintf(stderr, "cudaMemcpy failed!");
  185. goto Error;
  186. }
  187. cudaStatus = cudaMemcpy(b_d,b, N*sizeof(double), cudaMemcpyHostToDevice);
  188. //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  189. if (cudaStatus != cudaSuccess) {
  190. fprintf(stderr, "cudaMemcpy failed!");
  191. goto Error;
  192. }
  193. cudaStatus = cudaMemcpy(y_d, y, N * sizeof(double), cudaMemcpyHostToDevice);
  194. //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  195. if (cudaStatus != cudaSuccess) {
  196. fprintf(stderr, "cudaMemcpy failed!");
  197. goto Error;
  198. }
  199.  
  200. if (nrOfThreads > MAX_BLOCK_SIZE) {
  201. blocksize = MAX_BLOCK_SIZE;
  202. nrOfBlocks = ceil(nrOfThreads / (double)blocksize);
  203. }
  204. else {
  205. blocksize = nrOfThreads;
  206. nrOfBlocks = 1;
  207. }
  208. stride = nrOfThreads;
  209.  
  210. printf("nrOfBlocks: %zd, blocksize: %zd\n", nrOfBlocks, blocksize);
  211. dim3 gridSize(1,nrOfBlocks,1);
  212. dim3 blockSize(1, blocksize,1);
  213. for (size_t i = 0; i < N; i++)
  214. {
  215. printf("N: %zd, Pitch: %zd, k: %zd, stride: %zd\n", N, pitch, i, stride);
  216. gaussKernelDivide<<<gridSize, blockSize>>> (A_d, y_d, b_d, pitch, i, stride, N);
  217. gaussKernelEliminate<<<gridSize, blockSize>>> (A_d, y_d, b_d, pitch, i, stride, N);
  218. //gaussKernel <<< gridSize, blockSize >>> (A_d, y_d, b_d, pitch, i);
  219. }
  220. printf("Donezo\n");
  221.  
  222. // Check for any errors launching the kernel
  223. cudaStatus = cudaGetLastError();
  224. if (cudaStatus != cudaSuccess) {
  225. fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
  226. goto Error;
  227. }
  228. printf("Donezo1\n");
  229.  
  230.  
  231. // cudaDeviceSynchronize waits for the kernel to finish, and returns
  232. // any errors encountered during the launch.
  233. cudaStatus = cudaDeviceSynchronize();
  234. if (cudaStatus != cudaSuccess) {
  235. fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
  236. goto Error;
  237. }
  238. printf("Donezo2\n");
  239.  
  240.  
  241. // Copy output vector from GPU buffer to host memory.
  242. //cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
  243. cudaStatus = cudaMemcpy2D(A, N * sizeof(double), A_d, pitch, N * sizeof(double), N, cudaMemcpyDeviceToHost);
  244. if (cudaStatus != cudaSuccess) {
  245. fprintf(stderr, "cudaMemcpy failed!");
  246. goto Error;
  247. }
  248. printf("Donezo3\n");
  249.  
  250. cudaStatus = cudaMemcpy(b, b_d, N * sizeof(double), cudaMemcpyDeviceToHost);
  251. //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  252. if (cudaStatus != cudaSuccess) {
  253. fprintf(stderr, "cudaMemcpy failed!");
  254. goto Error;
  255. }
  256. printf("Donezo4\n");
  257.  
  258. cudaStatus = cudaMemcpy(y, y_d, N * sizeof(double), cudaMemcpyDeviceToHost);
  259. //cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  260. if (cudaStatus != cudaSuccess) {
  261. fprintf(stderr, "cudaMemcpy failed!");
  262. goto Error;
  263. }
  264. printf("Donezo5\n");
  265.  
  266. Error:
  267. cudaFree(A_d);
  268. cudaFree(b_d);
  269. cudaFree(y_d);
  270. printf("Donezo6\n");
  271.  
  272. return cudaStatus;
  273. }
  274.  
  275. void
  276. Init_Matrix()
  277. {
  278. int i, j;
  279.  
  280. printf("\nsize = %dx%d ", N, N);
  281. printf("\nmaxnum = %d \n", maxnum);
  282. printf("Init = %s \n", Init);
  283. printf("Initializing matrix...");
  284. A = (double *)malloc((N*N) * sizeof(double));
  285. A_seq = (double *)malloc((N*N) * sizeof(double));
  286. if (strcmp(Init, "rand") == 0) {
  287. for (i = 0; i < N; i++) {
  288. for (j = 0; j < N; j++) {
  289. if (i == j) /* diagonal dominance */
  290. A[(i*N) + j] = (double)(rand() % maxnum) + 5.0;
  291. else
  292. A[(i*N) + j] = (double)(rand() % maxnum) + 1.0;
  293. }
  294. }
  295. }
  296. if (strcmp(Init, "fast") == 0) {
  297. for (i = 0; i < N; i++) {
  298. for (j = 0; j < N; j++) {
  299. if (i == j) /* diagonal dominance */
  300. A[(i*N) + j] = 5.0;
  301. else
  302. A[(i*N) + j] = 2.0;
  303. }
  304. }
  305. }
  306.  
  307. /* Initialize vectors b and y */
  308. for (i = 0; i < N; i++) {
  309. b[i] = 1.0;
  310. y[i] = 1.0;
  311. }
  312.  
  313. memcpy(A_seq, A, N*N * sizeof(double));
  314. memcpy(y_seq, y, N * sizeof(double));
  315. memcpy(b_seq, b, N * sizeof(double));
  316.  
  317. printf("done \n\n");
  318. if (PRINT == 1) {
  319. Print_Matrix();
  320. Print_Matrix_seq();
  321. }
  322.  
  323. }
  324.  
  325. void
  326. Print_Matrix()
  327. {
  328. int i, j;
  329.  
  330. printf("Matrix A:\n");
  331. for (i = 0; i < N; i++) {
  332. //printf("[");
  333. //printf("\n");
  334. for (j = 0; j < N; j++) {
  335. if (j < N-1)
  336. printf(" %5.2f,", A[(i*N) + j]);
  337. else
  338. printf(" %5.2f", A[(i*N) + j]);
  339. }
  340. //printf("]\n");
  341. printf("\n");
  342. }
  343. //printf("Vector b:\n[");
  344. //for (j = 0; j < N; j++)
  345. // printf(" %5.8f,", b[j]);
  346. //printf("]\n");
  347. printf("Vector y:\n[");
  348. for (j = 0; j < N; j++)
  349. printf(" %5.8f\n", y[j]);
  350. printf("]\n");
  351. printf("\n\n");
  352. }
  353.  
  354. void
  355. Print_Matrix_seq()
  356. {
  357. int i, j;
  358.  
  359. printf("Matrix A_seq:\n");
  360. for (i = 0; i < N; i++) {
  361. //printf("[");
  362. //printf("\n");
  363. for (j = 0; j < N; j++) {
  364. if (j < N - 1)
  365. printf(" %5.2f,", A_seq[(i*N) + j]);
  366. else
  367. printf(" %5.2f", A_seq[(i*N) + j]);
  368. }
  369. //printf("]\n");
  370. printf("\n");
  371. }
  372. //printf("Vector b:\n[");
  373. //for (j = 0; j < N; j++)
  374. // printf(" %5.8f,", b[j]);
  375. //printf("]\n");
  376. printf("Vector y_seq:\n[");
  377. for (j = 0; j < N; j++)
  378. printf(" %5.8f\n", y_seq[j]);
  379. printf("]\n");
  380. printf("\n\n");
  381. }
  382.  
  383. void
  384. Init_Default()
  385. {
  386. N = 2048;
  387. Init = "rand";
  388. maxnum = 15.0;
  389. PRINT = 0;
  390. }
  391.  
  392. void scout_for_errors() {
  393. //int ones = 0;
  394. //int ones_seq = 0;
  395. printf("N: %d\n", N);
  396. for (int i = 0; i < N; i++) {
  397. for (int j = 0; j < N; j++) {
  398. //if (memcmp(A[(i*N)+j]), A[(i*N) + j]))
  399. //if (A[(i*N) + j] == 1.0)
  400. // ones += 1;
  401. //if (A_seq[(i*N) + j] == 1.0)
  402. // ones_seq += 1;
  403. if (i*N + j == 192)
  404. puts("its time");
  405. if (A[i*N + j] != A_seq[i*N + j])
  406. {
  407. printf("WRONG ");
  408. // printf("%5.8f | %5.8f @ idx [%d][%d]\n", A[(i*N) + j], A_seq[(i*N) + j], i, j);
  409. }
  410. else
  411. {
  412. // printf("%d\n", i*N + j);
  413. }
  414. }
  415. //if (y[i] != y_seq[i])
  416. // printf("%5.8f | %5.8f @ idx [%d]\n", y[i], y_seq[i], i);
  417. }
  418.  
  419. }
  420.  
  421. int
  422. Read_Options(int argc, char **argv)
  423. {
  424. char *prog;
  425.  
  426. prog = *argv;
  427. while (++argv, --argc > 0)
  428. if (**argv == '-')
  429. switch (*++*argv) {
  430. case 'n':
  431. --argc;
  432. N = atoi(*++argv);
  433. break;
  434. case 'G':
  435. --argc;
  436. MODE = atoi(*++argv);
  437. break;
  438. case 'h':
  439. printf("\nHELP: try sor -u \n\n");
  440. exit(0);
  441. break;
  442. case 'u':
  443. printf("\nUsage: sor [-n problemsize]\n");
  444. printf(" [-D] show default values \n");
  445. printf(" [-h] help \n");
  446. printf(" [-I init_type] fast/rand \n");
  447. printf(" [-m maxnum] max random no \n");
  448. printf(" [-P print_switch] 0/1 \n");
  449. exit(0);
  450. break;
  451. case 'D':
  452. printf("\nDefault: n = %d ", N);
  453. printf("\n Init = rand");
  454. printf("\n maxnum = 5 ");
  455. printf("\n P = 0 \n\n");
  456. exit(0);
  457. break;
  458. case 'I':
  459. --argc;
  460. Init = *++argv;
  461. break;
  462. case 'm':
  463. --argc;
  464. maxnum = atoi(*++argv);
  465. break;
  466. case 'P':
  467. --argc;
  468. PRINT = atoi(*++argv);
  469. break;
  470. case 't':
  471. --argc;
  472. nrOfThreads = atoi(*++argv);
  473. break;
  474. default:
  475. printf("%s: ignored option: -%s\n", prog, *argv);
  476. printf("HELP: try %s -u \n\n", prog);
  477. break;
  478. }
  479. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement