Advertisement
Guest User

Untitled

a guest
Jan 23rd, 2018
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.65 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <math.h>
  4. #include <float.h>
  5. #include <cuda.h>
  6.  
  7. typedef struct {
  8. float posx;
  9. float posy;
  10. float range;
  11. float temp;
  12. } heatsrc_t;
  13.  
  14. typedef struct {
  15. unsigned maxiter; // maximum number of iterations
  16. unsigned resolution; // spatial resolution
  17. int algorithm; // 0=>Jacobi, 1=>Gauss
  18.  
  19. unsigned visres; // visualization resolution
  20.  
  21. float *u, *uhelp;
  22. float *uvis;
  23.  
  24. unsigned numsrcs; // number of heat sources
  25. heatsrc_t *heatsrcs;
  26. } algoparam_t;
  27.  
  28. // function declarations
  29. int read_input( FILE *infile, algoparam_t *param );
  30. void print_params( algoparam_t *param );
  31. int initialize( algoparam_t *param );
  32. int finalize( algoparam_t *param );
  33. void write_image( FILE * f, float *u,
  34. unsigned sizex, unsigned sizey );
  35. int coarsen(float *uold, unsigned oldx, unsigned oldy ,
  36. float *unew, unsigned newx, unsigned newy );
  37.  
  38.  
  39. __global__ void gpu_Heat (float *h, float *g, int N);
  40.  
  41. #define NB 8
  42. #define min(a,b) ( ((a) < (b)) ? (a) : (b) )
  43.  
  44. float cpu_residual (float *u, float *utmp, unsigned sizex, unsigned sizey)
  45. {
  46. float diff, sum=0.0;
  47.  
  48. for (int i=1; i<sizex-1; i++)
  49. for (int j=1; j<sizey-1; j++) {
  50. diff = utmp[i*sizey+j] - u[i*sizey + j];
  51. sum += diff * diff;
  52. }
  53. return(sum);
  54. }
  55.  
  56. float cpu_jacobi (float *u, float *utmp, unsigned sizex, unsigned sizey)
  57. {
  58. float diff, sum=0.0;
  59. int nbx, bx, nby, by;
  60.  
  61. nbx = NB;
  62. bx = sizex/nbx;
  63. nby = NB;
  64. by = sizey/nby;
  65. for (int ii=0; ii<nbx; ii++)
  66. for (int jj=0; jj<nby; jj++)
  67. for (int i=1+ii*bx; i<=min((ii+1)*bx, sizex-2); i++)
  68. for (int j=1+jj*by; j<=min((jj+1)*by, sizey-2); j++) {
  69. utmp[i*sizey+j]= 0.25 * (u[ i*sizey + (j-1) ]+ // left
  70. u[ i*sizey + (j+1) ]+ // right
  71. u[ (i-1)*sizey + j ]+ // top
  72. u[ (i+1)*sizey + j ]); // bottom
  73. diff = utmp[i*sizey+j] - u[i*sizey + j];
  74. sum += diff * diff;
  75. }
  76. return(sum);
  77. }
  78.  
  79. void usage( char *s )
  80. {
  81. fprintf(stderr, "Usage: %s <input file> -t threads -b blocks\n", s);
  82. fprintf(stderr, " -t number of threads per block in each dimension (e.g. 16)\n");
  83. }
  84.  
  85. int main( int argc, char *argv[] ) {
  86. unsigned iter;
  87. FILE *infile, *resfile;
  88. char *resfilename;
  89.  
  90. // algorithmic parameters
  91. algoparam_t param;
  92. int np;
  93.  
  94. // check arguments
  95. if( argc < 4 ) {
  96. usage( argv[0] );
  97. return 1;
  98. }
  99.  
  100. // check input file
  101. if( !(infile=fopen(argv[1], "r")) ) {
  102. fprintf(stderr,
  103. "\nError: Cannot open \"%s\" for reading.\n\n", argv[1]);
  104.  
  105. usage(argv[0]);
  106. return 1;
  107. }
  108.  
  109. // check result file
  110. resfilename="heat.ppm";
  111.  
  112. if( !(resfile=fopen(resfilename, "w")) ) {
  113. fprintf(stderr,
  114. "\nError: Cannot open \"%s\" for writing.\n\n",
  115. resfilename);
  116. usage(argv[0]);
  117. return 1;
  118. }
  119.  
  120. // check input
  121. if( !read_input(infile, &param) )
  122. {
  123. fprintf(stderr, "\nError: Error parsing input file.\n\n");
  124. usage(argv[0]);
  125. return 1;
  126. }
  127.  
  128. // full size (param.resolution are only the inner points)
  129. np = param.resolution + 2;
  130.  
  131. int Grid_Dim, Block_Dim; // Grid and Block structure values
  132. if (strcmp(argv[2], "-t")==0) {
  133. Block_Dim = atoi(argv[3]);
  134. Grid_Dim = np/Block_Dim + ((np%Block_Dim)!=0);;
  135. if ((Block_Dim*Block_Dim) > 512) {
  136. printf("Error -- too many threads in block, try again\n");
  137. return 1;
  138. }
  139. }
  140. else {
  141. fprintf(stderr, "Usage: %s <input file> -t threads -b blocks\n", argv[0]);
  142. fprintf(stderr, " -t number of threads per block in each dimension (e.g. 16)\n");
  143. return 0;
  144. }
  145.  
  146. fprintf(stderr, "\nSolving Heat equation on the CPU and the GPU\n");
  147. fprintf(stderr, "--------------------------------------------\n");
  148. print_params(&param);
  149.  
  150. fprintf(stdout, "\nExecution on CPU (sequential)\n-----------------------------\n");
  151. if( !initialize(&param) )
  152. {
  153. fprintf(stderr, "Error in Solver initialization.\n\n");
  154. return 1;
  155. }
  156.  
  157. // starting time
  158. float elapsed_time_ms; // which is applicable for asynchronous code also
  159. cudaEvent_t start, stop; // using cuda events to measure time
  160. cudaEventCreate( &start ); // instrument code to measure start time
  161. cudaEventCreate( &stop );
  162.  
  163. cudaEventRecord( start, 0 );
  164. cudaEventSynchronize( start );
  165.  
  166. iter = 0;
  167. float residual;
  168. while(1) {
  169. residual = cpu_jacobi(param.u, param.uhelp, np, np);
  170. float * tmp = param.u;
  171. param.u = param.uhelp;
  172. param.uhelp = tmp;
  173.  
  174. iter++;
  175.  
  176. // solution good enough ?
  177. if (residual < 0.00005) break;
  178.  
  179. // max. iteration reached ? (no limit with maxiter=0)
  180. if (iter>=param.maxiter) break;
  181. }
  182.  
  183. cudaEventRecord( stop, 0 ); // instrument code to measue end time
  184. cudaEventSynchronize( stop );
  185. cudaEventElapsedTime( &elapsed_time_ms, start, stop );
  186.  
  187. // Flop count after iter iterations
  188. float flop = iter * 11.0 * param.resolution * param.resolution;
  189.  
  190. fprintf(stdout, "Time on CPU in ms.= %f ", elapsed_time_ms);
  191. fprintf(stdout, "(%3.3f GFlop => %6.2f MFlop/s)\n",
  192. flop/1000000000.0,
  193. flop/elapsed_time_ms/1000);
  194. fprintf(stdout, "Convergence to residual=%f: %d iterations\n", residual, iter);
  195.  
  196. finalize( &param );
  197.  
  198. fprintf(stdout, "\nExecution on GPU\n----------------\n");
  199. fprintf(stderr, "Number of threads per block in each dimension = %d\n", Block_Dim);
  200. fprintf(stderr, "Number of blocks per grid in each dimension = %d\n", Grid_Dim);
  201.  
  202. if( !initialize(&param) )
  203. {
  204. fprintf(stderr, "Error in Solver initialization.\n\n");
  205. return 1;
  206. }
  207.  
  208. dim3 Grid(Grid_Dim, Grid_Dim);
  209. dim3 Block(Block_Dim, Block_Dim);
  210.  
  211. // starting time
  212. cudaEventRecord( start, 0 );
  213. cudaEventSynchronize( start );
  214.  
  215. float *dev_u, *dev_uhelp;
  216.  
  217. // TODO: Allocation on GPU for matrices u and uhelp
  218. //...
  219.  
  220. // TODO: Copy initial values in u and uhelp from host to GPU
  221. //...
  222.  
  223. iter = 0;
  224. while(1) {
  225. gpu_Heat<<<Grid,Block>>>(dev_u, dev_uhelp, np);
  226. cudaThreadSynchronize(); // wait for all threads to complete
  227.  
  228. // TODO: residual is computed on host, we need to get from GPU values computed in u and uhelp
  229. //...
  230. residual = cpu_residual (param.u, param.uhelp, np, np);
  231.  
  232. float * tmp = dev_u;
  233. dev_u = dev_uhelp;
  234. dev_uhelp = tmp;
  235.  
  236. iter++;
  237.  
  238. // solution good enough ?
  239. if (residual < 0.00005) break;
  240.  
  241. // max. iteration reached ? (no limit with maxiter=0)
  242. if (iter>=param.maxiter) break;
  243. }
  244.  
  245. // TODO: get result matrix from GPU
  246. //...
  247.  
  248. // TODO: free memory used in GPU
  249. //...
  250.  
  251. cudaEventRecord( stop, 0 ); // instrument code to measue end time
  252. cudaEventSynchronize( stop );
  253. cudaEventElapsedTime( &elapsed_time_ms, start, stop );
  254.  
  255. fprintf(stdout, "\nTime on GPU in ms. = %f ", elapsed_time_ms);
  256. fprintf(stdout, "(%3.3f GFlop => %6.2f MFlop/s)\n",
  257. flop/1000000000.0,
  258. flop/elapsed_time_ms/1000);
  259. fprintf(stdout, "Convergence to residual=%f: %d iterations\n", residual, iter);
  260.  
  261. cudaEventDestroy(start);
  262. cudaEventDestroy(stop);
  263.  
  264. // for plot...
  265. coarsen( param.u, np, np,
  266. param.uvis, param.visres+2, param.visres+2 );
  267.  
  268. write_image( resfile, param.uvis,
  269. param.visres+2,
  270. param.visres+2 );
  271.  
  272. finalize( &param );
  273.  
  274. return 0;
  275. }
  276.  
  277.  
  278. /*
  279. * Initialize the iterative solver
  280. * - allocate memory for matrices
  281. * - set boundary conditions according to configuration
  282. */
  283. int initialize( algoparam_t *param )
  284.  
  285. {
  286. int i, j;
  287. float dist;
  288.  
  289. // total number of points (including border)
  290. const int np = param->resolution + 2;
  291.  
  292. //
  293. // allocate memory
  294. //
  295. (param->u) = (float*)calloc( sizeof(float),np*np );
  296. (param->uhelp) = (float*)calloc( sizeof(float),np*np );
  297. (param->uvis) = (float*)calloc( sizeof(float),
  298. (param->visres+2) *
  299. (param->visres+2) );
  300.  
  301.  
  302. if( !(param->u) || !(param->uhelp) || !(param->uvis) )
  303. {
  304. fprintf(stderr, "Error: Cannot allocate memory\n");
  305. return 0;
  306. }
  307.  
  308. for( i=0; i<param->numsrcs; i++ )
  309. {
  310. /* top row */
  311. for( j=0; j<np; j++ )
  312. {
  313. dist = sqrt( pow((float)j/(float)(np-1) -
  314. param->heatsrcs[i].posx, 2)+
  315. pow(param->heatsrcs[i].posy, 2));
  316.  
  317. if( dist <= param->heatsrcs[i].range )
  318. {
  319. (param->u)[j] +=
  320. (param->heatsrcs[i].range-dist) /
  321. param->heatsrcs[i].range *
  322. param->heatsrcs[i].temp;
  323. }
  324. }
  325.  
  326. /* bottom row */
  327. for( j=0; j<np; j++ )
  328. {
  329. dist = sqrt( pow((float)j/(float)(np-1) -
  330. param->heatsrcs[i].posx, 2)+
  331. pow(1-param->heatsrcs[i].posy, 2));
  332.  
  333. if( dist <= param->heatsrcs[i].range )
  334. {
  335. (param->u)[(np-1)*np+j]+=
  336. (param->heatsrcs[i].range-dist) /
  337. param->heatsrcs[i].range *
  338. param->heatsrcs[i].temp;
  339. }
  340. }
  341.  
  342. /* leftmost column */
  343. for( j=1; j<np-1; j++ )
  344. {
  345. dist = sqrt( pow(param->heatsrcs[i].posx, 2)+
  346. pow((float)j/(float)(np-1) -
  347. param->heatsrcs[i].posy, 2));
  348.  
  349. if( dist <= param->heatsrcs[i].range )
  350. {
  351. (param->u)[ j*np ]+=
  352. (param->heatsrcs[i].range-dist) /
  353. param->heatsrcs[i].range *
  354. param->heatsrcs[i].temp;
  355. }
  356. }
  357.  
  358. /* rightmost column */
  359. for( j=1; j<np-1; j++ )
  360. {
  361. dist = sqrt( pow(1-param->heatsrcs[i].posx, 2)+
  362. pow((float)j/(float)(np-1) -
  363. param->heatsrcs[i].posy, 2));
  364.  
  365. if( dist <= param->heatsrcs[i].range )
  366. {
  367. (param->u)[ j*np+(np-1) ]+=
  368. (param->heatsrcs[i].range-dist) /
  369. param->heatsrcs[i].range *
  370. param->heatsrcs[i].temp;
  371. }
  372. }
  373. }
  374.  
  375. // Copy u into uhelp
  376. float *putmp, *pu;
  377. pu = param->u;
  378. putmp = param->uhelp;
  379. for( j=0; j<np; j++ )
  380. for( i=0; i<np; i++ )
  381. *putmp++ = *pu++;
  382.  
  383. return 1;
  384. }
  385.  
  386. /*
  387. * free used memory
  388. */
  389. int finalize( algoparam_t *param )
  390. {
  391. if( param->u ) {
  392. free(param->u);
  393. param->u = 0;
  394. }
  395.  
  396. if( param->uhelp ) {
  397. free(param->uhelp);
  398. param->uhelp = 0;
  399. }
  400.  
  401. if( param->uvis ) {
  402. free(param->uvis);
  403. param->uvis = 0;
  404. }
  405.  
  406. return 1;
  407. }
  408.  
  409.  
  410. /*
  411. * write the given temperature u matrix to rgb values
  412. * and write the resulting image to file f
  413. */
  414. void write_image( FILE * f, float *u,
  415. unsigned sizex, unsigned sizey )
  416. {
  417. // RGB table
  418. unsigned char r[1024], g[1024], b[1024];
  419. int i, j, k;
  420.  
  421. float min, max;
  422.  
  423. j=1023;
  424.  
  425. // prepare RGB table
  426. for( i=0; i<256; i++ )
  427. {
  428. r[j]=255; g[j]=i; b[j]=0;
  429. j--;
  430. }
  431. for( i=0; i<256; i++ )
  432. {
  433. r[j]=255-i; g[j]=255; b[j]=0;
  434. j--;
  435. }
  436. for( i=0; i<256; i++ )
  437. {
  438. r[j]=0; g[j]=255; b[j]=i;
  439. j--;
  440. }
  441. for( i=0; i<256; i++ )
  442. {
  443. r[j]=0; g[j]=255-i; b[j]=255;
  444. j--;
  445. }
  446.  
  447. min=DBL_MAX;
  448. max=-DBL_MAX;
  449.  
  450. // find minimum and maximum
  451. for( i=0; i<sizey; i++ )
  452. {
  453. for( j=0; j<sizex; j++ )
  454. {
  455. if( u[i*sizex+j]>max )
  456. max=u[i*sizex+j];
  457. if( u[i*sizex+j]<min )
  458. min=u[i*sizex+j];
  459. }
  460. }
  461.  
  462.  
  463. fprintf(f, "P3\n");
  464. fprintf(f, "%u %u\n", sizex, sizey);
  465. fprintf(f, "%u\n", 255);
  466.  
  467. for( i=0; i<sizey; i++ )
  468. {
  469. for( j=0; j<sizex; j++ )
  470. {
  471. k=(int)(1023.0*(u[i*sizex+j]-min)/(max-min));
  472. fprintf(f, "%d %d %d ", r[k], g[k], b[k]);
  473. }
  474. fprintf(f, "\n");
  475. }
  476. }
  477.  
  478.  
  479. int coarsen( float *uold, unsigned oldx, unsigned oldy ,
  480. float *unew, unsigned newx, unsigned newy )
  481. {
  482. int i, j;
  483.  
  484. int stepx;
  485. int stepy;
  486. int stopx = newx;
  487. int stopy = newy;
  488.  
  489. if (oldx>newx)
  490. stepx=oldx/newx;
  491. else {
  492. stepx=1;
  493. stopx=oldx;
  494. }
  495. if (oldy>newy)
  496. stepy=oldy/newy;
  497. else {
  498. stepy=1;
  499. stopy=oldy;
  500. }
  501.  
  502. // NOTE: this only takes the top-left corner,
  503. // and doesnt' do any real coarsening
  504. for( i=0; i<stopy-1; i++ )
  505. {
  506. for( j=0; j<stopx-1; j++ )
  507. {
  508. unew[i*newx+j]=uold[i*oldx*stepy+j*stepx];
  509. }
  510. }
  511.  
  512. return 1;
  513. }
  514.  
  515. #define BUFSIZE 100
  516. int read_input( FILE *infile, algoparam_t *param )
  517. {
  518. int i, n;
  519. char buf[BUFSIZE];
  520.  
  521. fgets(buf, BUFSIZE, infile);
  522. n = sscanf( buf, "%u", &(param->maxiter) );
  523. if( n!=1)
  524. return 0;
  525.  
  526. fgets(buf, BUFSIZE, infile);
  527. n = sscanf( buf, "%u", &(param->resolution) );
  528. if( n!=1 )
  529. return 0;
  530.  
  531. param->visres = param->resolution;
  532.  
  533. fgets(buf, BUFSIZE, infile);
  534. n = sscanf(buf, "%u", &(param->numsrcs) );
  535. if( n!=1 )
  536. return 0;
  537.  
  538. (param->heatsrcs) =
  539. (heatsrc_t*) malloc( sizeof(heatsrc_t) * (param->numsrcs) );
  540.  
  541. for( i=0; i<param->numsrcs; i++ )
  542. {
  543. fgets(buf, BUFSIZE, infile);
  544. n = sscanf( buf, "%f %f %f %f",
  545. &(param->heatsrcs[i].posx),
  546. &(param->heatsrcs[i].posy),
  547. &(param->heatsrcs[i].range),
  548. &(param->heatsrcs[i].temp) );
  549.  
  550. if( n!=4 )
  551. return 0;
  552. }
  553.  
  554. return 1;
  555. }
  556.  
  557.  
  558. void print_params( algoparam_t *param )
  559. {
  560. int i;
  561.  
  562. fprintf(stdout, "Iterations : %u\n", param->maxiter);
  563. fprintf(stdout, "Resolution : %u\n", param->resolution);
  564. fprintf(stdout, "Num. Heat sources : %u\n", param->numsrcs);
  565.  
  566. for( i=0; i<param->numsrcs; i++ )
  567. {
  568. fprintf(stdout, " %2d: (%2.2f, %2.2f) %2.2f %2.2f \n",
  569. i+1,
  570. param->heatsrcs[i].posx,
  571. param->heatsrcs[i].posy,
  572. param->heatsrcs[i].range,
  573. param->heatsrcs[i].temp );
  574. }
  575. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement