Data hosted with ♥ by Pastebin.com - Download Raw - See Original
  1. diff -r -w -u msieve/common/lanczos/cpu/lanczos_vv.c msieve_lacuda_1015/common/lanczos/cpu/lanczos_vv.c
  2. --- msieve/common/lanczos/cpu/lanczos_vv.c 2017-07-11 00:56:18.000000000 -0500
  3. +++ msieve_lacuda_1015/common/lanczos/cpu/lanczos_vv.c 2017-07-10 16:58:45.000000000 -0500
  4. @@ -612,12 +612,12 @@
  5.  
  6. global_xor(xy, xytmp, VBITS, matrix->mpi_ncols,
  7. matrix->mpi_la_col_rank,
  8. - matrix->mpi_la_row_grid);
  9. + matrix->mpi_word, matrix->mpi_la_row_grid);
  10.  
  11. /* combine the results across an entire MPI column */
  12.  
  13. global_xor(xytmp, xy, VBITS, matrix->mpi_nrows,
  14. matrix->mpi_la_row_rank,
  15. - matrix->mpi_la_col_grid);
  16. + matrix->mpi_word, matrix->mpi_la_col_grid);
  17. #endif
  18. }
  19. diff -r -w -u msieve/common/lanczos/lanczos.c msieve_lacuda_1015/common/lanczos/lanczos.c
  20. --- msieve/common/lanczos/lanczos.c 2017-07-11 00:56:18.000000000 -0500
  21. +++ msieve_lacuda_1015/common/lanczos/lanczos.c 2017-07-10 16:24:05.000000000 -0500
  22. @@ -544,22 +544,22 @@
  23. /* gather v into MPI row 0 */
  24.  
  25. MPI_TRY(MPI_Gatherv(v,
  26. - VWORDS * packed_matrix->nsubcols,
  27. - MPI_LONG_LONG, scratch,
  28. + packed_matrix->nsubcols,
  29. + obj->mpi_word, scratch,
  30. packed_matrix->subcol_counts,
  31. packed_matrix->subcol_offsets,
  32. - MPI_LONG_LONG, 0,
  33. + obj->mpi_word, 0,
  34. obj->mpi_la_col_grid))
  35.  
  36. /* gather row 0 into the root node */
  37.  
  38. if (obj->mpi_la_row_rank == 0) {
  39. MPI_TRY(MPI_Gatherv(scratch,
  40. - VWORDS * packed_matrix->ncols,
  41. - MPI_LONG_LONG, out,
  42. + packed_matrix->ncols,
  43. + obj->mpi_word, out,
  44. packed_matrix->col_counts,
  45. packed_matrix->col_offsets,
  46. - MPI_LONG_LONG, 0,
  47. + obj->mpi_word, 0,
  48. obj->mpi_la_row_grid))
  49. }
  50.  
  51. @@ -581,11 +581,11 @@
  52.  
  53. if (obj->mpi_la_col_rank == 0) {
  54. MPI_TRY(MPI_Gatherv(scratch,
  55. - VWORDS * packed_matrix->nrows,
  56. - MPI_LONG_LONG, out,
  57. + packed_matrix->nrows,
  58. + obj->mpi_word, out,
  59. packed_matrix->row_counts,
  60. packed_matrix->row_offsets,
  61. - MPI_LONG_LONG, 0,
  62. + obj->mpi_word, 0,
  63. obj->mpi_la_col_grid))
  64. }
  65.  
  66. @@ -603,18 +603,18 @@
  67. if (obj->mpi_la_row_rank == 0)
  68. MPI_TRY(MPI_Scatterv(in, packed_matrix->col_counts,
  69. packed_matrix->col_offsets,
  70. - MPI_LONG_LONG, scratch,
  71. - VWORDS * packed_matrix->ncols,
  72. - MPI_LONG_LONG, 0,
  73. + obj->mpi_word, scratch,
  74. + packed_matrix->ncols,
  75. + obj->mpi_word, 0,
  76. obj->mpi_la_row_grid))
  77.  
  78. /* push down each column */
  79.  
  80. MPI_TRY(MPI_Scatterv(scratch, packed_matrix->subcol_counts,
  81. packed_matrix->subcol_offsets,
  82. - MPI_LONG_LONG, out,
  83. - VWORDS * packed_matrix->ncols,
  84. - MPI_LONG_LONG, 0,
  85. + obj->mpi_word, out,
  86. + packed_matrix->ncols,
  87. + obj->mpi_word, 0,
  88. obj->mpi_la_col_grid))
  89. }
  90. #endif
  91. @@ -1521,6 +1521,11 @@
  92. max_nrows -= POST_LANCZOS_ROWS;
  93.  
  94. #else
  95. + /* Construct necessary MPI datatype */
  96. +
  97. + MPI_TRY(MPI_Type_contiguous(VWORDS, MPI_LONG_LONG, (MPI_Datatype *)&obj->mpi_word));
  98. + MPI_TRY(MPI_Type_commit((MPI_Datatype *)&obj->mpi_word));
  99. +
  100. /* tell all the MPI processes whether a post lanczos matrix
  101. was constructed */
  102.  
  103. @@ -1573,40 +1578,22 @@
  104. packed_matrix.subcol_offsets,
  105. 1, MPI_INT, obj->mpi_la_col_grid))
  106.  
  107. -#if VWORDS > 1
  108. - /* scatter-gather operations count 64-bit words and not
  109. - VBITS-bit vectors, so scale the counts */
  110. - {
  111. - uint32 i;
  112. - for (i = 0; i < obj->mpi_nrows; i++) {
  113. - packed_matrix.row_counts *= VWORDS;
  114. - packed_matrix.row_offsets *= VWORDS;
  115. - packed_matrix.subcol_counts *= VWORDS;
  116. - packed_matrix.subcol_offsets *= VWORDS;
  117. - }
  118. - for (i = 0; i < obj->mpi_ncols; i++) {
  119. - packed_matrix.col_counts *= VWORDS;
  120. - packed_matrix.col_offsets *= VWORDS;
  121. - }
  122. - }
  123. -#endif
  124. -
  125. /* if using a post-lanczos matrix, gather the matrix elements
  126. at the root node since all of them will be necessary at once */
  127.  
  128. if (post_lanczos_matrix != NULL && obj->mpi_la_row_rank == 0) {
  129. if (obj->mpi_la_col_rank == 0) {
  130. post_lanczos_matrix = xrealloc(post_lanczos_matrix,
  131. - max_ncols * sizeof(uint64));
  132. + max_ncols * sizeof(v_t));
  133. }
  134.  
  135. MPI_TRY(MPI_Gatherv((obj->mpi_la_col_rank == 0) ?
  136. MPI_IN_PLACE : post_lanczos_matrix,
  137. - VWORDS * ncols, MPI_LONG_LONG,
  138. + ncols, obj->mpi_word,
  139. post_lanczos_matrix,
  140. packed_matrix.col_counts,
  141. packed_matrix.col_offsets,
  142. - MPI_LONG_LONG, 0, obj->mpi_la_row_grid))
  143. + obj->mpi_word, 0, obj->mpi_la_row_grid))
  144.  
  145. if (obj->mpi_la_col_rank != 0) {
  146. free(post_lanczos_matrix);
  147. @@ -1664,6 +1651,9 @@
  148. matrix structures, and also frees the column entries from
  149. the input matrix (whether packed or not) */
  150.  
  151. +#ifdef HAVE_MPI
  152. + MPI_Type_free((MPI_Datatype *)&obj->mpi_word);
  153. +#endif
  154. packed_matrix_free(&packed_matrix);
  155. aligned_free(lanczos_output);
  156. return dependencies;
  157. diff -r -w -u msieve/common/lanczos/lanczos.h msieve_lacuda_1015/common/lanczos/lanczos.h
  158. --- msieve/common/lanczos/lanczos.h 2017-07-11 00:56:18.000000000 -0500
  159. +++ msieve_lacuda_1015/common/lanczos/lanczos.h 2017-07-10 17:04:17.000000000 -0500
  160. @@ -179,6 +179,7 @@
  161. uint32 mpi_ncols;
  162. uint32 mpi_la_row_rank;
  163. uint32 mpi_la_col_rank;
  164. + MPI_Datatype mpi_word;
  165. MPI_Comm mpi_la_row_grid;
  166. MPI_Comm mpi_la_col_grid;
  167.  
  168. @@ -235,19 +236,19 @@
  169. #ifdef HAVE_MPI
  170. void global_xor(void *send_buf, void *recv_buf,
  171. uint32 bufsize, uint32 mpi_nodes,
  172. - uint32 mpi_rank, MPI_Comm comm);
  173. + uint32 mpi_rank, MPI_Datatype, MPI_Comm comm);
  174.  
  175. void global_chunk_info(uint32 total_size, uint32 num_nodes,
  176. uint32 my_id, uint32 *chunk_size, uint32 *chunk_start);
  177.  
  178. void global_allgather(void *send_buf, void *recv_buf,
  179. uint32 bufsize, uint32 mpi_nodes,
  180. - uint32 mpi_rank, MPI_Comm comm);
  181. + uint32 mpi_rank, MPI_Datatype, MPI_Comm comm);
  182.  
  183. void global_xor_scatter(void *send_buf, void *recv_buf,
  184. void *scratch, uint32 bufsize,
  185. uint32 mpi_nodes, uint32 mpi_rank,
  186. - MPI_Comm comm);
  187. + MPI_Datatype, MPI_Comm comm);
  188. #endif
  189.  
  190. /* top-level calls for vector-vector operations */
  191. diff -r -w -u msieve/common/lanczos/lanczos_matmul.c msieve_lacuda_1015/common/lanczos/lanczos_matmul.c
  192. --- msieve/common/lanczos/lanczos_matmul.c 2017-07-11 00:56:18.000000000 -0500
  193. +++ msieve_lacuda_1015/common/lanczos/lanczos_matmul.c 2017-07-10 17:06:01.000000000 -0500
  194. @@ -115,6 +115,7 @@
  195. p->mpi_la_col_rank = obj->mpi_la_col_rank;
  196. p->mpi_la_row_grid = obj->mpi_la_row_grid;
  197. p->mpi_la_col_grid = obj->mpi_la_col_grid;
  198. + p->mpi_word = obj->mpi_word;
  199. #endif
  200.  
  201. matrix_extra_init(obj, p, first_block_size);
  202. @@ -148,7 +149,7 @@
  203. /* make each MPI column gather its own part of x */
  204.  
  205. global_allgather(x, scratch, A->ncols, A->mpi_nrows,
  206. - A->mpi_la_row_rank, A->mpi_la_col_grid);
  207. + A->mpi_la_row_rank, A->mpi_word, A->mpi_la_col_grid);
  208.  
  209. mul_core(A, scratch, scratch2);
  210.  
  211. @@ -158,7 +159,7 @@
  212. so it's not worth removing the redundancy */
  213.  
  214. global_xor(scratch2, scratch, A->nrows, A->mpi_ncols,
  215. - A->mpi_la_col_rank, A->mpi_la_row_grid);
  216. + A->mpi_la_col_rank, A->mpi_word, A->mpi_la_row_grid);
  217.  
  218. #endif
  219. }
  220. @@ -186,20 +187,20 @@
  221. /* make each MPI column gather its own part of x */
  222.  
  223. global_allgather(x, scratch, A->ncols, A->mpi_nrows,
  224. - A->mpi_la_row_rank, A->mpi_la_col_grid);
  225. + A->mpi_la_row_rank, A->mpi_word, A->mpi_la_col_grid);
  226.  
  227. mul_core(A, scratch, scratch2);
  228.  
  229. /* make each MPI row combine its own part of A*x */
  230.  
  231. global_xor(scratch2, scratch, A->nrows, A->mpi_ncols,
  232. - A->mpi_la_col_rank, A->mpi_la_row_grid);
  233. + A->mpi_la_col_rank, A->mpi_word, A->mpi_la_row_grid);
  234.  
  235. mul_trans_core(A, scratch, scratch2);
  236.  
  237. /* make each MPI row combine and scatter its own part of A^T * A*x */
  238.  
  239. global_xor_scatter(scratch2, b, scratch, A->ncols, A->mpi_nrows,
  240. - A->mpi_la_row_rank, A->mpi_la_col_grid);
  241. + A->mpi_la_row_rank, A->mpi_word, A->mpi_la_col_grid);
  242. #endif
  243. }
  244. diff -r -w -u msieve/common/lanczos/matmul_util.c msieve_lacuda_1015/common/lanczos/matmul_util.c
  245. --- msieve/common/lanczos/matmul_util.c 2017-07-11 00:56:18.000000000 -0500
  246. +++ msieve_lacuda_1015/common/lanczos/matmul_util.c 2017-07-10 17:18:36.000000000 -0500
  247. @@ -38,7 +38,7 @@
  248. /*------------------------------------------------------------------*/
  249. static void global_xor_async(v_t *send_buf, v_t *recv_buf,
  250. uint32 total_size, uint32 num_nodes,
  251. - uint32 my_id, MPI_Comm comm) {
  252. + uint32 my_id, MPI_Datatype mpi_word, MPI_Comm comm) {
  253.  
  254. uint32 i;
  255. uint32 m, size, chunk, remainder;
  256. @@ -74,7 +74,7 @@
  257. /* asynchronously send the current chunk */
  258.  
  259. MPI_TRY(MPI_Isend(curr_buf + m * chunk, size,
  260. - MPI_LONG_LONG, next_id, 97,
  261. + mpi_word, next_id, 97,
  262. comm, &mpi_req))
  263.  
  264. /* switch to the recvbuf after the first send */
  265. @@ -90,8 +90,8 @@
  266. /* don't wait for send to finish, start the recv
  267. from the previous node */
  268.  
  269. - MPI_TRY(MPI_Recv(curr_buf + m * chunk, VWORDS * size,
  270. - MPI_LONG_LONG, prev_id, 97,
  271. + MPI_TRY(MPI_Recv(curr_buf + m * chunk, size,
  272. + mpi_word, prev_id, 97,
  273. comm, &mpi_status))
  274.  
  275. /* combine the new chunk with our own */
  276. @@ -114,7 +114,7 @@
  277.  
  278. /* async send to chunk the next proc in circle */
  279.  
  280. - MPI_TRY(MPI_Isend(curr_buf, VWORDS * size, MPI_LONG_LONG,
  281. + MPI_TRY(MPI_Isend(curr_buf, size, mpi_word,
  282. next_id, 98, comm, &mpi_req))
  283.  
  284. size = chunk;
  285. @@ -128,7 +128,7 @@
  286. from the previous proc in circle, put the new
  287. data just where it should be in recv_buf */
  288.  
  289. - MPI_TRY(MPI_Recv(curr_buf, VWORDS * size, MPI_LONG_LONG,
  290. + MPI_TRY(MPI_Recv(curr_buf, size, mpi_word,
  291. prev_id, 98, comm, &mpi_status))
  292.  
  293. /* now wait for the send to end */
  294. @@ -140,7 +140,7 @@
  295. /*------------------------------------------------------------------*/
  296. void global_xor(void *send_buf_in, void *recv_buf_in,
  297. uint32 total_size, uint32 num_nodes,
  298. - uint32 my_id, MPI_Comm comm) {
  299. + uint32 my_id, MPI_Datatype mpi_word, MPI_Comm comm) {
  300.  
  301. v_t *send_buf = (v_t *)send_buf_in;
  302. v_t *recv_buf = (v_t *)recv_buf_in;
  303. @@ -148,16 +148,16 @@
  304. /* only get fancy for large buffers; even the
  305. fancy method is only faster when many nodes
  306. are involved */
  307. -
  308. + /*
  309. if (total_size < GLOBAL_BREAKOVER || num_nodes < 2) {
  310. MPI_TRY(MPI_Allreduce(send_buf,
  311. recv_buf, VWORDS * total_size,
  312. MPI_LONG_LONG, MPI_BXOR, comm))
  313. return;
  314. - }
  315. + } */
  316.  
  317. global_xor_async(send_buf, recv_buf,
  318. - total_size, num_nodes, my_id, comm);
  319. + total_size, num_nodes, my_id, mpi_word, comm);
  320. }
  321.  
  322. /*------------------------------------------------------------------*/
  323. @@ -185,7 +185,7 @@
  324. void global_xor_scatter(void *send_buf_in, void *recv_buf_in,
  325. void *scratch_in, uint32 total_size,
  326. uint32 num_nodes, uint32 my_id,
  327. - MPI_Comm comm) {
  328. + MPI_Datatype mpi_word, MPI_Comm comm) {
  329.  
  330. v_t *send_buf = (v_t *)send_buf_in;
  331. v_t *recv_buf = (v_t *)recv_buf_in;
  332. @@ -226,7 +226,7 @@
  333. /* asynchroniously send the current chunk */
  334.  
  335. MPI_TRY(MPI_Isend(send_buf + m * chunk,
  336. - VWORDS * size, MPI_LONG_LONG, next_id, 95,
  337. + size, mpi_word, next_id, 95,
  338. comm, &mpi_req))
  339.  
  340. /* switch to the recvbuf after the first send */
  341. @@ -240,8 +240,8 @@
  342. /* don't wait for send to finish, start the recv
  343. from the previous node */
  344.  
  345. - MPI_TRY(MPI_Recv(scratch, VWORDS * size,
  346. - MPI_LONG_LONG, prev_id, 95,
  347. + MPI_TRY(MPI_Recv(scratch, size,
  348. + mpi_word, prev_id, 95,
  349. comm, &mpi_status))
  350.  
  351. /* combine the new chunk with our own */
  352. @@ -256,7 +256,7 @@
  353. /* asynchronously send the current chunk */
  354.  
  355. MPI_TRY(MPI_Isend(send_buf + m * chunk,
  356. - VWORDS * size, MPI_LONG_LONG,
  357. + size, mpi_word,
  358. next_id, 95, comm, &mpi_req))
  359.  
  360. /* switch to the recvbuf after the first send */
  361. @@ -270,8 +270,8 @@
  362. /* don't wait for send to finish, start the recv
  363. from the previous node */
  364.  
  365. - MPI_TRY(MPI_Recv(recv_buf, VWORDS * size,
  366. - MPI_LONG_LONG, prev_id, 95,
  367. + MPI_TRY(MPI_Recv(recv_buf, size,
  368. + mpi_word, prev_id, 95,
  369. comm, &mpi_status))
  370.  
  371. /* combine the new chunk with our own */
  372. @@ -286,7 +286,7 @@
  373. /*------------------------------------------------------------------*/
  374. void global_allgather(void *send_buf_in, void *recv_buf_in,
  375. uint32 total_size, uint32 num_nodes,
  376. - uint32 my_id, MPI_Comm comm) {
  377. + uint32 my_id, MPI_Datatype mpi_word, MPI_Comm comm) {
  378.  
  379. v_t *send_buf = (v_t *)send_buf_in;
  380. v_t *recv_buf = (v_t *)recv_buf_in;
  381. @@ -326,7 +326,7 @@
  382.  
  383. /* async send to chunk the next proc in circle */
  384.  
  385. - MPI_TRY(MPI_Isend(curr_buf, VWORDS * size, MPI_LONG_LONG,
  386. + MPI_TRY(MPI_Isend(curr_buf, size, mpi_word,
  387. next_id, 96, comm, &mpi_req))
  388.  
  389. size = chunk;
  390. @@ -340,7 +340,7 @@
  391. from the previous proc in circle, put the new
  392. data just where it should be in recv_buf */
  393.  
  394. - MPI_TRY(MPI_Recv(curr_buf, VWORDS * size, MPI_LONG_LONG,
  395. + MPI_TRY(MPI_Recv(curr_buf, size, mpi_word,
  396. prev_id, 96, comm, &mpi_status))
  397.  
  398. /* now wait for the send to end */
  399. diff -r -w -u msieve/include/msieve.h msieve_lacuda_1015/include/msieve.h
  400. --- msieve/include/msieve.h 2017-07-11 00:56:26.000000000 -0500
  401. +++ msieve_lacuda_1015/include/msieve.h 2017-07-10 15:39:52.000000000 -0500
  402. @@ -152,6 +152,7 @@
  403. MPI_Comm mpi_la_col_grid; /* communicator for the current MPI col */
  404. uint32 mpi_la_row_rank;
  405. uint32 mpi_la_col_rank;
  406. + MPI_Datatype mpi_word; /* Word size for MPI */
  407. #endif
  408.  
  409. char *mp_sprintf_buf; /* scratch space for printing big integers */