diff -r -w -u msieve/common/lanczos/cpu/lanczos_vv.c msieve_lacuda_1015/common/lanczos/cpu/lanczos_vv.c
--- msieve/common/lanczos/cpu/lanczos_vv.c 2017-07-11 00:56:18.000000000 -0500
+++ msieve_lacuda_1015/common/lanczos/cpu/lanczos_vv.c 2017-07-10 16:58:45.000000000 -0500
@@ -612,12 +612,12 @@
global_xor(xy, xytmp, VBITS, matrix->mpi_ncols,
matrix->mpi_la_col_rank,
- matrix->mpi_la_row_grid);
+ matrix->mpi_word, matrix->mpi_la_row_grid);
/* combine the results across an entire MPI column */
global_xor(xytmp, xy, VBITS, matrix->mpi_nrows,
matrix->mpi_la_row_rank,
- matrix->mpi_la_col_grid);
+ matrix->mpi_word, matrix->mpi_la_col_grid);
#endif
}
diff -r -w -u msieve/common/lanczos/lanczos.c msieve_lacuda_1015/common/lanczos/lanczos.c
--- msieve/common/lanczos/lanczos.c 2017-07-11 00:56:18.000000000 -0500
+++ msieve_lacuda_1015/common/lanczos/lanczos.c 2017-07-10 16:24:05.000000000 -0500
@@ -544,22 +544,22 @@
/* gather v into MPI row 0 */
MPI_TRY(MPI_Gatherv(v,
- VWORDS * packed_matrix->nsubcols,
- MPI_LONG_LONG, scratch,
+ packed_matrix->nsubcols,
+ obj->mpi_word, scratch,
packed_matrix->subcol_counts,
packed_matrix->subcol_offsets,
- MPI_LONG_LONG, 0,
+ obj->mpi_word, 0,
obj->mpi_la_col_grid))
/* gather row 0 into the root node */
if (obj->mpi_la_row_rank == 0) {
MPI_TRY(MPI_Gatherv(scratch,
- VWORDS * packed_matrix->ncols,
- MPI_LONG_LONG, out,
+ packed_matrix->ncols,
+ obj->mpi_word, out,
packed_matrix->col_counts,
packed_matrix->col_offsets,
- MPI_LONG_LONG, 0,
+ obj->mpi_word, 0,
obj->mpi_la_row_grid))
}
@@ -581,11 +581,11 @@
if (obj->mpi_la_col_rank == 0) {
MPI_TRY(MPI_Gatherv(scratch,
- VWORDS * packed_matrix->nrows,
- MPI_LONG_LONG, out,
+ packed_matrix->nrows,
+ obj->mpi_word, out,
packed_matrix->row_counts,
packed_matrix->row_offsets,
- MPI_LONG_LONG, 0,
+ obj->mpi_word, 0,
obj->mpi_la_col_grid))
}
@@ -603,18 +603,18 @@
if (obj->mpi_la_row_rank == 0)
MPI_TRY(MPI_Scatterv(in, packed_matrix->col_counts,
packed_matrix->col_offsets,
- MPI_LONG_LONG, scratch,
- VWORDS * packed_matrix->ncols,
- MPI_LONG_LONG, 0,
+ obj->mpi_word, scratch,
+ packed_matrix->ncols,
+ obj->mpi_word, 0,
obj->mpi_la_row_grid))
/* push down each column */
MPI_TRY(MPI_Scatterv(scratch, packed_matrix->subcol_counts,
packed_matrix->subcol_offsets,
- MPI_LONG_LONG, out,
- VWORDS * packed_matrix->ncols,
- MPI_LONG_LONG, 0,
+ obj->mpi_word, out,
+ packed_matrix->ncols,
+ obj->mpi_word, 0,
obj->mpi_la_col_grid))
}
#endif
@@ -1521,6 +1521,11 @@
max_nrows -= POST_LANCZOS_ROWS;
#else
+ /* Construct necessary MPI datatype */
+
+ MPI_TRY(MPI_Type_contiguous(VWORDS, MPI_LONG_LONG, (MPI_Datatype *)&obj->mpi_word));
+ MPI_TRY(MPI_Type_commit((MPI_Datatype *)&obj->mpi_word));
+
/* tell all the MPI processes whether a post lanczos matrix
was constructed */
@@ -1573,40 +1578,22 @@
packed_matrix.subcol_offsets,
1, MPI_INT, obj->mpi_la_col_grid))
-#if VWORDS > 1
- /* scatter-gather operations count 64-bit words and not
- VBITS-bit vectors, so scale the counts */
- {
- uint32 i;
- for (i = 0; i < obj->mpi_nrows; i++) {
- packed_matrix.row_counts *= VWORDS;
- packed_matrix.row_offsets *= VWORDS;
- packed_matrix.subcol_counts *= VWORDS;
- packed_matrix.subcol_offsets *= VWORDS;
- }
- for (i = 0; i < obj->mpi_ncols; i++) {
- packed_matrix.col_counts *= VWORDS;
- packed_matrix.col_offsets *= VWORDS;
- }
- }
-#endif
-
/* if using a post-lanczos matrix, gather the matrix elements
at the root node since all of them will be necessary at once */
if (post_lanczos_matrix != NULL && obj->mpi_la_row_rank == 0) {
if (obj->mpi_la_col_rank == 0) {
post_lanczos_matrix = xrealloc(post_lanczos_matrix,
- max_ncols * sizeof(uint64));
+ max_ncols * sizeof(v_t));
}
MPI_TRY(MPI_Gatherv((obj->mpi_la_col_rank == 0) ?
MPI_IN_PLACE : post_lanczos_matrix,
- VWORDS * ncols, MPI_LONG_LONG,
+ ncols, obj->mpi_word,
post_lanczos_matrix,
packed_matrix.col_counts,
packed_matrix.col_offsets,
- MPI_LONG_LONG, 0, obj->mpi_la_row_grid))
+ obj->mpi_word, 0, obj->mpi_la_row_grid))
if (obj->mpi_la_col_rank != 0) {
free(post_lanczos_matrix);
@@ -1664,6 +1651,9 @@
matrix structures, and also frees the column entries from
the input matrix (whether packed or not) */
+#ifdef HAVE_MPI
+ MPI_Type_free((MPI_Datatype *)&obj->mpi_word);
+#endif
packed_matrix_free(&packed_matrix);
aligned_free(lanczos_output);
return dependencies;
diff -r -w -u msieve/common/lanczos/lanczos.h msieve_lacuda_1015/common/lanczos/lanczos.h
--- msieve/common/lanczos/lanczos.h 2017-07-11 00:56:18.000000000 -0500
+++ msieve_lacuda_1015/common/lanczos/lanczos.h 2017-07-10 17:04:17.000000000 -0500
@@ -179,6 +179,7 @@
uint32 mpi_ncols;
uint32 mpi_la_row_rank;
uint32 mpi_la_col_rank;
+ MPI_Datatype mpi_word;
MPI_Comm mpi_la_row_grid;
MPI_Comm mpi_la_col_grid;
@@ -235,19 +236,19 @@
#ifdef HAVE_MPI
void global_xor(void *send_buf, void *recv_buf,
uint32 bufsize, uint32 mpi_nodes,
- uint32 mpi_rank, MPI_Comm comm);
+ uint32 mpi_rank, MPI_Datatype, MPI_Comm comm);
void global_chunk_info(uint32 total_size, uint32 num_nodes,
uint32 my_id, uint32 *chunk_size, uint32 *chunk_start);
void global_allgather(void *send_buf, void *recv_buf,
uint32 bufsize, uint32 mpi_nodes,
- uint32 mpi_rank, MPI_Comm comm);
+ uint32 mpi_rank, MPI_Datatype, MPI_Comm comm);
void global_xor_scatter(void *send_buf, void *recv_buf,
void *scratch, uint32 bufsize,
uint32 mpi_nodes, uint32 mpi_rank,
- MPI_Comm comm);
+ MPI_Datatype, MPI_Comm comm);
#endif
/* top-level calls for vector-vector operations */
diff -r -w -u msieve/common/lanczos/lanczos_matmul.c msieve_lacuda_1015/common/lanczos/lanczos_matmul.c
--- msieve/common/lanczos/lanczos_matmul.c 2017-07-11 00:56:18.000000000 -0500
+++ msieve_lacuda_1015/common/lanczos/lanczos_matmul.c 2017-07-10 17:06:01.000000000 -0500
@@ -115,6 +115,7 @@
p->mpi_la_col_rank = obj->mpi_la_col_rank;
p->mpi_la_row_grid = obj->mpi_la_row_grid;
p->mpi_la_col_grid = obj->mpi_la_col_grid;
+ p->mpi_word = obj->mpi_word;
#endif
matrix_extra_init(obj, p, first_block_size);
@@ -148,7 +149,7 @@
/* make each MPI column gather its own part of x */
global_allgather(x, scratch, A->ncols, A->mpi_nrows,
- A->mpi_la_row_rank, A->mpi_la_col_grid);
+ A->mpi_la_row_rank, A->mpi_word, A->mpi_la_col_grid);
mul_core(A, scratch, scratch2);
@@ -158,7 +159,7 @@
so it's not worth removing the redundancy */
global_xor(scratch2, scratch, A->nrows, A->mpi_ncols,
- A->mpi_la_col_rank, A->mpi_la_row_grid);
+ A->mpi_la_col_rank, A->mpi_word, A->mpi_la_row_grid);
#endif
}
@@ -186,20 +187,20 @@
/* make each MPI column gather its own part of x */
global_allgather(x, scratch, A->ncols, A->mpi_nrows,
- A->mpi_la_row_rank, A->mpi_la_col_grid);
+ A->mpi_la_row_rank, A->mpi_word, A->mpi_la_col_grid);
mul_core(A, scratch, scratch2);
/* make each MPI row combine its own part of A*x */
global_xor(scratch2, scratch, A->nrows, A->mpi_ncols,
- A->mpi_la_col_rank, A->mpi_la_row_grid);
+ A->mpi_la_col_rank, A->mpi_word, A->mpi_la_row_grid);
mul_trans_core(A, scratch, scratch2);
/* make each MPI row combine and scatter its own part of A^T * A*x */
global_xor_scatter(scratch2, b, scratch, A->ncols, A->mpi_nrows,
- A->mpi_la_row_rank, A->mpi_la_col_grid);
+ A->mpi_la_row_rank, A->mpi_word, A->mpi_la_col_grid);
#endif
}
diff -r -w -u msieve/common/lanczos/matmul_util.c msieve_lacuda_1015/common/lanczos/matmul_util.c
--- msieve/common/lanczos/matmul_util.c 2017-07-11 00:56:18.000000000 -0500
+++ msieve_lacuda_1015/common/lanczos/matmul_util.c 2017-07-10 17:18:36.000000000 -0500
@@ -38,7 +38,7 @@
/*------------------------------------------------------------------*/
static void global_xor_async(v_t *send_buf, v_t *recv_buf,
uint32 total_size, uint32 num_nodes,
- uint32 my_id, MPI_Comm comm) {
+ uint32 my_id, MPI_Datatype mpi_word, MPI_Comm comm) {
uint32 i;
uint32 m, size, chunk, remainder;
@@ -74,7 +74,7 @@
/* asynchronously send the current chunk */
MPI_TRY(MPI_Isend(curr_buf + m * chunk, size,
- MPI_LONG_LONG, next_id, 97,
+ mpi_word, next_id, 97,
comm, &mpi_req))
/* switch to the recvbuf after the first send */
@@ -90,8 +90,8 @@
/* don't wait for send to finish, start the recv
from the previous node */
- MPI_TRY(MPI_Recv(curr_buf + m * chunk, VWORDS * size,
- MPI_LONG_LONG, prev_id, 97,
+ MPI_TRY(MPI_Recv(curr_buf + m * chunk, size,
+ mpi_word, prev_id, 97,
comm, &mpi_status))
/* combine the new chunk with our own */
@@ -114,7 +114,7 @@
/* async send to chunk the next proc in circle */
- MPI_TRY(MPI_Isend(curr_buf, VWORDS * size, MPI_LONG_LONG,
+ MPI_TRY(MPI_Isend(curr_buf, size, mpi_word,
next_id, 98, comm, &mpi_req))
size = chunk;
@@ -128,7 +128,7 @@
from the previous proc in circle, put the new
data just where it should be in recv_buf */
- MPI_TRY(MPI_Recv(curr_buf, VWORDS * size, MPI_LONG_LONG,
+ MPI_TRY(MPI_Recv(curr_buf, size, mpi_word,
prev_id, 98, comm, &mpi_status))
/* now wait for the send to end */
@@ -140,7 +140,7 @@
/*------------------------------------------------------------------*/
void global_xor(void *send_buf_in, void *recv_buf_in,
uint32 total_size, uint32 num_nodes,
- uint32 my_id, MPI_Comm comm) {
+ uint32 my_id, MPI_Datatype mpi_word, MPI_Comm comm) {
v_t *send_buf = (v_t *)send_buf_in;
v_t *recv_buf = (v_t *)recv_buf_in;
@@ -148,16 +148,16 @@
/* only get fancy for large buffers; even the
fancy method is only faster when many nodes
are involved */
-
+ /*
if (total_size < GLOBAL_BREAKOVER || num_nodes < 2) {
MPI_TRY(MPI_Allreduce(send_buf,
recv_buf, VWORDS * total_size,
MPI_LONG_LONG, MPI_BXOR, comm))
return;
- }
+ } */
global_xor_async(send_buf, recv_buf,
- total_size, num_nodes, my_id, comm);
+ total_size, num_nodes, my_id, mpi_word, comm);
}
/*------------------------------------------------------------------*/
@@ -185,7 +185,7 @@
void global_xor_scatter(void *send_buf_in, void *recv_buf_in,
void *scratch_in, uint32 total_size,
uint32 num_nodes, uint32 my_id,
- MPI_Comm comm) {
+ MPI_Datatype mpi_word, MPI_Comm comm) {
v_t *send_buf = (v_t *)send_buf_in;
v_t *recv_buf = (v_t *)recv_buf_in;
@@ -226,7 +226,7 @@
/* asynchroniously send the current chunk */
MPI_TRY(MPI_Isend(send_buf + m * chunk,
- VWORDS * size, MPI_LONG_LONG, next_id, 95,
+ size, mpi_word, next_id, 95,
comm, &mpi_req))
/* switch to the recvbuf after the first send */
@@ -240,8 +240,8 @@
/* don't wait for send to finish, start the recv
from the previous node */
- MPI_TRY(MPI_Recv(scratch, VWORDS * size,
- MPI_LONG_LONG, prev_id, 95,
+ MPI_TRY(MPI_Recv(scratch, size,
+ mpi_word, prev_id, 95,
comm, &mpi_status))
/* combine the new chunk with our own */
@@ -256,7 +256,7 @@
/* asynchronously send the current chunk */
MPI_TRY(MPI_Isend(send_buf + m * chunk,
- VWORDS * size, MPI_LONG_LONG,
+ size, mpi_word,
next_id, 95, comm, &mpi_req))
/* switch to the recvbuf after the first send */
@@ -270,8 +270,8 @@
/* don't wait for send to finish, start the recv
from the previous node */
- MPI_TRY(MPI_Recv(recv_buf, VWORDS * size,
- MPI_LONG_LONG, prev_id, 95,
+ MPI_TRY(MPI_Recv(recv_buf, size,
+ mpi_word, prev_id, 95,
comm, &mpi_status))
/* combine the new chunk with our own */
@@ -286,7 +286,7 @@
/*------------------------------------------------------------------*/
void global_allgather(void *send_buf_in, void *recv_buf_in,
uint32 total_size, uint32 num_nodes,
- uint32 my_id, MPI_Comm comm) {
+ uint32 my_id, MPI_Datatype mpi_word, MPI_Comm comm) {
v_t *send_buf = (v_t *)send_buf_in;
v_t *recv_buf = (v_t *)recv_buf_in;
@@ -326,7 +326,7 @@
/* async send to chunk the next proc in circle */
- MPI_TRY(MPI_Isend(curr_buf, VWORDS * size, MPI_LONG_LONG,
+ MPI_TRY(MPI_Isend(curr_buf, size, mpi_word,
next_id, 96, comm, &mpi_req))
size = chunk;
@@ -340,7 +340,7 @@
from the previous proc in circle, put the new
data just where it should be in recv_buf */
- MPI_TRY(MPI_Recv(curr_buf, VWORDS * size, MPI_LONG_LONG,
+ MPI_TRY(MPI_Recv(curr_buf, size, mpi_word,
prev_id, 96, comm, &mpi_status))
/* now wait for the send to end */
diff -r -w -u msieve/include/msieve.h msieve_lacuda_1015/include/msieve.h
--- msieve/include/msieve.h 2017-07-11 00:56:26.000000000 -0500
+++ msieve_lacuda_1015/include/msieve.h 2017-07-10 15:39:52.000000000 -0500
@@ -152,6 +152,7 @@
MPI_Comm mpi_la_col_grid; /* communicator for the current MPI col */
uint32 mpi_la_row_rank;
uint32 mpi_la_col_rank;
+ MPI_Datatype mpi_word; /* Word size for MPI */
#endif
char *mp_sprintf_buf; /* scratch space for printing big integers */