CUDA IPC

int main(int argc, char **argv) {
    // setup mpi + cuda
    // read input data (similarities)
    // decompose
    // execute algorithm
    thrust::host_vector<value_type> cluster_idx = compute(world, similarities, options );
    cluster_idx = compute(world, similarities, options );
    cluster_idx = compute(world, similarities, options );
    cluster_idx = compute(world, similarities, options );
    cluster_idx = compute(world, similarities, options );
    return 0;
}

template<class T>
thrust::host_vector<T> compute(
        const boost::mpi::communicator& communicator,
        const thrust::host_vector<T>& sim,
        const mpi_options_t& options) {
    //...
    //begin iterative algorithm
    for( int i=0; i<1000; i++ ) {
    //...
    //--- communicate partial results
    value_type* d_srp = thrust::raw_pointer_cast(&srp[0]);
    // boost::mpi::all_gather(world, boost::mpi::inplace(d_srp+columns*communicator.rank()).buffer, columns, d_srp);
    MPI_Allgather(  MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
            &d_srp[0], columns, MPI_DOUBLE, communicator);
    //...
    //--- communicate exemplars
    value_type* d_dec = thrust::raw_pointer_cast(&(dec[0]));
    size_t rows = similarities.size()/columns;
    // boost::mpi::all_gather(communicator, boost::mpi::inplace(d_dec+options.lineoffset).buffer, rows, d_dec);
    MPI_Allgather(  MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
            &d_dec[0], rows, MPI_DOUBLE, communicator);
    }
    //post-process: seems to have the same sync problems, since
    //the amount of identified clusters doesnt work either (completely
    //different results than computed above)
    //...
    MPI_Allgather(....)
    //...
    MPI_Allgather(....)
}

mpirun --mca btl_smcuda_use_cuda_ipc 0 --mca btl_smcuda_use_cuda_ipc_same_gpu 0 -np 2 ./double_test ../data/similarities20000.double.-300 ex.20000.double.2.gpus 1000 1000 0.9 &>cout.20000.double.2.gpus

# datatype: double
# datapoints: 20000
# max_iterations: 1000
# conv_iterations: 1000
# damping: 0.9
# communicator.size: 2
# time elapsed [s]; iterations executed; convergent since; clusters identified
121.* 1000 807 20
121.* 1000 807 20
121.* 1000 807 20
121.* 1000 820 9
121.* 1000 820 9

mpirun --mca btl_smcuda_use_cuda_ipc 0 --mca btl_smcuda_use_cuda_ipc_same_gpu 0 -np 2 ./double_test ../data/similarities20000.double.-300 ex.20000.double.2.gpus 1000 1000 0.9 &>cout.20000.double.2.gpus

# datatype: double
# datapoints: 20000
# max_iterations: 1000
# conv_iterations: 1000
# damping: 0.9
# communicator.size: 2
# time elapsed [s]; iterations executed; convergent since; clusters identified
121.* 1000 807 20
121.* 1000 807 20
121.* 1000 807 20
121.* 1000 807 20
121.* 1000 807 20