task_11_multi_gpu

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

__global__ void vecAdd(double *A, double *B, double *C, int n){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n){
        C[idx] = A[idx] + B[idx];
    }
}

int main(int argc, char **argv){
    if (argc != 2){
        std::cerr << "Wrong arguments" << std::endl;
        return 1;
    }
    int n = atoi(argv[1]);
    double *h_a, *h_b, *h_c;
    size_t bytes = n *sizeof(double);

    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);

    cudaHostRegister(h_a, bytes, 0);
    cudaHostRegister(h_b, bytes, 0);
    cudaHostRegister(h_c, bytes, 0);


    for (int i = 0; i < n; ++i){
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i);
    }

    double *d_a0, *d_b0, *d_c0;
    double *d_a1, *d_b1, *d_c1;

    size_t bytes_device = ((n-1)/2 + 1)*sizeof(double);

    cudaSetDevice(0);

    cudaMalloc(&d_a0, bytes_device);
    cudaMalloc(&d_b0, bytes_device);
    cudaMalloc(&d_c0, bytes_device);

    cudaSetDevice(1);

    cudaMalloc(&d_a1, bytes_device);
        cudaMalloc(&d_b1, bytes_device);
        cudaMalloc(&d_c1, bytes_device);

    cudaSetDevice(0);

    cudaEvent_t start_gpu, stop_gpu;
        cudaEventCreate(&start_gpu);
        cudaEventCreate(&stop_gpu);

        cudaEventRecord(start_gpu);


    int block_size, grid_size;
        block_size = 1024;
        grid_size = ((n-1)/2)/block_size + 1;

    cudaSetDevice(0);

        cudaMemcpyAsync(d_a0, &h_a[0], bytes_device, cudaMemcpyHostToDevice);
        cudaMemcpyAsync(d_b0, &h_b[0], bytes_device, cudaMemcpyHostToDevice);


    cudaSetDevice(0);

    vecAdd<<<grid_size, block_size>>>(d_a0, d_b0, d_c0, (n-1)/2+1);

    cudaMemcpyAsync(&h_c[0], d_c0, bytes_device, cudaMemcpyDeviceToHost);

    cudaSetDevice(1);


    cudaMemcpyAsync(d_a1, &h_a[(n-1)/2 + 1], bytes_device, cudaMemcpyHostToDevice);
        cudaMemcpyAsync(d_b1, &h_b[(n-1)/2 + 1], bytes_device, cudaMemcpyHostToDevice);

    vecAdd<<<grid_size, block_size>>>(d_a1, d_b1, d_c1, (n-1)/2+1);

    cudaMemcpyAsync(&h_c[(n-1)/2+1], d_c1, bytes_device, cudaMemcpyDeviceToHost);

    cudaDeviceSynchronize();
    cudaSetDevice(0);
    cudaDeviceSynchronize();

    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);

    float delta = 0.0;
    cudaEventElapsedTime(&delta, start_gpu, stop_gpu);


    /*
    for (int i = 0; i < n; ++i){
        std::cout << h_c[i] << std::endl;
    }*/

    for (int i = 0; i < n; ++i){
            if (h_c[i] != h_a[i] + h_b[i]){
            std::cout << "Not equal" << std::endl;
            break;
        }
        if (i == n-1){
            std::cout << "Equal" << std::endl;
        }
        }

    printf("Elapsed time %lf\n", delta);


    cudaFree(d_a0);
        cudaFree(d_b0);
        cudaFree(d_c0);

    cudaSetDevice(1);

    cudaFree(d_a1);
        cudaFree(d_b1);
        cudaFree(d_c1);

    cudaSetDevice(0);

    cudaHostUnregister(h_a);
    cudaHostUnregister(h_b);
    cudaHostUnregister(h_c);

        free(h_a);
        free(h_b);
        free(h_c);

    return 0;
}