Untitled

#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
#include <math.h>

double wtime()
{
    struct timeval t;
    gettimeofday (&t, NULL);
    return (double)t.tv_sec + (double)t.tv_usec * 1E-6;
}

__global__ void add(float *a_device, float *b_device, float *c_device)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    a_device[i] = b_device[i] + c_device[i];
    a_device[i] *= a_device[i];
}

int main()
{
    int N = 32 * 386,
    threads = 32,
    num_of_blocks = 386;

    float *a = (float *)calloc(N, sizeof(*a));
    float *b = (float *)calloc(N, sizeof(*b));
    float *c = (float *)calloc(N, sizeof(*c));

    float *a_device;
    float *b_device;
    float *c_device;

    float *a_async;
    float *b_async;
    float *c_async;

    cudaMalloc((void **)&a_device, N * sizeof(*a_device));
    cudaMalloc((void **)&b_device, N * sizeof(*b_device));
    cudaMalloc((void **)&c_device, N * sizeof(*c_device));
    cudaHostAlloc((void**)&a_async, N * sizeof(float), cudaHostAllocDefault);
    cudaHostAlloc((void**)&b_async, N * sizeof(float), cudaHostAllocDefault);
    cudaHostAlloc((void**)&c_async, N * sizeof(float), cudaHostAllocDefault);

    for (int i = 0; i < N; i++)
    {
        b[i] = i;
        c[i] = i;
    }

    double cpyDef = -wtime();
    cudaMemcpy(b_device, b, N * sizeof(*b), cudaMemcpyHostToDevice);
    cudaMemcpy(c_device, c, N * sizeof(*c), cudaMemcpyHostToDevice);
    cpyDef += wtime();

    double cpyAsync = -wtime();
    cudaMemcpyAsync(b_device, b_async, N * sizeof(*b), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(c_device, c_async, N * sizeof(*c), cudaMemcpyHostToDevice);
    cpyAsync += wtime();

    printf("host->device: \n" "def: %lf\n" "paging: %lf\n\n", cpyDef, cpyAsync);

    add <<< num_of_blocks, threads >>>  (a_device, b_device, c_device);
    cudaDeviceSynchronize();

    double backDef = -wtime();
    cudaMemcpy(a, a_device, N * sizeof(float), cudaMemcpyDeviceToHost);
    backDef += wtime();

    double backAsync = -wtime();
    cudaMemcpyAsync(a_async, a_device, N * sizeof(float), cudaMemcpyDeviceToHost);
    backAsync += wtime();

    printf("device->host:\n" "def: %lf\n" "paging: %lf\n\n", backDef, backAsync);
    printf("summary:\n" "def: %lf\n" "paging: %lf\n", cpyDef + backDef, cpyAsync + backAsync);

    return 0;
}