Untitled

#include <stdio.h>
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#define N 9000000

struct saxpyFunctor {
    const float a;
    saxpyFunctor(float _a) : a(_a) {}
    __host__ __device__ float operator()(float x, float y) {
        return a * x + y;
    }
};

__global__ void cudaSaxpy(float alpha, float *x, float *y) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    y[i] = alpha * x[i] + y[i];
}

void saxpy(float a, thrust::device_vector<float>& x, thrust::device_vector<float>& y) {
    saxpyFunctor func(a);
    thrust::transform(x.begin(), x.end(), y.begin(), y.begin(), func);
}

void CudaKernel(){
    float elapsedTime;
    cudaEvent_t start, stop;
    float *x_d, *x_h, *y_h, *y_d;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaMalloc((void**)&x_d, N * sizeof(float));
    cudaMalloc((void**)&y_d, N * sizeof(float));
    x_h = (float*)calloc(N, sizeof(float));
    y_h = (float*)calloc(N, sizeof(float));

    for(int i = 0; i < N; i++) {
        x_h[i] = i;
        y_h[i] = 0.87;
    }

    cudaMemcpy(x_d, x_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(y_d, y_h, N * sizeof(float), cudaMemcpyHostToDevice);

    cudaEventRecord(start, 0);
    cudaSaxpy <<< N / 256, 256 >>> (3.0, x_d, y_d);
    cudaDeviceSynchronize();
    cudaEventRecord(stop, 0);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaMemcpy(y_h, y_d, N * sizeof(float), cudaMemcpyDeviceToHost);
    printf("CUDA Kernel Time:\n \t %f ms\n", elapsedTime);
    free(x_h);
    free(y_h);
    cudaFree(y_d);
    cudaFree(x_d);
}

void cudaCublas(){
     float elapsedTime;
    cudaEvent_t start, stop;
    float *cx_d, *cx_h, *cy_h, *cy_d;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMallocHost((void**)&cx_h, N * sizeof(float));
    cudaMallocHost((void**)&cy_h, N * sizeof(float));
    cudaMalloc((void**)&cx_d,  N * sizeof(float));
    cudaMalloc((void**)&cy_d,  N * sizeof(float));

    for(int i = 0; i < N; i++) {
        cx_h[i] = (float) i;
        cy_h[i] = 0.87f;
    }

    cublasHandle_t cublas_handle;
    cublasCreate(&cublas_handle);

    const int num_rows = N;
    const int num_cols = 1;
    const size_t elem_size = sizeof(float);

    cublasSetMatrix(num_rows, num_cols, elem_size, cx_h, num_rows, cx_d, num_rows);
    cublasSetMatrix(num_rows, num_cols, elem_size, cy_h, num_rows, cy_d, num_rows);

    const int stride = 1;
    float alpha = 3.0f;

    cudaEventRecord(start, 0);
    cublasSaxpy(cublas_handle, N, &alpha, cx_d, stride, cy_d, stride);
    cudaEventRecord(stop, 0);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cublasGetMatrix(num_rows, num_cols, elem_size, cx_d, num_rows, cx_h, num_rows);
    cublasGetMatrix(num_rows, num_cols, elem_size, cy_d, num_rows, cy_h, num_rows);
    printf("cuBLAS Time:\n \t %f ms\n", elapsedTime);
    cublasDestroy(cublas_handle);
    cudaFreeHost(cx_h);
    cudaFreeHost(cy_h);
    cudaFree(cx_d);
    cudaFree(cy_d);
}


void ThrustLib(){
     float elapsedTime;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    thrust::host_vector<float> h1(N);
    thrust::host_vector<float> h2(N);
    thrust::sequence(h1.begin(), h1.end());
    thrust::fill(h2.begin(), h2.end(), 0.87);

    thrust::device_vector<float> d1 = h1;
    thrust::device_vector<float> d2 = h2;

    cudaEventRecord(start, 0);
    saxpy(3.0, d1, d2);
    cudaEventRecord(stop, 0);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    h2 = d2;
    h1 = d1;

    printf("Thrust Time:\n \t %f ms\n\n", elapsedTime);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}


int main() {
    printf("\t\tРасчетно-графическое задание\n");
    printf("\tПо дисциплине ''Программирование графических процессоров''\n");
    printf("\tТема: Провести  анализ  производительности  программ,\nреализующих  алгоритмы линейной алгебры с использованием библиотек\nThrust, cuBLAS и «сырого» кода на CUDA C.\n");
    printf("\nВыполнили: студенты группы ИП-715 Киселев В.С. и Пляскина А.Ю.\n\n");
    CudaKernel();
    cudaCublas();
    ThrustLib();
    return 0;
}