Untitled

/**
* Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/

/**
* Matrix multiplication: C = A * B.
* Host code.
*
* This sample implements matrix multiplication as described in Chapter 3
* of the programming guide.
* It has been written for clarity of exposition to illustrate various CUDA
* programming principles, not with the goal of providing the most
* performant generic kernel for matrix multiplication.
*
* See also:
* V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
* in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
* Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
*/

// System includes
#define WIN32
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include <helper_functions.h>

/**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
* wA is A's width and wB is B's width
*/

/**
badanie wpływu organizacji dostępu do pamięci globalnej na efektywność przetwarzania (wersje 2,3,4 ) –
przygotować kody z dostępami łączonymi (w miarę możliwości)
2. grid wieloblokowy, jeden wątek oblicza jeden element macierzy wynikowej, obliczenia przy wykorzystaniu
pamięci globalnej,
3. grid wieloblokowy, jeden wątek oblicza jeden element macierzy wynikowej, obliczenia przy
wykorzystaniu pamięci współdzielonej bloku wątków,
4. grid wieloblokowy, jeden wątek oblicza jeden element macierzy wynikowej, obliczenia danych przy
wykorzystaniu pamięci współdzielonej bloku wątków ze zrównolegleniem obliczeń i pobierania danych z
pamięci globalnej w ramach każdego bloku wątków,
*/
template <int BLOCK_SIZE> __global__ void
matrixMulCUDA_v2(float *C, float *A, float *B, int wA, int wB)
{
    int row;
    row = blockIdx.y * blockDim.y + threadIdx.y;
    int col;
    col = blockIdx.x * blockDim.x + threadIdx.x;

    float Csub;
    Csub = 0;

    for (int i = 0; i < wA; i++)
    {
        Csub += A[row*wA + i] * B[i + col*wA];
    }

    C[row*wA + col] = Csub;
}

template <int BLOCK_SIZE> __global__ void
matrixMulCUDA_v3(float *C, float *A, float *B, int wA, int wB)
{
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int row;
    row = blockIdx.y * blockDim.y + threadIdx.y;
    int col;
    col = blockIdx.x * blockDim.x + threadIdx.x;

    const int SUB_WIDTH = 32;
    __shared__ float Ads[SUB_WIDTH][SUB_WIDTH];
    __shared__ float Bds[SUB_WIDTH][SUB_WIDTH];

    float Csub;
    Csub = 0;

    for (int i = 0; i < wA/SUB_WIDTH; i++)
    {
        Ads[tx][ty] = A[row*wA + i * SUB_WIDTH + tx];
        Bds[tx][ty] = B[i * SUB_WIDTH + ty + col*wA];

        __syncthreads();

        for (int k = 0; k < SUB_WIDTH; k++)
        {
            Csub += Ads[tx][k] * Bds[k][ty];

        }
        __syncthreads();
    }

    C[row*wA + col] = Csub;
}

template <int BLOCK_SIZE> __global__ void
matrixMulCUDA_v4(float *C, float *A, float *B, int wA, int wB)
{
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int row;
    row = blockIdx.y * blockDim.y + threadIdx.y;
    int col;
    col = blockIdx.x * blockDim.x + threadIdx.x;

    const int SUB_WIDTH = 16;
    __shared__ float Ads[SUB_WIDTH][SUB_WIDTH];
    __shared__ float Bds[SUB_WIDTH][SUB_WIDTH];

    __shared__ float A_shared[SUB_WIDTH][SUB_WIDTH];
    __shared__ float B_shared[SUB_WIDTH][SUB_WIDTH];

    float Csub;
    Csub = 0;

    //pierwsze pobranie danych z pamieci globalnej do pamieci wspoldzielonej 'A'
    A_shared[tx][ty] = A[row*wA + 0 * SUB_WIDTH + tx];
    B_shared[tx][ty] = B[0 * SUB_WIDTH + ty + col*wA];

    for (int i = 0; i < wA / SUB_WIDTH; i++)
    {
        //przepisanie danych z 'A' do pamiêci wspó³dzielonej 'B'
        Ads[tx][ty] = A_shared[tx][ty];
        Bds[tx][ty] = B_shared[tx][ty];

        __syncthreads();

        //pobranie kolejnego bloku danych z pamieci globalnej do 'A'
        A_shared[tx][ty] = A[row*wA + i * SUB_WIDTH + tx];
        B_shared[tx][ty] = B[i * SUB_WIDTH + ty + col*wA];

        //obliczenia na pamieci 'B'
        for (int k = 0; k < SUB_WIDTH; k++)
        {
            Csub += Ads[tx][k] * Bds[k][ty];

        }
        __syncthreads();
    }

    C[row*wA + col] = Csub;
}


void constantInit(float *data, int size, float val)
{
    for (int i = 0; i < size; ++i) {
        data[i] = val;
    }
}

/**
* Run a simple test of matrix multiplication using CUDA
*/
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
    // Allocate host memory for matrices A and B
    unsigned int size_A = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = (float *)malloc(mem_size_A);
    unsigned int size_B = dimsB.x * dimsB.y;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = (float *)malloc(mem_size_B);

    // Initialize host memory
    const float valB = 0.01f;
    constantInit(h_A, size_A, 1.0f);
    constantInit(h_B, size_B, valB);

    // Allocate device memory
    float *d_A, *d_B, *d_C;

    // Allocate host matrix C
    dim3 dimsC(dimsB.x, dimsA.y, 1);
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
    float *h_C = (float *)malloc(mem_size_C);

    if (h_C == NULL) {
        fprintf(stderr, "Failed to allocate host matrix C!\n");
        exit(EXIT_FAILURE);
    }

    cudaError_t error;

    //error = cudaMallocHost((void **)&d_A, mem_size_A);
    // error = cudaMalloc((void **)&d_A, mem_size_A);
    error = cudaHostAlloc(&d_A, mem_size_A, cudaHostAllocMapped); //zmiany

    if (error != cudaSuccess) {
        printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    //error = cudaMallocHost((void **)&d_B, mem_size_B);
    // error = cudaMalloc((void **)&d_B, mem_size_B);
    error = cudaHostAlloc(&d_B, mem_size_B, cudaHostAllocMapped); //zmiany

    if (error != cudaSuccess) {
        printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    //error = cudaMallocHost((void **)&d_C, mem_size_C);
    // error = cudaMalloc((void **)&d_C, mem_size_C);
    error = cudaHostAlloc(&d_C, mem_size_B, cudaHostAllocMapped); //zmiany

    if (error != cudaSuccess) {
        printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    cudaStream_t stream1, stream2, stream3;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    cudaStreamCreate(&stream3);

    // copy host memory to device
    //error = cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream1);
    // error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    error = cudaHostGetDevicePointer(&d_A, h_A, 0); //zmiany

    if (error != cudaSuccess) {
        printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    //error = cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream2);
    // error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
    error = cudaHostGetDevicePointer(&d_B, h_B, 0); //zmiany

    if (error != cudaSuccess) {
        printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // Setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

    // Create and start timer
    printf("Computing result using CUDA Kernel...\n");

    // Performs warmup operation using matrixMul CUDA kernel
    int version = 3;
    switch (version) {
        case 2:
            if (block_size == 16) {
                matrixMulCUDA_v2<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
            else {
                matrixMulCUDA_v2<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
            break;
        case 3:
            if (block_size == 16) {
                matrixMulCUDA_v3<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
            else {
                matrixMulCUDA_v3<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
            break;
        case 4:
            if (block_size == 16) {
                matrixMulCUDA_v4<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
            else {
                matrixMulCUDA_v4<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
            break;
    }

    printf("done\n");

    cudaDeviceSynchronize();

    // Allocate CUDA events that we'll use for timing
    cudaEvent_t start;
    error = cudaEventCreate(&start);


    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    cudaEvent_t stop;
    error = cudaEventCreate(&stop);

    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Record the start event
    error = cudaEventRecord(start, NULL);

    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Execute the kernel
    int nIter = 300;
    switch (version) {
        case 2:
            for (int j = 0; j < nIter; j++) {
                if (block_size == 16) {
                    matrixMulCUDA_v2<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
                }
                else {
                    matrixMulCUDA_v2<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
                }
            }
            break;
        case 3:
            for (int j = 0; j < nIter; j++) {
                if (block_size == 16) {
                    matrixMulCUDA_v3<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
                }
                else {
                    matrixMulCUDA_v3<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
                }
            }
            break;
        case 4:
            for (int j = 0; j < nIter; j++) {
                if (block_size == 16) {
                    matrixMulCUDA_v4<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
                }
                else {
                    matrixMulCUDA_v4<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
                }
            }
            break;
    }

    // Record the stop event
    error = cudaEventRecord(stop, NULL);
    //dobra teraz mamy tak mala macierz ze mozemy to sami obliczyc, moze dowiemy sie co jest nei tak
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Wait for the stop event to complete
    error = cudaEventSynchronize(stop);

    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    float msecTotal = 0.0f;
    error = cudaEventElapsedTime(&msecTotal, start, stop);

    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Compute and print the performance
    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf(
        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
        gigaFlops,
        msecPerMatrixMul,
        flopsPerMatrixMul,
        threads.x * threads.y);

    // Copy result from device to host
    //error = cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream3);
    // error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
    error = cudaHostGetDevicePointer(&h_C, d_C, 0); //zmiany

    if (error != cudaSuccess) {
        printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    printf("Checking computed result for correctness: ");
    bool correct = true;

    for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
        if (fabs(h_C[i] - (dimsA.x * valB)) > 1e-5) {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], dimsA.x*valB);
            correct = false;
        }
    }

    printf("%s\n", correct ? "OK" : "FAIL");

    // Clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    //ZMIANY NADCHODZĄ #zmiany
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    cudaStreamDestroy(stream3);

    printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n");

    cudaDeviceReset();

    if (correct) {
        return EXIT_SUCCESS;
    } else {
        return EXIT_FAILURE;
    }
}


/**
* Program main
*/
int main(int argc, char **argv)
{
    printf("[Matrix Multiply Using CUDA] - Starting...\n");

    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
        checkCmdLineFlag(argc, (const char **)argv, "?")) {
        printf("Usage -device=n (n >= 0 for deviceID)\n");
        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");

        exit(EXIT_SUCCESS);
    }

    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    int devID = 0;

    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
        devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
        cudaSetDevice(devID);
    }

    cudaError_t error;
    cudaDeviceProp deviceProp;
    error = cudaGetDevice(&devID);

    if (error != cudaSuccess) {
        printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
    }

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (deviceProp.computeMode == cudaComputeModeProhibited){
        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
        exit(EXIT_SUCCESS);
    }

    if (error != cudaSuccess){
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
    } else {
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
    }

    cudaGetDeviceProperties(&prop, 0);
    if (!prop.canMapHostMemory) {
        printf("Nie obsluguje zero-copy")
        exit(0);
    }
    cudaSetDeviceFlags(cudaDeviceMapHost);

    // Use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;
    int multiplier = 8;
    dim3 dimsA(multiplier * block_size, multiplier * block_size, 1);
    dim3 dimsB(multiplier * block_size, multiplier * block_size, 1);

    // width of Matrix A
    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
    }

    // height of Matrix A
    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
    }

    // width of Matrix B
    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
    }

    // height of Matrix B
    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
    }

    if (dimsA.x != dimsB.y) {
        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
            dimsA.x, dimsB.y);
        exit(EXIT_FAILURE);
    }

    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);

    int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
    system("PAUSE");
    exit(matrix_result);
}