CUDA Matrix Multiplication

#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

cudaError_t matrMulCuda(float *matrR, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y, int tile_width, bool shredMem);
void sequentialMul(float *matrVrfy, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y);
__global__ void mulKernel(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y);
__global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width);

int main()
{
    bool match;
    int i, j, k, l, amount, m_x, m_y, n_x, n_y, sizeM, sizeN, sizeR;
    cudaError_t cudaStatus;

    float *matrM;
    float *matrN;
    float *matrR;
    float *matrVrfy;

    bool shared[2] = { false, true };
    int tile_width[3] = { 16, 32, 64 };
    int amountTiles = 1; //TODO: Change me back to three

    int dimensions[5][4] = { { 300, 1000, 1000, 500 },
                             { 30, 50, 50, 40 },
                             { 1000, 1000, 1000, 1000 },
                             { 50, 10000, 10000, 80 },
                             { 5000, 40, 40, 7000 } };
    int amountDimensions = 1;

    srand(time(NULL));

    for (i = 0; i < amountDimensions; i++)
    {
        m_x = dimensions[i][0];
        m_y = dimensions[i][1];
        n_x = dimensions[i][2];
        n_y = dimensions[i][3];

        //allocate memory for one dimensional representation of the 2D array
        sizeM = m_x * m_y;// x down and y right -> 300 rows and 1000 cols
        sizeN = n_x * n_y;// 1000 rows and 500 cols
        sizeR = m_x * n_y;
        matrM = (float*)malloc(sizeM * sizeof(float));
        matrN = (float*)malloc(sizeN * sizeof(float));
        matrR = (float*)malloc(sizeR * sizeof(float));
        matrVrfy = (float*)malloc(sizeR * sizeof(float));

        //Fill both matrixes with random values
        for (j = 0; j < sizeM; j++)
        {
            matrM[j] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / 1000.0)); // random floats between 0 and 1000
        }

        for (j = 0; j < sizeN; j++)
        {
            matrN[j] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / 1000.0)); // random floats between 0 and 1000
        }

        /*float tmp = 1.0;
        //Fill both matrixes with random values
        for (j = 0; j < sizeM; j++)
        {
            matrM[j] = tmp;
            tmp++;
        }
        tmp = 1.0;
        for (j = 0; j < sizeN; j++)
        {
            matrN[j] = tmp;
            tmp++;
        }*/

        //calc sequential to confirm correct result
        auto start = std::chrono::high_resolution_clock::now();

        sequentialMul(matrVrfy, matrM, matrN, m_x, m_y, n_x, n_y);

        auto end = std::chrono::high_resolution_clock::now();

        std::chrono::duration<double> diff = end - start;
        printf("Matrix Multiplication with M: %dx%d, N:%dx%d:\n", m_x, m_y, n_x, n_y);
        printf("CPU Time:            %f ms\n", diff.count() * 1000);

        for (j = 0; j < amountTiles; j++)
        {
            for (k = 0; k < 2; k++)
            {
                //Mul both matrixes
                cudaStatus = matrMulCuda(matrR, matrM, matrN, m_x, m_y, n_x, n_y, tile_width[j], shared[k]);
                if (cudaStatus != cudaSuccess)
                {
                    fprintf(stderr, "matrMulCuda failed!");
                    return 1;
                }

                match = true;

                amount = m_x * n_y;
                for (l = 0; l < amount; l++)//300*500
                {
                    //printf("(j, R, Vrfy: %d, %f, %f)\n", j, matrR[j], matrVrfy[j]);
                    if (matrR[l] != matrVrfy[l])
                    {
                        printf("Result does not matche! (l, R, Vrfy: %d, %f, %f)\n", l, matrR[l], matrVrfy[l]);
                        match = false;

                        break;
                    }
                }

                if (match)
                {
                    //printf("Result matches sequential calculation.\n");
                }

                //Reset result for next use
                free(matrR);
                matrR = (float*)malloc(sizeR * sizeof(float));
            }

            printf("\n");
        }

        printf("\n");

        //free memory
        free(matrM);
        free(matrN);
        free(matrR);
        free(matrVrfy);
    }

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    //printf("\n");
    //printf("Press Enter to exit program...");
    //getchar();

    return 0;
}

__global__ void mulKernel(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int i;

    if ((row < m_x) && (col < n_y))
    {
        float tmp = 0.0;
        for (i = 0; i < m_y; i++)
        {
            tmp += matrM[row * m_y + i] * matrN[col + n_y * i];
        }

        matrR[row * n_y + col] = tmp;
    }
}

__global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width)
{
    int i, j;
    extern __shared__ float shared[];
    float *matrM_sm = shared;
    float *matrN_sm = &shared[tile_width * tile_width];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = by * tile_width + ty;
    int col = bx * tile_width + tx;

    float tmp;
    int limit = ceil(m_y / (float) tile_width);
    for (i = 0; i < limit; i++)
    {
        tmp = 0.0;

        if (i * tile_width + tx < m_y && row < m_x)
            matrM_sm[ty * tile_width + tx] = matrM[row * m_y + (i * tile_width + tx)];
        else
            matrM_sm[ty * tile_width + tx] = 0.0;

        if (i * tile_width + ty < n_x && col < n_y)
            matrN_sm[ty * tile_width + tx] = matrN[col + (i * tile_width + ty) * n_y];
        else
            matrN_sm[ty * tile_width + tx] = 0.0;

        __syncthreads();

        for (j = 0; j < tile_width; j++)
            tmp += matrM_sm[ty * tile_width + j] * matrN_sm[j * tile_width + tx];

        __syncthreads();
    }

    if (row < m_x && col < n_y)
        matrR[row * n_y + col] = tmp;
}

// Helper function for using CUDA to mul matrix M and N.
cudaError_t matrMulCuda(float *matrR, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y, int tile_width, bool shredMem)
{
    float *dev_matrR = 0;
    float *dev_matrM = 0;
    float *dev_matrN = 0;
    cudaError_t cudaStatus;
    dim3 dim_grid, dim_block;

    int sizeR = m_x * n_y;
    int sizeM = m_x * m_y;
    int sizeN = n_x * n_y;
    int shared = 2 * tile_width * tile_width * sizeof(float);

    dim_block.x = tile_width;
    dim_block.y = tile_width;
    dim_block.z = 1;

    dim_grid.x = (n_y + dim_block.x - 1) / tile_width;
    dim_grid.y = (m_x + dim_block.y - 1) / tile_width;
    dim_grid.z = 1;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three matrix
    cudaStatus = cudaMalloc((void**)&dev_matrR, sizeR * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrM, sizeM * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrN, sizeN * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input matrix from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_matrM, matrM, sizeM * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_matrN, matrN, sizeN * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    if (shredMem)
    {
        cudaEventRecord(start);
        // Launch a kernel on the GPU with one thread for each element.
        mulKernelSM<<<dim_grid, dim_block, shared>>>(dev_matrR, dev_matrM, dev_matrN, m_x, m_y, n_x, n_y, tile_width);

        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
    }
    else
    {
        cudaEventRecord(start);
        // Launch a kernel on the GPU with one thread for each element.
        mulKernel<<<dim_grid, dim_block>>>(dev_matrR, dev_matrM, dev_matrN, m_x, m_y, n_x, n_y);

        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
    }

    float elapsed = 0;
    cudaEventElapsedTime(&elapsed, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    if (shredMem)
    {
        printf("GPU Time with SM:    %f ms, tile_width: %d\n", elapsed, tile_width);
    }
    else
    {
        printf("GPU Time without SM: %f ms, tile_width: %d\n", elapsed, tile_width);
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "mulKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching mulKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output matrix from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(matrR, dev_matrR, sizeR * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_matrR);
    cudaFree(dev_matrM);
    cudaFree(dev_matrN);

    return cudaStatus;
}


void sequentialMul(float *matrVrfy, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y)
{
    int i, j, k, curPos;
    float tmp;

    for (i = 0; i < m_x; i++)//300
    {
        for (j = 0; j < n_y; j++)//500
        {
            curPos = i * n_y + j;

            tmp = 0.0;
            for (k = 0; k < m_y; k++)
            {
                tmp += matrM[i * m_y + k] * matrN[j + n_y * k];
            }

            matrVrfy[curPos] = tmp;
        }
    }
}