Transfer

#include "time.h"
#include "stdio.h"
#include "stdlib.h"
#include "assert.h"
#include "cuda.h"


#define tx threadIdx.x
#define ty threadIdx.y


#define KER_MONO 0x0
#define KER_GRID 0x1
#define MEM_STD 0x0
#define MEM_UNI 0x1

#define KER_SIZE 0x20


const int OP1_ROWS = 2000,
          OP1_COLS = 500,
          OP2_COLS = 2000,
          SIZE = OP1_ROWS * OP1_COLS,
          ACC_SIZE = OP1_ROWS * OP2_COLS;


////////////////////////////DEVICE CODE HERE////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
__global__
void cudaFill(float *buffer, float value, const int len) //aka matrixInit
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if(i < len)
    {
        buffer[i] = value;
    }
}


__global__
void cudaMatrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x,
        j = 0x0,
        k = 0x0,

        rowStride = blockDim.x * gridDim.x,
        colStride = blockDim.y * gridDim.y;


    for(; i < op1Rows; i += rowStride)
    {
        for(j = blockIdx.y * blockDim.y + threadIdx.y; j < op2Cols; j += colStride)
        {
            float scalarProduct = 0.0f;

            for(k = 0x0; k < dim; k++)
            {
                scalarProduct += op1[i * dim + k] * op2[k * dim + j];
            }
            acc[i * op1Rows + j] = scalarProduct;
        }
    }
}


__global__
void cudaTiledMatrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols)
{
    __shared__ int s_op1[KER_SIZE][KER_SIZE];
    __shared__ int s_op2[KER_SIZE][KER_SIZE];

    int row = blockIdx.y * blockDim.y + ty,
        col = 0x0,
        p = 0x0,
        k = 0x0,

        rowStride = blockDim.y * gridDim.y,
        colStride = blockDim.x * gridDim.x,

        rowLimit = op1Rows + (op1Rows % rowStride),
        colLimit = op2Cols + (op2Cols % colStride);

    for(; row < rowLimit; row += rowStride)
    {
        for(col = blockIdx.x * blockDim.x + tx; col < colLimit; col += colStride)
        {
            float scalarProduct = 0.0f;

            for(p = 0x0; p < (KER_SIZE + dim - 0x1) / KER_SIZE; p++) //TODO redefine condition!
            {
                if(p * KER_SIZE + tx < dim && row < op1Rows)
                {
                    s_op1[ty][tx] = op1[row * dim + k * KER_SIZE + tx];
                }
                else
                {
                    s_op1[ty][tx] = 0.0f;
                }

                if(p * KER_SIZE + ty < dim && col < op2Cols)
                {
                    s_op2[ty][tx] = op2[(p * KER_SIZE + ty) * op2Cols + col];
                }
                else
                {
                    s_op2[ty][tx] = 0.0f;
                }

                __syncthreads();

                for(k = 0x0; k < KER_SIZE; k++)
                {
                    scalarProduct += s_op1[ty][k] * s_op2[k][tx];
                }

                __syncthreads();
            }

            if(row < op1Rows && col < op2Cols)
            {
                acc[row * op2Cols + col] = scalarProduct;
            }
        }
    }
}
//////////////////////////////////////////////////////////////////////////////


////////////////////////////HOST CODE HERE////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
void matrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols);
void fill(float *buffer, const float value, const int len);
void assertfy(float *buffer, const float value, const int len);
void printUpTo(float *buffer, const int len);


void matrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols)
{
    int i = 0x0,
        j = 0x0,
        k = 0x0;

    for(; i < op1Rows; i++)
    {
        for(j = 0x0; j < op2Cols; j++)
        {
            float scalarProduct = 0.0f;

            for(k = 0x0; k < dim; k++)
            {
                scalarProduct += op1[i * dim + k] * op2[k * dim + j];
            }
            acc[i * op1Rows + j] = scalarProduct;
        }
    }
}


void fill(float *buffer, const float value, const int len)
{
    int i = 0x0;
    for(; i < len; i++)
    {
        buffer[i] = value;
    }
}


void assertfy(float *buffer, const float value, const int len)
{
    int i = 0x0;
    for(; i < len; i++)
    {
        if(buffer[i] != value)
        {
            printf("[x] Illegal Found: %1.2f\n", buffer[i]);
         return;
        }
        assert(buffer[i] == value);
    }
}


void printUpTo(float *buffer, const int len)
{
    int i = 0x0;
    for(; i < len; i++)
    {
        printf("%1.2f\t", buffer[i]);
    }
    printf("\n");
}
//////////////////////////////////////////////////////////////////////////////


//////////////////////EXECTUTION TYPE HERE////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
void exec_host_only();
void exec_cuda_std(const int ky, const int kx);
void exec_cuda_tiled(const int ky, const int kx);


void exec_host_only()
{
        float *op1,
          *op2,
          *acc;

    op1 = (float*) malloc(SIZE * sizeof(float));
    op2 = (float*) malloc(SIZE * sizeof(float));
    acc = (float*) malloc(ACC_SIZE * sizeof(float));

    clock_t begin = clock();

    fill(op1, 1.0f, SIZE);
    fill(op2, 2.0f, SIZE);

    matrixMultiplication(op1, op2, acc, OP1_COLS, OP1_ROWS, OP2_COLS);

    clock_t end = clock();
    float elapsedTime = (float)(end - begin) / CLOCKS_PER_SEC;

    printf("Elapsed Time: %1.2f\n", elapsedTime);

    assertfy(acc, 1000.0f, ACC_SIZE);
    printUpTo(acc, 0xa);

    free(op1);
    free(op2);
    free(acc);
}


void exec_cuda_std(const int ky, const int kx)
{
    float *op1,
          *op2,
          *acc;

    cudaMallocManaged((void**) &op1, SIZE * sizeof(float), cudaMemAttachGlobal);
    cudaMallocManaged((void**) &op2, SIZE * sizeof(float), cudaMemAttachGlobal);
    cudaMallocManaged((void**) &acc, ACC_SIZE * sizeof(float), cudaMemAttachGlobal);

    dim3 blockDim(0x400, 0x1, 0x1);
    dim3 gridDim(ceil(SIZE / (float) 0x400), 0x1, 0x1);

    cudaFill<<<gridDim, blockDim>>>(op1, 1.0f, SIZE);
    cudaFill<<<gridDim, blockDim>>>(op2, 2.0f, SIZE);

    dim3 blockDim_(ky, kx, 0x1);
    dim3 gridDim_(0x20, 0x20, 0x1);

    printf("[*] gridDim(%d, %d)\n", gridDim_.x, gridDim_.y);
    printf("[*] blockDim(%d, %d)\n", blockDim_.x, blockDim_.y);

    cudaMatrixMultiplication<<<gridDim_, blockDim_>>>(op1, op2, acc, OP1_COLS, OP1_ROWS, OP2_COLS);

    cudaDeviceSynchronize();

    assertfy(acc, 1000.0f, ACC_SIZE);
    printUpTo(acc, 0xa);

    cudaFree(op1);
    cudaFree(op2);
    cudaFree(acc);
}


void exec_cuda_tiled(const int ky, const int kx)
{
    float *op1,
          *op2,
          *acc;

    cudaMallocManaged((void**) &op1, SIZE * sizeof(float), cudaMemAttachGlobal);
    cudaMallocManaged((void**) &op2, SIZE * sizeof(float), cudaMemAttachGlobal);
    cudaMallocManaged((void**) &acc, ACC_SIZE * sizeof(float), cudaMemAttachGlobal);

    dim3 blockDim(0x400, 0x1, 0x1);
    dim3 gridDim(ceil(SIZE / (float) 0x400), 0x1, 0x1);

    cudaFill<<<gridDim, blockDim>>>(op1, 1.0f, SIZE);
    cudaFill<<<gridDim, blockDim>>>(op2, 2.0f, SIZE);

    dim3 blockDim_(ky, kx, 0x1);
    dim3 gridDim_(0x200, 0x200, 0x1);

    printf("[*] gridDim(%d, %d)\n", gridDim_.x, gridDim_.y);
    printf("[*] blockDim(%d, %d)\n", blockDim_.x, blockDim_.y);

    cudaTiledMatrixMultiplication<<<gridDim_, blockDim_>>>(op1, op2, acc, OP1_COLS, OP1_ROWS, OP2_COLS);

    cudaDeviceSynchronize();

    assertfy(acc, 100.0f, ACC_SIZE);
    printUpTo(acc, 0xa);

    cudaFree(op1);
    cudaFree(op2);
    cudaFree(acc);
}
//////////////////////////////////////////////////////////////////////////////


int main(int argc, char const *argv[])
{
    exec_cuda_tiled(KER_SIZE, KER_SIZE);
    return 0x0;
}