A Maz

#include <iostream>
#include <fstream>
using namespace std;

#define TILE_WIDTH 16

void randomFloats(float* data, int size){
    for(int i=0; i<size; i++)
        data[i] = rand() / (float)RAND_MAX;
}

// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
                         int numARows, int numAColumns,
                         int numBRows, int numBColumns,
                         int numCRows, int numCColumns) {
    //@@ Insert code to implement matrix multiplication here
    //@@ You have to use shared memory for this MP
    __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
    __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];

    int bx = blockIdx.x; int by = blockIdx.y;
    int tx = threadIdx.x; int ty = threadIdx.y;

    int row = by * TILE_WIDTH + ty;
    int col = bx * TILE_WIDTH + tx;

    float Cvalue = 0.0;

// Loop over the M and N tiles required to compute the Pd element
    for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) {
        if(row<numARows && m*TILE_WIDTH+tx < numAColumns){
            ds_A[ty][tx] = A[row*numAColumns + m*TILE_WIDTH+tx];
        } else {
            ds_A[ty][tx] = 0;
        }
        if(m*TILE_WIDTH+ty < numBRows && col < numBColumns){
            ds_B[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+col];
        } else {
            ds_B[ty][tx] = 0;
        }
        __syncthreads();
        if(row < numCRows && col < numCColumns){
            for (int k = 0; k < TILE_WIDTH; ++k)
                Cvalue += ds_A[ty][k] * ds_B[k][tx];
        }
        __syncthreads();
    }
    if(row < numCRows && col < numCColumns)
        C[row*numCColumns+col] = Cvalue;
}

int main(int argc, char ** argv) {
    float * hostA; // The A matrix
    float * hostB; // The B matrix
    float * hostC; // The output C matrix
    float * deviceA;
    float * deviceB;
    float * deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows; // number of rows in the matrix C
    int numCColumns; // number of columns in the matrix C
    ofstream fout("output.raw");

    numARows = numAColumns = 4096;
    numBRows = numBColumns = 4096;
    hostA = (float*)malloc(numARows*numAColumns * sizeof(float));
    hostB = (float*)malloc(numBRows*numBColumns * sizeof(float));

    randomFloats(hostA, numARows*numAColumns);
    randomFloats(hostB, numBRows*numBColumns);

    //@@ Set numCRows and numCColumns
    numCRows = numARows;
    numCColumns = numBColumns;
    //@@ Allocate the hostC matrix
    hostC = (float*)malloc(numCColumns * numCRows * sizeof(float));


    //@@ Allocate GPU memory here
    cudaMalloc((void**)&deviceA, numARows * numAColumns * sizeof(float));
    cudaMalloc((void**)&deviceB, numBRows * numBColumns * sizeof(float));
    cudaMalloc((void**)&deviceC, numCRows * numCColumns * sizeof(float));

    //@@ Copy memory to the GPU here
    cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);


    //@@ Initialize the grid and block dimensions here
    dim3 dimBlock(16, 16);
    dim3 dimGrid((numBColumns + dimBlock.x - 1) / dimBlock.x,
       (numARows + dimBlock.y - 1) / dimBlock.y);

    //@@ Launch the GPU Kernel here
    matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC,
                                        numARows, numAColumns,
                                        numBRows, numBColumns,
                                        numCRows, numCColumns);
    cudaThreadSynchronize();
    //@@ Copy the GPU memory back to the CPU here
    cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost);

    //@@ Free the GPU memory here
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);
    /*
    for(int i=0; i<numCRows; i++){
        for(int j=0; j<numCColumns; j++){
            fout << hostC[i*numCColumns+j] << " ";
        }
        fout << "\n";
    }
    */
    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}