A Maz

#define TILE_WIDTH 16
__kernel void matrixMultiplyShared(__global float* A,
                                 __global float *B,
                                 __global float *C,
                                  int numARows,
                                  int numAColumns,
                                  int numBRows,
                                  int numBColumns,
                                  int numCRows,
                                  int numCColumns)
{

    //--------- Write data to private memory  ----------//
    //allocation for a block
    __local float ds_A[TILE_WIDTH][TILE_WIDTH];
    __local float ds_B[TILE_WIDTH][TILE_WIDTH];

    int bx = get_group_id(0); int by = get_group_id(1);
    int tx = get_local_id(0); int ty = get_local_id(1);

    int row = by * TILE_WIDTH + ty;
    int col = bx * TILE_WIDTH + tx;

    float Cvalue = 0.0;

// Loop over the M and N tiles required to compute the Pd element
    for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) {
        if(row<numARows && m*TILE_WIDTH+tx < numAColumns){
            ds_A[ty][tx] = A[row*numAColumns + m*TILE_WIDTH+tx];
        } else {
            ds_A[ty][tx] = 0;
        }
        if(m*TILE_WIDTH+ty < numBRows && col < numBColumns){
            ds_B[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+col];
        } else {
            ds_B[ty][tx] = 0;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if(row < numCRows && col < numCColumns){
            for (int k = 0; k < TILE_WIDTH; ++k)
                Cvalue += ds_A[ty][k] * ds_B[k][tx];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if(row < numCRows && col < numCColumns)
        C[row*numCColumns+col] = Cvalue;
}