Untitled

__global__ static
void matrixMultiply(int wA,                    //Acol
                    int hA,                    //Arow
                    int wB,                    //Bcol
                    float *objects,            //matrix A
                    float *deviceClusters,     // matrix B
                    float *deviceC)            //  matrix C
{
    const int WARP = 32;
    const int SHIFT = 5;

    __shared__ volatile float AS[WARP][WARP];
    __shared__ volatile float BS[WARP][WARP];

    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    //  Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Index of the first sub-matrix of A processed by the block
    int aBegin = wA * WARP * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + ((wA+WARP-1)>>SHIFT)*WARP;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = WARP;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = WARP * bx;
    int bStep = WARP * wB;
    int bRow = 0;

    float cSub = 0.0f;
    //init shared memory
    AS[ty][tx] = 0.0f;
    BS[ty][tx] = 0.0f;

    __syncthreads(); //sychnozized for all the data being loaded into shared memory

    int a = 0,b=0;
    for (a = aBegin,b = bBegin;a<aEnd;a+=aStep,b+=bStep,bRow+=WARP)
    {
        if (a+tx<aBegin+wA && by*WARP+ty<hA)
            AS[ty][tx] = objects[a+wA*ty+tx];
        if (bRow<wA && bx*WARP+tx<wB)
            BS[ty][tx] = deviceClusters[b+wB*ty+tx];


        __syncthreads(); //sychnozized for all the data being loaded into shared memory

        for (int i=0;i<WARP;i++)
        {
            float x = AS[ty][i];
            float y = BS[i][tx];
            cSub += (x-y)*(x-y);
        }

        AS[ty][tx] = 0.0f;
        BS[ty][tx] = 0.0f;
        __syncthreads(); //sychnozized for all the data being loaded into shared memory
    }

    //save the result of C matrix
    int c = by * wB * WARP + ty * wB;
    int c_col = bx * WARP + tx;

    if (by*WARP+ty<hA && c_col < wB)
        deviceC[c+c_col] = cSub;
}