Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ static
- void matrixMultiply(int wA, //Acol
- int hA, //Arow
- int wB, //Bcol
- float *objects, //matrix A
- float *deviceClusters, // matrix B
- float *deviceC) // matrix C
- {
- const int WARP = 32;
- const int SHIFT = 5;
- __shared__ volatile float AS[WARP][WARP];
- __shared__ volatile float BS[WARP][WARP];
- // Block index
- int bx = blockIdx.x;
- int by = blockIdx.y;
- // Thread index
- int tx = threadIdx.x;
- int ty = threadIdx.y;
- // Index of the first sub-matrix of A processed by the block
- int aBegin = wA * WARP * by;
- // Index of the last sub-matrix of A processed by the block
- int aEnd = aBegin + ((wA+WARP-1)>>SHIFT)*WARP;
- // Step size used to iterate through the sub-matrices of A
- int aStep = WARP;
- // Index of the first sub-matrix of B processed by the block
- int bBegin = WARP * bx;
- int bStep = WARP * wB;
- int bRow = 0;
- float cSub = 0.0f;
- //init shared memory
- AS[ty][tx] = 0.0f;
- BS[ty][tx] = 0.0f;
- __syncthreads(); //sychnozized for all the data being loaded into shared memory
- int a = 0,b=0;
- for (a = aBegin,b = bBegin;a<aEnd;a+=aStep,b+=bStep,bRow+=WARP)
- {
- if (a+tx<aBegin+wA && by*WARP+ty<hA)
- AS[ty][tx] = objects[a+wA*ty+tx];
- if (bRow<wA && bx*WARP+tx<wB)
- BS[ty][tx] = deviceClusters[b+wB*ty+tx];
- __syncthreads(); //sychnozized for all the data being loaded into shared memory
- for (int i=0;i<WARP;i++)
- {
- float x = AS[ty][i];
- float y = BS[i][tx];
- cSub += (x-y)*(x-y);
- }
- AS[ty][tx] = 0.0f;
- BS[ty][tx] = 0.0f;
- __syncthreads(); //sychnozized for all the data being loaded into shared memory
- }
- //save the result of C matrix
- int c = by * wB * WARP + ty * wB;
- int c_col = bx * WARP + tx;
- if (by*WARP+ty<hA && c_col < wB)
- deviceC[c+c_col] = cSub;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement