Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define TILE_WIDTH 16
- __kernel void matrixMultiplyShared(__global float* A,
- __global float *B,
- __global float *C,
- int numARows,
- int numAColumns,
- int numBRows,
- int numBColumns,
- int numCRows,
- int numCColumns)
- {
- //--------- Write data to private memory ----------//
- //allocation for a block
- __local float ds_A[TILE_WIDTH][TILE_WIDTH];
- __local float ds_B[TILE_WIDTH][TILE_WIDTH];
- int bx = get_group_id(0); int by = get_group_id(1);
- int tx = get_local_id(0); int ty = get_local_id(1);
- int row = by * TILE_WIDTH + ty;
- int col = bx * TILE_WIDTH + tx;
- float Cvalue = 0.0;
- // Loop over the M and N tiles required to compute the Pd element
- for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) {
- if(row<numARows && m*TILE_WIDTH+tx < numAColumns){
- ds_A[ty][tx] = A[row*numAColumns + m*TILE_WIDTH+tx];
- } else {
- ds_A[ty][tx] = 0;
- }
- if(m*TILE_WIDTH+ty < numBRows && col < numBColumns){
- ds_B[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+col];
- } else {
- ds_B[ty][tx] = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(row < numCRows && col < numCColumns){
- for (int k = 0; k < TILE_WIDTH; ++k)
- Cvalue += ds_A[ty][k] * ds_B[k][tx];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(row < numCRows && col < numCColumns)
- C[row*numCColumns+col] = Cvalue;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement