Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ void global_mmul (int *A, int *B, int *C, int N)
- {
- int i = N-1 - (blockIdx.y * blockDim.y + threadIdx.y);
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i_part = i % PARTITION_SIZE;
- int j_part = j % PARTITION_SIZE;
- int rowPerPart = N/PARTITION_SIZE;
- __shared__ int Apart[PARTITION_SIZE][PARTITION_SIZE];
- __shared__ int Bpart[PARTITION_SIZE][PARTITION_SIZE];
- AT(C, N, i, j) = 0;
- for (int n = 0; n < rowPerPart; n++)
- {
- Apart[i_part][j_part] = AT(A, N, i, n*PARTITION_SIZE + j_part);
- Bpart[i_part][j_part] = AT(B, N, n*PARTITION_SIZE + i_part, j);
- __syncthreads();
- for (int k=0; k<PARTITION_SIZE; k++)
- AT(C, N, i, j) += Apart[i_part][k]*Bpart[k][j_part];
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement