Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.72 KB | None | 0 0
  1. __global__ void global_mmul (int *A, int *B, int *C, int N)
  2. {
  3. int i = N-1 - (blockIdx.y * blockDim.y + threadIdx.y);
  4. int j = blockIdx.x * blockDim.x + threadIdx.x;
  5.  
  6. int i_part = i % PARTITION_SIZE;
  7. int j_part = j % PARTITION_SIZE;
  8.  
  9. int rowPerPart = N/PARTITION_SIZE;
  10.  
  11. __shared__ int Apart[PARTITION_SIZE][PARTITION_SIZE];
  12. __shared__ int Bpart[PARTITION_SIZE][PARTITION_SIZE];
  13.  
  14. AT(C, N, i, j) = 0;
  15. for (int n = 0; n < rowPerPart; n++)
  16. {
  17. Apart[i_part][j_part] = AT(A, N, i, n*PARTITION_SIZE + j_part);
  18. Bpart[i_part][j_part] = AT(B, N, n*PARTITION_SIZE + i_part, j);
  19.  
  20. __syncthreads();
  21.  
  22. for (int k=0; k<PARTITION_SIZE; k++)
  23. AT(C, N, i, j) += Apart[i_part][k]*Bpart[k][j_part];
  24. }
  25.  
  26.  
  27. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement