SHARE
TWEET

Untitled

a guest Jun 19th, 2019 60 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. __global__ void global_mmul (int *A, int *B, int *C, int N)
  2. {
  3.     int i = N-1 - (blockIdx.y * blockDim.y + threadIdx.y);
  4.     int j = blockIdx.x * blockDim.x + threadIdx.x;
  5.    
  6.     int i_part  = i % PARTITION_SIZE;
  7.     int j_part = j % PARTITION_SIZE;
  8.    
  9.     int rowPerPart = N/PARTITION_SIZE;
  10.    
  11.     __shared__ int Apart[PARTITION_SIZE][PARTITION_SIZE];
  12.     __shared__ int Bpart[PARTITION_SIZE][PARTITION_SIZE];
  13.    
  14.     AT(C, N, i, j) = 0;
  15.     for (int n = 0; n < rowPerPart; n++)
  16.     {
  17.         Apart[i_part][j_part] = AT(A, N, i, n*PARTITION_SIZE + j_part);
  18.         Bpart[i_part][j_part] = AT(B, N, n*PARTITION_SIZE + i_part, j);
  19.  
  20.         __syncthreads();
  21.      
  22.         for (int k=0; k<PARTITION_SIZE; k++)
  23.             AT(C, N, i, j) +=  Apart[i_part][k]*Bpart[k][j_part];
  24.     }
  25.  
  26.  
  27. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top