Advertisement
matheus__serpa

Untitled

Jan 23rd, 2021
1,019
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.83 KB | None | 0 0
  1. bpg = 50
  2. tpb = 32
  3. n = bpg * tpb
  4.  
  5. @jit(argtypes=[float32[:,:], float32[:,:], float32[:,:]], target='gpu')
  6. def cu_square_matrix_mul(A, B, C):
  7.     sA = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
  8.     sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
  9.  
  10.     tx = cuda.threadIdx.x
  11.     ty = cuda.threadIdx.y
  12.     bx = cuda.blockIdx.x
  13.     by = cuda.blockIdx.y
  14.     bw = cuda.blockDim.x
  15.     bh = cuda.blockDim.y
  16.  
  17.     x = tx + bx * bw
  18.     y = ty + by * bh
  19.  
  20.     acc = 0.
  21.     for i in range(bpg):
  22.         if x < n and y < n:
  23.             sA[ty, tx] = A[y, tx + i * tpb]
  24.             sB[ty, tx] = B[ty + i * tpb, x]
  25.  
  26.         cuda.syncthreads()
  27.  
  28.         if x < n and y < n:
  29.             for j in range(tpb):
  30.                 acc += sA[ty, j] * sB[j, tx]
  31.  
  32.         cuda.syncthreads()
  33.  
  34.     if x < n and y < n:
  35.         C[y, x] = acc
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement