Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- bpg = 50
- tpb = 32
- n = bpg * tpb
- @jit(argtypes=[float32[:,:], float32[:,:], float32[:,:]], target='gpu')
- def cu_square_matrix_mul(A, B, C):
- sA = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
- sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
- tx = cuda.threadIdx.x
- ty = cuda.threadIdx.y
- bx = cuda.blockIdx.x
- by = cuda.blockIdx.y
- bw = cuda.blockDim.x
- bh = cuda.blockDim.y
- x = tx + bx * bw
- y = ty + by * bh
- acc = 0.
- for i in range(bpg):
- if x < n and y < n:
- sA[ty, tx] = A[y, tx + i * tpb]
- sB[ty, tx] = B[ty + i * tpb, x]
- cuda.syncthreads()
- if x < n and y < n:
- for j in range(tpb):
- acc += sA[ty, j] * sB[j, tx]
- cuda.syncthreads()
- if x < n and y < n:
- C[y, x] = acc
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement