Advertisement
Guest User

Untitled

a guest
May 23rd, 2018
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 2.67 KB | None | 0 0
  1. /**
  2. * Matrix multiplication (CUDA Kernel) on the device: C = A * B
  3. * wA is A's width and wB is B's width
  4. */
  5. template <int BLOCK_SIZE> __global__ void
  6. matrixMulCUDA_WSP(float *C, float *A, float *B, int width)
  7. {
  8.     // Block index
  9.     int bx = blockIdx.x;
  10.     int by = blockIdx.y;
  11.  
  12.     // Thread index
  13.     int tx = threadIdx.x;
  14.     int ty = threadIdx.y;
  15.  
  16.     // Index of the first sub-matrix of A processed by the block
  17.     int aBegin = width * BLOCK_SIZE * by;
  18.  
  19.     // Index of the last sub-matrix of A processed by the block
  20.     int aEnd = aBegin + width - 1;
  21.  
  22.     // Step size used to iterate through the sub-matrices of A
  23.     int aStep = BLOCK_SIZE;
  24.  
  25.     // Index of the first sub-matrix of B processed by the block
  26.     int bBegin = BLOCK_SIZE * bx;
  27.  
  28.     // Step size used to iterate through the sub-matrices of B
  29.     int bStep = BLOCK_SIZE * width;
  30.  
  31.     // Csub is used to store the element of the block sub-matrix
  32.     // that is computed by the thread
  33.     float Csub = 0;
  34.  
  35.  
  36.     // Declaration of the shared memory array As used to
  37.     // store the sub-matrix of A
  38.     __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
  39.  
  40.     // Declaration of the shared memory array Bs used to
  41.     // store the sub-matrix of B
  42.     __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
  43.  
  44.  
  45.     __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE];
  46.  
  47.     // Declaration of the shared memory array Bs used to
  48.     // store the sub-matrix of B
  49.     __shared__ float B_shared[BLOCK_SIZE][BLOCK_SIZE];
  50.  
  51.     // Load the matrices from device memory
  52.     // to shared memory; each thread loads
  53.     // one element of each matrix
  54.     As[ty][tx] = A[0 + width * ty + tx];
  55.     Bs[ty][tx] = B[0 + width * ty + tx];
  56.  
  57.     // Loop over all the sub-matrices of A and B
  58.     // required to compute the block sub-matrix
  59.     for (int a = aBegin, b = bBegin;
  60.         a <= aEnd;
  61.         a += aStep, b += bStep) {
  62.  
  63.  
  64.         // Load the matrices from device memory
  65.         // to shared memory; each thread loads
  66.         // one element of each matrix
  67.         As[ty][tx] = A[a + width * ty + tx];
  68.         Bs[ty][tx] = B[b + width * ty + tx];
  69.  
  70.         // Synchronize to make sure the matrices are loaded
  71.         __syncthreads();
  72.  
  73.         As[ty][tx] = A_shared[a + width * ty + tx];
  74.         Bs[ty][tx] = B_shared[b + width * ty + tx];
  75.  
  76.         // Multiply the two matrices together;
  77.         // each thread computes one element
  78.         // of the block sub-matrix
  79. #pragma unroll
  80.  
  81.         for (int k = 0; k < BLOCK_SIZE; ++k) {
  82.             Csub += As[ty][k] * Bs[k][tx];
  83.         }
  84.  
  85.         // Synchronize to make sure that the preceding
  86.         // computation is done before loading two new
  87.         // sub-matrices of A and B in the next iteration
  88.         __syncthreads();
  89.     }
  90.  
  91.     // Write the block sub-matrix to device memory;
  92.     // each thread writes one element
  93.     int c = width * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  94.     C[c + width * ty + tx] = Csub;
  95. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement