esteppan89

store operations

Aug 26th, 2025
25
0
29 days
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 0.90 KB | None | 0 0
  1. void linear(int m, int n, int k, const float* A, int lda, const float* B, int ldb, float* scores, int lds, float* bias)
  2. {
  3.     int num_threads = n/32;
  4.     omp_set_num_threads(num_threads);
  5. #pragma omp parallel
  6.     {
  7.         __m256 b0 = _mm256_load_ps(&bias[32*t + 0]);
  8.         __m256 b1 = _mm256_load_ps(&bias[32 * t + 8]);
  9.         __m256 b2 = _mm256_load_ps(&bias[32 * t + 16]);
  10.         __m256 b3 = _mm256_load_ps(&bias[32 * t + 24]);
  11.         for (int i = 0; i < m; i++) {
  12.             int storeOffset = i * lds + t * slice_rows;
  13.             __m256 intSum[4] = { b0, b1, b2, b3 };         
  14.             for (int j = 0; j < k-16; j += 16) {
  15.                 //standard FMA operations for multiplication of matrix
  16.             }
  17.             _mm256_stream_ps(scores + storeOffset, intSum[0]);
  18.             _mm256_stream_ps(scores + storeOffset + 8, intSum[1]);
  19.             _mm256_stream_ps(scores + storeOffset + 16, intSum[2]);
  20.             _mm256_stream_ps(scores + storeOffset + 24, intSum[3]);
  21.         }
  22.     }
  23. }
Advertisement
Add Comment
Please, Sign In to add comment