Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void
- mult_block (const double * a, const double * b, double * c, size_t m,
- size_t M, size_t N, size_t K)
- {
- const double * pa, * pb;
- double * pc;
- double s00, s01, s02, s03;
- double s10, s11, s12, s13;
- double s20, s21, s22, s23;
- double s30, s31, s32, s33;
- double a0, a1, a2, a3;
- double b0, b1, b2, b3;
- size_t i, j, k;
- for (i = 0; i < M; i += 4)
- {
- for (j = 0; j < N; j += 4)
- {
- s00 = 0;
- s01 = 0;
- s02 = 0;
- s03 = 0;
- s10 = 0;
- s11 = 0;
- s12 = 0;
- s13 = 0;
- s20 = 0;
- s21 = 0;
- s22 = 0;
- s23 = 0;
- s30 = 0;
- s31 = 0;
- s32 = 0;
- s33 = 0;
- pa = a + i * m;
- pb = b + j;
- for (k = 0; k < K; k++)
- {
- a0 = pa[0];
- a1 = pa[m];
- a2 = pa[2 * m];
- a3 = pa[3 * m];
- b0 = pb[0];
- b1 = pb[1];
- b2 = pb[2];
- b3 = pb[3];
- s00 += a0 * b0;
- s01 += a0 * b1;
- s02 += a0 * b2;
- s03 += a0 * b3;
- s10 += a1 * b0;
- s11 += a1 * b1;
- s12 += a1 * b2;
- s13 += a1 * b3;
- s20 += a2 * b0;
- s21 += a2 * b1;
- s22 += a2 * b2;
- s23 += a2 * b3;
- s30 += a3 * b0;
- s31 += a3 * b1;
- s32 += a3 * b2;
- s33 += a3 * b3;
- pa += 1;
- pb += m;
- }
- pc = c + i * m + j;
- pc[0] += s00;
- pc[1] += s01;
- pc[2] += s02;
- pc[3] += s03;
- pc[m] += s10;
- pc[m + 1] += s11;
- pc[m + 2] += s12;
- pc[m + 3] += s13;
- pc[2 * m] += s20;
- pc[2 * m + 1] += s21;
- pc[2 * m + 2] += s22;
- pc[2 * m + 3] += s23;
- pc[3 * m] += s30;
- pc[3 * m + 1] += s31;
- pc[3 * m + 2] += s32;
- pc[3 * m + 3] += s33;
- }
- }
- }
- void
- mult_block__ (const double * a, const double * b, double * c, size_t m,
- size_t M, size_t N, size_t K);
- void
- mult_block__ (const double * a, const double * b, double * c, size_t m,
- size_t M, size_t N, size_t K)
- {
- const double * pa, * pb;
- double * pc;
- double s00, s01, s02, s03;
- double s10, s11, s12, s13;
- double s20, s21, s22, s23;
- double a0, a1, a2;
- double b0, b1, b2, b3;
- size_t i, j, k;
- for (i = 0; i < M; i += 3)
- {
- for (j = 0; j < N; j += 4)
- {
- s00 = 0;
- s01 = 0;
- s02 = 0;
- s03 = 0;
- s10 = 0;
- s11 = 0;
- s12 = 0;
- s13 = 0;
- s20 = 0;
- s21 = 0;
- s22 = 0;
- s23 = 0;
- pa = a + i * m;
- pb = b + j;
- for (k = 0; k < K; k++)
- {
- a0 = pa[0];
- a1 = pa[m];
- a2 = pa[2 * m];
- b0 = pb[0];
- b1 = pb[1];
- b2 = pb[2];
- b3 = pb[3];
- s00 += a0 * b0;
- s01 += a0 * b1;
- s02 += a0 * b2;
- s03 += a0 * b3;
- s10 += a1 * b0;
- s11 += a1 * b1;
- s12 += a1 * b2;
- s13 += a1 * b3;
- s20 += a2 * b0;
- s21 += a2 * b1;
- s22 += a2 * b2;
- s23 += a2 * b3;
- pa += 1;
- pb += m;
- }
- pc = c + i * m + j;
- pc[0] += s00;
- pc[1] += s01;
- pc[2] += s02;
- pc[3] += s03;
- pc[m] += s10;
- pc[m + 1] += s11;
- pc[m + 2] += s12;
- pc[m + 3] += s13;
- pc[2 * m] += s20;
- pc[2 * m + 1] += s21;
- pc[2 * m + 2] += s22;
- pc[2 * m + 3] += s23;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement