Zgragselus

SSE Matrix Inverse

Nov 8th, 2025
176
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.45 KB | None | 0 0
  1.  
  2.         /// <summary>
  3.         /// Matrix inversion
  4.         /// </summary>
  5.         /// <param name="m">Matrix to invert</param>
  6.         /// <returns>Inverse of matrix m</returns>
  7.         friend inline mat4 inverse(const mat4& m)
  8.         {
  9.             __m128 f1 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0xAA),
  10.                 _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xFF), _mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)),
  11.                 _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xAA), _mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80),
  12.                 _mm_shuffle_ps(m.m3, m.m2, 0xFF)));
  13.  
  14.             __m128 f2 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x55),
  15.                 _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xFF), _mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)),
  16.                 _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x55), _mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80),
  17.                 _mm_shuffle_ps(m.m3, m.m2, 0xFF)));
  18.  
  19.             __m128 f3 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x55),
  20.                 _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xAA), _mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80)),
  21.                 _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x55), _mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80),
  22.                 _mm_shuffle_ps(m.m3, m.m2, 0xAA)));
  23.  
  24.             __m128 f4 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00),
  25.                 _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xFF), _mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)),
  26.                 _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x00), _mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80),
  27.                 _mm_shuffle_ps(m.m3, m.m2, 0xFF)));
  28.  
  29.             __m128 f5 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00),
  30.                 _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xAA), _mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80)),
  31.                 _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x00), _mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80),
  32.                 _mm_shuffle_ps(m.m3, m.m2, 0xAA)));
  33.  
  34.             __m128 f6 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00),
  35.                 _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x55), _mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80)),
  36.                 _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x00), _mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80),
  37.                 _mm_shuffle_ps(m.m3, m.m2, 0x55)));
  38.  
  39.             __m128 v1 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0x00), _mm_shuffle_ps(m.m2, m.m1, 0x00), 0xA8);
  40.             __m128 v2 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0x55), _mm_shuffle_ps(m.m2, m.m1, 0x55), 0xA8);
  41.             __m128 v3 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0xAA), _mm_shuffle_ps(m.m2, m.m1, 0xAA), 0xA8);
  42.             __m128 v4 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0xFF), _mm_shuffle_ps(m.m2, m.m1, 0xFF), 0xA8);
  43.             __m128 s1 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);
  44.             __m128 s2 = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f);
  45.             __m128 i1 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v2, f1),
  46.                 _mm_mul_ps(v3, f2)),
  47.                 _mm_mul_ps(v4, f3)));
  48.             __m128 i2 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f1),
  49.                 _mm_mul_ps(v3, f4)),
  50.                 _mm_mul_ps(v4, f5)));
  51.             __m128 i3 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f2),
  52.                 _mm_mul_ps(v2, f4)),
  53.                 _mm_mul_ps(v4, f6)));
  54.             __m128 i4 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f3),
  55.                 _mm_mul_ps(v2, f5)),
  56.                 _mm_mul_ps(v3, f6)));
  57.             __m128 d = _mm_mul_ps(m.m1, _mm_movelh_ps(_mm_unpacklo_ps(i1, i2), _mm_unpacklo_ps(i3, i4)));
  58.             d = _mm_add_ps(d, _mm_shuffle_ps(d, d, 0x4E));
  59.             d = _mm_add_ps(d, _mm_shuffle_ps(d, d, 0x11));
  60.             d = _mm_div_ps(_mm_set1_ps(1.0f), d);
  61.             return mat4(float4(_mm_mul_ps(i1, d)),
  62.                 float4(_mm_mul_ps(i2, d)),
  63.                 float4(_mm_mul_ps(i3, d)),
  64.                 float4(_mm_mul_ps(i4, d)));
  65.         }
Advertisement
Add Comment
Please, Sign In to add comment