Advertisement
Kitomas

kmixer_instrinsics.txt as of 8-18-23

Aug 18th, 2023
145
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 21.30 KB | None | 0 0
  1. ----BY EXTENSION----
  2.  
  3.  
  4.  
  5. AVX: {
  6. arithmetic: {
  7. __m256 _mm256_add_ps (__m256 a, __m256 b): add packed f32
  8. __m256 _mm256_div_ps (__m256 a, __m256 b): divide packed f32
  9. __m256 _mm256_mul_ps (__m256 a, __m256 b): multiply packed f32
  10. __m256 _mm256_sub_ps (__m256 a, __m256 b): subtract packed f32
  11.  
  12. __m256 _mm256_hadd_ps (__m256 a, __m256 b): horiz. add packed f32
  13. }
  14. convert: {
  15. __m256i _mm256_cvtps_epi32 (__m256 a): packed f32 to i32 (w/ rounding)
  16. __m256i _mm256_cvttps_epi32 (__m256 a): packed f32 to i32 (w/ truncation)
  17. __m256 _mm256_cvtepi32_ps (__m256i a): packed i32 to f32 (w/ rounding)
  18. }
  19. load: {
  20. __m256 _mm256_load_ps (float const * mem_addr): f32 (32B aligned for dst.mem_addr)
  21. __m256 _mm256_loadu_ps (float const * mem_addr): f32 (no alignment for dst.mem_addr)
  22. __m256i _mm256_load_si256 (__m256i const * mem_addr): int (32B aligned for dst.mem_addr)
  23. __m256i _mm256_loadu_si256 (__m256i const * mem_addr): int (no alignment for dst.mem_addr)
  24. }
  25. store: {
  26. void _mm256_store_ps (float * mem_addr, __m256 a): f32 (32B aligned for mem_addr)
  27. void _mm256_storeu_ps (float * mem_addr, __m256 a): f32 (no alignment for mem_addr)
  28. void _mm256_store_si256 (__m256i * mem_addr, __m256i a): int (32B aligned for mem_addr)
  29. void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a): int (no alignment for mem_addr)
  30. }
  31. set: {
  32. __m256i _mm256_set1_epi8 (char a): set all elements of dst to i8 value
  33. __m256i _mm256_set1_epi16 (short a): set all elements of dst to i16 value
  34. __m256i _mm256_set1_epi32 (int a): set all elements of dst to i32 value
  35. __m256 _mm256_set1_ps (float a): set all elements of dst to f32 value
  36. }
  37. special math: {
  38. __m256 _mm256_max_ps (__m256 a, __m256 b): max() on packed f32
  39. __m256 _mm256_min_ps (__m256 a, __m256 b): min() on packed f32
  40. __m256 _mm256_round_ps (__m256 a, int rounding): round packed f32
  41. }
  42. swizzle: {
  43. __m256 _mm256_unpackhi_ps (__m256 a, __m256 b): unpack & interleave f32 from high 128b of a&b
  44.  
  45. __m256 _mm256_unpacklo_ps (__m256 a, __m256 b): unpack & interleave f32 from low 128b of a&b
  46. }
  47. }
  48. AVX2: {
  49. arithmetic: {
  50. __m256i _mm256_add_epi8 (__m256i a, __m256i b): add packed i8
  51. __m256i _mm256_add_epi16 (__m256i a, __m256i b): add packed i16
  52. __m256i _mm256_add_epi32 (__m256i a, __m256i b): add packed i32
  53.  
  54. __m256i _mm256_adds_epi8 (__m256i a, __m256i b): add packed i8 w/ saturation
  55. __m256i _mm256_adds_epi16 (__m256i a, __m256i b): add packed i16 w/ saturation
  56. __m256i _mm256_adds_epu8 (__m256i a, __m256i b): add packed u8 w/ saturation
  57. __m256i _mm256_adds_epu16 (__m256i a, __m256i b): add packed u16 w/ saturation
  58.  
  59. __m256i _mm256_sub_epi8 (__m256i a, __m256i b): subtract packed i8
  60. __m256i _mm256_sub_epi16 (__m256i a, __m256i b): subtract packed i16
  61. __m256i _mm256_sub_epi32 (__m256i a, __m256i b): subtract packed i32
  62.  
  63. __m256i _mm256_subs_epi8 (__m256i a, __m256i b): subtract packed i8 w/ saturation
  64. __m256i _mm256_subs_epi16 (__m256i a, __m256i b): subtract packed i16 w/ saturation
  65. __m256i _mm256_subs_epu8 (__m256i a, __m256i b): subtract packed u8 w/ saturation
  66. __m256i _mm256_subs_epu16 (__m256i a, __m256i b): subtract packed u16 w/ saturation
  67.  
  68. __m256i _mm256_hadd_epi16 (__m256i a, __m256i b): horiz. add packed i16 pairs
  69. __m256i _mm256_hadds_epi16 (__m256i a, __m256i b): horiz. add packed i16 pairs w/ saturation
  70. __m256i _mm256_hadd_epi32 (__m256i a, __m256i b): horiz. add packed i32 pairs
  71. }
  72. convert: {
  73. __m256i _mm256_cvtepi8_epi16 (__m128i a): sign-extend packed i8 to i16
  74. __m256i _mm256_cvtepi16_epi32 (__m128i a): sign-extend packed i16 to i32
  75.  
  76. __m256i _mm256_cvtepi8_epi32 (__m128i a): zero-extend packed i8 to i32
  77. __m256i _mm256_cvtepu8_epi16 (__m128i a): zero-extend packed u8 to i16
  78. __m256i _mm256_cvtepu8_epi32 (__m128i a): zero-extend packed u8 to i32
  79. __m256i _mm256_cvtepu16_epi32 (__m128i a): zero-extend packed u16 to i32
  80. }
  81. prob/stat: {
  82. __m256i _mm256_avg_epu8 (__m256i a, __m256i b): average packed u8
  83. __m256i _mm256_avg_epu16 (__m256i a, __m256i b): average packed u16
  84. }
  85. special math: {
  86. __m256i _mm256_max_epi8 (__m256i a, __m256i b): max() on packed i8
  87. __m256i _mm256_max_epi16 (__m256i a, __m256i b): max() on packed i16
  88. __m256i _mm256_max_epi32 (__m256i a, __m256i b): max() on packed i32
  89. __m256i _mm256_max_epu8 (__m256i a, __m256i b): max() on packed u8
  90. __m256i _mm256_max_epu16 (__m256i a, __m256i b): max() on packed u16
  91. __m256i _mm256_max_epu32 (__m256i a, __m256i b): max() on packed u32
  92.  
  93. __m256i _mm256_min_epi8 (__m256i a, __m256i b): min() on packed i8
  94. __m256i _mm256_min_epi16 (__m256i a, __m256i b): min() on packed i16
  95. __m256i _mm256_min_epi32 (__m256i a, __m256i b): min() on packed i32
  96. __m256i _mm256_min_epu8 (__m256i a, __m256i b): min() on packed u8
  97. __m256i _mm256_min_epu16 (__m256i a, __m256i b): min() on packed u16
  98. __m256i _mm256_min_epu32 (__m256i a, __m256i b): min() on packed u32
  99. }
  100. misc: {
  101. __m256i _mm256_packs_epi32 (__m256i a, __m256i b): packed i32 to i16 (w/ signed saturation)
  102. __m256i _mm256_packs_epi16 (__m256i a, __m256i b): packed i16 to i8 (w/ signed saturation)
  103. __m256i _mm256_packus_epi32 (__m256i a, __m256i b): packed i32 to i16 (w/ unsigned saturation)
  104. __m256i _mm256_packus_epi16 (__m256i a, __m256i b): packed i16 to i8 (w/ unsigned saturation)
  105. }
  106. swizzle: {
  107. __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b): unpack and interleave i8 from high 128b of a&b
  108. __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b): unpack and interleave i16 from high 128b of a&b
  109. __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b): unpack and interleave i32 from high 128b of a&b
  110.  
  111. __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b): unpack and interleave i8 from low 128b of a&b
  112. __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b): unpack and interleave i16 from low 128b of a&b
  113. __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b): unpack and interleave i32 from low 128b of a&b
  114. }
  115. }
  116.  
  117. SSE: {
  118. arithmetic: {
  119. __m128 _mm_add_ps (__m128 a, __m128 b): add packed f32
  120. __m128 _mm_div_ps (__m128 a, __m128 b): divide packed f32
  121. __m128 _mm_mul_ps (__m128 a, __m128 b): multiply packed f32
  122. __m128 _mm_sub_ps (__m128 a, __m128 b): subtract packed f32
  123. }
  124. convert: {
  125. __m128 _mm_cvtpi16_ps (__m64 a): packed i16 to f32
  126. __m128 _mm_cvtpu16_ps (__m64 a): packed u16 to f32
  127. __m64 _mm_cvttps_pi32 (__m128 a): packed f32 to i32 (w/ truncation)
  128. __m64 _mm_cvtt_ps2pi (__m128 a): alias of _mm_cvttps_pi32
  129. }
  130. load: {
  131. __m128 _mm_load_ps1 (float const* mem_addr): basically an f32 set1 but from memory
  132. __m128 _mm_load1_ps (float const* mem_addr): alias of _mm_load1_ps
  133. __m128 _mm_load_ps (float const* mem_addr): f32 (16B alignment for dst.mem_addr)
  134. __m128 _mm_loadu_ps (float const* mem_addr): f32 (no alignment for dst.mem_addr)
  135. }
  136. store: {
  137. void _mm_store_ps (float* mem_addr, __m128 a): f32 (16B alignment for mem_addr)
  138. void _mm_storeu_ps (float* mem_addr, __m128 a): f32 (no alignment for mem_addr)
  139. }
  140. move: {
  141. __m128 _mm_movehl_ps (__m128 a, __m128 b): f32; b's hi 2 to low dst, a's hi 2 to high dst
  142. }
  143. prob/stat: {
  144. __m64 _mm_avg_pu8 (__m64 a, __m64 b): average packed u8
  145. __m64 _mm_avg_pu16 (__m64 a, __m64 b): average packed u16
  146. __m64 _m_pavgb (__m64 a, __m64 b): average packed u8 (alias of _mm_avg_pu8)
  147. __m64 _m_pavgw (__m64 a, __m64 b): average packed u16 (alias of _mm_avg_pu16)
  148. }
  149. set: {
  150. __m128 _mm_set1_ps (float a): set all elements of dst to f32
  151. }
  152. special math: {
  153. __m64 _mm_max_pu8 (__m64 a, __m64 b): max() packed u8
  154. __m64 _mm_max_pi16 (__m64 a, __m64 b): max() packed i16
  155. __m128 _mm_max_ps (__m128 a, __m128 b): max() packed f32
  156.  
  157. __m64 _mm_min_pu8 (__m64 a, __m64 b): min() packed u8
  158. __m64 _mm_min_pi16 (__m64 a, __m64 b): min() packed i16
  159. __m128 _mm_min_ps (__m128 a, __m128 b): min() packed f32
  160.  
  161. __m64 _m_pmaxub (__m64 a, __m64 b): max() packed u8 (alias of _mm_max_pu8)
  162. __m64 _m_pmaxsw (__m64 a, __m64 b): max() packed i16 (alias of _mm_max_pi16)
  163.  
  164. __m64 _m_pminub (__m64 a, __m64 b): min() packed u8 (alias of _mm_min_pu8)
  165. __m64 _m_pminsw (__m64 a, __m64 b): min() packed i16 (alias of _mm_min_pi16)
  166. }
  167. swizzle: {
  168. __m128 _mm_unpackhi_ps (__m128 a, __m128 b): unpack & interleave f32 from high 64b of a&b
  169.  
  170. __m128 _mm_unpacklo_ps (__m128 a, __m128 b): unpack & interleave f32 from low 64b of a&b
  171.  
  172. __m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8): shuffle f32 using control
  173. }
  174. }
  175. SSE2: {
  176. arithmetic: {
  177. __m128i _mm_add_epi8 (__m128i a, __m128i b): add packed i8
  178. __m128i _mm_add_epi16 (__m128i a, __m128i b): add packed i16
  179. __m128i _mm_add_epi32 (__m128i a, __m128i b): add packed i32
  180. __m128i _mm_adds_epi8 (__m128i a, __m128i b): add packed i8 (w/ saturation)
  181. __m128i _mm_adds_epi16 (__m128i a, __m128i b): add packed i16 (w/ saturation)
  182. __m128i _mm_adds_epu8 (__m128i a, __m128i b): add packed u8 (w/ saturation)
  183. __m128i _mm_adds_epu16 (__m128i a, __m128i b): add packed u16 (w/ saturation)
  184.  
  185. __m128i _mm_sub_epi8 (__m128i a, __m128i b): subtract packed i8
  186. __m128i _mm_sub_epi16 (__m128i a, __m128i b): subtract packed i16
  187. __m128i _mm_sub_epi32 (__m128i a, __m128i b): subtract packed i32
  188. __m128i _mm_subs_epi8 (__m128i a, __m128i b): subtract packed i8 (w/ saturation)
  189. __m128i _mm_subs_epi16 (__m128i a, __m128i b): subtract packed i16 (w/ saturation)
  190. __m128i _mm_subs_epu8 (__m128i a, __m128i b): subtract packed u8 (w/ saturation)
  191. __m128i _mm_subs_epu16 (__m128i a, __m128i b): subtract packed u16 (w/ saturation)
  192. }
  193. convert: {
  194. __m128 _mm_cvtepi32_ps (__m128i a): packed i32 to f32
  195. __m128i _mm_cvtps_epi32 (__m128 a): packed f32 to i32 (w/ rounding)
  196. __m128i _mm_cvttps_epi32 (__m128 a): packed f32 to i32 (w/ truncation)
  197. }
  198. load: {
  199. __m128i _mm_load_si128 (__m128i const* mem_addr): 128-bits int (16B aligned for dst.mem_addr)
  200. __m128i _mm_loadu_si128 (__m128i const* mem_addr): 128-bits int (no alignment for dst.mem_addr)
  201. }
  202. store: {
  203. void _mm_store_si128 (__m128i* mem_addr, __m128i a): 128-bits int (16B aligned for mem_addr)
  204. void _mm_storeu_si128 (__m128i* mem_addr, __m128i a): 128-bits int (no alignment for mem_addr)
  205. }
  206. prob/stat: {
  207. __m128i _mm_avg_epu8 (__m128i a, __m128i b): average packed u8
  208. __m128i _mm_avg_epu16 (__m128i a, __m128i b): average packed u16
  209. }
  210. set: {
  211. __m128i _mm_set1_epi8 (char a): set all elements of dst to i8
  212. __m128i _mm_set1_epi16 (short a): set all elements of dst to i16
  213. __m128i _mm_set1_epi32 (int a): set all elements of dst to i32
  214. }
  215. special math: {
  216. __m128i _mm_max_epu8 (__m128i a, __m128i b): max() packed u8
  217. __m128i _mm_max_epi16 (__m128i a, __m128i b): max() packed i16
  218.  
  219. __m128i _mm_min_epu8 (__m128i a, __m128i b): min() packed u8
  220. __m128i _mm_min_epi16 (__m128i a, __m128i b): min() packed i16
  221. }
  222. swizzle: {
  223. __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b): unpack & interleave i8 from high 64b of a&b
  224. __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b): unpack & interleave i16 from high 64b of a&b
  225. __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b): unpack & interleave i32 from high 64b of a&b
  226.  
  227. __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b): unpack & interleave i8 from low 64b of a&b
  228. __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b): unpack & interleave i16 from low 64b of a&b
  229. __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b): unpack & interleave i32 from low 64b of a&b
  230. }
  231. }
  232. SSE3: {
  233. arithmetic: {
  234. __m128 _mm_hadd_ps (__m128 a, __m128 b): horiz. add packed f32 pairs
  235. }
  236. }
  237. SSE41: {
  238. convert: {
  239. __m128i _mm_cvtepi8_epi16 (__m128i a): sign-extend packed i8 to i16
  240. __m128i _mm_cvtepi8_epi32 (__m128i a): sign-extend packed i8 to i32
  241. __m128i _mm_cvtepi16_epi32 (__m128i a): sign-extend packed i16 to i32
  242.  
  243. __m128i _mm_cvtepu8_epi16 (__m128i a): zero-extend packed u8 to i16
  244. __m128i _mm_cvtepu8_epi32 (__m128i a): zero-extend packed u8 to i32
  245. __m128i _mm_cvtepu16_epi32 (__m128i a): zero-extend packed u16 to i32
  246. }
  247. special math: {
  248. __m128i _mm_max_epi8 (__m128i a, __m128i b): max() packed i8
  249. __m128i _mm_max_epi32 (__m128i a, __m128i b): max() packed i32
  250. __m128i _mm_max_epu16 (__m128i a, __m128i b): max() packed u16
  251. __m128i _mm_max_epu32 (__m128i a, __m128i b): max() packed u32
  252.  
  253. __m128i _mm_min_epi8 (__m128i a, __m128i b): min() packed i8
  254. __m128i _mm_min_epi32 (__m128i a, __m128i b): min() packed i32
  255. __m128i _mm_min_epu16 (__m128i a, __m128i b): min() packed u16
  256. __m128i _mm_min_epu32 (__m128i a, __m128i b): min() packed u32
  257.  
  258. __m128 _mm_round_ps (__m128 a, int rounding): round packed f32
  259. }
  260. }
  261.  
  262.  
  263.  
  264. ----BY OPERATION----
  265.  
  266.  
  267.  
  268. add: {
  269. SSE, __m128 ->__m128 : f32 (_mm_add_ps)
  270. SSE2, __m128i->__m128i: i8 (_mm_add_epi8)
  271. SSE2, __m128i->__m128i: i16 (_mm_add_epi16)
  272. SSE2, __m128i->__m128i: i32 (_mm_add_epi32)
  273. AVX, __m256 ->__m256 : f32 (_mm256_add_ps)
  274. AVX2, __m256i->__m256i: i8 (_mm256_add_epi8)
  275. AVX2, __m256i->__m256i: i16 (_mm256_add_epi16)
  276. AVX2, __m256i->__m256i: i32 (_mm256_add_epi32)
  277. }
  278.  
  279. divide: {
  280. SSE, __m128 ->__m128 : f32 (_mm_div_ps)
  281. AVX, __m256->__m256: f32 (_mm256_div_ps)
  282. }
  283.  
  284. multiply: {
  285. SSE, __m128 ->__m128 : f32 (_mm_mul_ps)
  286. AVX, __m256->__m256: f32 (_mm256_mul_ps)
  287. }
  288.  
  289. subtract: {
  290. SSE, __m128 ->__m128 : f32 (_mm_sub_ps)
  291. SSE2, __m128i->__m128i: i8 (_mm_sub_epi8)
  292. SSE2, __m128i->__m128i: i16 (_mm_sub_epi16)
  293. SSE2, __m128i->__m128i: i32 (_mm_sub_epi32)
  294. AVX, __m256 ->__m256 : f32 (_mm256_sub_ps)
  295. AVX2, __m256i->__m256i: i8 (_mm256_sub_epi8)
  296. AVX2, __m256i->__m256i: i16 (_mm256_sub_epi16)
  297. AVX2, __m256i->__m256i: i32 (_mm256_sub_epi32)
  298. }
  299.  
  300.  
  301. add w/ saturation: {
  302. SSE2, __m128i->__m128i: i8 (_mm_adds_epi8)
  303. SSE2, __m128i->__m128i: i16 (_mm_adds_epi16)
  304. SSE2, __m128i->__m128i: u8 (_mm_adds_epu8)
  305. SSE2, __m128i->__m128i: u16 (_mm_adds_epu16)
  306. AVX2, __m256i->__m256i: i8 (_mm256_adds_epi8)
  307. AVX2, __m256i->__m256i: i16 (_mm256_adds_epi16)
  308. AVX2, __m256i->__m256i: u8 (_mm256_adds_epu8)
  309. AVX2, __m256i->__m256i: u16 (_mm256_adds_epu16)
  310. }
  311.  
  312. subtract w/ saturation: {
  313. SSE2, __m128i->__m128i: i8 (_mm_subs_epi8)
  314. SSE2, __m128i->__m128i: i16 (_mm_subs_epi16)
  315. SSE2, __m128i->__m128i: u8 (_mm_subs_epu8)
  316. SSE2, __m128i->__m128i: u16 (_mm_subs_epu16)
  317. AVX2, __m256i->__m256i: i8 (_mm256_subs_epi8)
  318. AVX2, __m256i->__m256i: i16 (_mm256_subs_epi16)
  319. AVX2, __m256i->__m256i: u8 (_mm256_subs_epu8)
  320. AVX2, __m256i->__m256i: u16 (_mm256_subs_epu16)
  321. }
  322.  
  323. horiz. add: {
  324. SSE3, __m128 ->__m128 : f32 (_mm_hadd_ps)
  325. AVX, __m256 ->__m256 : f32 (_mm256_hadd_ps)
  326. AVX2, __m256i->__m256i: i16 (_mm256_hadd_epi16)
  327. AVX2, __m256i->__m256i: i32 (_mm256_hadd_epi32)
  328. }
  329.  
  330. horiz. add w/ saturation: {
  331. AVX2, __m256i->__m256i: i16 (_mm256_hadds_epi16)
  332. }
  333.  
  334.  
  335. convert: {
  336. rounding or n/a: {
  337. SSE, __m64 ->__m128 : i16 to f32 (_mm_cvtpi16_ps)
  338. SSE, __m64 ->__m128 : u16 to f32 (_mm_cvtpu16_ps)
  339. SSE2, __m128i->__m128 : i32 to f32 (_mm_cvtepi32_ps)
  340. SSE2, __m128 ->__m128i: f32 to i32 (_mm_cvtps_epi32)
  341. AVX, __m256 ->__m256i: f32 to i32 (_mm256_cvtps_epi32)
  342. AVX, __m256i->__m256 : i32 to f32 (_mm256_cvtepi32_ps)
  343. }
  344. truncation: {
  345. SSE, __m128->__m64 : f32 to i32 (_mm_cvttps_pi32)
  346. SSE2, __m128->__m128i: f32 to i32 (_mm_cvttps_epi32)
  347. AVX, __m256->__m256i: f32 to i32 (_mm256_cvttps_epi32)
  348. }
  349. signed saturation: {
  350. }
  351. }
  352.  
  353.  
  354. load (alignment is 16B for m128, 32B for m256): {
  355. SSE, float*->__m128 : f32 (_mm_load_ps)
  356. SSE2, __m128i*->__m128i: int (_mm_load_si128)
  357. AVX, float*->__m256 : f32 (_mm256_load_ps)
  358. AVX, __m256i*->__m256i: int (_mm256_load_si256)
  359. }
  360.  
  361. load unaligned: {
  362. SSE, float*->__m128 : f32 (_mm_load1_ps) (acts like like set1)
  363. SSE, float*->__m128 : f32 (_mm_loadu_ps)
  364. SSE2, __m128i*->__m128i: int (_mm_loadu_si128)
  365. AVX, float*->__m256 : f32 (_mm256_loadu_ps)
  366. AVX, __m256i*->__m256i: int (_mm256_loadu_si256)
  367. }
  368.  
  369.  
  370. store (alignment is 16B for m128, 32B for m256): {
  371. SSE, __m128 ->float* : f32 (_mm_store_ps)
  372. SSE2, __m128i->__m128i*: int (_mm_store_si128)
  373. AVX, __m256 ->float* : f32 (_mm256_store_ps)
  374. AVX, __m256i->__m256i*: int (_mm256_store_si256)
  375. }
  376.  
  377. store unaligned: {
  378. SSE, __m128 ->float* : f32 (_mm_storeu_ps)
  379. SSE2, __m128i->__m128i*: int (_mm_storeu_si128)
  380. AVX, __m256 ->float* : f32 (_mm256_storeu_ps)
  381. AVX, __m256i->__m256i*: int (_mm256_storeu_si256)
  382. }
  383.  
  384.  
  385. movehl: {
  386. SSE, __m128->__m128: f32 (_mm_movehl_ps)
  387. }
  388.  
  389.  
  390. shuffle: {
  391. SSE, __m128->__m128: f32 using control (_mm_shuffle_ps)
  392. }
  393.  
  394.  
  395. set1: {
  396. SSE, float->__m128 : f32 (_mm_set1_ps)
  397. SSE2, char ->__m128i: i8 (_mm_set1_epi8)
  398. SSE2, short->__m128i: i16 (_mm_set1_epi16)
  399. SSE2, int ->__m128i: i32 (_mm_set1_epi32)
  400. AVX, char ->__m256i: i8 (_mm256_set1_epi8)
  401. AVX, short->__m256i: i16 (_mm256_set1_epi16)
  402. AVX, int ->__m256i: i32 (_mm256_set1_epi32)
  403. AVX, float->__m256 : f32 (_mm256_set1_ps)
  404. }
  405.  
  406.  
  407. unpack high: {
  408. SSE, __m128 ->__m128 : f32 (_mm_unpackhi_ps)
  409. SSE2, __m128i->__m128i: i8 (_mm_unpackhi_epi8)
  410. SSE2, __m128i->__m128i: i16 (_mm_unpackhi_epi16)
  411. SSE2, __m128i->__m128i: i32 (_mm_unpackhi_epi32)
  412. AVX, __m256 ->__m256 : f32 (_mm256_unpackhi_ps)
  413. AVX2, __m256i->__m256i: i8 (_mm256_unpackhi_epi8)
  414. AVX2, __m256i->__m256i: i16 (_mm256_unpackhi_epi16)
  415. AVX2, __m256i->__m256i: i32 (_mm256_unpackhi_epi32)
  416. }
  417.  
  418. unpack low: {
  419. SSE, __m128 ->__m128 : f32 (_mm_unpacklo_ps)
  420. SSE2, __m128i->__m128i: i8 (_mm_unpacklo_epi8)
  421. SSE2, __m128i->__m128i: i16 (_mm_unpacklo_epi16)
  422. SSE2, __m128i->__m128i: i32 (_mm_unpacklo_epi32)
  423. AVX, __m256 ->__m256 : f32 (_mm256_unpacklo_ps)
  424. AVX2, __m256i->__m256i: i8 (_mm256_unpacklo_epi8)
  425. AVX2, __m256i->__m256i: i16 (_mm256_unpacklo_epi16)
  426. AVX2, __m256i->__m256i: i32 (_mm256_unpacklo_epi32)
  427. }
  428.  
  429. pack w/ signed saturation: {
  430. AVX2, __m256i->__m256i: i32 to i16 (_mm256_packs_epi32)
  431. AVX2, __m256i->__m256i: i16 to i8 (_mm256_packs_epi16)
  432. }
  433.  
  434. pack w/ unsigned saturation: {
  435. AVX2, __m256i->__m256i: i32 to i16 (_mm256_packus_epi32)
  436. AVX2, __m256i->__m256i: i16 to i8 (_mm256_packus_epi16)
  437. }
  438.  
  439.  
  440. sign-extend: {
  441. SSE41, __m128i->__m128i: i8 to i16 (_mm_cvtepi8_epi16)
  442. SSE41, __m128i->__m128i: i8 to i32 (_mm_cvtepi8_epi32)
  443. SSE41, __m128i->__m128i: i16 to i32 (_mm_cvtepi16_epi32)
  444. AVX2, __m128i->__m256i: i8 to i16 (_mm256_cvtepi8_epi16)
  445. AVX2, __m128i->__m256i: i16 to i32 (_mm256_cvtepi16_epi32)
  446. }
  447.  
  448. zero-extend: {
  449. SSE41, __m128i->__m128i: u8 to i16 (_mm_cvtepu8_epi16)
  450. SSE41, __m128i->__m128i: u8 to i32 (_mm_cvtepu8_epi32)
  451. SSE41, __m128i->__m128i: u16 to i32 (_mm_cvtepu16_epi32)
  452. AVX2, __m128i->__m256i: i8 to i32 (_mm256_cvtepi8_epi32)
  453. AVX2, __m128i->__m256i: u8 to i16 (_mm256_cvtepu8_epi16)
  454. AVX2, __m128i->__m256i: u8 to i32 (_mm256_cvtepu8_epi32)
  455. AVX2, __m128i->__m256i: u16 to i32 (_mm256_cvtepu16_epi32)
  456. }
  457.  
  458.  
  459. average: {
  460. SSE, __m64 ->__m64 : u8 (_mm_avg_pu8)
  461. SSE, __m64 ->__m64 : u16 (_mm_avg_pu16)
  462. SSE2, __m128i->__m128i: u8 (_mm_avg_epu8)
  463. SSE2, __m128i->__m128i: u16 (_mm_avg_epu16)
  464. AVX2, __m256i->__m256i: u8 (_mm256_avg_epu8)
  465. AVX2, __m256i->__m256i: u16 (_mm256_avg_epu16)
  466. }
  467.  
  468. max: {
  469. SSE, __m64 ->__m64 : u8 (_mm_max_pu8)
  470. SSE, __m64 ->__m64 : i16 (_mm_max_pi16)
  471. SSE, __m128 ->__m128 : f32 (_mm_max_ps)
  472. SSE2, __m128i->__m128i: u8 (_mm_max_epu8)
  473. SSE2, __m128i->__m128i: i16 (_mm_max_epi16)
  474. SSE41, __m128i->__m128i: i8 (_mm_max_epi8)
  475. SSE41, __m128i->__m128i: i32 (_mm_max_epi32)
  476. SSE41, __m128i->__m128i: u16 (_mm_max_epu16)
  477. SSE41, __m128i->__m128i: u32 (_mm_max_epu32)
  478. AVX, __m256 ->__m256 : f32 (_mm256_max_ps)
  479. AVX2, __m256i->__m256i: i8 (_mm256_max_epi8)
  480. AVX2, __m256i->__m256i: i16 (_mm256_max_epi16)
  481. AVX2, __m256i->__m256i: i32 (_mm256_max_epi32)
  482. AVX2, __m256i->__m256i: u8 (_mm256_max_epu8)
  483. AVX2, __m256i->__m256i: u16 (_mm256_max_epu16)
  484. AVX2, __m256i->__m256i: u32 (_mm256_max_epu32)
  485. }
  486.  
  487. min: {
  488. SSE, __m64 ->__m64 : u8 (_mm_min_pu8)
  489. SSE, __m64 ->__m64 : i16 (_mm_min_pi16)
  490. SSE, __m128 ->__m128 : f32 (_mm_min_ps)
  491. SSE2, __m128i->__m128i: u8 (_mm_min_epu8)
  492. SSE2, __m128i->__m128i: i16 (_mm_min_epi16)
  493. SSE41, __m128i->__m128i: i8 (_mm_min_epi8)
  494. SSE41, __m128i->__m128i: i32 (_mm_min_epi32)
  495. SSE41, __m128i->__m128i: u16 (_mm_min_epu16)
  496. SSE41, __m128i->__m128i: u32 (_mm_min_epu32)
  497. AVX, __m256 ->__m256 : f32 (_mm256_min_ps)
  498. AVX2, __m256i->__m256i: i8 (_mm256_min_epi8)
  499. AVX2, __m256i->__m256i: i16 (_mm256_min_epi16)
  500. AVX2, __m256i->__m256i: i32 (_mm256_min_epi32)
  501. AVX2, __m256i->__m256i: u8 (_mm256_min_epu8)
  502. AVX2, __m256i->__m256i: u16 (_mm256_min_epu16)
  503. AVX2, __m256i->__m256i: u32 (_mm256_min_epu32)
  504. }
  505.  
  506. round: {
  507. SSE41, __m128->__m128: f32 (_mm_round_ps)
  508. AVX, __m256->__m256: f32 (_mm256_round_ps)
  509. }
  510.  
  511. (reminder: allow mono streams to have stereo volume applied to them)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement