Advertisement
Guest User

Untitled

a guest
Mar 4th, 2019
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.92 KB | None | 0 0
  1. diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
  2. index 517fd63638..817a1bfb52 100644
  3. --- a/libavutil/x86/float_dsp.asm
  4. +++ b/libavutil/x86/float_dsp.asm
  5. @@ -439,29 +439,54 @@ VECTOR_FMUL_REVERSE
  6. %endif
  7.  
  8. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  9. -INIT_XMM sse
  10. -cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  11. +%macro SCALARPRODUCT_FLOAT 0
  12. +cglobal scalarproduct_float, 3,3,3, v1, v2, offset
  13. shl offsetd, 2
  14. add v1q, offsetq
  15. add v2q, offsetq
  16. neg offsetq
  17. - xorps xmm0, xmm0
  18. + xorps m0, m0
  19. .loop:
  20. - movaps xmm1, [v1q+offsetq]
  21. - mulps xmm1, [v2q+offsetq]
  22. - addps xmm0, xmm1
  23. - add offsetq, 16
  24. + movaps m1, [v1q+offsetq]
  25. + mulps m1, [v2q+offsetq]
  26. + addps m0, m1
  27. + add offsetq, mmsize
  28. js .loop
  29. - movhlps xmm1, xmm0
  30. - addps xmm0, xmm1
  31. - movss xmm1, xmm0
  32. - shufps xmm0, xmm0, 1
  33. - addss xmm0, xmm1
  34. +%if cpuflag(avx)
  35. + vextractf128 xm2, m0, 0
  36. + vextractf128 xm3, m0, 1
  37. + movhlps xm1, xm2
  38. + addps xm2, xm1
  39. + movss xm1, xm2
  40. + shufps xm2, xm2, 1
  41. + addss xm2, xm1
  42. + movhlps xm1, xm3
  43. + addps xm3, xm1
  44. + movss xm1, xm3
  45. + shufps xm3, xm3, 1
  46. + addss xm3, xm1
  47. + addss xm2, xm3
  48. + movss xm0, xm2
  49. +%else
  50. + movhlps m1, m0
  51. + addps m0, m1
  52. + movss m1, m0
  53. + shufps m0, m0, 1
  54. + addss m0, m1
  55. +%endif
  56. %if ARCH_X86_64 == 0
  57. - movss r0m, xmm0
  58. + movss r0m, m0
  59. fld dword r0m
  60. %endif
  61. RET
  62. +%endmacro
  63. +
  64. +INIT_XMM sse
  65. +SCALARPRODUCT_FLOAT
  66. +%if HAVE_AVX_EXTERNAL
  67. +INIT_YMM avx
  68. +SCALARPRODUCT_FLOAT
  69. +%endif
  70.  
  71. ;-----------------------------------------------------------------------------
  72. ; void ff_butterflies_float(float *src0, float *src1, int len);
  73. diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
  74. index 8826e4e2c9..abd2472245 100644
  75. --- a/libavutil/x86/float_dsp_init.c
  76. +++ b/libavutil/x86/float_dsp_init.c
  77. @@ -77,6 +77,8 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
  78.  
  79. float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
  80.  
  81. +float ff_scalarproduct_float_avx(const float *v1, const float *v2, int order);
  82. +
  83. void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
  84.  
  85. av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
  86. @@ -109,6 +111,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
  87. fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
  88. fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
  89. fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
  90. + fdsp->scalarproduct_float = ff_scalarproduct_float_avx;
  91. }
  92. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  93. fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement