Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
- index 517fd63638..817a1bfb52 100644
- --- a/libavutil/x86/float_dsp.asm
- +++ b/libavutil/x86/float_dsp.asm
- @@ -439,29 +439,54 @@ VECTOR_FMUL_REVERSE
- %endif
- ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
- -INIT_XMM sse
- -cglobal scalarproduct_float, 3,3,2, v1, v2, offset
- +%macro SCALARPRODUCT_FLOAT 0
- +cglobal scalarproduct_float, 3,3,3, v1, v2, offset
- shl offsetd, 2
- add v1q, offsetq
- add v2q, offsetq
- neg offsetq
- - xorps xmm0, xmm0
- + xorps m0, m0
- .loop:
- - movaps xmm1, [v1q+offsetq]
- - mulps xmm1, [v2q+offsetq]
- - addps xmm0, xmm1
- - add offsetq, 16
- + movaps m1, [v1q+offsetq]
- + mulps m1, [v2q+offsetq]
- + addps m0, m1
- + add offsetq, mmsize
- js .loop
- - movhlps xmm1, xmm0
- - addps xmm0, xmm1
- - movss xmm1, xmm0
- - shufps xmm0, xmm0, 1
- - addss xmm0, xmm1
- +%if cpuflag(avx)
- + vextractf128 xm2, m0, 0
- + vextractf128 xm3, m0, 1
- + movhlps xm1, xm2
- + addps xm2, xm1
- + movss xm1, xm2
- + shufps xm2, xm2, 1
- + addss xm2, xm1
- + movhlps xm1, xm3
- + addps xm3, xm1
- + movss xm1, xm3
- + shufps xm3, xm3, 1
- + addss xm3, xm1
- + addss xm2, xm3
- + movss xm0, xm2
- +%else
- + movhlps m1, m0
- + addps m0, m1
- + movss m1, m0
- + shufps m0, m0, 1
- + addss m0, m1
- +%endif
- %if ARCH_X86_64 == 0
- - movss r0m, xmm0
- + movss r0m, m0
- fld dword r0m
- %endif
- RET
- +%endmacro
- +
- +INIT_XMM sse
- +SCALARPRODUCT_FLOAT
- +%if HAVE_AVX_EXTERNAL
- +INIT_YMM avx
- +SCALARPRODUCT_FLOAT
- +%endif
- ;-----------------------------------------------------------------------------
- ; void ff_butterflies_float(float *src0, float *src1, int len);
- diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
- index 8826e4e2c9..abd2472245 100644
- --- a/libavutil/x86/float_dsp_init.c
- +++ b/libavutil/x86/float_dsp_init.c
- @@ -77,6 +77,8 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
- float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
- +float ff_scalarproduct_float_avx(const float *v1, const float *v2, int order);
- +
- void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
- av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
- @@ -109,6 +111,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
- fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
- fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
- fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
- + fdsp->scalarproduct_float = ff_scalarproduct_float_avx;
- }
- if (EXTERNAL_AVX2_FAST(cpu_flags)) {
- fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement