Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 9a4242ea1581a448fdb757708ce29032ec3e56b7 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sun, 23 Oct 2011 12:21:33 -0400
- Subject: [PATCH] h264 qpel WIP
- ---
- libavcodec/x86/Makefile | 1 +
- libavcodec/x86/dsputil_mmx.c | 105 ++--
- libavcodec/x86/h264_qpel.asm | 812 +++++++++++++++++++++++
- libavcodec/x86/h264_qpel_mmx.c | 1388 ++++------------------------------------
- 4 files changed, 996 insertions(+), 1310 deletions(-)
- create mode 100644 libavcodec/x86/h264_qpel.asm
- diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
- index ab13109..ab67045 100644
- --- a/libavcodec/x86/Makefile
- +++ b/libavcodec/x86/Makefile
- @@ -50,6 +50,7 @@ MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
- x86/fmtconvert.o \
- x86/h264_chromamc.o \
- x86/h264_chromamc_10bit.o \
- + x86/h264_qpel.o \
- x86/h264_qpel_10bit.o \
- $(YASM-OBJS-yes)
- diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
- index 58620d6..1cd86c9 100644
- --- a/libavcodec/x86/dsputil_mmx.c
- +++ b/libavcodec/x86/dsputil_mmx.c
- @@ -2635,16 +2635,30 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
- SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
- +#define H264_QPEL_FUNCS_TMP(x, y, CPU)\
- + c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_##CPU;\
- + c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_##CPU;\
- + c->put_h264_qpel_pixels_tab[2][x+y*4] = ff_put_h264_qpel4_mc##x##y##_##CPU;\
- + c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_##CPU;\
- + c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_##CPU;\
- + c->avg_h264_qpel_pixels_tab[2][x+y*4] = ff_avg_h264_qpel4_mc##x##y##_##CPU;
- +
- +#define H264_QPEL_FUNCS_TMP48(x, y, CPU)\
- + c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_##CPU;\
- + c->put_h264_qpel_pixels_tab[2][x+y*4] = ff_put_h264_qpel4_mc##x##y##_##CPU;\
- + c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_##CPU;\
- + c->avg_h264_qpel_pixels_tab[2][x+y*4] = ff_avg_h264_qpel4_mc##x##y##_##CPU;
- +
- +#if HAVE_YASM
- if (!high_bit_depth) {
- - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
- - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
- - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
- - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
- - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
- - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
- + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
- + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
- + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
- + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
- + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
- + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
- }
- else if (bit_depth == 10) {
- -#if HAVE_YASM
- #if !ARCH_X86_64
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
- @@ -2653,8 +2667,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- #endif
- SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
- SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
- -#endif
- }
- +#endif
- SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
- SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
- @@ -2724,15 +2738,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
- SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
- - if (!high_bit_depth) {
- - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
- - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
- - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
- - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
- - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
- - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
- - }
- -
- SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
- SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
- SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
- @@ -2747,37 +2752,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- #endif
- }
- -
- +#if HAVE_YASM
- #define H264_QPEL_FUNCS(x, y, CPU)\
- - c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
- - c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
- - c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
- - c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
- - if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
- - // these functions are slower than mmx on AMD, but faster on Intel
- - if (!high_bit_depth) {
- - c->put_pixels_tab[0][0] = put_pixels16_sse2;
- - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
- - c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
- - H264_QPEL_FUNCS(0, 0, sse2);
- - }
- - }
- + c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_##CPU;\
- + c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_##CPU;\
- + c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_##CPU;\
- + c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_##CPU;
- +#define H264_QPEL_FUNCS8(x, y, CPU)\
- + c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_##CPU;
- if(mm_flags & AV_CPU_FLAG_SSE2){
- if (!high_bit_depth) {
- - H264_QPEL_FUNCS(0, 1, sse2);
- - H264_QPEL_FUNCS(0, 2, sse2);
- - H264_QPEL_FUNCS(0, 3, sse2);
- - H264_QPEL_FUNCS(1, 1, sse2);
- - H264_QPEL_FUNCS(1, 2, sse2);
- - H264_QPEL_FUNCS(1, 3, sse2);
- - H264_QPEL_FUNCS(2, 1, sse2);
- - H264_QPEL_FUNCS(2, 2, sse2);
- - H264_QPEL_FUNCS(2, 3, sse2);
- - H264_QPEL_FUNCS(3, 1, sse2);
- - H264_QPEL_FUNCS(3, 2, sse2);
- - H264_QPEL_FUNCS(3, 3, sse2);
- + //H264_QPEL_FUNCS(0, 0, sse2);
- + H264_QPEL_FUNCS(0, 1, sse2);
- + H264_QPEL_FUNCS(0, 2, sse2);
- + H264_QPEL_FUNCS(0, 3, sse2);
- + H264_QPEL_FUNCS(1, 0, sse2);
- + H264_QPEL_FUNCS(1, 1, sse2);
- + H264_QPEL_FUNCS(1, 2, sse2);
- + H264_QPEL_FUNCS(1, 3, sse2);
- + H264_QPEL_FUNCS(2, 0, sse2);
- + H264_QPEL_FUNCS(2, 1, sse2);
- + H264_QPEL_FUNCS(2, 2, sse2);
- + H264_QPEL_FUNCS(2, 3, sse2);
- + H264_QPEL_FUNCS(3, 0, sse2);
- + H264_QPEL_FUNCS(3, 1, sse2);
- + H264_QPEL_FUNCS(3, 2, sse2);
- + H264_QPEL_FUNCS(3, 3, sse2);
- }
- -#if HAVE_YASM
- #define H264_QPEL_FUNCS_10(x, y, CPU)\
- c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
- c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
- @@ -2799,22 +2800,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- }
- #if HAVE_SSSE3
- if(mm_flags & AV_CPU_FLAG_SSSE3){
- - if (!high_bit_depth) {
- - H264_QPEL_FUNCS(1, 0, ssse3);
- - H264_QPEL_FUNCS(1, 1, ssse3);
- - H264_QPEL_FUNCS(1, 2, ssse3);
- - H264_QPEL_FUNCS(1, 3, ssse3);
- - H264_QPEL_FUNCS(2, 0, ssse3);
- - H264_QPEL_FUNCS(2, 1, ssse3);
- - H264_QPEL_FUNCS(2, 2, ssse3);
- - H264_QPEL_FUNCS(2, 3, ssse3);
- - H264_QPEL_FUNCS(3, 0, ssse3);
- - H264_QPEL_FUNCS(3, 1, ssse3);
- - H264_QPEL_FUNCS(3, 2, ssse3);
- - H264_QPEL_FUNCS(3, 3, ssse3);
- - }
- #if HAVE_YASM
- - else if (bit_depth == 10) {
- + if (bit_depth == 10) {
- H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
- H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
- H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
- diff --git a/libavcodec/x86/h264_qpel.asm b/libavcodec/x86/h264_qpel.asm
- new file mode 100644
- index 0000000..4fa7c64
- --- /dev/null
- +++ b/libavcodec/x86/h264_qpel.asm
- @@ -0,0 +1,812 @@
- +;*****************************************************************************
- +;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
- +;*****************************************************************************
- +;* Copyright (C) 2011 Daniel Kang
- +;*
- +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
- +;*
- +;* This file is part of Libav.
- +;*
- +;* Libav is free software; you can redistribute it and/or
- +;* modify it under the terms of the GNU Lesser General Public
- +;* License as published by the Free Software Foundation; either
- +;* version 2.1 of the License, or (at your option) any later version.
- +;*
- +;* Libav is distributed in the hope that it will be useful,
- +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- +;* Lesser General Public License for more details.
- +;*
- +;* You should have received a copy of the GNU Lesser General Public
- +;* License along with Libav; if not, write to the Free Software
- +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- +;******************************************************************************
- +
- +%include "x86inc.asm"
- +%include "x86util.asm"
- +
- +SECTION_RODATA 32
- +
- +cextern pw_16
- +cextern pw_5
- +cextern pb_0
- +
- +SECTION .text
- +
- +%macro AVG_MOV 2
- +; In the internal 16x16 calls, the pointer is incremented by 8 bytes.
- +; This breaks 16-byte alignment, so movement to mmx is required.
- +; Is there a way around this?
- +%if mmsize == 16
- + movdq2q mm0, %2
- + pavgb mm0, %1
- + movq %1, mm0
- +%else
- + pavgb %2, %1
- + MOV_OP %1, %2
- +%endif
- +%endmacro
- +
- +%macro MC 1
- +%define OP_MOV movh
- +INIT_MMX
- +%1 mmxext, put, 4
- +INIT_XMM
- +%1 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +%1 mmxext, avg, 4
- +INIT_XMM
- +%1 sse2 , avg, 8
- +%endmacro
- +
- +%macro MCAxA 9
- +%ifdef ARCH_X86_64
- +%ifnidn %1,mmxext
- +MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8,%9
- +%endif
- +%else
- +MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8,%9
- +%endif
- +%endmacro
- +
- +%macro MCAxA_OP 9
- +cglobal %2_h264_qpel%5_%3_%1, %6,%7,%8
- +%ifdef ARCH_X86_32
- + mov r3d, %4*2
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- + mov r0, r0m
- + mov r1, r1m
- + add r0, %4
- + add r1, %4
- + mov r3d, %4*2
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- +%if %9
- + mov r0, r0m
- + mov r1, r1m
- + lea r0, [r0+r2*%4]
- + lea r1, [r1+r2*%4]
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- + mov r0, r0m
- + mov r1, r1m
- + lea r0, [r0+r2*%4+%4]
- + lea r1, [r1+r2*%4+%4]
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- +%endif
- + RET
- +%else ; ARCH_X86_64
- + mov r10, r0
- + mov r11, r1
- + mov r3d, %4*2
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- + lea r0, [r10+%4]
- + lea r1, [r11+%4]
- + mov r3d, %4*2
- +%if %9
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- + lea r0, [r10+r2*%4]
- + lea r1, [r11+r2*%4]
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- + lea r0, [r10+r2*%4+%4]
- + lea r1, [r11+r2*%4+%4]
- +%endif ; %9
- +%ifndef UNIX64 ; fall through to function
- + call stub_%2_h264_qpel%4_%3_%1.skip_prologue
- + RET
- +%endif ; UNIX64
- +%endif ; ARCH
- +%endmacro
- +
- +;cpu, put/avg, mc, 4/8, cglobal args, call 2x or 4x
- +%macro cglobal_mc 7-8 1
- +%assign i %4*2
- +MCAxA %1, %2, %3, %4, i, %5,%6,%7, %8
- +
- +cglobal %2_h264_qpel%4_%3_%1, %5,%6,%7
- +%ifndef UNIX64 ; no prologue or epilogue for UNIX64
- + call stub_%2_h264_qpel%4_%3_%1
- + RET
- +%endif
- +
- +stub_%2_h264_qpel%4_%3_%1:
- +%endmacro
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro COPY4 1
- + %1 m0, [r1 ]
- + OP_MOV [r0 ], m0
- + %1 m0, [r1+r2 ]
- + OP_MOV [r0+r2 ], m0
- + %1 m0, [r1+r2*2]
- + OP_MOV [r0+r2*2], m0
- + %1 m0, [r1+r3 ]
- + OP_MOV [r0+r3 ], m0
- +%endmacro
- +
- +%macro MC00 1
- +INIT_MMX
- +%define MOV_OP movh
- +cglobal %1_h264_qpel4_mc00_mmxext,3,4
- + lea r3, [r2*3 ]
- + COPY4 movh
- + RET
- +
- +%define MOV_OP mova
- +cglobal %1_h264_qpel8_mc00_mmxext,3,4
- + lea r3, [r2*3 ]
- + COPY4 movu
- + lea r0, [r0+r2*4]
- + lea r1, [r1+r2*4]
- + COPY4 movu
- + RET
- +
- +INIT_XMM
- +cglobal %1_h264_qpel16_mc00_sse2,3,5
- + lea r3, [r2*3 ]
- + mov r4d, 4
- +.loop:
- + COPY4 movu
- + lea r0, [r0+r2*4]
- + lea r1, [r1+r2*4]
- + dec r4d
- + jg .loop
- + REP_RET
- +%endmacro
- +
- +%macro AVG_MOV_MC00 2
- +; See AVG_MOV above why this is necessary -- any way around it?
- + pavgb %2, %1
- + MOV_OP %1, %2
- +%endmacro
- +
- +INIT_MMX
- +%define OP_MOV MOV_OP
- +MC00 put
- +
- +INIT_MMX
- +%define OP_MOV AVG_MOV_MC00
- +MC00 avg
- +
- +%define MOV_OP movh ; After mc00, it should be movh
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC20 3
- +cglobal_mc %1, %2, mc20, %3, 3,4,8, 0
- + mov r3d, %3
- +.skip_prologue:
- + pxor m7, m7
- + mova m4, [pw_5]
- + mova m5, [pw_16]
- +.nextrow:
- + movh m1, [r1-1]
- + movh m2, [r1+0]
- + movh m3, [r1+1]
- + movh m0, [r1+2]
- + punpcklbw m1, m7
- + punpcklbw m2, m7
- + punpcklbw m3, m7
- + punpcklbw m0, m7
- + paddw m1, m0
- + paddw m2, m3
- + movh m0, [r1-2]
- + movh m3, [r1+3]
- + punpcklbw m0, m7
- + punpcklbw m3, m7
- + paddw m0, m3
- + psllw m2, 2
- + psubw m2, m1
- + pmullw m2, m4
- + paddw m0, m5
- + paddw m0, m2
- + psraw m0, 5
- + packuswb m0, m0
- + OP_MOV [r0], m0
- + add r0, r2
- + add r1, r2
- + dec r3d
- + jg .nextrow
- + rep ret
- +%endmacro
- +
- +MC MC20
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC30 3
- +cglobal_mc %1, %2, mc30, %3, 3,5,8, 0
- + mov r3d, %3
- +.skip_prologue:
- + lea r4, [r1+1]
- + jmp stub_%2_h264_qpel%3_mc10_%1.body
- +%endmacro
- +
- +MC MC30
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC10 3
- +cglobal_mc %1, %2, mc10, %3, 3,5,8, 0
- + mov r3d, %3
- +.skip_prologue:
- + mov r4, r1
- +.body:
- + pxor m7, m7
- + mova m4, [pw_5]
- + mova m5, [pw_16]
- +.nextrow:
- + movh m1, [r1-1]
- + movh m2, [r1+0]
- + movh m3, [r1+1]
- + movh m0, [r1+2]
- + punpcklbw m1, m7
- + punpcklbw m2, m7
- + punpcklbw m3, m7
- + punpcklbw m0, m7
- + paddw m1, m0
- + paddw m2, m3
- + movh m0, [r1-2]
- + movh m3, [r1+3]
- + punpcklbw m0, m7
- + punpcklbw m3, m7
- + paddw m0, m3
- + psllw m2, 2
- + psubw m2, m1
- + pmullw m2, m4
- + paddw m0, m5
- + paddw m0, m2
- + psraw m0, 5
- + packuswb m0, m0
- + movh m3, [r4]
- + pavgb m0, m3
- + OP_MOV [r0], m0
- + add r0, r2
- + add r1, r2
- + add r4, r2
- + dec r3d
- + jg .nextrow
- + rep ret
- +%endmacro
- +
- +MC MC10
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro V_FILT 3
- +v_filt%1_%2_%3:
- + add r4, r2
- +.no_addr4:
- + mova m6, m2
- + movh m5, [r1]
- + paddw m6, m3
- + psllw m6, 2
- + psubw m6, m1
- + psubw m6, m4
- + punpcklbw m5, m7
- + pmullw m6, [pw_5]
- + paddw m0, [pw_16]
- + paddw m0, m5
- + paddw m0, m6
- + psraw m0, 5
- + packuswb m0, m0
- + add r1, r2
- + add r0, r2
- + ret
- +%endmacro
- +
- +INIT_MMX
- +RESET_MM_PERMUTATION
- +%assign i 0
- +%rep 4
- +V_FILT 4, i, mmxext
- +SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- +
- +INIT_XMM
- +RESET_MM_PERMUTATION
- +%assign i 0
- +%rep 6
- +V_FILT 8, i, sse2
- +SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- +
- +%macro PRELOAD_V 0
- + pxor m7, m7
- + lea r3, [r2*3]
- + sub r1, r3
- + movh m0, [r1+r2]
- + movh m1, [r1+r2*2]
- + add r1, r3
- + movu m2, [r1]
- + movu m3, [r1+r2]
- + movu m4, [r1+r2*2]
- + add r1, r3
- + punpcklbw m0, m7
- + punpcklbw m1, m7
- + punpcklbw m2, m7
- + punpcklbw m3, m7
- + punpcklbw m4, m7
- +%endmacro
- +
- +%macro MC02 3
- +cglobal_mc %1, %2, mc02, %3, 3,4,8
- +.skip_prologue:
- + PRELOAD_V
- +
- + sub r0, r2
- +%assign j 0
- +%rep %3
- + %assign i (j % 6)
- + call v_filt%3_ %+ i %+ _%1.no_addr4
- + OP_MOV [r0], m0
- + SWAP 0,1,2,3,4,5
- + %assign j j+1
- +%endrep
- + ret
- +%endmacro
- +
- +MC MC02
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC01 3
- +cglobal_mc %1, %2, mc01, %3, 3,5,8
- +.skip_prologue:
- + mov r4, r1
- +.body:
- + PRELOAD_V
- +
- + sub r4, r2
- + sub r0, r2
- +%assign j 0
- +%rep %3
- + %assign i (j % 6)
- + call v_filt%3_ %+ i %+ _%1
- + movu m6, [r4]
- + pavgb m0, m6
- + OP_MOV [r0], m0
- + SWAP 0,1,2,3,4,5
- + %assign j j+1
- +%endrep
- + ret
- +%endmacro
- +
- +MC MC01
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC03 3
- +cglobal_mc %1, %2, mc03, %3, 3,5,8
- +.skip_prologue:
- + lea r4, [r1+r2]
- + jmp stub_%2_h264_qpel%3_mc01_%1.body
- +%endmacro
- +
- +MC MC03
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro H_FILT_AVG 3-4
- +h_filt%2_%3_%1:
- +;FILT_H with fewer registers and averaged with the FILT_V result
- +;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
- +;unfortunately I need three registers, so m5 will have to be re-read from memory
- + movh m6, [r4-1]
- + movh m5, [r4+2]
- + punpcklbw m6, m7 ; still 0
- + punpcklbw m5, m7
- + paddw m6, m5
- + movh m5, [r4+0]
- + movh m7, [r4+1]
- + punpcklbw m5, [pb_0]
- + punpcklbw m7, [pb_0]
- + paddw m7, m5
- + psllw m7, 2
- + psubw m7, m6
- + pmullw m7, [pw_5]
- + movh m5, [r4-2]
- + movh m6, [r4+3]
- + punpcklbw m5, [pb_0]
- + punpcklbw m6, [pb_0]
- + paddw m5, m6
- + paddw m5, [pw_16]
- + paddw m5, m7
- + psraw m5, 5
- + packuswb m5, m5
- +;avg FILT_V, FILT_H
- + pavgb m0, m5
- +%if %0!=4
- + movh m5, [r1+r5]
- + punpcklbw m5, [pb_0]
- +%endif
- + ret
- +%endmacro
- +
- +INIT_MMX
- +RESET_MM_PERMUTATION
- +%assign i 0
- +%rep 3
- +H_FILT_AVG mmxext, 4, i
- +SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- +H_FILT_AVG mmxext, 4, i, 0
- +
- +INIT_XMM
- +RESET_MM_PERMUTATION
- +%assign i 0
- +%rep 6
- +%if i==1
- +H_FILT_AVG sse2, 8, i, 0
- +%else
- +H_FILT_AVG sse2, 8, i
- +%endif
- +SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- +
- +%macro MC11 3
- +; this REALLY needs x86_64
- +cglobal_mc %1, %2, mc11, %3, 3,6,8
- +.skip_prologue:
- + mov r4, r1
- +.body:
- + PRELOAD_V
- +
- + sub r0, r2
- + sub r4, r2
- + mov r5, r2
- + neg r5
- +%assign j 0
- +%rep %3
- + %assign i (j % 6)
- + call v_filt%3_ %+ i %+ _%1
- + call h_filt%3_ %+ i %+ _%1
- + pxor m7, m7
- +%if %3==8 && i==1
- + movh m5, [r1+r5]
- + punpcklbw m5, m7
- +%endif
- + OP_MOV [r0], m0
- + SWAP 0,1,2,3,4,5
- + %assign j j+1
- +%endrep
- + ret
- +%endmacro
- +
- +MC MC11
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC31 3
- +cglobal_mc %1, %2, mc31, %3, 3,6,8
- +.skip_prologue:
- + mov r4, r1
- + add r1, 1
- + jmp stub_%2_h264_qpel%3_mc11_%1.body
- +%endmacro
- +
- +MC MC31
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC13 3
- +cglobal_mc %1, %2, mc13, %3, 3,6,8
- +.skip_prologue:
- + lea r4, [r1+r2]
- + jmp stub_%2_h264_qpel%3_mc11_%1.body
- +%endmacro
- +
- +MC MC13
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC33 3
- +cglobal_mc %1, %2, mc33, %3, 3,6,8
- +.skip_prologue:
- + lea r4, [r1+r2]
- + add r1, 1
- + jmp stub_%2_h264_qpel%3_mc11_%1.body
- +%endmacro
- +
- +MC MC33
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro FILT_VNRD 0
- + movh m5, [r1]
- + punpcklbw m5, m7
- + paddw m0, m5 ; -2,+3
- + paddw m6, m1, m4 ; -1,+2
- + paddw m7, m2, m3 ; +0,+1
- + psllw m7, 2
- + paddw m0, [pw_16]
- + psubw m7, m6
- + pmullw m7, [pw_5]
- + paddw m0, m7
- + pxor m7, m7
- +%endmacro
- +
- +%macro HV 2
- +%ifidn %1,sse2
- +%define PAD 12
- +%define COUNT 2
- +%else
- +%define PAD 0
- +%define COUNT 3
- +%endif
- +put_hv%2_%1:
- + neg r2 ; This actually saves instructions
- + lea r1, [r1+r2*2-(mmsize-PAD)/2]
- + lea r4, [rsp+PAD+gprsize]
- + mov r3d, COUNT
- + pxor m7, m7
- +.v_loop:
- + movh m0, [r1]
- + punpcklbw m0, m7
- + sub r1, r2
- + movh m1, [r1]
- + punpcklbw m1, m7
- + sub r1, r2
- + movh m2, [r1]
- + punpcklbw m2, m7
- + sub r1, r2
- + movh m3, [r1]
- + punpcklbw m3, m7
- + sub r1, r2
- + movh m4, [r1]
- + punpcklbw m4, m7
- + sub r1, r2
- +%assign i 0
- +%rep %2-1
- + FILT_VNRD
- + movu [r4+i*mmsize*3], m0
- + sub r1, r2
- + SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- + FILT_VNRD
- + movu [r4+i*mmsize*3], m0
- + add r4, mmsize
- + lea r1, [r1+r2*8+mmsize/2]
- +%if %2==8
- + lea r1, [r1+r2*4]
- +%endif
- + dec r3d
- + jg .v_loop
- + neg r2
- + ret
- +%endmacro
- +
- +INIT_MMX
- +HV mmxext, 4
- +INIT_XMM
- +HV sse2 , 8
- +
- +%macro H_LOOP 2
- +h%2_loop_op_%1:
- + movu m1, [r1+mmsize-4]
- + movu m2, [r1+mmsize-2]
- + mova m3, [r1+mmsize+0]
- + movu m4, [r1+mmsize+2]
- + movu m5, [r1+mmsize+4]
- + movu m6, [r1+mmsize+6]
- + paddw m3, m4
- + paddw m2, m5
- + paddw m1, m6
- + psubw m1, m2
- + psraw m1, 2
- + psubw m1, m2
- + paddsw m1, m3
- + psraw m1, 2
- + paddw m1, m3
- + psraw m1, 6
- + packuswb m1, m1
- + add r1, mmsize*3
- + ret
- +%endmacro
- +
- +INIT_MMX
- +H_LOOP mmxext, 4
- +INIT_XMM
- +H_LOOP sse2 , 8
- +
- +%macro MC22 3
- +cglobal_mc %1, %2, mc22, %3, 3,7,8
- +%define PAD mmsize*16*4 ; SIZE*16*4 -- so 8xX can be called only twice in 16x16
- +.skip_prologue:
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, PAD
- +
- + call put_hv%3_%1
- +
- + mova m0, [pw_5]
- + mova m7, [pw_16]
- + mov r3d, %3
- + mov r1, rsp
- +.h_loop:
- + call h%3_loop_op_%1
- +
- + OP_MOV [r0], m1
- + add r0, r2
- + dec r3d
- + jg .h_loop
- +
- + mov rsp, r6 ; restore stack pointer
- + ret
- +%endmacro
- +
- +MC MC22
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC12 3
- +cglobal_mc %1, %2, mc12, %3, 3,7,8
- +%define PAD mmsize*16*4 ; SIZE*16*4*sizeof(pixel)
- +.skip_prologue:
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, PAD
- +
- + call put_hv%3_%1
- +
- + xor r4d, r4d
- +.body
- + mov r3d, %3
- + mova m0, [pw_5]
- + mova m7, [pw_16]
- + mov r1, rsp
- +.h_loop:
- + call h%3_loop_op_%1
- +
- + movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
- + psraw m3, 5
- + packuswb m3, m3
- + pavgb m1, m3
- +
- + OP_MOV [r0], m1
- + add r0, r2
- + dec r3d
- + jg .h_loop
- +
- + mov rsp, r6 ; restore stack pointer
- + ret
- +%endmacro
- +
- +MC MC12
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC32 3
- +cglobal_mc %1, %2, mc32, %3, 3,7,8
- +%define PAD mmsize*16*4 ; SIZE*16*4*sizeof(pixel)
- +.skip_prologue:
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, PAD
- +
- + call put_hv%3_%1
- +
- + mov r4d, 2 ; sizeof(pixel)
- + jmp stub_%2_h264_qpel%3_mc12_%1.body
- +%endmacro
- +
- +MC MC32
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro H_NRD 2
- +put_h%2_%1:
- + add rsp, gprsize
- + mov r3d, %2
- + xor r4d, r4d
- + pxor m7, m7
- + mova m4, [pw_5]
- + mova m5, [pw_16]
- +.nextrow
- + movh m1, [r5-1]
- + movh m2, [r5+0]
- + movh m3, [r5+1]
- + movh m0, [r5+2]
- + punpcklbw m1, m7
- + punpcklbw m2, m7
- + punpcklbw m3, m7
- + punpcklbw m0, m7
- + paddw m1, m0
- + paddw m2, m3
- + movh m0, [r5-2]
- + movh m3, [r5+3]
- + punpcklbw m0, m7
- + punpcklbw m3, m7
- + paddw m0, m3
- + psllw m2, 2
- + psubw m2, m1
- + pmullw m2, m4
- + paddw m0, m5
- + paddw m2, m0
- + mova [rsp+r4], m2
- + add r4d, mmsize*3
- + add r5, r2
- + dec r3d
- + jg .nextrow
- + sub rsp, gprsize
- + ret
- +%endmacro
- +
- +INIT_MMX
- +H_NRD mmxext, 4
- +INIT_XMM
- +H_NRD sse2 , 8
- +
- +%macro MC21 3
- +cglobal_mc %1, %2, mc21, %3, 3,7,8
- +.skip_prologue:
- + mov r5, r1
- +.body
- +%define PAD mmsize*16*4 ; SIZE*16*4*sizeof(pixel)
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- +
- + sub rsp, PAD
- + call put_h%3_%1
- +
- + sub rsp, PAD
- + call put_hv%3_%1
- +
- + mov r4d, PAD-mmsize ; H buffer
- + jmp stub_%2_h264_qpel%3_mc12_%1.body
- +%endmacro
- +
- +MC MC21
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC23 3
- +cglobal_mc %1, %2, mc23, %3, 3,7,8
- +.skip_prologue:
- + lea r5, [r1+r2]
- + jmp stub_%2_h264_qpel%3_mc21_%1.body
- +%endmacro
- +
- +MC MC23
- diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c
- index b7a4183..9231bc1 100644
- --- a/libavcodec/x86/h264_qpel_mmx.c
- +++ b/libavcodec/x86/h264_qpel_mmx.c
- @@ -21,1191 +21,9 @@
- #include "dsputil_mmx.h"
- -/***********************************/
- -/* motion compensation */
- -
- -#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
- - "mov"#q" "#C", "#T" \n\t"\
- - "mov"#d" (%0), "#F" \n\t"\
- - "paddw "#D", "#T" \n\t"\
- - "psllw $2, "#T" \n\t"\
- - "psubw "#B", "#T" \n\t"\
- - "psubw "#E", "#T" \n\t"\
- - "punpcklbw "#Z", "#F" \n\t"\
- - "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
- - "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
- - "add %2, %0 \n\t"\
- - "paddw "#F", "#A" \n\t"\
- - "paddw "#A", "#T" \n\t"\
- - "psraw $5, "#T" \n\t"\
- - "packuswb "#T", "#T" \n\t"\
- - OP(T, (%1), A, d)\
- - "add %3, %1 \n\t"
- -
- -#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
- - "mov"#q" "#C", "#T" \n\t"\
- - "mov"#d" (%0), "#F" \n\t"\
- - "paddw "#D", "#T" \n\t"\
- - "psllw $2, "#T" \n\t"\
- - "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
- - "psubw "#B", "#T" \n\t"\
- - "psubw "#E", "#T" \n\t"\
- - "punpcklbw "#Z", "#F" \n\t"\
- - "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
- - "paddw "#F", "#A" \n\t"\
- - "add %2, %0 \n\t"\
- - "paddw "#A", "#T" \n\t"\
- - "mov"#q" "#T", "#OF"(%1) \n\t"
- -
- -#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
- -#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
- -#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
- -#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
- -
- -
- -#define QPEL_H264(OPNAME, OP, MMX)\
- -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - int h=4;\
- -\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
- - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
- - "1: \n\t"\
- - "movd -1(%0), %%mm1 \n\t"\
- - "movd (%0), %%mm2 \n\t"\
- - "movd 1(%0), %%mm3 \n\t"\
- - "movd 2(%0), %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "paddw %%mm0, %%mm1 \n\t"\
- - "paddw %%mm3, %%mm2 \n\t"\
- - "movd -2(%0), %%mm0 \n\t"\
- - "movd 3(%0), %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "paddw %%mm3, %%mm0 \n\t"\
- - "psllw $2, %%mm2 \n\t"\
- - "psubw %%mm1, %%mm2 \n\t"\
- - "pmullw %%mm4, %%mm2 \n\t"\
- - "paddw %%mm5, %%mm0 \n\t"\
- - "paddw %%mm2, %%mm0 \n\t"\
- - "psraw $5, %%mm0 \n\t"\
- - "packuswb %%mm0, %%mm0 \n\t"\
- - OP(%%mm0, (%1),%%mm6, d)\
- - "add %3, %0 \n\t"\
- - "add %4, %1 \n\t"\
- - "decl %2 \n\t"\
- - " jnz 1b \n\t"\
- - : "+a"(src), "+c"(dst), "+g"(h)\
- - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
- - : "memory"\
- - );\
- -}\
- -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- - int h=4;\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movq %0, %%mm4 \n\t"\
- - "movq %1, %%mm5 \n\t"\
- - :: "m"(ff_pw_5), "m"(ff_pw_16)\
- - );\
- - do{\
- - __asm__ volatile(\
- - "movd -1(%0), %%mm1 \n\t"\
- - "movd (%0), %%mm2 \n\t"\
- - "movd 1(%0), %%mm3 \n\t"\
- - "movd 2(%0), %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "paddw %%mm0, %%mm1 \n\t"\
- - "paddw %%mm3, %%mm2 \n\t"\
- - "movd -2(%0), %%mm0 \n\t"\
- - "movd 3(%0), %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "paddw %%mm3, %%mm0 \n\t"\
- - "psllw $2, %%mm2 \n\t"\
- - "psubw %%mm1, %%mm2 \n\t"\
- - "pmullw %%mm4, %%mm2 \n\t"\
- - "paddw %%mm5, %%mm0 \n\t"\
- - "paddw %%mm2, %%mm0 \n\t"\
- - "movd (%2), %%mm3 \n\t"\
- - "psraw $5, %%mm0 \n\t"\
- - "packuswb %%mm0, %%mm0 \n\t"\
- - PAVGB" %%mm3, %%mm0 \n\t"\
- - OP(%%mm0, (%1),%%mm6, d)\
- - "add %4, %0 \n\t"\
- - "add %4, %1 \n\t"\
- - "add %3, %2 \n\t"\
- - : "+a"(src), "+c"(dst), "+d"(src2)\
- - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
- - : "memory"\
- - );\
- - }while(--h);\
- -}\
- -static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - src -= 2*srcStride;\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movd (%0), %%mm0 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm1 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm2 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm3 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm4 \n\t"\
- - "add %2, %0 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm4 \n\t"\
- - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- - \
- - : "+a"(src), "+c"(dst)\
- - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- - : "memory"\
- - );\
- -}\
- -static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- - int h=4;\
- - int w=3;\
- - src -= 2*srcStride+2;\
- - while(w--){\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movd (%0), %%mm0 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm1 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm2 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm3 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm4 \n\t"\
- - "add %2, %0 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm4 \n\t"\
- - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
- - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
- - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
- - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
- - \
- - : "+a"(src)\
- - : "c"(tmp), "S"((x86_reg)srcStride)\
- - : "memory"\
- - );\
- - tmp += 4;\
- - src += 4 - 9*srcStride;\
- - }\
- - tmp -= 3*4;\
- - __asm__ volatile(\
- - "1: \n\t"\
- - "movq (%0), %%mm0 \n\t"\
- - "paddw 10(%0), %%mm0 \n\t"\
- - "movq 2(%0), %%mm1 \n\t"\
- - "paddw 8(%0), %%mm1 \n\t"\
- - "movq 4(%0), %%mm2 \n\t"\
- - "paddw 6(%0), %%mm2 \n\t"\
- - "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
- - "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
- - "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
- - "paddsw %%mm2, %%mm0 \n\t"\
- - "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
- - "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
- - "psraw $6, %%mm0 \n\t"\
- - "packuswb %%mm0, %%mm0 \n\t"\
- - OP(%%mm0, (%1),%%mm7, d)\
- - "add $24, %0 \n\t"\
- - "add %3, %1 \n\t"\
- - "decl %2 \n\t"\
- - " jnz 1b \n\t"\
- - : "+a"(tmp), "+c"(dst), "+g"(h)\
- - : "S"((x86_reg)dstStride)\
- - : "memory"\
- - );\
- -}\
- -\
- -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - int h=8;\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
- - "1: \n\t"\
- - "movq (%0), %%mm0 \n\t"\
- - "movq 1(%0), %%mm2 \n\t"\
- - "movq %%mm0, %%mm1 \n\t"\
- - "movq %%mm2, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpckhbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpckhbw %%mm7, %%mm3 \n\t"\
- - "paddw %%mm2, %%mm0 \n\t"\
- - "paddw %%mm3, %%mm1 \n\t"\
- - "psllw $2, %%mm0 \n\t"\
- - "psllw $2, %%mm1 \n\t"\
- - "movq -1(%0), %%mm2 \n\t"\
- - "movq 2(%0), %%mm4 \n\t"\
- - "movq %%mm2, %%mm3 \n\t"\
- - "movq %%mm4, %%mm5 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpckhbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm4 \n\t"\
- - "punpckhbw %%mm7, %%mm5 \n\t"\
- - "paddw %%mm4, %%mm2 \n\t"\
- - "paddw %%mm3, %%mm5 \n\t"\
- - "psubw %%mm2, %%mm0 \n\t"\
- - "psubw %%mm5, %%mm1 \n\t"\
- - "pmullw %%mm6, %%mm0 \n\t"\
- - "pmullw %%mm6, %%mm1 \n\t"\
- - "movd -2(%0), %%mm2 \n\t"\
- - "movd 7(%0), %%mm5 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm5 \n\t"\
- - "paddw %%mm3, %%mm2 \n\t"\
- - "paddw %%mm5, %%mm4 \n\t"\
- - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
- - "paddw %%mm5, %%mm2 \n\t"\
- - "paddw %%mm5, %%mm4 \n\t"\
- - "paddw %%mm2, %%mm0 \n\t"\
- - "paddw %%mm4, %%mm1 \n\t"\
- - "psraw $5, %%mm0 \n\t"\
- - "psraw $5, %%mm1 \n\t"\
- - "packuswb %%mm1, %%mm0 \n\t"\
- - OP(%%mm0, (%1),%%mm5, q)\
- - "add %3, %0 \n\t"\
- - "add %4, %1 \n\t"\
- - "decl %2 \n\t"\
- - " jnz 1b \n\t"\
- - : "+a"(src), "+c"(dst), "+g"(h)\
- - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
- - : "memory"\
- - );\
- -}\
- -\
- -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- - int h=8;\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
- - "1: \n\t"\
- - "movq (%0), %%mm0 \n\t"\
- - "movq 1(%0), %%mm2 \n\t"\
- - "movq %%mm0, %%mm1 \n\t"\
- - "movq %%mm2, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpckhbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpckhbw %%mm7, %%mm3 \n\t"\
- - "paddw %%mm2, %%mm0 \n\t"\
- - "paddw %%mm3, %%mm1 \n\t"\
- - "psllw $2, %%mm0 \n\t"\
- - "psllw $2, %%mm1 \n\t"\
- - "movq -1(%0), %%mm2 \n\t"\
- - "movq 2(%0), %%mm4 \n\t"\
- - "movq %%mm2, %%mm3 \n\t"\
- - "movq %%mm4, %%mm5 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpckhbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm4 \n\t"\
- - "punpckhbw %%mm7, %%mm5 \n\t"\
- - "paddw %%mm4, %%mm2 \n\t"\
- - "paddw %%mm3, %%mm5 \n\t"\
- - "psubw %%mm2, %%mm0 \n\t"\
- - "psubw %%mm5, %%mm1 \n\t"\
- - "pmullw %%mm6, %%mm0 \n\t"\
- - "pmullw %%mm6, %%mm1 \n\t"\
- - "movd -2(%0), %%mm2 \n\t"\
- - "movd 7(%0), %%mm5 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm5 \n\t"\
- - "paddw %%mm3, %%mm2 \n\t"\
- - "paddw %%mm5, %%mm4 \n\t"\
- - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
- - "paddw %%mm5, %%mm2 \n\t"\
- - "paddw %%mm5, %%mm4 \n\t"\
- - "paddw %%mm2, %%mm0 \n\t"\
- - "paddw %%mm4, %%mm1 \n\t"\
- - "psraw $5, %%mm0 \n\t"\
- - "psraw $5, %%mm1 \n\t"\
- - "movq (%2), %%mm4 \n\t"\
- - "packuswb %%mm1, %%mm0 \n\t"\
- - PAVGB" %%mm4, %%mm0 \n\t"\
- - OP(%%mm0, (%1),%%mm5, q)\
- - "add %5, %0 \n\t"\
- - "add %5, %1 \n\t"\
- - "add %4, %2 \n\t"\
- - "decl %3 \n\t"\
- - "jg 1b \n\t"\
- - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
- - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
- - : "memory"\
- - );\
- -}\
- -\
- -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- - int w= 2;\
- - src -= 2*srcStride;\
- - \
- - while(w--){\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movd (%0), %%mm0 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm1 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm2 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm3 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm4 \n\t"\
- - "add %2, %0 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm4 \n\t"\
- - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
- - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
- - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- - "cmpl $16, %4 \n\t"\
- - "jne 2f \n\t"\
- - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
- - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
- - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- - "2: \n\t"\
- - \
- - : "+a"(src), "+c"(dst)\
- - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
- - : "memory"\
- - );\
- - src += 4-(h+5)*srcStride;\
- - dst += 4-h*dstStride;\
- - }\
- -}\
- -static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
- - int w = (size+8)>>2;\
- - src -= 2*srcStride+2;\
- - while(w--){\
- - __asm__ volatile(\
- - "pxor %%mm7, %%mm7 \n\t"\
- - "movd (%0), %%mm0 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm1 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm2 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm3 \n\t"\
- - "add %2, %0 \n\t"\
- - "movd (%0), %%mm4 \n\t"\
- - "add %2, %0 \n\t"\
- - "punpcklbw %%mm7, %%mm0 \n\t"\
- - "punpcklbw %%mm7, %%mm1 \n\t"\
- - "punpcklbw %%mm7, %%mm2 \n\t"\
- - "punpcklbw %%mm7, %%mm3 \n\t"\
- - "punpcklbw %%mm7, %%mm4 \n\t"\
- - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
- - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
- - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
- - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
- - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
- - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
- - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
- - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
- - "cmpl $16, %3 \n\t"\
- - "jne 2f \n\t"\
- - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
- - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
- - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
- - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
- - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
- - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
- - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
- - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
- - "2: \n\t"\
- - : "+a"(src)\
- - : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\
- - : "memory"\
- - );\
- - tmp += 4;\
- - src += 4 - (size+5)*srcStride;\
- - }\
- -}\
- -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
- - int w = size>>4;\
- - do{\
- - int h = size;\
- - __asm__ volatile(\
- - "1: \n\t"\
- - "movq (%0), %%mm0 \n\t"\
- - "movq 8(%0), %%mm3 \n\t"\
- - "movq 2(%0), %%mm1 \n\t"\
- - "movq 10(%0), %%mm4 \n\t"\
- - "paddw %%mm4, %%mm0 \n\t"\
- - "paddw %%mm3, %%mm1 \n\t"\
- - "paddw 18(%0), %%mm3 \n\t"\
- - "paddw 16(%0), %%mm4 \n\t"\
- - "movq 4(%0), %%mm2 \n\t"\
- - "movq 12(%0), %%mm5 \n\t"\
- - "paddw 6(%0), %%mm2 \n\t"\
- - "paddw 14(%0), %%mm5 \n\t"\
- - "psubw %%mm1, %%mm0 \n\t"\
- - "psubw %%mm4, %%mm3 \n\t"\
- - "psraw $2, %%mm0 \n\t"\
- - "psraw $2, %%mm3 \n\t"\
- - "psubw %%mm1, %%mm0 \n\t"\
- - "psubw %%mm4, %%mm3 \n\t"\
- - "paddsw %%mm2, %%mm0 \n\t"\
- - "paddsw %%mm5, %%mm3 \n\t"\
- - "psraw $2, %%mm0 \n\t"\
- - "psraw $2, %%mm3 \n\t"\
- - "paddw %%mm2, %%mm0 \n\t"\
- - "paddw %%mm5, %%mm3 \n\t"\
- - "psraw $6, %%mm0 \n\t"\
- - "psraw $6, %%mm3 \n\t"\
- - "packuswb %%mm3, %%mm0 \n\t"\
- - OP(%%mm0, (%1),%%mm7, q)\
- - "add $48, %0 \n\t"\
- - "add %3, %1 \n\t"\
- - "decl %2 \n\t"\
- - " jnz 1b \n\t"\
- - : "+a"(tmp), "+c"(dst), "+g"(h)\
- - : "S"((x86_reg)dstStride)\
- - : "memory"\
- - );\
- - tmp += 8 - size*24;\
- - dst += 8 - size*dstStride;\
- - }while(w--);\
- -}\
- -\
- -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
- -}\
- -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
- - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
- -}\
- -\
- -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- - src += 8*srcStride;\
- - dst += 8*dstStride;\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- -}\
- -\
- -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- - src += 8*dstStride;\
- - dst += 8*dstStride;\
- - src2 += 8*src2Stride;\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- -}\
- -\
- -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
- - put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
- - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
- -}\
- -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
- -}\
- -\
- -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
- -}\
- -\
- -static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
- -{\
- - __asm__ volatile(\
- - "movq (%1), %%mm0 \n\t"\
- - "movq 24(%1), %%mm1 \n\t"\
- - "psraw $5, %%mm0 \n\t"\
- - "psraw $5, %%mm1 \n\t"\
- - "packuswb %%mm0, %%mm0 \n\t"\
- - "packuswb %%mm1, %%mm1 \n\t"\
- - PAVGB" (%0), %%mm0 \n\t"\
- - PAVGB" (%0,%3), %%mm1 \n\t"\
- - OP(%%mm0, (%2), %%mm4, d)\
- - OP(%%mm1, (%2,%4), %%mm5, d)\
- - "lea (%0,%3,2), %0 \n\t"\
- - "lea (%2,%4,2), %2 \n\t"\
- - "movq 48(%1), %%mm0 \n\t"\
- - "movq 72(%1), %%mm1 \n\t"\
- - "psraw $5, %%mm0 \n\t"\
- - "psraw $5, %%mm1 \n\t"\
- - "packuswb %%mm0, %%mm0 \n\t"\
- - "packuswb %%mm1, %%mm1 \n\t"\
- - PAVGB" (%0), %%mm0 \n\t"\
- - PAVGB" (%0,%3), %%mm1 \n\t"\
- - OP(%%mm0, (%2), %%mm4, d)\
- - OP(%%mm1, (%2,%4), %%mm5, d)\
- - :"+a"(src8), "+c"(src16), "+d"(dst)\
- - :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
- - :"memory");\
- -}\
- -static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
- -{\
- - do{\
- - __asm__ volatile(\
- - "movq (%1), %%mm0 \n\t"\
- - "movq 8(%1), %%mm1 \n\t"\
- - "movq 48(%1), %%mm2 \n\t"\
- - "movq 8+48(%1), %%mm3 \n\t"\
- - "psraw $5, %%mm0 \n\t"\
- - "psraw $5, %%mm1 \n\t"\
- - "psraw $5, %%mm2 \n\t"\
- - "psraw $5, %%mm3 \n\t"\
- - "packuswb %%mm1, %%mm0 \n\t"\
- - "packuswb %%mm3, %%mm2 \n\t"\
- - PAVGB" (%0), %%mm0 \n\t"\
- - PAVGB" (%0,%3), %%mm2 \n\t"\
- - OP(%%mm0, (%2), %%mm5, q)\
- - OP(%%mm2, (%2,%4), %%mm5, q)\
- - ::"a"(src8), "c"(src16), "d"(dst),\
- - "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
- - :"memory");\
- - src8 += 2L*src8Stride;\
- - src16 += 48;\
- - dst += 2L*dstStride;\
- - }while(h-=2);\
- -}\
- -static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
- -{\
- - OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
- - OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
- -}\
- -
- -
- -#if ARCH_X86_64
- -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
- -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- - int h=16;\
- - __asm__ volatile(\
- - "pxor %%xmm15, %%xmm15 \n\t"\
- - "movdqa %6, %%xmm14 \n\t"\
- - "movdqa %7, %%xmm13 \n\t"\
- - "1: \n\t"\
- - "lddqu 6(%0), %%xmm1 \n\t"\
- - "lddqu -2(%0), %%xmm7 \n\t"\
- - "movdqa %%xmm1, %%xmm0 \n\t"\
- - "punpckhbw %%xmm15, %%xmm1 \n\t"\
- - "punpcklbw %%xmm15, %%xmm0 \n\t"\
- - "punpcklbw %%xmm15, %%xmm7 \n\t"\
- - "movdqa %%xmm1, %%xmm2 \n\t"\
- - "movdqa %%xmm0, %%xmm6 \n\t"\
- - "movdqa %%xmm1, %%xmm3 \n\t"\
- - "movdqa %%xmm0, %%xmm8 \n\t"\
- - "movdqa %%xmm1, %%xmm4 \n\t"\
- - "movdqa %%xmm0, %%xmm9 \n\t"\
- - "movdqa %%xmm0, %%xmm12 \n\t"\
- - "movdqa %%xmm1, %%xmm11 \n\t"\
- - "palignr $10,%%xmm0, %%xmm11\n\t"\
- - "palignr $10,%%xmm7, %%xmm12\n\t"\
- - "palignr $2, %%xmm0, %%xmm4 \n\t"\
- - "palignr $2, %%xmm7, %%xmm9 \n\t"\
- - "palignr $4, %%xmm0, %%xmm3 \n\t"\
- - "palignr $4, %%xmm7, %%xmm8 \n\t"\
- - "palignr $6, %%xmm0, %%xmm2 \n\t"\
- - "palignr $6, %%xmm7, %%xmm6 \n\t"\
- - "paddw %%xmm0 ,%%xmm11 \n\t"\
- - "palignr $8, %%xmm0, %%xmm1 \n\t"\
- - "palignr $8, %%xmm7, %%xmm0 \n\t"\
- - "paddw %%xmm12,%%xmm7 \n\t"\
- - "paddw %%xmm3, %%xmm2 \n\t"\
- - "paddw %%xmm8, %%xmm6 \n\t"\
- - "paddw %%xmm4, %%xmm1 \n\t"\
- - "paddw %%xmm9, %%xmm0 \n\t"\
- - "psllw $2, %%xmm2 \n\t"\
- - "psllw $2, %%xmm6 \n\t"\
- - "psubw %%xmm1, %%xmm2 \n\t"\
- - "psubw %%xmm0, %%xmm6 \n\t"\
- - "paddw %%xmm13,%%xmm11 \n\t"\
- - "paddw %%xmm13,%%xmm7 \n\t"\
- - "pmullw %%xmm14,%%xmm2 \n\t"\
- - "pmullw %%xmm14,%%xmm6 \n\t"\
- - "lddqu (%2), %%xmm3 \n\t"\
- - "paddw %%xmm11,%%xmm2 \n\t"\
- - "paddw %%xmm7, %%xmm6 \n\t"\
- - "psraw $5, %%xmm2 \n\t"\
- - "psraw $5, %%xmm6 \n\t"\
- - "packuswb %%xmm2,%%xmm6 \n\t"\
- - "pavgb %%xmm3, %%xmm6 \n\t"\
- - OP(%%xmm6, (%1), %%xmm4, dqa)\
- - "add %5, %0 \n\t"\
- - "add %5, %1 \n\t"\
- - "add %4, %2 \n\t"\
- - "decl %3 \n\t"\
- - "jg 1b \n\t"\
- - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
- - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
- - "m"(ff_pw_5), "m"(ff_pw_16)\
- - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
- - "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
- - "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
- - "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
- - "memory"\
- - );\
- -}
- -#else // ARCH_X86_64
- -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
- -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- - src += 8*dstStride;\
- - dst += 8*dstStride;\
- - src2 += 8*src2Stride;\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- -}
- -#endif // ARCH_X86_64
- -
- -#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
- -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- - int h=8;\
- - __asm__ volatile(\
- - "pxor %%xmm7, %%xmm7 \n\t"\
- - "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
- - "1: \n\t"\
- - "lddqu -2(%0), %%xmm1 \n\t"\
- - "movdqa %%xmm1, %%xmm0 \n\t"\
- - "punpckhbw %%xmm7, %%xmm1 \n\t"\
- - "punpcklbw %%xmm7, %%xmm0 \n\t"\
- - "movdqa %%xmm1, %%xmm2 \n\t"\
- - "movdqa %%xmm1, %%xmm3 \n\t"\
- - "movdqa %%xmm1, %%xmm4 \n\t"\
- - "movdqa %%xmm1, %%xmm5 \n\t"\
- - "palignr $2, %%xmm0, %%xmm4 \n\t"\
- - "palignr $4, %%xmm0, %%xmm3 \n\t"\
- - "palignr $6, %%xmm0, %%xmm2 \n\t"\
- - "palignr $8, %%xmm0, %%xmm1 \n\t"\
- - "palignr $10,%%xmm0, %%xmm5 \n\t"\
- - "paddw %%xmm5, %%xmm0 \n\t"\
- - "paddw %%xmm3, %%xmm2 \n\t"\
- - "paddw %%xmm4, %%xmm1 \n\t"\
- - "psllw $2, %%xmm2 \n\t"\
- - "movq (%2), %%xmm3 \n\t"\
- - "psubw %%xmm1, %%xmm2 \n\t"\
- - "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
- - "pmullw %%xmm6, %%xmm2 \n\t"\
- - "paddw %%xmm0, %%xmm2 \n\t"\
- - "psraw $5, %%xmm2 \n\t"\
- - "packuswb %%xmm2, %%xmm2 \n\t"\
- - "pavgb %%xmm3, %%xmm2 \n\t"\
- - OP(%%xmm2, (%1), %%xmm4, q)\
- - "add %5, %0 \n\t"\
- - "add %5, %1 \n\t"\
- - "add %4, %2 \n\t"\
- - "decl %3 \n\t"\
- - "jg 1b \n\t"\
- - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
- - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
- - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
- - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
- - "memory"\
- - );\
- -}\
- -QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
- -\
- -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - int h=8;\
- - __asm__ volatile(\
- - "pxor %%xmm7, %%xmm7 \n\t"\
- - "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
- - "1: \n\t"\
- - "lddqu -2(%0), %%xmm1 \n\t"\
- - "movdqa %%xmm1, %%xmm0 \n\t"\
- - "punpckhbw %%xmm7, %%xmm1 \n\t"\
- - "punpcklbw %%xmm7, %%xmm0 \n\t"\
- - "movdqa %%xmm1, %%xmm2 \n\t"\
- - "movdqa %%xmm1, %%xmm3 \n\t"\
- - "movdqa %%xmm1, %%xmm4 \n\t"\
- - "movdqa %%xmm1, %%xmm5 \n\t"\
- - "palignr $2, %%xmm0, %%xmm4 \n\t"\
- - "palignr $4, %%xmm0, %%xmm3 \n\t"\
- - "palignr $6, %%xmm0, %%xmm2 \n\t"\
- - "palignr $8, %%xmm0, %%xmm1 \n\t"\
- - "palignr $10,%%xmm0, %%xmm5 \n\t"\
- - "paddw %%xmm5, %%xmm0 \n\t"\
- - "paddw %%xmm3, %%xmm2 \n\t"\
- - "paddw %%xmm4, %%xmm1 \n\t"\
- - "psllw $2, %%xmm2 \n\t"\
- - "psubw %%xmm1, %%xmm2 \n\t"\
- - "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
- - "pmullw %%xmm6, %%xmm2 \n\t"\
- - "paddw %%xmm0, %%xmm2 \n\t"\
- - "psraw $5, %%xmm2 \n\t"\
- - "packuswb %%xmm2, %%xmm2 \n\t"\
- - OP(%%xmm2, (%1), %%xmm4, q)\
- - "add %3, %0 \n\t"\
- - "add %4, %1 \n\t"\
- - "decl %2 \n\t"\
- - " jnz 1b \n\t"\
- - : "+a"(src), "+c"(dst), "+g"(h)\
- - : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
- - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
- - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
- - "memory"\
- - );\
- -}\
- -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- - src += 8*srcStride;\
- - dst += 8*dstStride;\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- -}\
- -
- -#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
- -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- - src -= 2*srcStride;\
- - \
- - __asm__ volatile(\
- - "pxor %%xmm7, %%xmm7 \n\t"\
- - "movq (%0), %%xmm0 \n\t"\
- - "add %2, %0 \n\t"\
- - "movq (%0), %%xmm1 \n\t"\
- - "add %2, %0 \n\t"\
- - "movq (%0), %%xmm2 \n\t"\
- - "add %2, %0 \n\t"\
- - "movq (%0), %%xmm3 \n\t"\
- - "add %2, %0 \n\t"\
- - "movq (%0), %%xmm4 \n\t"\
- - "add %2, %0 \n\t"\
- - "punpcklbw %%xmm7, %%xmm0 \n\t"\
- - "punpcklbw %%xmm7, %%xmm1 \n\t"\
- - "punpcklbw %%xmm7, %%xmm2 \n\t"\
- - "punpcklbw %%xmm7, %%xmm3 \n\t"\
- - "punpcklbw %%xmm7, %%xmm4 \n\t"\
- - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
- - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
- - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
- - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
- - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
- - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
- - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
- - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
- - "cmpl $16, %4 \n\t"\
- - "jne 2f \n\t"\
- - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
- - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
- - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
- - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
- - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
- - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
- - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
- - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
- - "2: \n\t"\
- - \
- - : "+a"(src), "+c"(dst)\
- - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
- - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
- - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
- - "memory"\
- - );\
- -}\
- -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
- -}\
- -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
- - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
- -}
- -
- -static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
- - int w = (size+8)>>3;
- - src -= 2*srcStride+2;
- - while(w--){
- - __asm__ volatile(
- - "pxor %%xmm7, %%xmm7 \n\t"
- - "movq (%0), %%xmm0 \n\t"
- - "add %2, %0 \n\t"
- - "movq (%0), %%xmm1 \n\t"
- - "add %2, %0 \n\t"
- - "movq (%0), %%xmm2 \n\t"
- - "add %2, %0 \n\t"
- - "movq (%0), %%xmm3 \n\t"
- - "add %2, %0 \n\t"
- - "movq (%0), %%xmm4 \n\t"
- - "add %2, %0 \n\t"
- - "punpcklbw %%xmm7, %%xmm0 \n\t"
- - "punpcklbw %%xmm7, %%xmm1 \n\t"
- - "punpcklbw %%xmm7, %%xmm2 \n\t"
- - "punpcklbw %%xmm7, %%xmm3 \n\t"
- - "punpcklbw %%xmm7, %%xmm4 \n\t"
- - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
- - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
- - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
- - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
- - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
- - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
- - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
- - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
- - "cmpl $16, %3 \n\t"
- - "jne 2f \n\t"
- - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
- - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
- - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
- - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
- - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
- - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
- - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
- - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
- - "2: \n\t"
- - : "+a"(src)
- - : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)
- - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
- - "memory"
- - );
- - tmp += 8;
- - src += 8 - (size+5)*srcStride;
- - }
- -}
- -
- -#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
- -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
- - int h = size;\
- - if(size == 16){\
- - __asm__ volatile(\
- - "1: \n\t"\
- - "movdqa 32(%0), %%xmm4 \n\t"\
- - "movdqa 16(%0), %%xmm5 \n\t"\
- - "movdqa (%0), %%xmm7 \n\t"\
- - "movdqa %%xmm4, %%xmm3 \n\t"\
- - "movdqa %%xmm4, %%xmm2 \n\t"\
- - "movdqa %%xmm4, %%xmm1 \n\t"\
- - "movdqa %%xmm4, %%xmm0 \n\t"\
- - "palignr $10, %%xmm5, %%xmm0 \n\t"\
- - "palignr $8, %%xmm5, %%xmm1 \n\t"\
- - "palignr $6, %%xmm5, %%xmm2 \n\t"\
- - "palignr $4, %%xmm5, %%xmm3 \n\t"\
- - "palignr $2, %%xmm5, %%xmm4 \n\t"\
- - "paddw %%xmm5, %%xmm0 \n\t"\
- - "paddw %%xmm4, %%xmm1 \n\t"\
- - "paddw %%xmm3, %%xmm2 \n\t"\
- - "movdqa %%xmm5, %%xmm6 \n\t"\
- - "movdqa %%xmm5, %%xmm4 \n\t"\
- - "movdqa %%xmm5, %%xmm3 \n\t"\
- - "palignr $8, %%xmm7, %%xmm4 \n\t"\
- - "palignr $2, %%xmm7, %%xmm6 \n\t"\
- - "palignr $10, %%xmm7, %%xmm3 \n\t"\
- - "paddw %%xmm6, %%xmm4 \n\t"\
- - "movdqa %%xmm5, %%xmm6 \n\t"\
- - "palignr $6, %%xmm7, %%xmm5 \n\t"\
- - "palignr $4, %%xmm7, %%xmm6 \n\t"\
- - "paddw %%xmm7, %%xmm3 \n\t"\
- - "paddw %%xmm6, %%xmm5 \n\t"\
- - \
- - "psubw %%xmm1, %%xmm0 \n\t"\
- - "psubw %%xmm4, %%xmm3 \n\t"\
- - "psraw $2, %%xmm0 \n\t"\
- - "psraw $2, %%xmm3 \n\t"\
- - "psubw %%xmm1, %%xmm0 \n\t"\
- - "psubw %%xmm4, %%xmm3 \n\t"\
- - "paddw %%xmm2, %%xmm0 \n\t"\
- - "paddw %%xmm5, %%xmm3 \n\t"\
- - "psraw $2, %%xmm0 \n\t"\
- - "psraw $2, %%xmm3 \n\t"\
- - "paddw %%xmm2, %%xmm0 \n\t"\
- - "paddw %%xmm5, %%xmm3 \n\t"\
- - "psraw $6, %%xmm0 \n\t"\
- - "psraw $6, %%xmm3 \n\t"\
- - "packuswb %%xmm0, %%xmm3 \n\t"\
- - OP(%%xmm3, (%1), %%xmm7, dqa)\
- - "add $48, %0 \n\t"\
- - "add %3, %1 \n\t"\
- - "decl %2 \n\t"\
- - " jnz 1b \n\t"\
- - : "+a"(tmp), "+c"(dst), "+g"(h)\
- - : "S"((x86_reg)dstStride)\
- - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
- - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
- - "memory"\
- - );\
- - }else{\
- - __asm__ volatile(\
- - "1: \n\t"\
- - "movdqa 16(%0), %%xmm1 \n\t"\
- - "movdqa (%0), %%xmm0 \n\t"\
- - "movdqa %%xmm1, %%xmm2 \n\t"\
- - "movdqa %%xmm1, %%xmm3 \n\t"\
- - "movdqa %%xmm1, %%xmm4 \n\t"\
- - "movdqa %%xmm1, %%xmm5 \n\t"\
- - "palignr $10, %%xmm0, %%xmm5 \n\t"\
- - "palignr $8, %%xmm0, %%xmm4 \n\t"\
- - "palignr $6, %%xmm0, %%xmm3 \n\t"\
- - "palignr $4, %%xmm0, %%xmm2 \n\t"\
- - "palignr $2, %%xmm0, %%xmm1 \n\t"\
- - "paddw %%xmm5, %%xmm0 \n\t"\
- - "paddw %%xmm4, %%xmm1 \n\t"\
- - "paddw %%xmm3, %%xmm2 \n\t"\
- - "psubw %%xmm1, %%xmm0 \n\t"\
- - "psraw $2, %%xmm0 \n\t"\
- - "psubw %%xmm1, %%xmm0 \n\t"\
- - "paddw %%xmm2, %%xmm0 \n\t"\
- - "psraw $2, %%xmm0 \n\t"\
- - "paddw %%xmm2, %%xmm0 \n\t"\
- - "psraw $6, %%xmm0 \n\t"\
- - "packuswb %%xmm0, %%xmm0 \n\t"\
- - OP(%%xmm0, (%1), %%xmm7, q)\
- - "add $48, %0 \n\t"\
- - "add %3, %1 \n\t"\
- - "decl %2 \n\t"\
- - " jnz 1b \n\t"\
- - : "+a"(tmp), "+c"(dst), "+g"(h)\
- - : "S"((x86_reg)dstStride)\
- - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
- - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
- - "memory"\
- - );\
- - }\
- -}
- -
- -#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
- -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
- - put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
- - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
- -}\
- -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
- -}\
- -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
- -}\
- -
- -#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
- -#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
- -#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
- -#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
- -#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
- -#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
- -#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
- -#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
- -
- -#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
- -#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
- -#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
- -#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
- -#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
- -#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
- -#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
- -#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
- -
- -#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
- -#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
- -#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
- -#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
- -
- -#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
- -#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
- -#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
- -#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
- -
- -#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
- -#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
- -
- -#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
- -H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
- -H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
- -H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
- -H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
- -
- -static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
- - put_pixels16_sse2(dst, src, stride, 16);
- -}
- -static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
- - avg_pixels16_sse2(dst, src, stride, 16);
- -}
- -#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
- -#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
- -
- -#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
- -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
- - OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
- -}\
- -
- -#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
- -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
- -}\
- -
- -#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
- -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
- -}\
- -
- -#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
- -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
- - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- - uint8_t * const halfHV= temp;\
- - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- - assert(((int)temp & 7) == 0);\
- - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- - uint8_t * const halfHV= temp;\
- - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- - assert(((int)temp & 7) == 0);\
- - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- - uint8_t * const halfHV= temp;\
- - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- - assert(((int)temp & 7) == 0);\
- - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
- -}\
- -\
- -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- - uint8_t * const halfHV= temp;\
- - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- - assert(((int)temp & 7) == 0);\
- - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
- -}\
- -
- -#define H264_MC_4816(MMX)\
- -H264_MC(put_, 4, MMX, 8)\
- -H264_MC(put_, 8, MMX, 8)\
- -H264_MC(put_, 16,MMX, 8)\
- -H264_MC(avg_, 4, MMX, 8)\
- -H264_MC(avg_, 8, MMX, 8)\
- -H264_MC(avg_, 16,MMX, 8)\
- -
- -#define H264_MC_816(QPEL, XMM)\
- -QPEL(put_, 8, XMM, 16)\
- -QPEL(put_, 16,XMM, 16)\
- -QPEL(avg_, 8, XMM, 16)\
- -QPEL(avg_, 16,XMM, 16)\
- -
- -
- -#define AVG_3DNOW_OP(a,b,temp, size) \
- -"mov" #size " " #b ", " #temp " \n\t"\
- -"pavgusb " #temp ", " #a " \n\t"\
- -"mov" #size " " #a ", " #b " \n\t"
- -#define AVG_MMX2_OP(a,b,temp, size) \
- -"mov" #size " " #b ", " #temp " \n\t"\
- -"pavgb " #temp ", " #a " \n\t"\
- -"mov" #size " " #a ", " #b " \n\t"
- -
- -#define PAVGB "pavgusb"
- -QPEL_H264(put_, PUT_OP, 3dnow)
- -QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
- -#undef PAVGB
- -#define PAVGB "pavgb"
- -QPEL_H264(put_, PUT_OP, mmx2)
- -QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
- -QPEL_H264_V_XMM(put_, PUT_OP, sse2)
- -QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
- -QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
- -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
- -#if HAVE_SSSE3
- -QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
- -QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
- -QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
- -QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
- -QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
- -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
- -#endif
- -#undef PAVGB
- -
- -H264_MC_4816(3dnow)
- -H264_MC_4816(mmx2)
- -H264_MC_816(H264_MC_V, sse2)
- -H264_MC_816(H264_MC_HV, sse2)
- -#if HAVE_SSSE3
- -H264_MC_816(H264_MC_H, ssse3)
- -H264_MC_816(H264_MC_HV, ssse3)
- -#endif
- -
- -
- -
- //10bit
- #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
- -void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
- +void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## DEPTH ## _ ## OPT \
- (uint8_t *dst, uint8_t *src, int stride);
- #define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
- @@ -1222,78 +40,146 @@ void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
- LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
- -LUMA_MC_ALL(10, mc00, mmxext)
- -LUMA_MC_ALL(10, mc10, mmxext)
- -LUMA_MC_ALL(10, mc20, mmxext)
- -LUMA_MC_ALL(10, mc30, mmxext)
- -LUMA_MC_ALL(10, mc01, mmxext)
- -LUMA_MC_ALL(10, mc11, mmxext)
- -LUMA_MC_ALL(10, mc21, mmxext)
- -LUMA_MC_ALL(10, mc31, mmxext)
- -LUMA_MC_ALL(10, mc02, mmxext)
- -LUMA_MC_ALL(10, mc12, mmxext)
- -LUMA_MC_ALL(10, mc22, mmxext)
- -LUMA_MC_ALL(10, mc32, mmxext)
- -LUMA_MC_ALL(10, mc03, mmxext)
- -LUMA_MC_ALL(10, mc13, mmxext)
- -LUMA_MC_ALL(10, mc23, mmxext)
- -LUMA_MC_ALL(10, mc33, mmxext)
- -
- -LUMA_MC_816(10, mc00, sse2)
- -LUMA_MC_816(10, mc10, sse2)
- -LUMA_MC_816(10, mc10, sse2_cache64)
- -LUMA_MC_816(10, mc10, ssse3_cache64)
- -LUMA_MC_816(10, mc20, sse2)
- -LUMA_MC_816(10, mc20, sse2_cache64)
- -LUMA_MC_816(10, mc20, ssse3_cache64)
- -LUMA_MC_816(10, mc30, sse2)
- -LUMA_MC_816(10, mc30, sse2_cache64)
- -LUMA_MC_816(10, mc30, ssse3_cache64)
- -LUMA_MC_816(10, mc01, sse2)
- -LUMA_MC_816(10, mc11, sse2)
- -LUMA_MC_816(10, mc21, sse2)
- -LUMA_MC_816(10, mc31, sse2)
- -LUMA_MC_816(10, mc02, sse2)
- -LUMA_MC_816(10, mc12, sse2)
- -LUMA_MC_816(10, mc22, sse2)
- -LUMA_MC_816(10, mc32, sse2)
- -LUMA_MC_816(10, mc03, sse2)
- -LUMA_MC_816(10, mc13, sse2)
- -LUMA_MC_816(10, mc23, sse2)
- -LUMA_MC_816(10, mc33, sse2)
- -
- -#define QPEL16_OPMC(OP, MC, MMX)\
- -void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
- - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
- +LUMA_MC_ALL(, mc00, mmxext)
- +LUMA_MC_ALL(, mc01, mmxext)
- +LUMA_MC_ALL(, mc02, mmxext)
- +LUMA_MC_ALL(, mc03, mmxext)
- +LUMA_MC_ALL(, mc10, mmxext)
- +LUMA_MC_ALL(, mc11, mmxext)
- +LUMA_MC_ALL(, mc12, mmxext)
- +LUMA_MC_ALL(, mc13, mmxext)
- +LUMA_MC_ALL(, mc20, mmxext)
- +LUMA_MC_ALL(, mc21, mmxext)
- +LUMA_MC_ALL(, mc22, mmxext)
- +LUMA_MC_ALL(, mc23, mmxext)
- +LUMA_MC_ALL(, mc30, mmxext)
- +LUMA_MC_ALL(, mc31, mmxext)
- +LUMA_MC_ALL(, mc32, mmxext)
- +LUMA_MC_ALL(, mc33, mmxext)
- +
- +#define ff_put_h264_qpel8_mc00_sse2 ff_put_h264_qpel8_mc00_mmxext
- +#define ff_avg_h264_qpel8_mc00_sse2 ff_avg_h264_qpel8_mc00_mmxext
- +LUMA_MC_ALL(, mc00, sse2)
- +LUMA_MC_ALL(, mc01, sse2)
- +LUMA_MC_ALL(, mc02, sse2)
- +LUMA_MC_ALL(, mc03, sse2)
- +LUMA_MC_ALL(, mc10, sse2)
- +LUMA_MC_ALL(, mc11, sse2)
- +LUMA_MC_ALL(, mc12, sse2)
- +LUMA_MC_ALL(, mc13, sse2)
- +LUMA_MC_ALL(, mc20, sse2)
- +LUMA_MC_ALL(, mc21, sse2)
- +LUMA_MC_ALL(, mc22, sse2)
- +LUMA_MC_ALL(, mc23, sse2)
- +LUMA_MC_ALL(, mc30, sse2)
- +LUMA_MC_ALL(, mc31, sse2)
- +LUMA_MC_ALL(, mc32, sse2)
- +LUMA_MC_ALL(, mc33, sse2)
- +
- +LUMA_MC_ALL(_10, mc00, mmxext)
- +LUMA_MC_ALL(_10, mc10, mmxext)
- +LUMA_MC_ALL(_10, mc20, mmxext)
- +LUMA_MC_ALL(_10, mc30, mmxext)
- +LUMA_MC_ALL(_10, mc01, mmxext)
- +LUMA_MC_ALL(_10, mc11, mmxext)
- +LUMA_MC_ALL(_10, mc21, mmxext)
- +LUMA_MC_ALL(_10, mc31, mmxext)
- +LUMA_MC_ALL(_10, mc02, mmxext)
- +LUMA_MC_ALL(_10, mc12, mmxext)
- +LUMA_MC_ALL(_10, mc22, mmxext)
- +LUMA_MC_ALL(_10, mc32, mmxext)
- +LUMA_MC_ALL(_10, mc03, mmxext)
- +LUMA_MC_ALL(_10, mc13, mmxext)
- +LUMA_MC_ALL(_10, mc23, mmxext)
- +LUMA_MC_ALL(_10, mc33, mmxext)
- +
- +LUMA_MC_816(_10, mc00, sse2)
- +LUMA_MC_816(_10, mc10, sse2)
- +LUMA_MC_816(_10, mc10, sse2_cache64)
- +LUMA_MC_816(_10, mc10, ssse3_cache64)
- +LUMA_MC_816(_10, mc20, sse2)
- +LUMA_MC_816(_10, mc20, sse2_cache64)
- +LUMA_MC_816(_10, mc20, ssse3_cache64)
- +LUMA_MC_816(_10, mc30, sse2)
- +LUMA_MC_816(_10, mc30, sse2_cache64)
- +LUMA_MC_816(_10, mc30, ssse3_cache64)
- +LUMA_MC_816(_10, mc01, sse2)
- +LUMA_MC_816(_10, mc11, sse2)
- +LUMA_MC_816(_10, mc21, sse2)
- +LUMA_MC_816(_10, mc31, sse2)
- +LUMA_MC_816(_10, mc02, sse2)
- +LUMA_MC_816(_10, mc12, sse2)
- +LUMA_MC_816(_10, mc22, sse2)
- +LUMA_MC_816(_10, mc32, sse2)
- +LUMA_MC_816(_10, mc03, sse2)
- +LUMA_MC_816(_10, mc13, sse2)
- +LUMA_MC_816(_10, mc23, sse2)
- +LUMA_MC_816(_10, mc33, sse2)
- +
- +#define QPEL16_OPMC(OP, MC, MMX, DEPTH, OFFSET)\
- +void ff_ ## OP ## _h264_qpel16_ ## MC ## DEPTH ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## DEPTH ## _ ## MMX(dst , src , stride);\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## DEPTH ## _ ## MMX(dst+OFFSET, src+OFFSET, stride);\
- src += 8*stride;\
- dst += 8*stride;\
- - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
- - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## DEPTH ## _ ## MMX(dst , src , stride);\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## DEPTH ## _ ## MMX(dst+OFFSET, src+OFFSET, stride);\
- }
- -#define QPEL16_OP(MC, MMX)\
- -QPEL16_OPMC(put, MC, MMX)\
- -QPEL16_OPMC(avg, MC, MMX)
- -
- -#define QPEL16(MMX)\
- -QPEL16_OP(mc00, MMX)\
- -QPEL16_OP(mc01, MMX)\
- -QPEL16_OP(mc02, MMX)\
- -QPEL16_OP(mc03, MMX)\
- -QPEL16_OP(mc10, MMX)\
- -QPEL16_OP(mc11, MMX)\
- -QPEL16_OP(mc12, MMX)\
- -QPEL16_OP(mc13, MMX)\
- -QPEL16_OP(mc20, MMX)\
- -QPEL16_OP(mc21, MMX)\
- -QPEL16_OP(mc22, MMX)\
- -QPEL16_OP(mc23, MMX)\
- -QPEL16_OP(mc30, MMX)\
- -QPEL16_OP(mc31, MMX)\
- -QPEL16_OP(mc32, MMX)\
- -QPEL16_OP(mc33, MMX)
- +#define QPEL16_OP(MC, MMX, DEPTH, OFFSET)\
- +QPEL16_OPMC(put, MC, MMX, DEPTH, OFFSET)\
- +QPEL16_OPMC(avg, MC, MMX, DEPTH, OFFSET)
- +
- +#define QPEL16(MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc00, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc01, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc02, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc03, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc10, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc11, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc12, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc13, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc20, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc21, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc22, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc23, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc30, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc31, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc32, MMX, DEPTH, OFFSET)\
- +QPEL16_OP(mc33, MMX, DEPTH, OFFSET)
- #if ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+
- -QPEL16(mmxext)
- +QPEL16(mmxext, _10, 16)
- +
- +QPEL16(mmxext, , 8)
- #endif
- +
- +#define H264_MC_TMP(OPNAME, MC, SIZE, MMX) \
- +static void OPNAME ## _h264_qpel ## SIZE ## _ ## MC ## _ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
- + ff_ ## OPNAME ## _h264_qpel ## SIZE ## _ ## MC ## _ ## MMX(dst,src,stride);\
- +}\
- +
- +#define H264_MC_TMP_ALL(MC, MMX) \
- +H264_MC_TMP(put, MC, 4, MMX);\
- +H264_MC_TMP(avg, MC, 4, MMX);\
- +H264_MC_TMP(put, MC, 8, MMX);\
- +H264_MC_TMP(avg, MC, 8, MMX);\
- +H264_MC_TMP(put, MC, 16, MMX);\
- +H264_MC_TMP(avg, MC, 16, MMX);
- +
- +H264_MC_TMP_ALL(mc00, mmxext)
- +H264_MC_TMP_ALL(mc01, mmxext)
- +H264_MC_TMP_ALL(mc02, mmxext)
- +H264_MC_TMP_ALL(mc03, mmxext)
- +H264_MC_TMP_ALL(mc10, mmxext)
- +H264_MC_TMP_ALL(mc11, mmxext)
- +H264_MC_TMP_ALL(mc12, mmxext)
- +H264_MC_TMP_ALL(mc13, mmxext)
- +H264_MC_TMP_ALL(mc20, mmxext)
- +H264_MC_TMP_ALL(mc21, mmxext)
- +H264_MC_TMP_ALL(mc22, mmxext)
- +H264_MC_TMP_ALL(mc23, mmxext)
- +H264_MC_TMP_ALL(mc30, mmxext)
- +H264_MC_TMP_ALL(mc31, mmxext)
- +H264_MC_TMP_ALL(mc32, mmxext)
- +H264_MC_TMP_ALL(mc33, mmxext)
- --
- 1.7.5.1
Add Comment
Please, Sign In to add comment