Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 489134cf2343edbb65bb17d2ed99dc2c114657bf Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Wed, 22 Jun 2011 17:40:50 -0400
- Subject: [PATCH 1/5] luma mc first pass done \o/
- ---
- libavcodec/x86/Makefile | 1 +
- libavcodec/x86/dsputil_mmx.c | 47 ++
- libavcodec/x86/h264_qpel_10bit.asm | 813 ++++++++++++++++++++++++++++++++++
- libavcodec/x86/h264_qpel_mmx_10bit.c | 141 ++++++
- 4 files changed, 1002 insertions(+), 0 deletions(-)
- create mode 100644 libavcodec/x86/h264_qpel_10bit.asm
- create mode 100755 libavcodec/x86/h264_qpel_mmx_10bit.c
- diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
- index 022ab27..d3cf0da 100644
- --- a/libavcodec/x86/Makefile
- +++ b/libavcodec/x86/Makefile
- @@ -46,6 +46,7 @@ MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
- x86/fmtconvert.o \
- x86/h264_chromamc.o \
- x86/h264_chromamc_10bit.o \
- + x86/h264_qpel_10bit.o \
- $(YASM-OBJS-yes)
- MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
- diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
- index 5c5ecb2..43ff26a 100644
- --- a/libavcodec/x86/dsputil_mmx.c
- +++ b/libavcodec/x86/dsputil_mmx.c
- @@ -1896,6 +1896,7 @@ PREFETCH(prefetch_3dnow, prefetch)
- #undef PREFETCH
- #include "h264_qpel_mmx.c"
- +#include "h264_qpel_mmx_10bit.c"
- void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
- @@ -2649,6 +2650,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
- SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
- }
- +#if HAVE_YASM
- +#define SET_QPEL_FUNCS_10(PFX, IDX, SIZE, CPU) \
- + c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## SIZE ## _mc00_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 1] = ff_ ## PFX ## SIZE ## _mc10_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## SIZE ## _mc20_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 3] = ff_ ## PFX ## SIZE ## _mc30_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## SIZE ## _mc01_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 5] = ff_ ## PFX ## SIZE ## _mc11_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 6] = ff_ ## PFX ## SIZE ## _mc21_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 7] = ff_ ## PFX ## SIZE ## _mc31_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## SIZE ## _mc02_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][ 9] = ff_ ## PFX ## SIZE ## _mc12_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][10] = ff_ ## PFX ## SIZE ## _mc22_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][11] = ff_ ## PFX ## SIZE ## _mc32_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## SIZE ## _mc03_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][13] = ff_ ## PFX ## SIZE ## _mc13_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][14] = ff_ ## PFX ## SIZE ## _mc23_10_ ## CPU; \
- + c->PFX ## _pixels_tab[IDX][15] = ff_ ## PFX ## SIZE ## _mc33_10_ ## CPU
- + else if (bit_depth == 10) {
- + SET_QPEL_FUNCS_10(put_h264_qpel, 0, 16, mmxext);
- + SET_QPEL_FUNCS_10(put_h264_qpel, 1, 8, mmxext);
- + SET_QPEL_FUNCS_10(put_h264_qpel, 2, 4, mmxext);
- + SET_QPEL_FUNCS_10(avg_h264_qpel, 0, 16, mmxext);
- + SET_QPEL_FUNCS_10(avg_h264_qpel, 1, 8, mmxext);
- + SET_QPEL_FUNCS_10(avg_h264_qpel, 2, 4, mmxext);
- + }
- +#endif
- SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
- SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
- @@ -2777,7 +2805,26 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- H264_QPEL_FUNCS(3, 3, sse2);
- }
- #if HAVE_YASM
- +#define H264_QPEL_FUNCS_10(x, y, CPU)\
- + c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
- + c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
- + c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
- + c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
- if (bit_depth == 10) {
- + H264_QPEL_FUNCS_10(0, 0, sse2);
- + H264_QPEL_FUNCS_10(0, 1, sse2);
- + H264_QPEL_FUNCS_10(0, 2, sse2);
- + H264_QPEL_FUNCS_10(0, 3, sse2);
- + H264_QPEL_FUNCS_10(1, 1, sse2);
- + H264_QPEL_FUNCS_10(1, 2, sse2);
- + H264_QPEL_FUNCS_10(1, 3, sse2);
- + H264_QPEL_FUNCS_10(2, 1, sse2);
- + H264_QPEL_FUNCS_10(2, 2, sse2);
- + H264_QPEL_FUNCS_10(2, 3, sse2);
- + H264_QPEL_FUNCS_10(3, 1, sse2);
- + H264_QPEL_FUNCS_10(3, 2, sse2);
- + H264_QPEL_FUNCS_10(3, 3, sse2);
- +
- c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
- c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
- }
- diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
- new file mode 100644
- index 0000000..fc44e85
- --- /dev/null
- +++ b/libavcodec/x86/h264_qpel_10bit.asm
- @@ -0,0 +1,813 @@
- +;*****************************************************************************
- +;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
- +;*****************************************************************************
- +;* Copyright (C) 2005-2011 x264 project
- +;*
- +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
- +;*
- +;* This file is part of Libav.
- +;*
- +;* Libav is free software; you can redistribute it and/or
- +;* modify it under the terms of the GNU Lesser General Public
- +;* License as published by the Free Software Foundation; either
- +;* version 2.1 of the License, or (at your option) any later version.
- +;*
- +;* Libav is distributed in the hope that it will be useful,
- +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- +;* Lesser General Public License for more details.
- +;*
- +;* You should have received a copy of the GNU Lesser General Public
- +;* License along with Libav; if not, write to the Free Software
- +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- +;******************************************************************************
- +
- +%include "x86inc.asm"
- +%include "x86util.asm"
- +
- +SECTION_RODATA 32
- +
- +cextern pw_16
- +cextern pw_1
- +cextern pb_0
- +
- +pw_pixel_max: times 8 dw ((1 << 10)-1)
- +
- +pad10: times 8 dw 10*1023
- +pad20: times 8 dw 20*1023
- +pad30: times 8 dw 30*1023
- +depad: times 4 dd 32*20*1023 + 512
- +depad2: times 8 dw 20*1023 + 16*1022 + 16
- +unpad: times 8 dw 16*1022/32 ; needs to be mod 16
- +
- +tap1: times 4 dw 1, -5
- +tap2: times 4 dw 20, 20
- +tap3: times 4 dw -5, 1
- +pd_0f: times 4 dd 0xffff
- +
- +SECTION .text
- +
- +; All of the 2x2 functions are probably no faster than the C version.
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC00 1
- +INIT_MMX
- +cglobal %1_h264_qpel4_mc00_10_mmxext, 3,4
- + lea r3, [r2*3]
- + movq m0, [r1 ]
- + OP_MOV [r0 ], m0
- + movq m0, [r1+r2 ]
- + OP_MOV [r0+r2 ], m0
- + movq m0, [r1+r2*2]
- + OP_MOV [r0+r2*2], m0
- + movq m0, [r1+r3 ]
- + OP_MOV [r0+r3 ], m0
- + RET
- +
- +INIT_XMM
- +cglobal %1_h264_qpel8_mc00_10_sse2, 3,3
- +%rep 4
- + movu m0, [r1 ]
- + OP_MOV [r0 ], m0
- + movu m0, [r1+r2]
- + OP_MOV [r0+r2], m0
- + lea r0, [r0+r2*2]
- + lea r1, [r1+r2*2]
- +%endrep
- + RET
- +
- +cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
- +%rep 8
- + movu m0, [r1 ]
- + movu m1, [r1 +16]
- + OP_MOV [r0 ], m0
- + OP_MOV [r0 +16], m1
- + movu m0, [r1+r2 ]
- + movu m1, [r1+r2+16]
- + OP_MOV [r0+r2 ], m0
- + OP_MOV [r0+r2+16], m1
- + lea r0, [r0+r2*2]
- + lea r1, [r1+r2*2]
- +%endrep
- + RET
- +%endmacro
- +
- +%macro AVG_MOV 2
- + pavgw %2, %1
- + mova %1, %2
- +%endmacro
- +
- +%define OP_MOV mova
- +MC00 put
- +
- +%define OP_MOV AVG_MOV
- +MC00 avg
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro FILT_H 4
- + paddw %1, %4
- + psubw %1, %2 ; a-b
- + psraw %1, 2 ; (a-b)/4
- + psubw %1, %2 ; (a-b)/4-b
- + paddw %1, %3 ; (a-b)/4-b+c
- + psraw %1, 2 ; ((a-b)/4-b+c)/4
- + paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- +%endmacro
- +
- +%macro ADDW 3
- +%if mmsize == 8
- + paddw %1, %2
- +%else
- + movu %3, %2
- + paddw %1, %3
- +%endif
- +%endmacro
- +
- +%macro MC20 3
- +cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
- + mov r3d, %3
- + pxor m0, m0
- + mova m1, [pw_pixel_max]
- + mova m6, [pw_16]
- +.nextrow
- + movu m2, [r1-4]
- + movu m3, [r1-2]
- + movu m4, [r1+0]
- + ADDW m4, [r1+2], m5
- + ADDW m3, [r1+4], m5
- + ADDW m2, [r1+6], m5
- +
- + FILT_H m2, m3, m4, m6
- + psraw m2, 1
- + CLIPW m2, m0, m1
- + OP_MOV [r0], m2
- + add r0, r2
- + add r1, r2
- + dec r3d
- + jg .nextrow
- + REP_RET
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC20 mmxext, put, 4
- +INIT_XMM
- +MC20 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC20 mmxext, avg, 4
- +INIT_XMM
- +MC20 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC30 3
- +cglobal %2_h264_qpel%3_mc30_10_%1, 3,5,6
- + lea r4, [r1+2]
- + jmp mangle(ff_%2_h264_qpel%3_mc10_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC30 mmxext, put, 4
- +INIT_XMM
- +MC30 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC30 mmxext, avg, 4
- +INIT_XMM
- +MC30 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC10 3
- +cglobal %2_h264_qpel%3_mc10_10_%1, 3,5,7
- + mov r4, r1
- +.body
- + mov r3d, %3
- + pxor m0, m0
- + mova m1, [pw_pixel_max]
- + mova m6, [pw_16]
- +.nextrow
- + movu m2, [r1-4]
- + movu m3, [r1-2]
- + movu m4, [r1+0]
- + ADDW m4, [r1+2], m5
- + ADDW m3, [r1+4], m5
- + ADDW m2, [r1+6], m5
- +
- + FILT_H m2, m3, m4, m6
- + psraw m2, 1
- + CLIPW m2, m0, m1
- + movu m3, [r4]
- + pavgw m2, m3
- + OP_MOV [r0], m2
- + add r0, r2
- + add r1, r2
- + add r4, r2
- + dec r3d
- + jg .nextrow
- + REP_RET
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC10 mmxext, put, 4
- +INIT_XMM
- +MC10 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC10 mmxext, avg, 4
- +INIT_XMM
- +MC10 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro FILT_V 8
- + movu %6, [r1]
- + paddw %1, %6
- + mova %7, %2
- + paddw %7, %5
- + mova %8, %3
- + paddw %8, %4
- + FILT_H %1, %7, %8, [pw_16]
- + psraw %1, 1
- + CLIPW %1, [pb_0], [pw_pixel_max]
- +%endmacro
- +
- +%macro MC02 3
- +cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
- + lea r3, [r2*2]
- + sub r1, r3
- + movu m0, [r1]
- + movu m1, [r1+r2]
- + add r1, r3
- + movu m2, [r1]
- + movu m3, [r1+r2]
- + add r1, r3
- + movu m4, [r1]
- + add r1, r2
- +
- +%rep %3-1
- + FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- + OP_MOV [r0], m0
- + add r1, r2
- + add r0, r2
- + SWAP 0,1,2,3,4,5
- +%endrep
- + FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- + OP_MOV [r0], m0
- + RET
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC02 mmxext, put, 4
- +INIT_XMM
- +MC02 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC02 mmxext, avg, 4
- +INIT_XMM
- +MC02 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC01 3
- +cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
- + mov r4, r1
- +.body
- + lea r3, [r2*2]
- + sub r1, r3
- + movu m0, [r1]
- + movu m1, [r1+r2]
- + add r1, r3
- + movu m2, [r1]
- + movu m3, [r1+r2]
- + add r1, r3
- + movu m4, [r1]
- + add r1, r2
- +
- +%rep %3-1
- + FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- + movu m7, [r4]
- + pavgw m0, m7
- + OP_MOV [r0], m0
- + add r4, r2
- + add r1, r2
- + add r0, r2
- + SWAP 0,1,2,3,4,5
- +%endrep
- + FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- + movu m7, [r4]
- + pavgw m0, m7
- + OP_MOV [r0], m0
- + RET
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC01 mmxext, put, 4
- +INIT_XMM
- +MC01 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC01 mmxext, avg, 4
- +INIT_XMM
- +MC01 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC03 3
- +cglobal %2_h264_qpel%3_mc03_10_%1, 3,5,8
- + lea r4, [r1+r2]
- + jmp mangle(ff_%2_h264_qpel%3_mc01_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC03 mmxext, put, 4
- +INIT_XMM
- +MC03 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC03 mmxext, avg, 4
- +INIT_XMM
- +MC03 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC11 3
- +; this REALLY needs x86_64
- +cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
- + mov r4, r1
- +.body
- + lea r3, [r2*2]
- + sub r1, r3
- + movu m0, [r1]
- + movu m1, [r1+r2]
- + add r1, r3
- + movu m2, [r1]
- + movu m3, [r1+r2]
- + add r1, r3
- + movu m4, [r1]
- + add r1, r2
- +
- +%assign i 0
- +%rep %3
- +%assign i i+1
- + FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- +;now do FILT_H with fewer registers. probably faster than doing FILT_V then FILT_H
- +;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
- +;unfortunately I need three registers, so m5 will have to be re-read from memory
- + movu m5, [r4-4]
- + ADDW m5, [r4+6], m7
- + movu m6, [r4-2]
- + ADDW m6, [r4+4], m7
- + paddw m5, [pw_16]
- + psubw m5, m6 ; a-b
- + psraw m5, 2 ; (a-b)/4
- + psubw m5, m6 ; (a-b)/4-b
- +;now I need to load c...
- + movu m6, [r4+0]
- + ADDW m6, [r4+2], m7
- + paddw m5, m6 ; (a-b)/4-b+c
- + psraw m5, 2 ; ((a-b)/4-b+c)/4
- + paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- + psraw m5, 1
- + CLIPW m5, [pb_0], [pw_pixel_max]
- +;avg FILT_V, FILT_H and reload m5
- + pavgw m0, m5
- + OP_MOV [r0], m0
- +%if i<%3
- + movu m5, [r1]
- + add r4, r2
- + add r1, r2
- + add r0, r2
- + SWAP 0,1,2,3,4,5
- +%endif
- +%endrep
- + RET
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC11 mmxext, put, 4
- +INIT_XMM
- +MC11 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC11 mmxext, avg, 4
- +INIT_XMM
- +MC11 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC31 3
- +cglobal %2_h264_qpel%3_mc31_10_%1, 3,5,8
- + mov r4, r1
- + add r1, 2
- + jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC31 mmxext, put, 4
- +INIT_XMM
- +MC31 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC31 mmxext, avg, 4
- +INIT_XMM
- +MC31 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC13 3
- +cglobal %2_h264_qpel%3_mc13_10_%1, 3,5,8
- + lea r4, [r1+r2]
- + jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC13 mmxext, put, 4
- +INIT_XMM
- +MC13 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC13 mmxext, avg, 4
- +INIT_XMM
- +MC13 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC33 3
- +cglobal %2_h264_qpel%3_mc33_10_%1, 3,5,8
- + lea r4, [r1+r2]
- + add r1, 2
- + jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC33 mmxext, put, 4
- +INIT_XMM
- +MC33 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC33 mmxext, avg, 4
- +INIT_XMM
- +MC33 sse2 , avg, 8
- +
- +
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro FILT_H2 3
- + psubw %1, %2 ; a-b
- + psubw %2, %3 ; b-c
- + psllw %2, 2
- + psubw %1, %2 ; a-5*b+4*c
- + psllw %3, 4
- + paddw %1, %3 ; a-5*b+20*c
- +%endmacro
- +
- +%macro FILT_VNRD 8
- + movu %6, [r1]
- + paddw %1, %6
- + mova %7, %2
- + paddw %7, %5
- + mova %8, %3
- + paddw %8, %4
- + FILT_H2 %1, %7, %8
- +%endmacro
- +
- +%macro MC22 3
- +%2_hv%3_10_%1:
- + add rsp, gprsize
- + neg r2 ; This actually saves instructions
- + lea r1, [r1+r2*2]
- + sub r1, mmsize
- + xor r4, r4
- + mov r3, 3
- +.v_loop:
- + movu m0, [r1]
- + sub r1, r2
- + movu m1, [r1]
- + sub r1, r2
- + movu m2, [r1]
- + sub r1, r2
- + movu m3, [r1]
- + sub r1, r2
- + movu m4, [r1]
- + sub r1, r2
- +%assign i 0
- +%rep %3-1
- + FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
- + psubw m0, [pad20]
- + mova [rsp+r4+i*mmsize*3], m0
- + sub r1, r2
- + SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- + FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
- + psubw m0, [pad20]
- + mova [rsp+r4+i*mmsize*3], m0
- + add r4, mmsize
- + lea r1, [r1+r2*8+mmsize]
- +%if %3==8
- + lea r1, [r1+r2*4]
- +%endif
- + dec r3
- + jg .v_loop
- + sub rsp, gprsize
- + neg r2
- + ret
- +
- +cglobal %2_h264_qpel%3_mc22_10_%1, 3,7,10
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, 4096 ; TODO: calculate this correctly
- +
- + call %2_hv%3_10_%1
- +
- + mov r4, mmsize
- + mov r3d, %3
- + mova m0, [tap1]
- + mova m7, [tap3]
- +%if num_mmregs > 8
- + mova m8, [tap2]
- + mova m9, [depad]
- + %define s1 m8
- + %define s2 m9
- +%else
- + %define s1 [tap2]
- + %define s2 [depad]
- +%endif
- +.h_loop:
- + movu m1, [rsp+r4-4]
- + movu m2, [rsp+r4-2]
- + mova m3, [rsp+r4+0]
- + movu m4, [rsp+r4+2]
- + movu m5, [rsp+r4+4]
- + movu m6, [rsp+r4+6]
- + pmaddwd m1, m0
- + pmaddwd m2, m0
- + pmaddwd m3, s1
- + pmaddwd m4, s1
- + pmaddwd m5, m7
- + pmaddwd m6, m7
- + paddd m1, s2
- + paddd m2, s2
- + paddd m3, m5
- + paddd m4, m6
- + paddd m1, m3
- + paddd m2, m4
- + psrad m1, 10
- + psrad m2, 10
- + pslld m2, 16
- + pand m1, [pd_0f]
- + por m1, m2
- + CLIPW m1, [pb_0], [pw_pixel_max]
- + OP_MOV [r0], m1
- + add r4, mmsize*3
- + add r0, r2
- + dec r3d
- + jg .h_loop
- +
- + mov rsp, r6 ; restore stack pointer
- + RET
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC22 mmxext, put, 4
- +INIT_XMM
- +MC22 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC22 mmxext, avg, 4
- +INIT_XMM
- +MC22 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC12 3
- +cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, 4096 ; TODO: calculate this correctly
- +
- + call %2_hv%3_10_%1
- +
- + xor r4, r4
- +.body
- + mov r3d, %3
- + mova m0, [tap1]
- + mova m7, [tap3]
- +%if num_mmregs > 8
- + mova m8, [tap2]
- + mova m9, [depad]
- + %define s1 m8
- + %define s2 m9
- +%else
- + %define s1 [tap2]
- + %define s2 [depad]
- +%endif
- +.h_loop:
- + movu m1, [rsp+mmsize-4]
- + movu m2, [rsp+mmsize-2]
- + mova m3, [rsp+mmsize+0]
- + movu m4, [rsp+mmsize+2]
- + movu m5, [rsp+mmsize+4]
- + movu m6, [rsp+mmsize+6]
- + pmaddwd m1, m0
- + pmaddwd m2, m0
- + pmaddwd m3, s1
- + pmaddwd m4, s1
- + pmaddwd m5, m7
- + pmaddwd m6, m7
- + paddd m1, s2
- + paddd m2, s2
- + paddd m3, m5
- + paddd m4, m6
- + paddd m1, m3
- + paddd m2, m4
- + psrad m1, 10
- + psrad m2, 10
- + pslld m2, 16
- + pand m1, [pd_0f]
- + por m1, m2
- + CLIPW m1, [pw_0], [pw_pixel_max]
- +
- + movu m3, [rsp+r4+mmsize] ; movu needed for mc32
- + paddw m3, [depad2]
- + psrlw m3, 5
- + psubw m3, [unpad]
- + CLIPW m3, [pw_0], [pw_pixel_max]
- + pavgw m1, m3
- +
- + OP_MOV [r0], m1
- + add rsp, mmsize*3
- + add r0, r2
- + dec r3d
- + jg .h_loop
- +
- + mov rsp, r6 ; restore stack pointer
- + RET
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC12 mmxext, put, 4
- +INIT_XMM
- +MC12 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC12 mmxext, avg, 4
- +INIT_XMM
- +MC12 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC32 3
- +cglobal %2_h264_qpel%3_mc32_10_%1, 3,7,10
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, 4096 ; TODO: calculate this correctly
- +
- + call %2_hv%3_10_%1
- +
- + mov r4, 2 ; sizeof(pixel)
- + jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC32 mmxext, put, 4
- +INIT_XMM
- +MC32 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC32 mmxext, avg, 4
- +INIT_XMM
- +MC32 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC21 3
- +%2_h%3_10_%1:
- + add rsp, gprsize
- + mov r3d, %3
- + xor r4, r4
- + mova m6, [pad20]
- +.nextrow
- + movu m2, [r5-4]
- + movu m3, [r5-2]
- + movu m4, [r5+0]
- + ADDW m4, [r5+2], m5
- + ADDW m3, [r5+4], m5
- + ADDW m2, [r5+6], m5
- +
- + FILT_H2 m2, m3, m4
- + psubw m2, m6
- + mova [rsp+r4], m2
- + add r4, mmsize*3
- + add r5, r2
- + dec r3d
- + jg .nextrow
- + sub rsp, gprsize
- + ret
- +
- +cglobal %2_h264_qpel%3_mc21_10_%1, 3,7,10
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, 4096 ; TODO: calculate this correctly
- +
- + mov r5, r1
- + call %2_hv%3_10_%1
- +
- +%define PAD mmsize*16*3*2 ; SIZE*16*3*sizeof(pixel)
- + add rsp, PAD
- + call %2_h%3_10_%1
- + sub rsp, PAD
- +
- + mov r4, PAD-mmsize ; H buffer
- + jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC21 mmxext, put, 4
- +INIT_XMM
- +MC21 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC21 mmxext, avg, 4
- +INIT_XMM
- +MC21 sse2 , avg, 8
- +
- +;-----------------------------------------------------------------------------
- +; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
- +;-----------------------------------------------------------------------------
- +%macro MC23 3
- +cglobal %2_h264_qpel%3_mc23_10_%1, 3,7,10
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, 4096 ; TODO: calculate this correctly
- +
- + lea r5, [r1+r2]
- + call %2_hv%3_10_%1
- +
- +%define PAD mmsize*16*3*2 ; SIZE*16*3*sizeof(pixel)
- + add rsp, PAD
- + call %2_h%3_10_%1
- + sub rsp, PAD
- +
- + mov r4, PAD-mmsize ; H buffer
- + jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
- +%endmacro
- +
- +%define OP_MOV mova
- +INIT_MMX
- +MC23 mmxext, put, 4
- +INIT_XMM
- +MC23 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +MC23 mmxext, avg, 4
- +INIT_XMM
- +MC23 sse2 , avg, 8
- diff --git a/libavcodec/x86/h264_qpel_mmx_10bit.c b/libavcodec/x86/h264_qpel_mmx_10bit.c
- new file mode 100755
- index 0000000..98cf6da
- --- /dev/null
- +++ b/libavcodec/x86/h264_qpel_mmx_10bit.c
- @@ -0,0 +1,141 @@
- +/*
- + * Copyright (c) 2011 Daniel Kang
- + *
- + * This file is part of Libav.
- + *
- + * Libav is free software; you can redistribute it and/or
- + * modify it under the terms of the GNU Lesser General Public
- + * License as published by the Free Software Foundation; either
- + * version 2.1 of the License, or (at your option) any later version.
- + *
- + * Libav is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- + * Lesser General Public License for more details.
- + *
- + * You should have received a copy of the GNU Lesser General Public
- + * License along with Libav; if not, write to the Free Software
- + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- + */
- +
- +#include "dsputil_mmx.h"
- +
- +#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
- +void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
- + (uint8_t *dst, uint8_t *src, int stride);
- +
- +#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
- +
- +#define LUMA_MC_816(DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
- + LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
- +
- +LUMA_MC_ALL(10, mc00, mmxext)
- +LUMA_MC_ALL(10, mc10, mmxext)
- +LUMA_MC_ALL(10, mc20, mmxext)
- +LUMA_MC_ALL(10, mc30, mmxext)
- +LUMA_MC_ALL(10, mc01, mmxext)
- +LUMA_MC_ALL(10, mc11, mmxext)
- +LUMA_MC_ALL(10, mc21, mmxext)
- +LUMA_MC_ALL(10, mc31, mmxext)
- +LUMA_MC_ALL(10, mc02, mmxext)
- +LUMA_MC_ALL(10, mc12, mmxext)
- +LUMA_MC_ALL(10, mc22, mmxext)
- +LUMA_MC_ALL(10, mc32, mmxext)
- +LUMA_MC_ALL(10, mc03, mmxext)
- +LUMA_MC_ALL(10, mc13, mmxext)
- +LUMA_MC_ALL(10, mc23, mmxext)
- +LUMA_MC_ALL(10, mc33, mmxext)
- +
- +LUMA_MC_816(10, mc00, sse2)
- +LUMA_MC_816(10, mc10, sse2)
- +LUMA_MC_816(10, mc20, sse2)
- +LUMA_MC_816(10, mc30, sse2)
- +LUMA_MC_816(10, mc01, sse2)
- +LUMA_MC_816(10, mc11, sse2)
- +LUMA_MC_816(10, mc21, sse2)
- +LUMA_MC_816(10, mc31, sse2)
- +LUMA_MC_816(10, mc02, sse2)
- +LUMA_MC_816(10, mc12, sse2)
- +LUMA_MC_816(10, mc22, sse2)
- +LUMA_MC_816(10, mc32, sse2)
- +LUMA_MC_816(10, mc03, sse2)
- +LUMA_MC_816(10, mc13, sse2)
- +LUMA_MC_816(10, mc23, sse2)
- +LUMA_MC_816(10, mc33, sse2)
- +
- +#define QPEL8_OPMC(OP, MC, MMX)\
- +void ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- + ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst , src , stride);\
- + ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst+8, src+8, stride);\
- + src += 4*stride;\
- + dst += 4*stride;\
- + ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst , src , stride);\
- + ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst+8, src+8, stride);\
- +}
- +
- +#define QPEL8_OP(MC, MMX)\
- +QPEL8_OPMC(put, MC, MMX)\
- +QPEL8_OPMC(avg, MC, MMX)
- +
- +#define QPEL8(MMX)\
- +QPEL8_OP(mc00, MMX)\
- +QPEL8_OP(mc01, MMX)\
- +QPEL8_OP(mc02, MMX)\
- +QPEL8_OP(mc03, MMX)\
- +QPEL8_OP(mc10, MMX)\
- +QPEL8_OP(mc11, MMX)\
- +QPEL8_OP(mc12, MMX)\
- +QPEL8_OP(mc13, MMX)\
- +QPEL8_OP(mc20, MMX)\
- +QPEL8_OP(mc21, MMX)\
- +QPEL8_OP(mc22, MMX)\
- +QPEL8_OP(mc23, MMX)\
- +QPEL8_OP(mc30, MMX)\
- +QPEL8_OP(mc31, MMX)\
- +QPEL8_OP(mc32, MMX)\
- +QPEL8_OP(mc33, MMX)
- +
- +#define QPEL16_OPMC(OP, MC, MMX)\
- +void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
- + src += 8*stride;\
- + dst += 8*stride;\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
- + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
- +}
- +
- +#define QPEL16_OP(MC, MMX)\
- +QPEL16_OPMC(put, MC, MMX)\
- +QPEL16_OPMC(avg, MC, MMX)
- +
- +#define QPEL16(MMX)\
- +QPEL16_OP(mc01, MMX)\
- +QPEL16_OP(mc02, MMX)\
- +QPEL16_OP(mc03, MMX)\
- +QPEL16_OP(mc10, MMX)\
- +QPEL16_OP(mc11, MMX)\
- +QPEL16_OP(mc12, MMX)\
- +QPEL16_OP(mc13, MMX)\
- +QPEL16_OP(mc20, MMX)\
- +QPEL16_OP(mc21, MMX)\
- +QPEL16_OP(mc22, MMX)\
- +QPEL16_OP(mc23, MMX)\
- +QPEL16_OP(mc30, MMX)\
- +QPEL16_OP(mc31, MMX)\
- +QPEL16_OP(mc32, MMX)\
- +QPEL16_OP(mc33, MMX)
- +
- +QPEL8(mmxext)
- +QPEL16_OP(mc00, mmxext)
- +QPEL16(mmxext)
- +QPEL16(sse2)
- --
- 1.7.5.1
- From 970a7a164d0ae110821c174a7b9081f93172c095 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sat, 25 Jun 2011 13:28:49 -0400
- Subject: [PATCH 2/5] improvement?
- ---
- libavcodec/x86/h264_qpel_10bit.asm | 40 ++++++++++++++++++++---------------
- 1 files changed, 23 insertions(+), 17 deletions(-)
- diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
- index fc44e85..bc0e78d 100644
- --- a/libavcodec/x86/h264_qpel_10bit.asm
- +++ b/libavcodec/x86/h264_qpel_10bit.asm
- @@ -620,7 +620,7 @@ MC22 sse2 , avg, 8
- ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC12 3
- -cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
- +cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, 4096 ; TODO: calculate this correctly
- @@ -630,16 +630,22 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
- xor r4, r4
- .body
- mov r3d, %3
- - mova m0, [tap1]
- - mova m7, [tap3]
- + pxor m0, m0
- + mova m7, [pw_pixel_max]
- %if num_mmregs > 8
- - mova m8, [tap2]
- - mova m9, [depad]
- + mova m8, [tap1]
- + mova m9, [tap2]
- + mova m10, [tap3]
- + mova m11, [depad]
- %define s1 m8
- %define s2 m9
- + %define s3 m10
- + %define s1 m11
- %else
- - %define s1 [tap2]
- - %define s2 [depad]
- + %define s1 [tap1]
- + %define s2 [tap2]
- + %define s3 [tap3]
- + %define d1 [depad]
- %endif
- .h_loop:
- movu m1, [rsp+mmsize-4]
- @@ -648,14 +654,14 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
- movu m4, [rsp+mmsize+2]
- movu m5, [rsp+mmsize+4]
- movu m6, [rsp+mmsize+6]
- - pmaddwd m1, m0
- - pmaddwd m2, m0
- - pmaddwd m3, s1
- - pmaddwd m4, s1
- - pmaddwd m5, m7
- - pmaddwd m6, m7
- - paddd m1, s2
- - paddd m2, s2
- + pmaddwd m1, s1
- + pmaddwd m2, s1
- + pmaddwd m3, s2
- + pmaddwd m4, s2
- + pmaddwd m5, s3
- + pmaddwd m6, s3
- + paddd m1, d1
- + paddd m2, d1
- paddd m3, m5
- paddd m4, m6
- paddd m1, m3
- @@ -665,13 +671,13 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
- pslld m2, 16
- pand m1, [pd_0f]
- por m1, m2
- - CLIPW m1, [pw_0], [pw_pixel_max]
- + CLIPW m1, m0, m7
- movu m3, [rsp+r4+mmsize] ; movu needed for mc32
- paddw m3, [depad2]
- psrlw m3, 5
- psubw m3, [unpad]
- - CLIPW m3, [pw_0], [pw_pixel_max]
- + CLIPW m3, m0, m7
- pavgw m1, m3
- OP_MOV [r0], m1
- --
- 1.7.5.1
- From 6e61dab2e3c479e7c8258368a2025f41074e8bae Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sun, 26 Jun 2011 00:07:08 -0400
- Subject: [PATCH 3/5] pengvado's fixes pt 1
- ---
- libavcodec/x86/h264_qpel_10bit.asm | 354 ++++++++++++------------------------
- 1 files changed, 115 insertions(+), 239 deletions(-)
- diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
- index bc0e78d..ec17cf6 100644
- --- a/libavcodec/x86/h264_qpel_10bit.asm
- +++ b/libavcodec/x86/h264_qpel_10bit.asm
- @@ -47,7 +47,56 @@ pd_0f: times 4 dd 0xffff
- SECTION .text
- -; All of the 2x2 functions are probably no faster than the C version.
- +
- +%macro AVG_MOV 2
- + pavgw %2, %1
- + mova %1, %2
- +%endmacro
- +
- +%macro ADDW 3
- +%if mmsize == 8
- + paddw %1, %2
- +%else
- + movu %3, %2
- + paddw %1, %3
- +%endif
- +%endmacro
- +
- +%macro FILT_H 4
- + paddw %1, %4
- + psubw %1, %2 ; a-b
- + psraw %1, 2 ; (a-b)/4
- + psubw %1, %2 ; (a-b)/4-b
- + paddw %1, %3 ; (a-b)/4-b+c
- + psraw %1, 2 ; ((a-b)/4-b+c)/4
- + paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- +%endmacro
- +
- +%macro FILT_V 8
- + movu %6, [r1]
- + paddw %1, %6
- + mova %7, %2
- + paddw %7, %5
- + mova %8, %3
- + paddw %8, %4
- + FILT_H %1, %7, %8, [pw_16]
- + psraw %1, 1
- + CLIPW %1, [pb_0], [pw_pixel_max]
- +%endmacro
- +
- +%macro MC 1
- +%define OP_MOV mova
- +INIT_MMX
- +%1 mmxext, put, 4
- +INIT_XMM
- +%1 sse2 , put, 8
- +
- +%define OP_MOV AVG_MOV
- +INIT_MMX
- +%1 mmxext, avg, 4
- +INIT_XMM
- +%1 sse2 , avg, 8
- +%endmacro
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
- @@ -94,11 +143,6 @@ cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
- RET
- %endmacro
- -%macro AVG_MOV 2
- - pavgw %2, %1
- - mova %1, %2
- -%endmacro
- -
- %define OP_MOV mova
- MC00 put
- @@ -108,25 +152,6 @@ MC00 avg
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- -%macro FILT_H 4
- - paddw %1, %4
- - psubw %1, %2 ; a-b
- - psraw %1, 2 ; (a-b)/4
- - psubw %1, %2 ; (a-b)/4-b
- - paddw %1, %3 ; (a-b)/4-b+c
- - psraw %1, 2 ; ((a-b)/4-b+c)/4
- - paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- -%endmacro
- -
- -%macro ADDW 3
- -%if mmsize == 8
- - paddw %1, %2
- -%else
- - movu %3, %2
- - paddw %1, %3
- -%endif
- -%endmacro
- -
- %macro MC20 3
- cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
- mov r3d, %3
- @@ -137,9 +162,9 @@ cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
- movu m2, [r1-4]
- movu m3, [r1-2]
- movu m4, [r1+0]
- - ADDW m4, [r1+2], m5
- - ADDW m3, [r1+4], m5
- ADDW m2, [r1+6], m5
- + ADDW m3, [r1+4], m5
- + ADDW m4, [r1+2], m5
- FILT_H m2, m3, m4, m6
- psraw m2, 1
- @@ -152,17 +177,7 @@ cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
- REP_RET
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC20 mmxext, put, 4
- -INIT_XMM
- -MC20 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC20 mmxext, avg, 4
- -INIT_XMM
- -MC20 sse2 , avg, 8
- +MC MC20
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
- @@ -173,17 +188,7 @@ cglobal %2_h264_qpel%3_mc30_10_%1, 3,5,6
- jmp mangle(ff_%2_h264_qpel%3_mc10_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC30 mmxext, put, 4
- -INIT_XMM
- -MC30 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC30 mmxext, avg, 4
- -INIT_XMM
- -MC30 sse2 , avg, 8
- +MC MC30
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
- @@ -200,9 +205,9 @@ cglobal %2_h264_qpel%3_mc10_10_%1, 3,5,7
- movu m2, [r1-4]
- movu m3, [r1-2]
- movu m4, [r1+0]
- - ADDW m4, [r1+2], m5
- - ADDW m3, [r1+4], m5
- ADDW m2, [r1+6], m5
- + ADDW m3, [r1+4], m5
- + ADDW m4, [r1+2], m5
- FILT_H m2, m3, m4, m6
- psraw m2, 1
- @@ -218,33 +223,11 @@ cglobal %2_h264_qpel%3_mc10_10_%1, 3,5,7
- REP_RET
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC10 mmxext, put, 4
- -INIT_XMM
- -MC10 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC10 mmxext, avg, 4
- -INIT_XMM
- -MC10 sse2 , avg, 8
- +MC MC10
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- -%macro FILT_V 8
- - movu %6, [r1]
- - paddw %1, %6
- - mova %7, %2
- - paddw %7, %5
- - mova %8, %3
- - paddw %8, %4
- - FILT_H %1, %7, %8, [pw_16]
- - psraw %1, 1
- - CLIPW %1, [pb_0], [pw_pixel_max]
- -%endmacro
- -
- %macro MC02 3
- cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
- lea r3, [r2*2]
- @@ -270,17 +253,7 @@ cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
- RET
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC02 mmxext, put, 4
- -INIT_XMM
- -MC02 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC02 mmxext, avg, 4
- -INIT_XMM
- -MC02 sse2 , avg, 8
- +MC MC02
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
- @@ -317,17 +290,7 @@ cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
- RET
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC01 mmxext, put, 4
- -INIT_XMM
- -MC01 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC01 mmxext, avg, 4
- -INIT_XMM
- -MC01 sse2 , avg, 8
- +MC MC01
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
- @@ -338,17 +301,7 @@ cglobal %2_h264_qpel%3_mc03_10_%1, 3,5,8
- jmp mangle(ff_%2_h264_qpel%3_mc01_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC03 mmxext, put, 4
- -INIT_XMM
- -MC03 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC03 mmxext, avg, 4
- -INIT_XMM
- -MC03 sse2 , avg, 8
- +MC MC03
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
- @@ -406,17 +359,7 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
- RET
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC11 mmxext, put, 4
- -INIT_XMM
- -MC11 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC11 mmxext, avg, 4
- -INIT_XMM
- -MC11 sse2 , avg, 8
- +MC MC11
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
- @@ -428,17 +371,7 @@ cglobal %2_h264_qpel%3_mc31_10_%1, 3,5,8
- jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC31 mmxext, put, 4
- -INIT_XMM
- -MC31 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC31 mmxext, avg, 4
- -INIT_XMM
- -MC31 sse2 , avg, 8
- +MC MC31
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
- @@ -449,17 +382,7 @@ cglobal %2_h264_qpel%3_mc13_10_%1, 3,5,8
- jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC13 mmxext, put, 4
- -INIT_XMM
- -MC13 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC13 mmxext, avg, 4
- -INIT_XMM
- -MC13 sse2 , avg, 8
- +MC MC13
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
- @@ -471,19 +394,7 @@ cglobal %2_h264_qpel%3_mc33_10_%1, 3,5,8
- jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC33 mmxext, put, 4
- -INIT_XMM
- -MC33 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC33 mmxext, avg, 4
- -INIT_XMM
- -MC33 sse2 , avg, 8
- -
- -
- +MC MC33
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
- @@ -507,8 +418,8 @@ MC33 sse2 , avg, 8
- FILT_H2 %1, %7, %8
- %endmacro
- -%macro MC22 3
- -%2_hv%3_10_%1:
- +%macro HV 2
- +put_hv%2_10_%1:
- add rsp, gprsize
- neg r2 ; This actually saves instructions
- lea r1, [r1+r2*2]
- @@ -527,7 +438,7 @@ MC33 sse2 , avg, 8
- movu m4, [r1]
- sub r1, r2
- %assign i 0
- -%rep %3-1
- +%rep %2-1
- FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
- psubw m0, [pad20]
- mova [rsp+r4+i*mmsize*3], m0
- @@ -540,7 +451,7 @@ MC33 sse2 , avg, 8
- mova [rsp+r4+i*mmsize*3], m0
- add r4, mmsize
- lea r1, [r1+r2*8+mmsize]
- -%if %3==8
- +%if %2==8
- lea r1, [r1+r2*4]
- %endif
- dec r3
- @@ -548,13 +459,21 @@ MC33 sse2 , avg, 8
- sub rsp, gprsize
- neg r2
- ret
- +%endmacro
- +INIT_MMX
- +HV mmxext, 4
- +INIT_XMM
- +HV sse2 , 8
- +
- +%macro MC22 3
- cglobal %2_h264_qpel%3_mc22_10_%1, 3,7,10
- +%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- - sub rsp, 4096 ; TODO: calculate this correctly
- + sub rsp, PAD
- - call %2_hv%3_10_%1
- + call put_hv%3_10_%1
- mov r4, mmsize
- mov r3d, %3
- @@ -604,28 +523,19 @@ cglobal %2_h264_qpel%3_mc22_10_%1, 3,7,10
- RET
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC22 mmxext, put, 4
- -INIT_XMM
- -MC22 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC22 mmxext, avg, 4
- -INIT_XMM
- -MC22 sse2 , avg, 8
- +MC MC22
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC12 3
- cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
- - mov r6, rsp ; backup stack pointer
- - and rsp, ~(mmsize-1) ; align stack
- - sub rsp, 4096 ; TODO: calculate this correctly
- +%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- + mov r6, rsp ; backup stack pointer
- + and rsp, ~(mmsize-1) ; align stack
- + sub rsp, PAD
- - call %2_hv%3_10_%1
- + call put_hv%3_10_%1
- xor r4, r4
- .body
- @@ -640,7 +550,7 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
- %define s1 m8
- %define s2 m9
- %define s3 m10
- - %define s1 m11
- + %define d1 m11
- %else
- %define s1 [tap1]
- %define s2 [tap2]
- @@ -673,7 +583,7 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
- por m1, m2
- CLIPW m1, m0, m7
- - movu m3, [rsp+r4+mmsize] ; movu needed for mc32
- + movu m3, [rsp+r4+mmsize] ; movu needed for mc32, etc
- paddw m3, [depad2]
- psrlw m3, 5
- psubw m3, [unpad]
- @@ -690,61 +600,42 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
- RET
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC12 mmxext, put, 4
- -INIT_XMM
- -MC12 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC12 mmxext, avg, 4
- -INIT_XMM
- -MC12 sse2 , avg, 8
- +MC MC12
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC32 3
- cglobal %2_h264_qpel%3_mc32_10_%1, 3,7,10
- +%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- - sub rsp, 4096 ; TODO: calculate this correctly
- + sub rsp, PAD
- - call %2_hv%3_10_%1
- + call put_hv%3_10_%1
- mov r4, 2 ; sizeof(pixel)
- jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC32 mmxext, put, 4
- -INIT_XMM
- -MC32 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC32 mmxext, avg, 4
- -INIT_XMM
- -MC32 sse2 , avg, 8
- +MC MC32
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- -%macro MC21 3
- -%2_h%3_10_%1:
- +%macro H_NRD 2
- +put_h%2_10_%1:
- add rsp, gprsize
- - mov r3d, %3
- + mov r3d, %2
- xor r4, r4
- mova m6, [pad20]
- .nextrow
- movu m2, [r5-4]
- movu m3, [r5-2]
- movu m4, [r5+0]
- - ADDW m4, [r5+2], m5
- - ADDW m3, [r5+4], m5
- ADDW m2, [r5+6], m5
- + ADDW m3, [r5+4], m5
- + ADDW m4, [r5+2], m5
- FILT_H2 m2, m3, m4
- psubw m2, m6
- @@ -755,65 +646,50 @@ MC32 sse2 , avg, 8
- jg .nextrow
- sub rsp, gprsize
- ret
- +%endmacro
- +INIT_MMX
- +H_NRD mmxext, 4
- +INIT_XMM
- +H_NRD sse2 , 8
- +
- +%macro MC21 3
- cglobal %2_h264_qpel%3_mc21_10_%1, 3,7,10
- +%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- - sub rsp, 4096 ; TODO: calculate this correctly
- + sub rsp, PAD
- mov r5, r1
- - call %2_hv%3_10_%1
- + call put_h%3_10_%1
- -%define PAD mmsize*16*3*2 ; SIZE*16*3*sizeof(pixel)
- - add rsp, PAD
- - call %2_h%3_10_%1
- sub rsp, PAD
- + call put_hv%3_10_%1
- - mov r4, PAD-mmsize ; H buffer
- + mov r4, PAD-mmsize ; H buffer
- jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC21 mmxext, put, 4
- -INIT_XMM
- -MC21 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC21 mmxext, avg, 4
- -INIT_XMM
- -MC21 sse2 , avg, 8
- +MC MC21
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC23 3
- cglobal %2_h264_qpel%3_mc23_10_%1, 3,7,10
- +%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- - sub rsp, 4096 ; TODO: calculate this correctly
- + sub rsp, PAD
- lea r5, [r1+r2]
- - call %2_hv%3_10_%1
- + call put_h%3_10_%1
- -%define PAD mmsize*16*3*2 ; SIZE*16*3*sizeof(pixel)
- - add rsp, PAD
- - call %2_h%3_10_%1
- sub rsp, PAD
- + call put_hv%3_10_%1
- - mov r4, PAD-mmsize ; H buffer
- + mov r4, PAD-mmsize ; H buffer
- jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
- %endmacro
- -%define OP_MOV mova
- -INIT_MMX
- -MC23 mmxext, put, 4
- -INIT_XMM
- -MC23 sse2 , put, 8
- -
- -%define OP_MOV AVG_MOV
- -INIT_MMX
- -MC23 mmxext, avg, 4
- -INIT_XMM
- -MC23 sse2 , avg, 8
- +MC MC23
- --
- 1.7.5.1
- From d1ccd3f604b353aa32cc5460a3934fbbbaca0841 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sun, 26 Jun 2011 13:04:22 -0400
- Subject: [PATCH 4/5] more fixes
- ---
- libavcodec/x86/h264_qpel_10bit.asm | 58 ++++++++++++++++++++++++++++--------
- 1 files changed, 45 insertions(+), 13 deletions(-)
- diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
- index ec17cf6..b5dfa8a 100644
- --- a/libavcodec/x86/h264_qpel_10bit.asm
- +++ b/libavcodec/x86/h264_qpel_10bit.asm
- @@ -228,6 +228,30 @@ MC MC10
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- +%macro V_FILT 11
- +v_filt%9_%10_10_%11:
- + FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- + ret
- +%endmacro
- +
- +INIT_MMX
- +RESET_MM_PERMUTATION
- +%assign i 0
- +%rep 4
- +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
- +SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- +
- +INIT_XMM
- +RESET_MM_PERMUTATION
- +%assign i 0
- +%rep 6
- +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
- +SWAP 0,1,2,3,4,5
- +%assign i i+1
- +%endrep
- +
- %macro MC02 3
- cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
- lea r3, [r2*2]
- @@ -241,15 +265,19 @@ cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
- movu m4, [r1]
- add r1, r2
- -%rep %3-1
- - FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- +%assign i 0
- +%assign j 0
- +%rep %3
- + call v_filt%3_ %+ i %+ _10_%1
- OP_MOV [r0], m0
- +%if j<%3-1
- add r1, r2
- add r0, r2
- SWAP 0,1,2,3,4,5
- +%endif
- + %assign j j+1
- + %assign i (j % 6)
- %endrep
- - FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- - OP_MOV [r0], m0
- RET
- %endmacro
- @@ -273,20 +301,22 @@ cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
- movu m4, [r1]
- add r1, r2
- -%rep %3-1
- - FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- +%assign i 0
- +%assign j 0
- +%rep %3
- + call v_filt%3_ %+ i %+ _10_%1
- movu m7, [r4]
- pavgw m0, m7
- OP_MOV [r0], m0
- +%if i<%3-1
- add r4, r2
- add r1, r2
- add r0, r2
- SWAP 0,1,2,3,4,5
- +%endif
- + %assign j j+1
- + %assign i (j % 6)
- %endrep
- - FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- - movu m7, [r4]
- - pavgw m0, m7
- - OP_MOV [r0], m0
- RET
- %endmacro
- @@ -323,9 +353,9 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
- add r1, r2
- %assign i 0
- +%assign j 0
- %rep %3
- -%assign i i+1
- - FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- + call v_filt%3_ %+ i %+ _10_%1
- ;now do FILT_H with fewer registers. probably faster than doing FILT_V then FILT_H
- ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
- ;unfortunately I need three registers, so m5 will have to be re-read from memory
- @@ -348,13 +378,15 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
- ;avg FILT_V, FILT_H and reload m5
- pavgw m0, m5
- OP_MOV [r0], m0
- -%if i<%3
- +%if j<%3-1
- movu m5, [r1]
- add r4, r2
- add r1, r2
- add r0, r2
- SWAP 0,1,2,3,4,5
- %endif
- + %assign j j+1
- + %assign i (j % 6)
- %endrep
- RET
- %endmacro
- --
- 1.7.5.1
- From ba151cdffadece22319d6a4722c01fe48eee10d3 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sun, 26 Jun 2011 16:27:12 -0400
- Subject: [PATCH 5/5] fixes pt 3
- ---
- libavcodec/x86/h264_qpel_10bit.asm | 113 ++++++++++++++----------------------
- 1 files changed, 43 insertions(+), 70 deletions(-)
- diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
- index b5dfa8a..cb9b077 100644
- --- a/libavcodec/x86/h264_qpel_10bit.asm
- +++ b/libavcodec/x86/h264_qpel_10bit.asm
- @@ -72,6 +72,18 @@ SECTION .text
- paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- %endmacro
- +%macro PRELOAD_V 0
- + lea r3, [r2*3]
- + sub r1, r3
- + movu m0, [r1+r2]
- + movu m1, [r1+r2*2]
- + add r1, r3
- + movu m2, [r1]
- + movu m3, [r1+r2]
- + movu m4, [r1+r2*2]
- + add r1, r3
- +%endmacro
- +
- %macro FILT_V 8
- movu %6, [r1]
- paddw %1, %6
- @@ -127,8 +139,9 @@ cglobal %1_h264_qpel8_mc00_10_sse2, 3,3
- %endrep
- RET
- -cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
- -%rep 8
- +cglobal %1_h264_qpel16_mc00_10_sse2_asm, 3,4
- + mov r3d, 8
- +.loop:
- movu m0, [r1 ]
- movu m1, [r1 +16]
- OP_MOV [r0 ], m0
- @@ -139,7 +152,8 @@ cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
- OP_MOV [r0+r2+16], m1
- lea r0, [r0+r2*2]
- lea r1, [r1+r2*2]
- -%endrep
- + dec r3d
- + jg .loop
- RET
- %endmacro
- @@ -230,7 +244,11 @@ MC MC10
- ;-----------------------------------------------------------------------------
- %macro V_FILT 11
- v_filt%9_%10_10_%11:
- + add r4, r2
- +.no_addr4:
- FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- + add r1, r2
- + add r0, r2
- ret
- %endmacro
- @@ -254,29 +272,16 @@ SWAP 0,1,2,3,4,5
- %macro MC02 3
- cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
- - lea r3, [r2*2]
- - sub r1, r3
- - movu m0, [r1]
- - movu m1, [r1+r2]
- - add r1, r3
- - movu m2, [r1]
- - movu m3, [r1+r2]
- - add r1, r3
- - movu m4, [r1]
- - add r1, r2
- + PRELOAD_V
- -%assign i 0
- + sub r0, r2
- %assign j 0
- %rep %3
- - call v_filt%3_ %+ i %+ _10_%1
- + %assign i (j % 6)
- + call v_filt%3_ %+ i %+ _10_%1.no_addr4
- OP_MOV [r0], m0
- -%if j<%3-1
- - add r1, r2
- - add r0, r2
- SWAP 0,1,2,3,4,5
- -%endif
- %assign j j+1
- - %assign i (j % 6)
- %endrep
- RET
- %endmacro
- @@ -290,32 +295,19 @@ MC MC02
- cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
- mov r4, r1
- .body
- - lea r3, [r2*2]
- - sub r1, r3
- - movu m0, [r1]
- - movu m1, [r1+r2]
- - add r1, r3
- - movu m2, [r1]
- - movu m3, [r1+r2]
- - add r1, r3
- - movu m4, [r1]
- - add r1, r2
- + PRELOAD_V
- -%assign i 0
- + sub r4, r2
- + sub r0, r2
- %assign j 0
- %rep %3
- + %assign i (j % 6)
- call v_filt%3_ %+ i %+ _10_%1
- movu m7, [r4]
- pavgw m0, m7
- OP_MOV [r0], m0
- -%if i<%3-1
- - add r4, r2
- - add r1, r2
- - add r0, r2
- SWAP 0,1,2,3,4,5
- -%endif
- %assign j j+1
- - %assign i (j % 6)
- %endrep
- RET
- %endmacro
- @@ -338,23 +330,18 @@ MC MC03
- ;-----------------------------------------------------------------------------
- %macro MC11 3
- ; this REALLY needs x86_64
- -cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
- +cglobal %2_h264_qpel%3_mc11_10_%1, 3,6,8
- mov r4, r1
- .body
- - lea r3, [r2*2]
- - sub r1, r3
- - movu m0, [r1]
- - movu m1, [r1+r2]
- - add r1, r3
- - movu m2, [r1]
- - movu m3, [r1+r2]
- - add r1, r3
- - movu m4, [r1]
- - add r1, r2
- + PRELOAD_V
- -%assign i 0
- + sub r0, r2
- + sub r4, r2
- + mov r5, r2
- + neg r5
- %assign j 0
- %rep %3
- + %assign i (j % 6)
- call v_filt%3_ %+ i %+ _10_%1
- ;now do FILT_H with fewer registers. probably faster than doing FILT_V then FILT_H
- ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
- @@ -379,14 +366,10 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
- pavgw m0, m5
- OP_MOV [r0], m0
- %if j<%3-1
- - movu m5, [r1]
- - add r4, r2
- - add r1, r2
- - add r0, r2
- + movu m5, [r1+r5]
- SWAP 0,1,2,3,4,5
- %endif
- %assign j j+1
- - %assign i (j % 6)
- %endrep
- RET
- %endmacro
- @@ -397,7 +380,7 @@ MC MC11
- ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC31 3
- -cglobal %2_h264_qpel%3_mc31_10_%1, 3,5,8
- +cglobal %2_h264_qpel%3_mc31_10_%1, 3,6,8
- mov r4, r1
- add r1, 2
- jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- @@ -409,7 +392,7 @@ MC MC31
- ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC13 3
- -cglobal %2_h264_qpel%3_mc13_10_%1, 3,5,8
- +cglobal %2_h264_qpel%3_mc13_10_%1, 3,6,8
- lea r4, [r1+r2]
- jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- %endmacro
- @@ -420,7 +403,7 @@ MC MC13
- ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC33 3
- -cglobal %2_h264_qpel%3_mc33_10_%1, 3,5,8
- +cglobal %2_h264_qpel%3_mc33_10_%1, 3,6,8
- lea r4, [r1+r2]
- add r1, 2
- jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
- @@ -687,12 +670,13 @@ H_NRD sse2 , 8
- %macro MC21 3
- cglobal %2_h264_qpel%3_mc21_10_%1, 3,7,10
- + mov r5, r1
- +.body
- %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, PAD
- - mov r5, r1
- call put_h%3_10_%1
- sub rsp, PAD
- @@ -709,19 +693,8 @@ MC MC21
- ;-----------------------------------------------------------------------------
- %macro MC23 3
- cglobal %2_h264_qpel%3_mc23_10_%1, 3,7,10
- -%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- - mov r6, rsp ; backup stack pointer
- - and rsp, ~(mmsize-1) ; align stack
- -
- - sub rsp, PAD
- lea r5, [r1+r2]
- - call put_h%3_10_%1
- -
- - sub rsp, PAD
- - call put_hv%3_10_%1
- -
- - mov r4, PAD-mmsize ; H buffer
- - jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
- + jmp mangle(ff_%2_h264_qpel%3_mc21_10_%1.body)
- %endmacro
- MC MC23
- --
- 1.7.5.1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement