Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 39d579269603036f388628cfa19a5abbc9012fb0 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Mon, 11 Jul 2011 13:48:40 -0400
- Subject: [PATCH] H.264: Add optimizations to predict x86 assembly.
- ---
- libavcodec/x86/h264_intrapred.asm | 5 +-
- libavcodec/x86/h264_intrapred_10bit.asm | 933 +++++++++++++------------------
- libavcodec/x86/h264_intrapred_init.c | 29 +-
- 3 files changed, 417 insertions(+), 550 deletions(-)
- diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
- index cbf3cf7..c1cd5c4 100644
- --- a/libavcodec/x86/h264_intrapred.asm
- +++ b/libavcodec/x86/h264_intrapred.asm
- @@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3
- punpckldq m1, [r1]
- movq m2, m1
- movq m3, m1
- - movq m4, m1
- psllq m1, 8
- pxor m2, m1
- psrlq m2, 8
- - pxor m3, m2
- - PRED4x4_LOWPASS m0, m1, m3, m4, m5
- + pxor m2, m3
- + PRED4x4_LOWPASS m0, m1, m2, m3, m4
- lea r1, [r0+r2*2]
- psrlq m0, 8
- movd [r0+r2*1], m0
- diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
- index 24a7bfa..5da7ee5 100644
- --- a/libavcodec/x86/h264_intrapred_10bit.asm
- +++ b/libavcodec/x86/h264_intrapred_10bit.asm
- @@ -27,8 +27,6 @@
- SECTION_RODATA
- -SECTION .text
- -
- cextern pw_16
- cextern pw_8
- cextern pw_4
- @@ -42,6 +40,8 @@ pw_512: times 8 dw 512
- pd_17: times 4 dd 17
- pd_16: times 4 dd 16
- +SECTION .text
- +
- ; dest, left, right, src
- ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
- %macro PRED4x4_LOWPASS 4
- @@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3
- movq m3, [r0]
- punpckhdq m1, m2
- PALIGNR m3, m1, 10, m1
- - mova m1, m3
- movhps m4, [r1+r2*1-8]
- - PALIGNR m3, m4, 14, m4
- - mova m2, m3
- + PALIGNR m0, m3, m4, 14, m4
- movhps m4, [r1+r2*2-8]
- - PALIGNR m3, m4, 14, m4
- - PRED4x4_LOWPASS m0, m3, m1, m2
- + PALIGNR m2, m0, m4, 14, m4
- + PRED4x4_LOWPASS m0, m2, m3, m0
- movq [r1+r2*2], m0
- psrldq m0, 2
- movq [r1+r2*1], m0
- @@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6
- pavgw m5, m0
- movhps m1, [r0+r2*1-8]
- PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
- - mova m1, m0
- movhps m2, [r0+r2*2-8]
- - PALIGNR m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
- - mova m2, m0
- + PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
- movhps m3, [r1+r2*1-8]
- - PALIGNR m0, m3, 14, m3 ; t3t2t1t0ltl0l1l2
- - PRED4x4_LOWPASS m3, m1, m0, m2
- - pslldq m1, m3, 12
- - psrldq m3, 4
- + PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
- + PRED4x4_LOWPASS m1, m0, m2, m1
- + pslldq m0, m1, 12
- + psrldq m1, 4
- movq [r0+r2*1], m5
- - movq [r0+r2*2], m3
- - PALIGNR m5, m1, 14, m2
- - pslldq m1, 2
- + movq [r0+r2*2], m1
- + PALIGNR m5, m0, 14, m2
- + pslldq m0, 2
- movq [r1+r2*1], m5
- - PALIGNR m3, m1, 14, m1
- - movq [r1+r2*2], m3
- + PALIGNR m1, m0, 14, m0
- + movq [r1+r2*2], m1
- RET
- %endmacro
- @@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3
- punpckhdq m1, m2 ; l0 l1 l2 l3
- punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
- - psrldq m2, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
- - pavgw m5, m1, m2
- - PRED4x4_LOWPASS m3, m1, m0, m2
- + psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
- + pavgw m5, m1, m3
- + PRED4x4_LOWPASS m3, m1, m0, m3
- punpcklwd m5, m3
- psrldq m3, 8
- PALIGNR m3, m5, 12, m4
- @@ -224,13 +220,12 @@ cglobal pred4x4_dc_10_mmxext, 3,3
- %macro PRED4x4_DL 1
- cglobal pred4x4_down_left_10_%1, 3,3
- sub r0, r2
- - movq m1, [r0]
- - movhps m1, [r1]
- - pslldq m5, m1, 2
- - pxor m2, m5, m1
- - psrldq m2, 2
- - pxor m3, m1, m2
- - PRED4x4_LOWPASS m0, m5, m3, m1
- + movq m0, [r0]
- + movhps m0, [r1]
- + psrldq m2, m0, 2
- + pslldq m3, m0, 2
- + pshufhw m2, m2, 10100100b
- + PRED4x4_LOWPASS m0, m3, m2, m0
- lea r1, [r0+r2*2]
- movhps [r1+r2*2], m0
- psrldq m0, 2
- @@ -257,10 +252,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3
- sub r0, r2
- movu m1, [r0]
- movhps m1, [r1]
- - psrldq m3, m1, 2
- + psrldq m0, m1, 2
- psrldq m2, m1, 4
- - pavgw m4, m3, m1
- - PRED4x4_LOWPASS m0, m1, m2, m3
- + pavgw m4, m0, m1
- + PRED4x4_LOWPASS m0, m1, m2, m0
- lea r1, [r0+r2*2]
- movq [r0+r2*1], m4
- movq [r0+r2*2], m0
- @@ -333,7 +328,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2
- ;-----------------------------------------------------------------------------
- INIT_XMM
- cglobal pred8x8_horizontal_10_sse2, 2,3
- - mov r2, 4
- + mov r2d, 4
- .loop:
- movq m0, [r0+r1*0-8]
- movq m1, [r0+r1*1-8]
- @@ -344,7 +339,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
- mova [r0+r1*0], m0
- mova [r0+r1*1], m1
- lea r0, [r0+r1*2]
- - dec r2
- + dec r2d
- jg .loop
- REP_RET
- @@ -402,7 +397,7 @@ cglobal pred8x8_dc_10_%1, 2,4
- punpcklwd m2, m3
- punpckldq m0, m2 ; s0, s1, s2, s3
- %2 m3, m0, 11110110b ; s2, s1, s3, s3
- - lea r2, [r1+r1*2]
- + lea r2, [r1*3]
- %2 m0, m0, 01110100b ; s0, s1, s3, s1
- paddw m0, m3
- lea r3, [r0+r1*4]
- @@ -445,7 +440,7 @@ cglobal pred8x8_top_dc_10_%1, 2,4
- movq m1, [r0+8]
- HADDW m0, m2
- HADDW m1, m3
- - lea r2, [r1+r1*2]
- + lea r2, [r1*3]
- paddw m0, [pw_2]
- paddw m1, [pw_2]
- lea r3, [r0+r1*4]
- @@ -478,7 +473,7 @@ PRED8x8_TOP_DC sse2 , pshuflw
- INIT_XMM
- cglobal pred8x8_plane_10_sse2, 2,7,7
- sub r0, r1
- - lea r2, [r1+r1*2]
- + lea r2, [r1*3]
- lea r3, [r0+r1*4]
- mova m2, [r0]
- pmaddwd m2, [pw_m32101234]
- @@ -500,7 +495,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
- movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
- movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
- sub r5d, r6d
- - lea r5d, [r5+r5*2]
- + lea r5d, [r5*3]
- add r4d, r5d
- movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
- movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
- @@ -541,7 +536,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
- %macro PRED8x8L_128_DC 1
- cglobal pred8x8l_128_dc_10_%1, 4,4
- mova m0, [pw_512]
- - lea r1, [r3+r3*2]
- + lea r1, [r3*3]
- lea r2, [r0+r3*4]
- MOV8 r0+r3*0, m0, m0
- MOV8 r0+r3*1, m0, m0
- @@ -562,38 +557,69 @@ PRED8x8L_128_DC sse2
- ;-----------------------------------------------------------------------------
- ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- +%macro FIX_LT_2 3-4
- +%ifidn %1,sse4
- + pblendw %2, %3, 00000001b
- +%elifidn %1,avx
- + pblendw %2, %3, 00000001b
- +%else
- + mova %4, %3
- + pxor %4, %2
- + pslldq %4, 14
- + psrldq %4, 14
- + pxor %2, %4
- +%endif
- +%endmacro
- +
- +%macro FIX_TR_1 3-4
- +%ifidn %1,sse4
- + pblendw %2, %3, 10000000b
- +%elifidn %1,avx
- + pblendw %2, %3, 10000000b
- +%else
- + mova %4, %3
- + pxor %4, %2
- + psrldq %4, 14
- + pslldq %4, 14
- + pxor %2, %4
- +%endif
- +%endmacro
- +
- +%macro FIX_LT_1 3-5
- +%ifidn %1, sse4
- + pblendw %2, %3, 01000000b
- +%elifidn %1,avx
- + pblendw %2, %3, 01000000b
- +%else
- + mova %5, %3
- + pxor %5, %4
- + psrldq %5, 14
- + pslldq %5, 12
- + pxor %2, %5
- +%endif
- +%endmacro
- +
- %macro PRED8x8L_TOP_DC 1
- cglobal pred8x8l_top_dc_10_%1, 4,4,6
- sub r0, r3
- - pxor m7, m7
- mova m0, [r0-16]
- mova m3, [r0]
- mova m1, [r0+16]
- - mova m2, m3
- - mova m4, m3
- - PALIGNR m2, m0, 14, m0
- - PALIGNR m1, m4, 2, m4
- - test r1, r1 ; top_left
- + PALIGNR m2, m3, m0, 14, m0
- + PALIGNR m1, m3, 2, m4
- + test r1d, r1d ; top_left
- jz .fix_lt_2
- - test r2, r2 ; top_right
- + test r2d, r2d ; top_right
- jz .fix_tr_1
- jmp .body
- .fix_lt_2:
- - mova m5, m3
- - pxor m5, m2
- - pslldq m5, 14
- - psrldq m5, 14
- - pxor m2, m5
- - test r2, r2 ; top_right
- + FIX_LT_2 %1, m2, m3, m5
- + test r2d, r2d ; top_right
- jnz .body
- .fix_tr_1:
- - mova m5, m3
- - pxor m5, m1
- - psrldq m5, 14
- - pslldq m5, 14
- - pxor m1, m5
- + FIX_TR_1 %1, m1, m3, m5
- .body
- - lea r1, [r3+r3*2]
- + lea r1, [r3*3]
- lea r2, [r0+r3*4]
- PRED4x4_LOWPASS m0, m2, m1, m3
- HADDW m0, m1
- @@ -616,98 +642,73 @@ INIT_XMM
- PRED8x8L_TOP_DC sse2
- %define PALIGNR PALIGNR_SSSE3
- PRED8x8L_TOP_DC ssse3
- +PRED8x8L_TOP_DC sse4
- ;-----------------------------------------------------------------------------
- ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- ;TODO: see if scalar is faster
- %macro PRED8x8L_DC 1
- -cglobal pred8x8l_dc_10_%1, 4,5,8
- +cglobal pred8x8l_dc_10_%1, 4,6,6
- sub r0, r3
- - lea r4, [r0+r3*2]
- + lea r4, [r0+r3*4]
- + lea r5, [r3*3]
- mova m0, [r0+r3*1-16]
- punpckhwd m0, [r0+r3*0-16]
- - mova m1, [r4+r3*1-16]
- + mova m1, [r0+r5*1-16]
- punpckhwd m1, [r0+r3*2-16]
- - mov r4, r0
- punpckhdq m1, m0
- - lea r0, [r0+r3*4]
- - mova m2, [r0+r3*1-16]
- - punpckhwd m2, [r0+r3*0-16]
- - lea r0, [r0+r3*2]
- - mova m3, [r0+r3*1-16]
- - punpckhwd m3, [r0+r3*0-16]
- + mova m2, [r4+r3*1-16]
- + punpckhwd m2, [r4+r3*0-16]
- + mova m3, [r4+r5*1-16]
- + punpckhwd m3, [r4+r3*2-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- - lea r0, [r0+r3*2]
- - mova m0, [r0+r3*0-16]
- + mova m0, [r4+r3*4-16]
- mova m1, [r4]
- - mov r0, r4
- - mova m4, m3
- - mova m2, m3
- - PALIGNR m4, m0, 14, m0
- - PALIGNR m1, m2, 2, m2
- - test r1, r1
- + PALIGNR m4, m3, m0, 14, m0
- + PALIGNR m1, m3, 2, m2
- + test r1d, r1d
- jnz .do_left
- .fix_lt_1:
- - mova m5, m3
- - pxor m5, m4
- - psrldq m5, 14
- - pslldq m5, 12
- - pxor m1, m5
- + FIX_LT_1 %1, m1, m3, m4, m5
- jmp .do_left
- .fix_lt_2:
- - mova m5, m3
- - pxor m5, m2
- - pslldq m5, 14
- - psrldq m5, 14
- - pxor m2, m5
- - test r2, r2
- + FIX_LT_2 %1, m2, m3, m4
- + test r2d, r2d
- jnz .body
- .fix_tr_1:
- - mova m5, m3
- - pxor m5, m1
- - psrldq m5, 14
- - pslldq m5, 14
- - pxor m1, m5
- + FIX_TR_1 %1, m1, m3, m4
- jmp .body
- .do_left:
- - mova m0, m4
- PRED4x4_LOWPASS m2, m1, m4, m3
- - mova m4, m0
- - mova m7, m2
- - PRED4x4_LOWPASS m1, m3, m0, m4
- + PRED4x4_LOWPASS m1, m3, m4, m4
- pslldq m1, 14
- - PALIGNR m7, m1, 14, m3
- + PALIGNR m5, m2, m1, 14, m3
- mova m0, [r0-16]
- mova m3, [r0]
- mova m1, [r0+16]
- - mova m2, m3
- - mova m4, m3
- - PALIGNR m2, m0, 14, m0
- - PALIGNR m1, m4, 2, m4
- - test r1, r1
- + PALIGNR m2, m3, m0, 14, m0
- + PALIGNR m1, m3, 2, m4
- + test r1d, r1d
- jz .fix_lt_2
- - test r2, r2
- + test r2d, r2d
- jz .fix_tr_1
- -.body
- - lea r1, [r3+r3*2]
- - PRED4x4_LOWPASS m6, m2, m1, m3
- - HADDW m7, m0
- - HADDW m6, m0
- - lea r2, [r0+r3*4]
- - paddw m7, [pw_8]
- - paddw m7, m6
- - psrlw m7, 4
- - SPLATW m7, m7
- - mova [r0+r3*1], m7
- - mova [r0+r3*2], m7
- - mova [r0+r1*1], m7
- - mova [r0+r3*4], m7
- - mova [r2+r3*1], m7
- - mova [r2+r3*2], m7
- - mova [r2+r1*1], m7
- - mova [r2+r3*4], m7
- +.body:
- + PRED4x4_LOWPASS m4, m2, m1, m3
- + paddw m5, m4
- + HADDW m5, m0
- + paddw m5, [pw_8]
- + psrlw m5, 4
- + SPLATW m5, m5
- + mova [r0+r3*1], m5
- + mova [r0+r3*2], m5
- + mova [r0+r5*1], m5
- + mova [r0+r3*4], m5
- + mova [r4+r3*1], m5
- + mova [r4+r3*2], m5
- + mova [r4+r5*1], m5
- + mova [r4+r3*4], m5
- RET
- %endmacro
- @@ -716,6 +717,11 @@ INIT_XMM
- PRED8x8L_DC sse2
- %define PALIGNR PALIGNR_SSSE3
- PRED8x8L_DC ssse3
- +PRED8x8L_DC sse4
- +%ifdef HAVE_AVX
- +INIT_AVX
- +PRED8x8L_DC avx
- +%endif
- ;-----------------------------------------------------------------------------
- ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
- @@ -723,36 +729,17 @@ PRED8x8L_DC ssse3
- %macro PRED8x8L_VERTICAL 1
- cglobal pred8x8l_vertical_10_%1, 4,4,6
- sub r0, r3
- - mova m0, [r0-16]
- - mova m3, [r0]
- - mova m1, [r0+16]
- - mova m2, m3
- - mova m4, m3
- - PALIGNR m2, m0, 14, m0
- - PALIGNR m1, m4, 2, m4
- - test r1, r1 ; top_left
- - jz .fix_lt_2
- - test r2, r2 ; top_right
- - jz .fix_tr_1
- - jmp .body
- -.fix_lt_2:
- - mova m5, m3
- - pxor m5, m2
- - pslldq m5, 14
- - psrldq m5, 14
- - pxor m2, m5
- - test r2, r2 ; top_right
- - jnz .body
- -.fix_tr_1:
- - mova m5, m3
- - pxor m5, m1
- - psrldq m5, 14
- - pslldq m5, 14
- - pxor m1, m5
- -.body
- - lea r1, [r3+r3*2]
- + mova m0, [r0]
- + shr r1d, 14
- + shr r2d, 13
- + neg r1
- + pslldq m1, m0, 2
- + psrldq m2, m0, 2
- + pinsrw m1, [r0+r1], 0
- + pinsrw m2, [r0+r2+14], 7
- + lea r1, [r3*3]
- lea r2, [r0+r3*4]
- - PRED4x4_LOWPASS m0, m2, m1, m3
- + PRED4x4_LOWPASS m0, m2, m1, m0
- mova [r0+r3*1], m0
- mova [r0+r3*2], m0
- mova [r0+r1*1], m0
- @@ -765,70 +752,57 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6
- %endmacro
- INIT_XMM
- -%define PALIGNR PALIGNR_MMX
- PRED8x8L_VERTICAL sse2
- -%define PALIGNR PALIGNR_SSSE3
- -PRED8x8L_VERTICAL ssse3
- +%ifdef HAVE_AVX
- +INIT_AVX
- +PRED8x8L_VERTICAL avx
- +%endif
- ;-----------------------------------------------------------------------------
- ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_HORIZONTAL 1
- -cglobal pred8x8l_horizontal_10_%1, 4,4,8
- - sub r0, r3
- - lea r2, [r0+r3*2]
- - mova m0, [r0+r3*1-16]
- - test r1, r1
- - lea r1, [r0+r3]
- - cmovnz r1, r0
- - punpckhwd m0, [r1+r3*0-16]
- - mova m1, [r2+r3*1-16]
- - punpckhwd m1, [r0+r3*2-16]
- +cglobal pred8x8l_horizontal_10_%1, 4,4,5
- + mova m0, [r0-16]
- mov r2, r0
- + sub r2, r3
- + test r1d, r1d
- + cmovz r2, r0
- + punpckhwd m0, [r2+r3*0-16]
- + mova m1, [r0+r3*2-16]
- + punpckhwd m1, [r0+r3*1-16]
- + lea r1, [r3*3]
- punpckhdq m1, m0
- - lea r0, [r0+r3*4]
- - mova m2, [r0+r3*1-16]
- - punpckhwd m2, [r0+r3*0-16]
- - lea r0, [r0+r3*2]
- - mova m3, [r0+r3*1-16]
- - punpckhwd m3, [r0+r3*0-16]
- + mova m2, [r0+r3*4-16]
- + punpckhwd m2, [r0+r1-16]
- + lea r2, [r0+r3*4]
- + mova m3, [r2+r3*2-16]
- + punpckhwd m3, [r2+r3*1-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- - lea r0, [r0+r3*2]
- - mova m0, [r0+r3*0-16]
- - mova m1, [r1+r3*0-16]
- - mov r0, r2
- - mova m4, m3
- - mova m2, m3
- - PALIGNR m4, m0, 14, m0
- - PALIGNR m1, m2, 2, m2
- - mova m0, m4
- - PRED4x4_LOWPASS m2, m1, m4, m3
- - mova m4, m0
- - mova m7, m2
- - PRED4x4_LOWPASS m1, m3, m0, m4
- - pslldq m1, 14
- - PALIGNR m7, m1, 14, m3
- - lea r1, [r3+r3*2]
- - punpckhwd m3, m7, m7
- - punpcklwd m7, m7
- + mova m0, [r2+r1-16]
- + PALIGNR m4, m3, m0, 14, m0
- + pslldq m0, m4, 2
- + pshuflw m0, m0, 11100101b
- + PRED4x4_LOWPASS m4, m3, m0, m4
- + punpckhwd m3, m4, m4
- + punpcklwd m4, m4
- pshufd m0, m3, 0xff
- pshufd m1, m3, 0xaa
- - lea r2, [r0+r3*4]
- pshufd m2, m3, 0x55
- pshufd m3, m3, 0x00
- - pshufd m4, m7, 0xff
- - pshufd m5, m7, 0xaa
- - pshufd m6, m7, 0x55
- - pshufd m7, m7, 0x00
- - mova [r0+r3*1], m0
- - mova [r0+r3*2], m1
- - mova [r0+r1*1], m2
- - mova [r0+r3*4], m3
- - mova [r2+r3*1], m4
- - mova [r2+r3*2], m5
- - mova [r2+r1*1], m6
- - mova [r2+r3*4], m7
- + mova [r0+r3*0], m0
- + mova [r0+r3*1], m1
- + mova [r0+r3*2], m2
- + mova [r0+r1*1], m3
- + pshufd m0, m4, 0xff
- + pshufd m1, m4, 0xaa
- + pshufd m2, m4, 0x55
- + pshufd m3, m4, 0x00
- + mova [r2+r3*0], m0
- + mova [r2+r3*1], m1
- + mova [r2+r3*2], m2
- + mova [r2+r1*1], m3
- RET
- %endmacro
- @@ -837,114 +811,77 @@ INIT_XMM
- PRED8x8L_HORIZONTAL sse2
- %define PALIGNR PALIGNR_SSSE3
- PRED8x8L_HORIZONTAL ssse3
- +%ifdef HAVE_AVX
- +INIT_AVX
- +PRED8x8L_HORIZONTAL avx
- +%endif
- ;-----------------------------------------------------------------------------
- ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_DOWN_LEFT 1
- -cglobal pred8x8l_down_left_10_%1, 4,4,8
- +cglobal pred8x8l_down_left_10_%1, 4,4,7
- sub r0, r3
- mova m0, [r0-16]
- mova m3, [r0]
- mova m1, [r0+16]
- - mova m2, m3
- - mova m4, m3
- - PALIGNR m2, m0, 14, m0
- - PALIGNR m1, m4, 2, m4
- - test r1, r1
- + PALIGNR m2, m3, m0, 14, m0
- + PALIGNR m1, m3, 2, m4
- + test r1d, r1d
- jz .fix_lt_2
- - test r2, r2
- + test r2d, r2d
- jz .fix_tr_1
- jmp .do_top
- .fix_lt_2:
- - mova m5, m3
- - pxor m5, m2
- - pslldq m5, 14
- - psrldq m5, 14
- - pxor m2, m5
- - test r2, r2
- + FIX_LT_2 %1, m2, m3, m5
- + test r2d, r2d
- jnz .do_top
- .fix_tr_1:
- - mova m5, m3
- - pxor m5, m1
- - psrldq m5, 14
- - pslldq m5, 14
- - pxor m1, m5
- + FIX_TR_1 %1, m1, m3, m5
- jmp .do_top
- .fix_tr_2:
- punpckhwd m3, m3
- pshufd m1, m3, 0xFF
- jmp .do_topright
- .do_top:
- - PRED4x4_LOWPASS m4, m2, m1, m3
- - mova m7, m4
- - test r2, r2
- + PRED4x4_LOWPASS m6, m2, m1, m3
- + test r2d, r2d
- jz .fix_tr_2
- - mova m0, [r0+16]
- - mova m5, m0
- - mova m2, m0
- - mova m4, m0
- - psrldq m5, 14
- - PALIGNR m2, m3, 14, m3
- - PALIGNR m5, m4, 2, m4
- - PRED4x4_LOWPASS m1, m2, m5, m0
- + mova m1, [r0+16]
- + psrldq m5, m1, 2
- + PALIGNR m2, m1, m3, 14, m3
- + pshufhw m5, m5, 10100100b
- + PRED4x4_LOWPASS m1, m2, m5, m1
- .do_topright:
- - lea r1, [r3+r3*2]
- - mova m6, m1
- - psrldq m1, 14
- - mova m4, m1
- + lea r1, [r3*3]
- + psrldq m5, m1, 14
- lea r2, [r0+r3*4]
- - mova m2, m6
- - PALIGNR m2, m7, 2, m0
- - mova m3, m6
- - PALIGNR m3, m7, 14, m0
- - PALIGNR m4, m6, 2, m0
- - mova m5, m7
- - mova m1, m7
- - mova m7, m6
- - pslldq m1, 2
- - PRED4x4_LOWPASS m0, m1, m2, m5
- - PRED4x4_LOWPASS m1, m3, m4, m7
- + PALIGNR m2, m1, m6, 2, m0
- + PALIGNR m3, m1, m6, 14, m0
- + PALIGNR m5, m1, 2, m0
- + pslldq m4, m6, 2
- + PRED4x4_LOWPASS m6, m4, m2, m6
- + PRED4x4_LOWPASS m1, m3, m5, m1
- mova [r2+r3*4], m1
- - mova m2, m0
- - pslldq m1, 2
- - psrldq m2, 14
- - pslldq m0, 2
- - por m1, m2
- + PALIGNR m1, m6, 14, m2
- + pslldq m6, 2
- mova [r2+r1*1], m1
- - mova m2, m0
- - pslldq m1, 2
- - psrldq m2, 14
- - pslldq m0, 2
- - por m1, m2
- + PALIGNR m1, m6, 14, m2
- + pslldq m6, 2
- mova [r2+r3*2], m1
- - mova m2, m0
- - pslldq m1, 2
- - psrldq m2, 14
- - pslldq m0, 2
- - por m1, m2
- + PALIGNR m1, m6, 14, m2
- + pslldq m6, 2
- mova [r2+r3*1], m1
- - mova m2, m0
- - pslldq m1, 2
- - psrldq m2, 14
- - pslldq m0, 2
- - por m1, m2
- + PALIGNR m1, m6, 14, m2
- + pslldq m6, 2
- mova [r0+r3*4], m1
- - mova m2, m0
- - pslldq m1, 2
- - psrldq m2, 14
- - pslldq m0, 2
- - por m1, m2
- + PALIGNR m1, m6, 14, m2
- + pslldq m6, 2
- mova [r0+r1*1], m1
- - mova m2, m0
- - pslldq m1, 2
- - psrldq m2, 14
- - pslldq m0, 2
- - por m1, m2
- + PALIGNR m1, m6, 14, m2
- + pslldq m6, 2
- mova [r0+r3*2], m1
- - pslldq m1, 2
- - psrldq m0, 14
- - por m1, m0
- + PALIGNR m1, m6, 14, m6
- mova [r0+r3*1], m1
- RET
- %endmacro
- @@ -954,139 +891,91 @@ INIT_XMM
- PRED8x8L_DOWN_LEFT sse2
- %define PALIGNR PALIGNR_SSSE3
- PRED8x8L_DOWN_LEFT ssse3
- +PRED8x8L_DOWN_LEFT sse4
- +%ifdef HAVE_AVX
- +INIT_AVX
- +PRED8x8L_DOWN_LEFT avx
- +%endif
- ;-----------------------------------------------------------------------------
- -;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride)
- +;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_DOWN_RIGHT 1
- -cglobal pred8x8l_down_right_10_%1, 4,5,8
- +cglobal pred8x8l_down_right_10_%1, 4,6,8
- sub r0, r3
- - lea r4, [r0+r3*2]
- + lea r4, [r0+r3*4]
- + lea r5, [r3*3]
- mova m0, [r0+r3*1-16]
- punpckhwd m0, [r0+r3*0-16]
- - mova m1, [r4+r3*1-16]
- + mova m1, [r0+r5*1-16]
- punpckhwd m1, [r0+r3*2-16]
- - mov r4, r0
- punpckhdq m1, m0
- - lea r0, [r0+r3*4]
- - mova m2, [r0+r3*1-16]
- - punpckhwd m2, [r0+r3*0-16]
- - lea r0, [r0+r3*2]
- - mova m3, [r0+r3*1-16]
- - punpckhwd m3, [r0+r3*0-16]
- + mova m2, [r4+r3*1-16]
- + punpckhwd m2, [r4+r3*0-16]
- + mova m3, [r4+r5*1-16]
- + punpckhwd m3, [r4+r3*2-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- - lea r0, [r0+r3*2]
- - mova m0, [r0+r3*0-16]
- - mova m1, [r4]
- - mov r0, r4
- - mova m4, m3
- - mova m2, m3
- - PALIGNR m4, m0, 14, m0
- - PALIGNR m1, m2, 2, m2
- - test r1, r1 ; top_left
- + mova m0, [r4+r3*4-16]
- + mova m1, [r0]
- + PALIGNR m4, m3, m0, 14, m0
- + PALIGNR m1, m3, 2, m2
- + test r1d, r1d
- jz .fix_lt_1
- .do_left:
- - mova m0, m4
- - PRED4x4_LOWPASS m2, m1, m4, m3
- - mova m4, m0
- - mova m7, m2
- - mova m6, m2
- - PRED4x4_LOWPASS m1, m3, m0, m4
- - pslldq m1, 14
- - PALIGNR m7, m1, 14, m3
- + pslldq m0, m4, 2
- + pshuflw m0, m0, 11100101b
- + PRED4x4_LOWPASS m6, m1, m4, m3
- + PRED4x4_LOWPASS m4, m3, m0, m4
- mova m0, [r0-16]
- mova m3, [r0]
- mova m1, [r0+16]
- - mova m2, m3
- - mova m4, m3
- - PALIGNR m2, m0, 14, m0
- - PALIGNR m1, m4, 2, m4
- - test r1, r1 ; top_left
- + PALIGNR m2, m3, m0, 14, m0
- + PALIGNR m1, m3, 2, m7
- + test r1d, r1d ; top_left
- jz .fix_lt_2
- - test r2, r2 ; top_right
- + test r2d, r2d ; top_right
- jz .fix_tr_1
- .do_top:
- - PRED4x4_LOWPASS m4, m2, m1, m3
- - mova m5, m4
- + PRED4x4_LOWPASS m3, m2, m1, m3
- jmp .body
- .fix_lt_1:
- - mova m5, m3
- - pxor m5, m4
- - psrldq m5, 14
- - pslldq m5, 12
- - pxor m1, m5
- + FIX_LT_1 %1, m1, m3, m4, m5
- jmp .do_left
- .fix_lt_2:
- - mova m5, m3
- - pxor m5, m2
- - pslldq m5, 14
- - psrldq m5, 14
- - pxor m2, m5
- - test r2, r2 ; top_right
- + FIX_LT_2 %1, m2, m3, m5
- + test r2d, r2d ; top_right
- jnz .do_top
- .fix_tr_1:
- - mova m5, m3
- - pxor m5, m1
- - psrldq m5, 14
- - pslldq m5, 14
- - pxor m1, m5
- + FIX_TR_1 %1, m1, m3, m5
- jmp .do_top
- .body
- - lea r1, [r3+r3*2]
- - mova m1, m7
- - mova m7, m5
- - mova m5, m6
- - mova m2, m7
- - lea r2, [r0+r3*4]
- - PALIGNR m2, m6, 2, m0
- - mova m3, m7
- - PALIGNR m3, m6, 14, m0
- - mova m4, m7
- - psrldq m4, 2
- - PRED4x4_LOWPASS m0, m1, m2, m5
- - PRED4x4_LOWPASS m1, m3, m4, m7
- - mova [r2+r3*4], m0
- - mova m2, m1
- - psrldq m0, 2
- - pslldq m2, 14
- - psrldq m1, 2
- - por m0, m2
- - mova [r2+r1*1], m0
- - mova m2, m1
- - psrldq m0, 2
- - pslldq m2, 14
- - psrldq m1, 2
- - por m0, m2
- - mova [r2+r3*2], m0
- - mova m2, m1
- - psrldq m0, 2
- - pslldq m2, 14
- - psrldq m1, 2
- - por m0, m2
- - mova [r2+r3*1], m0
- - mova m2, m1
- - psrldq m0, 2
- - pslldq m2, 14
- - psrldq m1, 2
- - por m0, m2
- - mova [r0+r3*4], m0
- - mova m2, m1
- - psrldq m0, 2
- - pslldq m2, 14
- - psrldq m1, 2
- - por m0, m2
- - mova [r0+r1*1], m0
- - mova m2, m1
- - psrldq m0, 2
- - pslldq m2, 14
- - psrldq m1, 2
- - por m0, m2
- - mova [r0+r3*2], m0
- - psrldq m0, 2
- - pslldq m1, 14
- - por m0, m1
- - mova [r0+r3*1], m0
- + PALIGNR m2, m3, m6, 2, m0
- + PALIGNR m5, m3, m6, 14, m0
- + psrldq m7, m3, 2
- + PRED4x4_LOWPASS m6, m4, m2, m6
- + PRED4x4_LOWPASS m3, m5, m7, m3
- + mova [r4+r3*4], m6
- + PALIGNR m3, m6, 14, m2
- + pslldq m6, 2
- + mova [r0+r3*1], m3
- + PALIGNR m3, m6, 14, m2
- + pslldq m6, 2
- + mova [r0+r3*2], m3
- + PALIGNR m3, m6, 14, m2
- + pslldq m6, 2
- + mova [r0+r5*1], m3
- + PALIGNR m3, m6, 14, m2
- + pslldq m6, 2
- + mova [r0+r3*4], m3
- + PALIGNR m3, m6, 14, m2
- + pslldq m6, 2
- + mova [r4+r3*1], m3
- + PALIGNR m3, m6, 14, m2
- + pslldq m6, 2
- + mova [r4+r3*2], m3
- + PALIGNR m3, m6, 14, m6
- + mova [r4+r5*1], m3
- RET
- %endmacro
- @@ -1095,114 +984,86 @@ INIT_XMM
- PRED8x8L_DOWN_RIGHT sse2
- %define PALIGNR PALIGNR_SSSE3
- PRED8x8L_DOWN_RIGHT ssse3
- +PRED8x8L_DOWN_RIGHT sse4
- +%ifdef HAVE_AVX
- +INIT_AVX
- +PRED8x8L_DOWN_RIGHT avx
- +%endif
- ;-----------------------------------------------------------------------------
- ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_VERTICAL_RIGHT 1
- -cglobal pred8x8l_vertical_right_10_%1, 4,5,8
- +cglobal pred8x8l_vertical_right_10_%1, 4,6,8
- sub r0, r3
- - lea r4, [r0+r3*2]
- + lea r4, [r0+r3*4]
- + lea r5, [r3*3]
- mova m0, [r0+r3*1-16]
- punpckhwd m0, [r0+r3*0-16]
- - mova m1, [r4+r3*1-16]
- + mova m1, [r0+r5*1-16]
- punpckhwd m1, [r0+r3*2-16]
- - mov r4, r0
- punpckhdq m1, m0
- - lea r0, [r0+r3*4]
- - mova m2, [r0+r3*1-16]
- - punpckhwd m2, [r0+r3*0-16]
- - lea r0, [r0+r3*2]
- - mova m3, [r0+r3*1-16]
- - punpckhwd m3, [r0+r3*0-16]
- + mova m2, [r4+r3*1-16]
- + punpckhwd m2, [r4+r3*0-16]
- + mova m3, [r4+r5*1-16]
- + punpckhwd m3, [r4+r3*2-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- - lea r0, [r0+r3*2]
- - mova m0, [r0+r3*0-16]
- - mova m1, [r4]
- - mov r0, r4
- - mova m4, m3
- - mova m2, m3
- - PALIGNR m4, m0, 14, m0
- - PALIGNR m1, m2, 2, m2
- - test r1, r1
- - jz .fix_lt_1
- - jmp .do_left
- + mova m0, [r4+r3*4-16]
- + mova m1, [r0]
- + PALIGNR m4, m3, m0, 14, m0
- + PALIGNR m1, m3, 2, m2
- + test r1d, r1d
- + jnz .do_left
- .fix_lt_1:
- - mova m5, m3
- - pxor m5, m4
- - psrldq m5, 14
- - pslldq m5, 12
- - pxor m1, m5
- + FIX_LT_1 %1, m1, m3, m4, m5
- jmp .do_left
- .fix_lt_2:
- - mova m5, m3
- - pxor m5, m2
- - pslldq m5, 14
- - psrldq m5, 14
- - pxor m2, m5
- - test r2, r2
- + FIX_LT_2 %1, m7, m2, m5
- + test r2d, r2d
- jnz .do_top
- .fix_tr_1:
- - mova m5, m3
- - pxor m5, m1
- - psrldq m5, 14
- - pslldq m5, 14
- - pxor m1, m5
- + FIX_TR_1 %1, m1, m2, m5
- jmp .do_top
- .do_left:
- - mova m0, m4
- - PRED4x4_LOWPASS m2, m1, m4, m3
- - mova m7, m2
- + PRED4x4_LOWPASS m3, m1, m4, m3
- mova m0, [r0-16]
- - mova m3, [r0]
- + mova m2, [r0]
- mova m1, [r0+16]
- - mova m2, m3
- - mova m4, m3
- - PALIGNR m2, m0, 14, m0
- - PALIGNR m1, m4, 2, m4
- - test r1, r1
- + PALIGNR m7, m2, m0, 14, m0
- + PALIGNR m1, m2, 2, m4
- + test r1d, r1d
- jz .fix_lt_2
- - test r2, r2
- + test r2d, r2d
- jz .fix_tr_1
- .do_top
- - PRED4x4_LOWPASS m6, m2, m1, m3
- - lea r1, [r3+r3*2]
- - mova m2, m6
- - mova m3, m6
- - PALIGNR m3, m7, 14, m0
- - PALIGNR m6, m7, 12, m1
- - mova m4, m3
- - pavgw m3, m2
- - lea r2, [r0+r3*4]
- - PRED4x4_LOWPASS m0, m6, m2, m4
- - mova [r0+r3*1], m3
- + PRED4x4_LOWPASS m2, m7, m1, m2
- + PALIGNR m6, m2, m3, 12, m1
- + PALIGNR m7, m2, m3, 14, m0
- + PRED4x4_LOWPASS m0, m6, m2, m7
- + pavgw m2, m7
- mova [r0+r3*2], m0
- - mova m5, m0
- - mova m6, m3
- - mova m1, m7
- - mova m2, m1
- - pslldq m2, 2
- - mova m3, m1
- - pslldq m3, 4
- - PRED4x4_LOWPASS m0, m1, m3, m2
- - PALIGNR m6, m0, 14, m2
- - mova [r0+r1*1], m6
- - pslldq m0, 2
- - PALIGNR m5, m0, 14, m1
- - mova [r0+r3*4], m5
- - pslldq m0, 2
- - PALIGNR m6, m0, 14, m2
- - mova [r2+r3*1], m6
- - pslldq m0, 2
- - PALIGNR m5, m0, 14, m1
- - mova [r2+r3*2], m5
- - pslldq m0, 2
- - PALIGNR m6, m0, 14, m2
- - mova [r2+r1*1], m6
- - pslldq m0, 2
- - PALIGNR m5, m0, 14, m1
- - mova [r2+r3*4], m5
- + mova [r0+r3*1], m2
- + pslldq m6, m3, 4
- + pslldq m1, m3, 2
- + PRED4x4_LOWPASS m1, m3, m6, m1
- + PALIGNR m2, m1, 14, m4
- + mova [r0+r5*1], m2
- + pslldq m1, 2
- + PALIGNR m0, m1, 14, m3
- + mova [r0+r3*4], m0
- + pslldq m1, 2
- + PALIGNR m2, m1, 14, m4
- + mova [r4+r3*1], m2
- + pslldq m1, 2
- + PALIGNR m0, m1, 14, m3
- + mova [r4+r3*2], m0
- + pslldq m1, 2
- + PALIGNR m2, m1, 14, m4
- + mova [r4+r5*1], m2
- + pslldq m1, 2
- + PALIGNR m0, m1, 14, m1
- + mova [r4+r3*4], m0
- RET
- %endmacro
- @@ -1211,84 +1072,63 @@ INIT_XMM
- PRED8x8L_VERTICAL_RIGHT sse2
- %define PALIGNR PALIGNR_SSSE3
- PRED8x8L_VERTICAL_RIGHT ssse3
- +PRED8x8L_VERTICAL_RIGHT sse4
- +%ifdef HAVE_AVX
- +INIT_AVX
- +PRED8x8L_VERTICAL_RIGHT avx
- +%endif
- ;-----------------------------------------------------------------------------
- ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_HORIZONTAL_UP 1
- cglobal pred8x8l_horizontal_up_10_%1, 4,4,8
- - sub r0, r3
- - lea r2, [r0+r3*2]
- - mova m0, [r0+r3*1-16]
- - test r1, r1
- - lea r1, [r0+r3]
- - cmovnz r1, r0
- - punpckhwd m0, [r1+r3*0-16]
- - mova m1, [r2+r3*1-16]
- - punpckhwd m1, [r0+r3*2-16]
- + mova m0, [r0-16]
- mov r2, r0
- + sub r2, r3
- + test r1d, r1d
- + cmovz r2, r0
- + mova m1, [r0+r3*2-16]
- + punpckhwd m0, [r2+r3*0-16]
- + punpckhwd m1, [r0+r3*1-16]
- + lea r1, [r3*3]
- + lea r2, [r0+r3*4]
- punpckhdq m1, m0
- - lea r0, [r0+r3*4]
- - mova m2, [r0+r3*1-16]
- - punpckhwd m2, [r0+r3*0-16]
- - lea r0, [r0+r3*2]
- - mova m3, [r0+r3*1-16]
- - punpckhwd m3, [r0+r3*0-16]
- + mova m2, [r0+r3*4-16]
- + punpckhwd m2, [r0+r1-16]
- + mova m3, [r2+r3*2-16]
- + punpckhwd m3, [r2+r3*1-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- - lea r0, [r0+r3*2]
- - mova m0, [r0+r3*0-16]
- - mova m1, [r1+r3*0-16]
- - mov r0, r2
- - mova m4, m3
- - mova m2, m3
- - PALIGNR m4, m0, 14, m0
- - PALIGNR m1, m2, 2, m2
- - mova m0, m4
- - PRED4x4_LOWPASS m2, m1, m4, m3
- - mova m4, m0
- - mova m7, m2
- - PRED4x4_LOWPASS m1, m3, m0, m4
- - pslldq m1, 14
- - PALIGNR m7, m1, 14, m3
- - lea r1, [r3+r3*2]
- + mova m0, [r2+r1-16]
- + PALIGNR m7, m3, m0, 14, m0
- + pslldq m4, m7, 2
- + pshuflw m4, m4, 11100101b
- + PRED4x4_LOWPASS m7, m3, m4, m7
- pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
- - pslldq m7, 14 ; l7 .. .. .. .. .. .. ..
- - mova m2, m0
- + psrld m5, m0, 16
- pslld m0, 16
- - psrld m2, 16
- - por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
- - mova m3, m2
- - mova m4, m2
- - mova m5, m2
- - psrldq m2, 2
- - psrldq m3, 4
- - lea r2, [r0+r3*4]
- - por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1
- - punpckhwd m7, m7
- - por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2
- - pavgw m4, m2
- - PRED4x4_LOWPASS m1, m3, m5, m2
- - mova m5, m4
- + por m5, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
- + PALIGNR m6, m7, m5, 2, m0
- + PALIGNR m7, m6, 2, m1
- + pavgw m4, m5, m6
- + PRED4x4_LOWPASS m1, m7, m5, m6
- + punpckhwd m5, m4, m1 ; p8 p7 p6 p5
- punpcklwd m4, m1 ; p4 p3 p2 p1
- - punpckhwd m5, m1 ; p8 p7 p6 p5
- - mova m6, m5
- - mova m7, m5
- - mova m0, m5
- - PALIGNR m5, m4, 4, m1
- - pshufd m1, m6, 11111001b
- - PALIGNR m6, m4, 8, m2
- - pshufd m2, m7, 11111110b
- - PALIGNR m7, m4, 12, m3
- - pshufd m3, m0, 11111111b
- - mova [r0+r3*1], m4
- - mova [r0+r3*2], m5
- - mova [r0+r1*1], m6
- - mova [r0+r3*4], m7
- + mova [r2+r3*0], m5
- + mova [r0+r3*0], m4
- + pshufd m0, m5, 11111001b
- + pshufd m1, m5, 11111110b
- + pshufd m2, m5, 11111111b
- mova [r2+r3*1], m0
- mova [r2+r3*2], m1
- mova [r2+r1*1], m2
- - mova [r2+r3*4], m3
- + PALIGNR m7, m5, m4, 4, m0
- + PALIGNR m6, m5, m4, 8, m1
- + PALIGNR m5, m5, m4, 12, m4
- + mova [r0+r3*1], m7
- + mova [r0+r3*2], m6
- + mova [r0+r1*1], m5
- RET
- %endmacro
- @@ -1297,7 +1137,10 @@ INIT_XMM
- PRED8x8L_HORIZONTAL_UP sse2
- %define PALIGNR PALIGNR_SSSE3
- PRED8x8L_HORIZONTAL_UP ssse3
- -
- +%ifdef HAVE_AVX
- +INIT_AVX
- +PRED8x8L_HORIZONTAL_UP avx
- +%endif
- ;-----------------------------------------------------------------------------
- @@ -1315,7 +1158,7 @@ PRED8x8L_HORIZONTAL_UP ssse3
- %macro PRED16x16_VERTICAL 1
- cglobal pred16x16_vertical_10_%1, 2,3
- sub r0, r1
- - mov r2, 8
- + mov r2d, 8
- mova m0, [r0+ 0]
- mova m1, [r0+mmsize]
- %if mmsize==8
- @@ -1326,7 +1169,7 @@ cglobal pred16x16_vertical_10_%1, 2,3
- MOV16 r0+r1*1, m0, m1, m2, m3
- MOV16 r0+r1*2, m0, m1, m2, m3
- lea r0, [r0+r1*2]
- - dec r2
- + dec r2d
- jg .loop
- REP_RET
- %endmacro
- @@ -1341,7 +1184,7 @@ PRED16x16_VERTICAL sse2
- ;-----------------------------------------------------------------------------
- %macro PRED16x16_HORIZONTAL 1
- cglobal pred16x16_horizontal_10_%1, 2,3
- - mov r2, 8
- + mov r2d, 8
- .vloop:
- movd m0, [r0+r1*0-4]
- movd m1, [r0+r1*1-4]
- @@ -1350,7 +1193,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3
- MOV16 r0+r1*0, m0, m0, m0, m0
- MOV16 r0+r1*1, m1, m1, m1, m1
- lea r0, [r0+r1*2]
- - dec r2
- + dec r2d
- jg .vloop
- REP_RET
- %endmacro
- @@ -1364,8 +1207,8 @@ PRED16x16_HORIZONTAL sse2
- ; void pred16x16_dc(pixel *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED16x16_DC 1
- -cglobal pred16x16_dc_10_%1, 2,7
- - mov r4, r0
- +cglobal pred16x16_dc_10_%1, 2,6
- + mov r5, r0
- sub r0, r1
- mova m0, [r0+0]
- paddw m0, [r0+mmsize]
- @@ -1375,17 +1218,17 @@ cglobal pred16x16_dc_10_%1, 2,7
- %endif
- HADDW m0, m2
- - sub r0, 2
- - movzx r3d, word [r0+r1*1]
- - movzx r5d, word [r0+r1*2]
- + lea r0, [r0+r1-2]
- + movzx r3d, word [r0]
- + movzx r4d, word [r0+r1]
- %rep 7
- lea r0, [r0+r1*2]
- - movzx r2d, word [r0+r1*1]
- + movzx r2d, word [r0]
- add r3d, r2d
- - movzx r2d, word [r0+r1*2]
- - add r5d, r2d
- + movzx r2d, word [r0+r1]
- + add r4d, r2d
- %endrep
- - lea r3d, [r3+r5+16]
- + lea r3d, [r3+r4+16]
- movd m1, r3d
- paddw m0, m1
- @@ -1393,9 +1236,9 @@ cglobal pred16x16_dc_10_%1, 2,7
- SPLATW m0, m0
- mov r3d, 8
- .loop:
- - MOV16 r4+r1*0, m0, m0, m0, m0
- - MOV16 r4+r1*1, m0, m0, m0, m0
- - lea r4, [r4+r1*2]
- + MOV16 r5+r1*0, m0, m0, m0, m0
- + MOV16 r5+r1*1, m0, m0, m0, m0
- + lea r5, [r5+r1*2]
- dec r3d
- jg .loop
- REP_RET
- diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
- index 62e4c87..332a464 100644
- --- a/libavcodec/x86/h264_intrapred_init.c
- +++ b/libavcodec/x86/h264_intrapred_init.c
- @@ -56,22 +56,33 @@ void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_tople
- PRED8x8L(dc, 10, sse2)
- PRED8x8L(dc, 10, ssse3)
- +PRED8x8L(dc, 10, sse4)
- +PRED8x8L(dc, 10, avx)
- PRED8x8L(128_dc, 10, mmxext)
- PRED8x8L(128_dc, 10, sse2)
- PRED8x8L(top_dc, 10, sse2)
- PRED8x8L(top_dc, 10, ssse3)
- +PRED8x8L(top_dc, 10, sse4)
- PRED8x8L(vertical, 10, sse2)
- -PRED8x8L(vertical, 10, ssse3)
- +PRED8x8L(vertical, 10, avx)
- PRED8x8L(horizontal, 10, sse2)
- PRED8x8L(horizontal, 10, ssse3)
- +PRED8x8L(horizontal, 10, avx)
- PRED8x8L(down_left, 10, sse2)
- PRED8x8L(down_left, 10, ssse3)
- +PRED8x8L(down_left, 10, sse4)
- +PRED8x8L(down_left, 10, avx)
- PRED8x8L(down_right, 10, sse2)
- PRED8x8L(down_right, 10, ssse3)
- +PRED8x8L(down_right, 10, sse4)
- +PRED8x8L(down_right, 10, avx)
- PRED8x8L(vertical_right, 10, sse2)
- PRED8x8L(vertical_right, 10, ssse3)
- +PRED8x8L(vertical_right, 10, sse4)
- +PRED8x8L(vertical_right, 10, avx)
- PRED8x8L(horizontal_up, 10, sse2)
- PRED8x8L(horizontal_up, 10, ssse3)
- +PRED8x8L(horizontal_up, 10, avx)
- #define PRED16x16(TYPE, DEPTH, OPT)\
- void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
- @@ -344,18 +355,32 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
- h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
- h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
- - h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_ssse3;
- h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
- h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_ssse3;
- h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_ssse3;
- h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
- }
- + if (mm_flags & AV_CPU_FLAG_SSE4) {
- + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse4;
- + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse4;
- + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse4;
- + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse4;
- + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse4;
- + }
- #if HAVE_AVX
- if (mm_flags & AV_CPU_FLAG_AVX) {
- h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
- h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
- h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
- h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
- +
- + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
- + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
- + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
- + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
- + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
- + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
- + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
- }
- #endif /* HAVE_AVX */
- }
- --
- 1.7.5.1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement