Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 4b02d14340062021871b44a010396855c991183f Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sat, 15 Jan 2011 13:44:45 -0500
- Subject: [PATCH] Add AVX functions where 3+ arg commands are useful.
- TODO in later patches:
- Change functions to use the move approach in AVX SUMSUB*
- AVX palignr
- Port predict functions to avx
- ---
- common/bitstream.c | 3 +
- common/dct.c | 29 ++++++
- common/deblock.c | 26 ++++++
- common/pixel.c | 24 +++++
- common/quant.c | 3 +
- common/x86/bitstream-a.asm | 18 ++--
- common/x86/dct-32.asm | 50 ++++-------
- common/x86/dct-64.asm | 31 +++----
- common/x86/dct-a.asm | 150 ++++++++++++++++---------------
- common/x86/dct.h | 8 ++
- common/x86/deblock-a.asm | 209 +++++++++++++++++++++++++-----------------
- common/x86/mc-a.asm | 65 ++++++++------
- common/x86/mc-a2.asm | 153 ++++++++++++++++---------------
- common/x86/mc-c.c | 47 ++++++++++-
- common/x86/pixel-a.asm | 140 ++++++++++++++++-------------
- common/x86/pixel.h | 16 +++-
- common/x86/predict-a.asm | 216 ++++++++++++++++++++++----------------------
- common/x86/predict-c.c | 40 ++++++++
- common/x86/quant-a.asm | 23 +++--
- common/x86/quant.h | 3 +
- common/x86/sad-a.asm | 21 ++---
- common/x86/x86inc.asm | 2 +
- common/x86/x86util.asm | 69 ++++++++++-----
- 23 files changed, 808 insertions(+), 538 deletions(-)
- diff --git a/common/bitstream.c b/common/bitstream.c
- index 5acffd0..d7c2c51 100644
- --- a/common/bitstream.c
- +++ b/common/bitstream.c
- @@ -42,6 +42,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
- #if HAVE_MMX
- uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
- uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
- +uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
- #endif
- /****************************************************************************
- @@ -93,5 +94,7 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
- pf->nal_escape = x264_nal_escape_mmxext;
- if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
- pf->nal_escape = x264_nal_escape_sse2;
- + if( cpu&X264_CPU_AVX )
- + pf->nal_escape = x264_nal_escape_avx;
- #endif
- }
- diff --git a/common/dct.c b/common/dct.c
- index fef004e..22bd4d7 100644
- --- a/common/dct.c
- +++ b/common/dct.c
- @@ -738,6 +738,8 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
- if( cpu&X264_CPU_SSE4 )
- pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
- + if( cpu&X264_CPU_AVX )
- + pf->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
- #endif // HAVE_MMX
- #else
- #if HAVE_MMX
- @@ -751,6 +753,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
- pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
- }
- + if( cpu&X264_CPU_AVX )
- + {
- + pf->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
- +#if ARCH_X86_64
- + pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_avx;
- +#endif
- + }
- #endif // HAVE_MMX
- #if HAVE_ALTIVEC
- if( cpu&X264_CPU_ALTIVEC )
- @@ -772,6 +781,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
- pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
- }
- +#if ARCH_X86_64
- + if( cpu&X264_CPU_AVX )
- + {
- + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
- + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
- + }
- +#endif // ARCH_X86_64
- #endif // HAVE_MMX
- #else
- #if HAVE_MMX
- @@ -789,6 +805,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
- pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
- }
- + if( cpu&X264_CPU_AVX )
- + {
- + pf->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
- +#if ARCH_X86_64
- + pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
- +#endif
- + if( cpu&X264_CPU_SHUFFLE_IS_FAST )
- + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; // review please, I don't actually know what X264_CPU_SHUFFLE_IS_FAST is
- + }
- #endif // HAVE_MMX
- #if HAVE_ALTIVEC
- if( cpu&X264_CPU_ALTIVEC )
- @@ -806,11 +831,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- #if HIGH_BIT_DEPTH
- if( cpu&X264_CPU_SSE2 )
- pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
- + if( cpu&X264_CPU_AVX )
- + pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
- #else
- if( cpu&X264_CPU_MMX )
- pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
- pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
- + if( cpu&X264_CPU_AVX )
- + pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
- #endif // HIGH_BIT_DEPTH
- #endif
- }
- diff --git a/common/deblock.c b/common/deblock.c
- index ff7c99f..5122068 100644
- --- a/common/deblock.c
- +++ b/common/deblock.c
- @@ -437,12 +437,20 @@ void x264_macroblock_deblock( x264_t *h )
- #if HAVE_MMX
- void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- +void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- +void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- +void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- +void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- +void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
- void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
- +void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
- void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
- +void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
- void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
- +void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
- void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
- void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
- @@ -453,6 +461,9 @@ void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X
- void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
- int mvy_limit, int bframe );
- +void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
- + int mvy_limit, int bframe );
- #if ARCH_X86
- void x264_deblock_h_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
- void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
- @@ -537,6 +548,21 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
- }
- if( cpu&X264_CPU_SSSE3 )
- pf->deblock_strength = x264_deblock_strength_ssse3;
- + if( cpu&X264_CPU_AVX )
- + {
- + pf->deblock_strength = x264_deblock_strength_avx;
- + if( !(cpu&X264_CPU_STACK_MOD4) )
- + {
- + pf->deblock_luma[1] = x264_deblock_v_luma_avx;
- + pf->deblock_luma[0] = x264_deblock_h_luma_avx;
- + pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
- + pf->deblock_chroma[0] = x264_deblock_h_chroma_avx;
- + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
- + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
- + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
- + pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx;
- + }
- + }
- }
- #endif
- diff --git a/common/pixel.c b/common/pixel.c
- index 979c731..ebc0570 100644
- --- a/common/pixel.c
- +++ b/common/pixel.c
- @@ -488,6 +488,7 @@ SATD_X_DECL7( _mmxext )
- SATD_X_DECL6( _sse2 )
- SATD_X_DECL7( _ssse3 )
- SATD_X_DECL7( _sse4 )
- +SATD_X_DECL7( _avx )
- #endif // !HIGH_BIT_DEPTH
- #endif
- @@ -1030,6 +1031,29 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
- /* Slower on Conroe, so only enable under SSE4 */
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
- }
- +
- + if( cpu&X264_CPU_AVX )
- + {
- + INIT7( satd, _avx );
- + INIT7( satd_x3, _avx );
- + INIT7( satd_x4, _avx );
- + pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
- + pixf->ads[PIXEL_16x8] = x264_pixel_ads2_avx;
- + if( !(cpu&X264_CPU_STACK_MOD4) )
- + {
- + INIT4( hadamard_ac, _avx );
- + }
- + INIT5( ssd, _avx );
- +#if ARCH_X86_64
- + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
- + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
- + pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
- +#endif
- + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
- + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
- + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
- + pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
- + }
- #endif //HAVE_MMX
- #if HAVE_ARMV6
- diff --git a/common/quant.c b/common/quant.c
- index afc50a3..54d9b5c 100644
- --- a/common/quant.c
- +++ b/common/quant.c
- @@ -477,6 +477,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
- if( cpu&X264_CPU_AVX )
- {
- + pf->dequant_4x4 = x264_dequant_4x4_avx;
- + pf->dequant_8x8 = x264_dequant_8x8_avx;
- + pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
- pf->denoise_dct = x264_denoise_dct_avx;
- }
- #endif // HAVE_MMX
- diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
- index c80b925..97c7500 100644
- --- a/common/x86/bitstream-a.asm
- +++ b/common/x86/bitstream-a.asm
- @@ -44,19 +44,17 @@ SECTION .text
- jmp %1_continue
- ALIGN 16
- %1:
- - mova m3, m1
- - mova m2, m0
- - pcmpeqb m1, m4
- - pcmpeqb m0, m4
- - pmovmskb r3d, m1
- - %2 [r0+r1], m2
- - pmovmskb r4d, m0
- + pcmpeqb m3, m1, m4
- + pcmpeqb m2, m0, m4
- + pmovmskb r3d, m3
- + %2 [r0+r1], m0
- + pmovmskb r4d, m2
- shl r3d, mmsize
- mova m0, [r1+r2+2*mmsize]
- or r4d, r3d
- - mova m1, [r1+r2+3*mmsize]
- + %2 [r0+r1+mmsize], m1
- lea r3d, [r4+r4+1]
- - %2 [r0+r1+mmsize], m3
- + mova m1, [r1+r2+3*mmsize]
- and r4d, r3d
- jnz %1_escape
- %1_continue:
- @@ -129,3 +127,5 @@ INIT_MMX
- NAL_ESCAPE mmxext
- INIT_XMM
- NAL_ESCAPE sse2
- +INIT_AVX
- +NAL_ESCAPE avx
- diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
- index f766f8c..fa106c0 100644
- --- a/common/x86/dct-32.asm
- +++ b/common/x86/dct-32.asm
- @@ -48,39 +48,30 @@ cextern hsub_mul
- SUMSUB_BA w, m%6, m%5 ; %6 = dst0, %5 = dst4
- mova [%9+0x00], m%6
- mova [%9+0x40], m%5
- - mova m%6, m%7 ; a3
- - psraw m%6, 1 ; a3>>1
- + psraw m%6, m%7, 1 ; a3>>1
- paddw m%6, m%8 ; a2 + (a3>>1)
- psraw m%8, 1 ; a2>>1
- psubw m%8, m%7 ; (a2>>1) - a3
- mova [%9+0x60], m%8
- - mova m%5, m%3
- - psraw m%5, 1
- + psraw m%5, m%3, 1
- paddw m%5, m%3 ; d25+(d25>>1)
- - mova m%7, m%1
- - psubw m%7, m%4 ; a5 = d07-d34-(d25+(d25>>1))
- + psubw m%7, m%1, m%4 ; a5 = d07-d34-(d25+(d25>>1))
- psubw m%7, m%5
- - mova m%5, m%2
- - psraw m%5, 1
- + psraw m%5, m%2, 1
- paddw m%5, m%2 ; d16+(d16>>1)
- - mova m%8, m%1
- - paddw m%8, m%4
- + paddw m%8, m%1, m%4
- psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
- - mova m%5, m%1
- - psraw m%5, 1
- + psraw m%5, m%1, 1
- paddw m%5, m%1 ; d07+(d07>>1)
- paddw m%5, m%2
- paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
- - mova m%1, m%4
- - psraw m%1, 1
- + psraw m%1, m%4, 1
- paddw m%1, m%4 ; d34+(d34>>1)
- paddw m%1, m%2
- psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
- - mova m%4, m%1
- - psraw m%4, 2
- + psraw m%4, m%1, 2
- paddw m%4, m%5 ; a4 + (a7>>2)
- - mova m%3, m%8
- - psraw m%3, 2
- + psraw m%3, m%8, 2
- paddw m%3, m%7 ; a5 + (a6>>2)
- psraw m%5, 2
- psraw m%7, 2
- @@ -92,19 +83,17 @@ cextern hsub_mul
- ; in: 0,4 in mem, rest in regs
- ; out: m0..m7
- %macro IDCT8_1D 9
- - mova m%1, m%3
- - mova m%5, m%7
- - psraw m%3, 1
- - psraw m%7, 1
- + psraw m%1, m%3, 1
- + SWAP m%1, m%3
- + psraw m%5, m%7, 1
- + SWAP m%5, m%7
- psubw m%3, m%5
- paddw m%7, m%1
- - mova m%5, m%2
- - psraw m%5, 1
- + psraw m%5, m%2, 1
- paddw m%5, m%2
- paddw m%5, m%4
- paddw m%5, m%6
- - mova m%1, m%6
- - psraw m%1, 1
- + psraw m%1, m%6, 1
- paddw m%1, m%6
- paddw m%1, m%8
- psubw m%1, m%2
- @@ -116,10 +105,8 @@ cextern hsub_mul
- psraw m%8, 1
- psubw m%2, m%4
- psubw m%6, m%8
- - mova m%4, m%5
- - mova m%8, m%1
- - psraw m%4, 2
- - psraw m%8, 2
- + psraw m%4, m%5, 2
- + psraw m%8, m%1, 2
- paddw m%4, m%6
- paddw m%8, m%2
- psraw m%6, 2
- @@ -246,9 +233,8 @@ idct8_mmx:
- %macro ADD_STORE_ROW 3
- movq m1, [r0+%1*FDEC_STRIDE]
- - movq m2, m1
- + punpckhbw m2, m1, m0
- punpcklbw m1, m0
- - punpckhbw m2, m0
- paddw m1, %2
- paddw m2, %3
- packuswb m1, m2
- diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
- index 112c7ba..fe404d9 100644
- --- a/common/x86/dct-64.asm
- +++ b/common/x86/dct-64.asm
- @@ -45,14 +45,12 @@ INIT_XMM
- SUMSUB_BA w, m%6, m%7, m%10 ; %6=a1, %7=a3
- SUMSUB_BA w, m%5, m%8, m%10 ; %5=a0, %8=a2
- - movdqa m%9, m%1
- - psraw m%9, 1
- + psraw m%9, m%1, 1
- paddw m%9, m%1
- paddw m%9, m%2
- paddw m%9, m%3 ; %9=a4
- - movdqa m%10, m%4
- - psraw m%10, 1
- + psraw m%10, m%4, 1
- paddw m%10, m%4
- paddw m%10, m%2
- psubw m%10, m%3 ; %10=a7
- @@ -65,22 +63,19 @@ INIT_XMM
- psubw m%1, m%3 ; %1=a5
- psubw m%4, m%2 ; %4=a6
- - movdqa m%2, m%10
- - psraw m%2, 2
- + psraw m%2, m%10, 2
- paddw m%2, m%9 ; %2=b1
- psraw m%9, 2
- psubw m%9, m%10 ; %9=b7
- SUMSUB_BA w, m%6, m%5, m%10 ; %6=b0, %5=b4
- - movdqa m%3, m%7
- - psraw m%3, 1
- + psraw m%3, m%7, 1
- paddw m%3, m%8 ; %3=b2
- psraw m%8, 1
- psubw m%8, m%7 ; %8=b6
- - movdqa m%7, m%4
- - psraw m%7, 2
- + psraw m%7, m%4, 2
- paddw m%7, m%1 ; %7=b3
- psraw m%1, 2
- psubw m%4, m%1 ; %4=b5
- @@ -91,20 +86,18 @@ INIT_XMM
- %macro IDCT8_1D 10
- SUMSUB_BA w, m%5, m%1, m%9 ; %5=a0, %1=a2
- - movdqa m%9, m%2
- - psraw m%9, 1
- + psraw m%9, m%2, 1
- paddw m%9, m%2
- paddw m%9, m%4
- paddw m%9, m%6 ; %9=a7
- - movdqa m%10, m%3
- - psraw m%3, 1
- + psraw m%10, m%3, 1
- + SWAP m%10, m%3
- psubw m%3, m%7 ; %3=a4
- psraw m%7, 1
- paddw m%7, m%10 ; %7=a6
- - movdqa m%10, m%6
- - psraw m%10, 1
- + psraw m%10, m%6, 1
- paddw m%10, m%6
- paddw m%10, m%8
- psubw m%10, m%2 ; %10=a5
- @@ -118,8 +111,7 @@ INIT_XMM
- psubw m%2, m%4 ; %2=a3
- psubw m%6, m%8 ; %6=a1
- - movdqa m%4, m%9
- - psraw m%4, 2
- + psraw m%4, m%9, 2
- paddw m%4, m%6 ; %4=b1
- psraw m%6, 2
- psubw m%9, m%6 ; %9=b7
- @@ -127,8 +119,7 @@ INIT_XMM
- SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
- SUMSUB_BA w, m%3, m%1, m%6 ; %3=b2, %1=b4
- - movdqa m%8, m%10
- - psraw m%8, 2
- + psraw m%8, m%10, 2
- paddw m%8, m%2 ; %8=b3
- psraw m%2, 2
- psubw m%2, m%10 ; %2=b5
- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
- index ab33a59..f320823 100644
- --- a/common/x86/dct-a.asm
- +++ b/common/x86/dct-a.asm
- @@ -285,10 +285,10 @@ INIT_XMM
- cglobal add4x4_idct_sse4, 2,2,6
- mova m0, [r1+0x00] ; row1/row0
- mova m2, [r1+0x10] ; row3/row2
- - mova m1, m0 ; row1/row0
- - psraw m0, 1 ; row1>>1/...
- - mova m3, m2 ; row3/row2
- - psraw m2, 1 ; row3>>1/...
- + psraw m1, m0, 1 ; row1>>1/...
- + psraw m3, m2, 1 ; row3>>1/...
- + SWAP m0, m1
- + SWAP m2, m3
- movsd m0, m1 ; row1>>1/row0
- movsd m2, m3 ; row3>>1/row2
- psubw m0, m3 ; row1>>1-row3/row0-2
- @@ -304,8 +304,8 @@ cglobal add4x4_idct_sse4, 2,2,6
- mova m1, [pw_32_0]
- paddw m1, m0 ; row1/row0 corrected
- psraw m0, 1 ; row1>>1/...
- - mova m3, m2 ; row3/row2
- - psraw m2, 1 ; row3>>1/...
- + psraw m3, m2, 1 ; row3>>1/...
- + SWAP m2, m3
- movsd m0, m1 ; row1>>1/row0
- movsd m2, m3 ; row3>>1/row2
- psubw m0, m3 ; row1>>1-row3/row0-2
- @@ -638,22 +638,19 @@ cglobal add16x16_idct_dc_sse2, 2,2,8
- add r1, 16
- punpcklwd xmm0, xmm0
- punpcklwd xmm2, xmm2
- - pxor xmm1, xmm1
- pxor xmm3, xmm3
- paddw xmm0, [pw_32]
- paddw xmm2, [pw_32]
- psraw xmm0, 6
- psraw xmm2, 6
- - psubw xmm1, xmm0
- - psubw xmm3, xmm2
- + psubw xmm1, xmm3, xmm0
- packuswb xmm0, xmm1
- + psubw xmm3, xmm2
- + punpckhbw xmm1, xmm0, xmm0
- packuswb xmm2, xmm3
- - movdqa xmm1, xmm0
- - movdqa xmm3, xmm2
- + punpckhbw xmm3, xmm2, xmm2
- punpcklbw xmm0, xmm0
- punpcklbw xmm2, xmm2
- - punpckhbw xmm1, xmm1
- - punpckhbw xmm3, xmm3
- IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
- IDCT_DC_STORE 0, xmm2, xmm3
- ret
- @@ -677,12 +674,10 @@ cglobal add16x16_idct_dc_ssse3, 2,2,8
- movdqa xmm6, [pb_idctdc_unpack2]
- packuswb xmm0, xmm0
- packuswb xmm1, xmm1
- - movdqa xmm2, xmm0
- - movdqa xmm3, xmm1
- + pshufb xmm2, xmm0, xmm6
- pshufb xmm0, xmm5
- - pshufb xmm2, xmm6
- + pshufb xmm3, xmm1, xmm6
- pshufb xmm1, xmm5
- - pshufb xmm3, xmm6
- IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
- IDCT_DC_STORE 0, xmm2, xmm3
- ret
- @@ -835,7 +830,7 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8
- movdq2q mm5, xmm5
- PALIGNR xmm6, xmm6, 10, xmm3
- movdq2q mm6, xmm6
- -%ifidn %1, ssse3
- +%ifnidn %1, sse2
- PALIGNR xmm7, xmm7, 8, xmm3
- movdq2q mm7, xmm7
- %else
- @@ -868,12 +863,10 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8
- movq [r0+2*56], mm5
- movq [r0+2*60], mm3
- - movdqa xmm3, xmm0
- - movdqa xmm7, xmm4
- + punpckhdq xmm3, xmm0, xmm2
- punpckldq xmm0, xmm2
- + punpckhdq xmm7, xmm4, xmm6
- punpckldq xmm4, xmm6
- - punpckhdq xmm3, xmm2
- - punpckhdq xmm7, xmm6
- pshufhw xmm0, xmm0, 0x1b
- pshuflw xmm4, xmm4, 0x1b
- pshufhw xmm3, xmm3, 0x1b
- @@ -909,14 +902,12 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
- movu m2, [r1+14*SIZEOF_DCTCOEF]
- movu m3, [r1+21*SIZEOF_DCTCOEF]
- mova m4, [r1+28*SIZEOF_DCTCOEF]
- - mova m5, m0
- - mova m6, m1
- + punpckl%5 m5, m0, m1
- psrl%3 m0, %2
- + punpckh%5 m6, m1, m0
- + punpckl%4 m5, m0
- punpckl%4 m1, m1
- - punpckl%5 m5, m6
- punpckh%5 m1, m3
- - punpckh%5 m6, m0
- - punpckl%4 m5, m0
- mova m7, [r1+52*SIZEOF_DCTCOEF]
- mova m0, [r1+60*SIZEOF_DCTCOEF]
- punpckh%5 m1, m2
- @@ -936,10 +927,9 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
- movu [r0+47*SIZEOF_DCTCOEF], m4
- punpckh%5 m7, m0
- psll%3 m0, %2
- - mova m3, m5
- + punpckh%4 m3, m5, m5
- punpckl%5 m5, m1
- punpckh%5 m1, m2
- - punpckh%4 m3, m3
- mova [r0+52*SIZEOF_DCTCOEF], m6
- movu [r0+13*SIZEOF_DCTCOEF], m5
- movu m4, [r1+11*SIZEOF_DCTCOEF]
- @@ -957,12 +947,10 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
- punpckl%5 m6, m7
- punpckh%5 m1, m3
- punpckh%5 m5, m7
- - mova m3, m6
- - mova m7, m5
- + punpckh%4 m3, m6, m4
- + punpckh%4 m7, m5, m1
- punpckl%4 m6, m4
- punpckl%4 m5, m1
- - punpckh%4 m3, m4
- - punpckh%4 m7, m1
- movu m4, [r1+35*SIZEOF_DCTCOEF]
- movu m1, [r1+49*SIZEOF_DCTCOEF]
- pshuf%6 m6, m6, 0x1b
- @@ -975,18 +963,14 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
- mova [r0+32*SIZEOF_DCTCOEF], m7
- movu [r0+10*SIZEOF_DCTCOEF], m6
- movu [r0+21*SIZEOF_DCTCOEF], m5
- - mova m3, m0
- - mova m7, m2
- + punpckh%5 m3, m0, m4
- + punpckh%5 m7, m2, m1
- punpckl%5 m0, m4
- punpckl%5 m2, m1
- - punpckh%5 m3, m4
- - punpckh%5 m7, m1
- - mova m4, m2
- - mova m1, m7
- + punpckl%4 m4, m2, m0
- + punpckl%4 m1, m7, m3
- punpckh%4 m2, m0
- punpckh%4 m7, m3
- - punpckl%4 m4, m0
- - punpckl%4 m1, m3
- pshuf%6 m2, m2, 0x1b
- pshuf%6 m7, m7, 0x1b
- mova [r0+28*SIZEOF_DCTCOEF], m4
- @@ -999,6 +983,8 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
- +INIT_AVX
- +SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
- %else
- INIT_MMX
- SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
- @@ -1013,7 +999,7 @@ cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
- mova m1, [r1+ 4*SIZEOF_DCTCOEF]
- mova m2, [r1+ 8*SIZEOF_DCTCOEF]
- mova m3, [r1+12*SIZEOF_DCTCOEF]
- - mova m4, m0
- + punpckl%5 m4, m0, m1
- mova m5, m1
- mova m6, m2
- mova m7, m3
- @@ -1021,7 +1007,6 @@ cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
- psrl%3 m0, %2
- punpckl%4 m2, m2
- punpckh%4 m1, m1
- - punpckl%5 m4, m5
- punpckl%5 m5, m3
- punpckl%4 m4, m0
- punpckh%5 m5, m2
- @@ -1039,27 +1024,35 @@ cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- SCAN_4x4 sse2, 4 , dq, qdq, dq
- +INIT_AVX
- +SCAN_4x4 avx , 4 , dq, qdq, dq
- %else
- INIT_MMX
- SCAN_4x4 mmx , 16, q , dq , wd
- -%endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
- ;-----------------------------------------------------------------------------
- -cglobal zigzag_scan_4x4_frame_ssse3, 2,2
- +%macro SCAN_4x4_FRAME 1
- +cglobal zigzag_scan_4x4_frame_%1, 2,2
- movdqa xmm1, [r1+16]
- movdqa xmm0, [r1]
- pshufb xmm1, [pb_scan4frameb]
- pshufb xmm0, [pb_scan4framea]
- - movdqa xmm2, xmm1
- - psrldq xmm1, 6
- - palignr xmm2, xmm0, 6
- + psrldq xmm2, xmm1, 6
- + palignr xmm1, xmm0, 6
- pslldq xmm0, 10
- - palignr xmm1, xmm0, 10
- - movdqa [r0], xmm2
- - movdqa [r0+16], xmm1
- + palignr xmm2, xmm0, 10
- + movdqa [r0], xmm1
- + movdqa [r0+16], xmm2
- RET
- +%endmacro
- +
- +INIT_XMM
- +SCAN_4x4_FRAME ssse3
- +INIT_AVX
- +SCAN_4x4_FRAME avx
- +%endif ; !HIGH_BIT_DEPTH
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- @@ -1121,25 +1114,23 @@ cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
- pshuf%2 m2, m2, 000111001b ; 08 11 10 09
- punpckl%3 m3, m1 ; 05 03 04 03
- pinsr%2 m0, r2d, 3 ; 08 02 01 00
- - mova m4, m2
- - punpckl%3 m2, m3 ; 04 10 03 09
- - pshuf%2 m2, m2, 010110100b ; 10 04 03 09
- + punpckl%3 m4, m2, m3 ; 04 10 03 09
- + pshuf%2 m4, m4, 010110100b ; 10 04 03 09
- mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
- - mova [r0+ 4*SIZEOF_DCTCOEF], m2 ; 10 04 03 09
- + mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
- mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
- mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
- punpckl%4 m6, m5 ; 17 16 XX XX
- psrl%5 m1, %6 ; XX 07 06 05
- - punpckh%3 m6, m4 ; 08 17 11 16
- + punpckh%3 m6, m2 ; 08 17 11 16
- punpckl%4 m6, m1 ; 06 05 11 16
- mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
- psrl%5 m1, %6 ; XX XX 07 06
- punpckl%3 m1, m5 ; 17 07 16 06
- mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
- mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
- - mova m6, m3
- punpckh%4 m1, m1 ; 17 07 17 07
- - punpckl%3 m6, m2 ; 25 13 24 12
- + punpckl%3 m6, m3, m2 ; 25 13 24 12
- pextr%2 r2d, m5, 2
- mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
- punpckl%3 m1, m6 ; 24 17 12 07
- @@ -1181,9 +1172,8 @@ cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
- mova [r0+48*SIZEOF_DCTCOEF], m7
- mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
- mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
- - mova m2, m0
- mova m7, [r1+60*SIZEOF_DCTCOEF]
- - punpckl%4 m2, m1 ; 53 52 57 56
- + punpckl%4 m2, m0, m1 ; 53 52 57 56
- punpckh%4 m1, m0 ; 59 58 55 54
- mova [r0+52*SIZEOF_DCTCOEF], m2
- mova [r0+56*SIZEOF_DCTCOEF], m1
- @@ -1193,6 +1183,8 @@ cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- SCAN_8x8 sse4 , d, dq, qdq, dq, 4
- +INIT_AVX
- +SCAN_8x8 avx , d, dq, qdq, dq, 4
- %else
- INIT_MMX
- SCAN_8x8 mmxext, w, wd, dq , q , 16
- @@ -1201,11 +1193,11 @@ SCAN_8x8 mmxext, w, wd, dq , q , 16
- ;-----------------------------------------------------------------------------
- ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
- ;-----------------------------------------------------------------------------
- -%macro ZIGZAG_SUB_4x4 2
- +%macro ZIGZAG_SUB_4x4 3
- %ifidn %1, ac
- -cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
- +cglobal zigzag_sub_4x4%1_%2_%3, 4,4,8
- %else
- -cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
- +cglobal zigzag_sub_4x4%1_%2_%3, 3,3,8
- %endif
- movd xmm0, [r1+0*FENC_STRIDE]
- movd xmm1, [r1+1*FENC_STRIDE]
- @@ -1233,12 +1225,10 @@ cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
- pshufb xmm0, xmm7
- pshufb xmm4, xmm7
- pxor xmm6, xmm6
- - movdqa xmm1, xmm0
- - movdqa xmm5, xmm4
- + punpckhbw xmm1, xmm0, xmm6
- + punpckhbw xmm5, xmm4, xmm6
- punpcklbw xmm0, xmm6
- - punpckhbw xmm1, xmm6
- punpcklbw xmm4, xmm6
- - punpckhbw xmm5, xmm6
- psubw xmm0, xmm4
- psubw xmm1, xmm5
- %ifidn %1, ac
- @@ -1259,10 +1249,16 @@ cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
- RET
- %endmacro
- -ZIGZAG_SUB_4x4 , frame
- -ZIGZAG_SUB_4x4 ac, frame
- -ZIGZAG_SUB_4x4 , field
- -ZIGZAG_SUB_4x4 ac, field
- +INIT_XMM
- +ZIGZAG_SUB_4x4 , frame, ssse3
- +ZIGZAG_SUB_4x4 ac, frame, ssse3
- +ZIGZAG_SUB_4x4 , field, ssse3
- +ZIGZAG_SUB_4x4 ac, field, ssse3
- +INIT_AVX
- +ZIGZAG_SUB_4x4 , frame, avx
- +ZIGZAG_SUB_4x4 ac, frame, avx
- +ZIGZAG_SUB_4x4 , field, avx
- +ZIGZAG_SUB_4x4 ac, field, avx
- ;-----------------------------------------------------------------------------
- ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
- @@ -1314,6 +1310,8 @@ cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8*(mmsize/16)
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- ZIGZAG_8x8_CAVLC sse2, D
- +INIT_AVX
- +ZIGZAG_8x8_CAVLC avx , D
- %else
- INIT_MMX
- ZIGZAG_8x8_CAVLC mmx , W
- @@ -1350,8 +1348,8 @@ ZIGZAG_8x8_CAVLC mmx , W
- %endmacro
- %ifndef HIGH_BIT_DEPTH
- -INIT_XMM
- -cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8
- +%macro ZIGZAG_8x8_CAVLC 1
- +cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8
- INTERLEAVE_XMM 0
- INTERLEAVE_XMM 16
- packsswb m2, m3
- @@ -1365,4 +1363,10 @@ cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8
- shr r0d, 16
- mov [r2+8], r0w
- RET
- +%endmacro
- +
- +INIT_XMM
- +ZIGZAG_8x8_CAVLC sse2
- +INIT_AVX
- +ZIGZAG_8x8_CAVLC avx
- %endif ; !HIGH_BIT_DEPTH
- diff --git a/common/x86/dct.h b/common/x86/dct.h
- index 9361132..d6e3b68 100644
- --- a/common/x86/dct.h
- +++ b/common/x86/dct.h
- @@ -71,21 +71,29 @@ void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
- void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct [64] );
- void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
- +void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
- void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
- void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
- void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
- +void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
- void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
- void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
- void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
- void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
- void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
- +void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
- void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
- void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
- +int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
- int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
- +int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
- int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
- +int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
- int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
- +int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
- int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
- void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
- void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
- +void x264_zigzag_interleave_8x8_cavlc_avx( dctcoef *dst, dctcoef *src, uint8_t *nnz );
- #endif
- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
- index 42cafa9..80d803d 100644
- --- a/common/x86/deblock-a.asm
- +++ b/common/x86/deblock-a.asm
- @@ -47,20 +47,16 @@ cextern pw_pixel_max
- ; out: %4 = |%1-%2|-%3
- ; clobbers: %5
- %macro ABS_SUB 5
- - mova %5, %2
- - mova %4, %1
- - psubusw %5, %1
- - psubusw %4, %2
- + psubusw %5, %2, %1
- + psubusw %4, %1, %2
- por %4, %5
- psubw %4, %3
- %endmacro
- ; out: %4 = |%1-%2|<%3
- %macro DIFF_LT 5
- - mova %4, %2
- - mova %5, %1
- - psubusw %4, %1
- - psubusw %5, %2
- + psubusw %4, %2, %1
- + psubusw %5, %1, %2
- por %5, %4 ; |%1-%2|
- pxor %4, %4
- psubw %5, %3 ; |%1-%2|-%3
- @@ -105,11 +101,10 @@ cextern pw_pixel_max
- ; out: %1=p0', m2=q0'
- %macro DEBLOCK_P0_Q0 7
- psubw %3, %4
- - mova %6, %2
- pxor %7, %7
- paddw %3, [pw_4]
- psubw %7, %5
- - psubw %6, %1
- + psubw %6, %2, %1
- psllw %6, 2
- paddw %3, %6
- psraw %3, 3
- @@ -124,8 +119,7 @@ cextern pw_pixel_max
- ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
- %macro LUMA_Q1 6
- - mova %6, %3
- - pavgw %6, %4 ; (p0+q0+1)>>1
- + pavgw %6, %3, %4 ; (p0+q0+1)>>1
- paddw %1, %6
- pxor %6, %6
- psraw %1, 1
- @@ -350,7 +344,8 @@ INIT_XMM
- SWAP m3, m9
- %endmacro
- -cglobal deblock_v_luma_sse2, 5,5,15
- +%macro DEBLOCK_LUMA_64 1
- +cglobal deblock_v_luma_%1, 5,5,15
- %define p2 m8
- %define p1 m0
- %define p0 m1
- @@ -386,7 +381,7 @@ cglobal deblock_v_luma_sse2, 5,5,15
- jg .loop
- REP_RET
- -cglobal deblock_h_luma_sse2, 5,7,15
- +cglobal deblock_h_luma_%1, 5,7,15
- add r1, r1
- LOAD_AB m12, m13, r2, r3
- mov r2, r1
- @@ -423,6 +418,12 @@ cglobal deblock_h_luma_sse2, 5,7,15
- dec r6
- jg .loop
- REP_RET
- +%endmacro
- +
- +INIT_XMM
- +DEBLOCK_LUMA_64 sse2
- +INIT_AVX
- +DEBLOCK_LUMA_64 avx
- %endif
- %macro SWAPMOVA 2
- @@ -437,10 +438,16 @@ cglobal deblock_h_luma_sse2, 5,7,15
- ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
- ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
- %macro LUMA_INTRA_P012 12 ; p0..p3 in memory
- +%ifdef ARCH_X86_64
- + paddw t0, %3, %2
- + mova t2, %4
- + paddw t2, %3
- +%else
- mova t0, %3
- mova t2, %4
- paddw t0, %2
- paddw t2, %3
- +%endif
- paddw t0, %1
- paddw t2, t2
- paddw t0, %5
- @@ -448,9 +455,8 @@ cglobal deblock_h_luma_sse2, 5,7,15
- paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
- paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
- - mova t1, t0
- psrlw t2, 3
- - psrlw t1, 2
- + psrlw t1, t0, 2
- psubw t2, %3
- psubw t1, %2
- pand t2, %8
- @@ -459,8 +465,7 @@ cglobal deblock_h_luma_sse2, 5,7,15
- paddw t1, %2
- SWAPMOVA %11, t1
- - mova t1, t0
- - psubw t1, %3
- + psubw t1, t0, %3
- paddw t0, t0
- psubw t1, %5
- psubw t0, %3
- @@ -500,9 +505,14 @@ cglobal deblock_h_luma_sse2, 5,7,15
- LOAD_AB t0, t1, r2d, r3d
- mova %1, t0
- LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
- +%ifdef ARCH_X86_64
- + mova %2, t0 ; mask0
- + psrlw t3, %1, 2
- +%else
- mova t3, %1
- mova %2, t0 ; mask0
- psrlw t3, 2
- +%endif
- paddw t3, [pw_2] ; alpha/4+2
- DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
- pand t2, %2
- @@ -593,11 +603,11 @@ cglobal deblock_h_luma_sse2, 5,7,15
- %endmacro
- %ifdef ARCH_X86_64
- -INIT_XMM
- ;-----------------------------------------------------------------------------
- ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
- ;-----------------------------------------------------------------------------
- -cglobal deblock_v_luma_intra_sse2, 4,7,16
- +%macro DEBLOCK_LUMA_INTRA_64 1
- +cglobal deblock_v_luma_intra_%1, 4,7,16
- %define t0 m1
- %define t1 m2
- %define t2 m4
- @@ -646,7 +656,7 @@ cglobal deblock_v_luma_intra_sse2, 4,7,16
- ;-----------------------------------------------------------------------------
- ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
- ;-----------------------------------------------------------------------------
- -cglobal deblock_h_luma_intra_sse2, 4,7,16
- +cglobal deblock_h_luma_intra_%1, 4,7,16
- %define t0 m15
- %define t1 m14
- %define t2 m2
- @@ -702,6 +712,13 @@ cglobal deblock_h_luma_intra_sse2, 4,7,16
- jg .loop
- ADD rsp, pad
- RET
- +%endmacro
- +
- +INIT_XMM
- +DEBLOCK_LUMA_INTRA_64 sse2
- +INIT_AVX
- +DEBLOCK_LUMA_INTRA_64 avx
- +
- %endif
- %macro DEBLOCK_LUMA_INTRA 1
- @@ -780,6 +797,9 @@ DEBLOCK_LUMA_INTRA mmxext
- INIT_XMM
- DEBLOCK_LUMA sse2
- DEBLOCK_LUMA_INTRA sse2
- +INIT_AVX
- +DEBLOCK_LUMA avx
- +DEBLOCK_LUMA_INTRA avx
- %endif
- %endif ; HIGH_BIT_DEPTH
- @@ -815,28 +835,23 @@ DEBLOCK_LUMA_INTRA sse2
- punpckl%2 m4, m5
- punpckh%2 m6, m5
- - mova m1, m0
- - mova m3, m2
- + punpckh%3 m1, m0, m4
- + punpckh%3 m3, m2, m6
- punpckl%3 m0, m4
- - punpckh%3 m1, m4
- punpckl%3 m2, m6
- - punpckh%3 m3, m6
- %endmacro
- ; in: 4 rows of 8 bytes in m0..m3
- ; out: 8 rows of 4 bytes in %1..%8
- %macro TRANSPOSE8x4B_STORE 8
- - mova m4, m0
- - mova m5, m1
- - mova m6, m2
- - punpckhdq m4, m4
- - punpckhdq m5, m5
- - punpckhdq m6, m6
- + punpckhdq m4, m0, m0
- + punpckhdq m5, m1, m1
- + punpckhdq m6, m2, m2
- punpcklbw m0, m1
- punpcklbw m2, m3
- - mova m1, m0
- - punpcklwd m0, m2
- + punpcklwd m1, m0, m2
- + SWAP m0, m1
- punpckhwd m1, m2
- movh %1, m0
- punpckhdq m0, m0
- @@ -848,8 +863,8 @@ DEBLOCK_LUMA_INTRA sse2
- punpckhdq m3, m3
- punpcklbw m4, m5
- punpcklbw m6, m3
- - mova m5, m4
- - punpcklwd m4, m6
- + punpcklwd m5, m4, m6
- + SWAP m4, m5
- punpckhwd m5, m6
- movh %5, m4
- punpckhdq m4, m4
- @@ -877,19 +892,18 @@ DEBLOCK_LUMA_INTRA sse2
- %endmacro
- %macro TRANSPOSE8x2W_STORE 8
- - mova m0, m1
- + punpckhwd m0, m1, m2
- punpcklwd m1, m2
- - punpckhwd m0, m2
- %if mmsize==8
- - movd %1, m1
- movd %3, m0
- + movd %1, m1
- psrlq m1, 32
- psrlq m0, 32
- movd %2, m1
- movd %4, m0
- %else
- - movd %1, m1
- movd %5, m0
- + movd %1, m1
- psrldq m1, 4
- psrldq m0, 4
- movd %2, m1
- @@ -906,9 +920,8 @@ DEBLOCK_LUMA_INTRA sse2
- %endmacro
- %macro SBUTTERFLY3 4
- - movq %4, %2
- + punpckh%1 %4, %2, %3
- punpckl%1 %2, %3
- - punpckh%1 %4, %3
- %endmacro
- ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
- @@ -984,10 +997,15 @@ DEBLOCK_LUMA_INTRA sse2
- ; out: %4 = |%1-%2|>%3
- ; clobbers: %5
- %macro DIFF_GT 5
- +%if avx_enabled == 0
- mova %5, %2
- mova %4, %1
- psubusb %5, %1
- psubusb %4, %2
- +%else
- + psubusb %5, %2, %1
- + psubusb %4, %1, %2
- +%endif
- por %4, %5
- psubusb %4, %3
- %endmacro
- @@ -995,10 +1013,15 @@ DEBLOCK_LUMA_INTRA sse2
- ; out: %4 = |%1-%2|>%3
- ; clobbers: %5
- %macro DIFF_GT2 5
- +%ifdef ARCH_X86_64
- + psubusb %5, %2, %1
- + psubusb %4, %1, %2
- +%else
- mova %5, %2
- mova %4, %1
- psubusb %5, %1
- psubusb %4, %2
- +%endif
- psubusb %5, %3
- psubusb %4, %3
- pcmpeqb %4, %5
- @@ -1030,8 +1053,7 @@ DEBLOCK_LUMA_INTRA sse2
- ; out: m1=p0' m2=q0'
- ; clobbers: m0,3-6
- %macro DEBLOCK_P0_Q0 0
- - mova m5, m1
- - pxor m5, m2 ; p0^q0
- + pxor m5, m1, m2 ; p0^q0
- pand m5, [pb_1] ; (p0^q0)&1
- pcmpeqb m4, m4
- pxor m3, m4
- @@ -1057,14 +1079,12 @@ DEBLOCK_LUMA_INTRA sse2
- ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
- ; clobbers: q2, tmp, tc0
- %macro LUMA_Q1 6
- - mova %6, m1
- - pavgb %6, m2
- + pavgb %6, m1, m2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
- pxor %6, %3
- pand %6, [pb_1] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- - mova %6, %1
- - psubusb %6, %5
- + psubusb %6, %1, %5
- paddusb %5, %1
- pmaxub %2, %6
- pminub %2, %5
- @@ -1075,8 +1095,8 @@ DEBLOCK_LUMA_INTRA sse2
- ;-----------------------------------------------------------------------------
- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
- ;-----------------------------------------------------------------------------
- -INIT_XMM
- -cglobal deblock_v_luma_sse2, 5,5,10
- +%macro DEBLOCK_LUMA 1
- +cglobal deblock_v_luma_%1, 5,5,10
- movd m8, [r4] ; tc0
- lea r4, [r1*3]
- dec r2d ; alpha-1
- @@ -1100,8 +1120,7 @@ cglobal deblock_v_luma_sse2, 5,5,10
- movdqa m3, [r4] ; p2
- DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
- pand m6, m9
- - mova m7, m8
- - psubb m7, m6
- + psubb m7, m8, m6
- pand m6, m8
- LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
- @@ -1122,7 +1141,7 @@ cglobal deblock_v_luma_sse2, 5,5,10
- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
- ;-----------------------------------------------------------------------------
- INIT_MMX
- -cglobal deblock_h_luma_sse2, 5,7
- +cglobal deblock_h_luma_%1, 5,7
- movsxd r10, r1d
- lea r11, [r10+r10*2]
- lea r6, [r0-4]
- @@ -1149,7 +1168,7 @@ cglobal deblock_h_luma_sse2, 5,7
- %ifdef WIN64
- mov [rsp+0x20], r4
- %endif
- - call deblock_v_luma_sse2
- + call deblock_v_luma_%1
- ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- add r6, 2
- @@ -1176,6 +1195,12 @@ cglobal deblock_h_luma_sse2, 5,7
- add rsp, 0x68
- %endif
- RET
- +%endmacro
- +
- +INIT_XMM
- +DEBLOCK_LUMA sse2
- +INIT_AVX
- +DEBLOCK_LUMA avx
- %else
- @@ -1212,8 +1237,7 @@ cglobal deblock_%2_luma_%1, 5,5
- DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
- pand m6, m4
- pand m4, [esp+%3] ; tc
- - mova m7, m4
- - psubb m7, m6
- + psubb m7, m4, m6
- pand m6, m4
- LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
- @@ -1295,22 +1319,34 @@ INIT_MMX
- DEBLOCK_LUMA mmxext, v8, 8
- INIT_XMM
- DEBLOCK_LUMA sse2, v, 16
- +INIT_AVX
- +DEBLOCK_LUMA avx, v, 16
- %endif ; ARCH
- %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
- +%ifdef ARCH_X86_64
- + pavgb t0, p2, p1
- + pavgb t1, p0, q0
- +%else
- mova t0, p2
- mova t1, p0
- pavgb t0, p1
- pavgb t1, q0
- +%endif
- pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
- mova t5, t1
- +%ifdef ARCH_X86_64
- + paddb t2, p2, p1
- + paddb t3, p0, q0
- +%else
- mova t2, p2
- mova t3, p0
- paddb t2, p1
- paddb t3, q0
- +%endif
- paddb t2, t3
- mova t3, t2
- mova t4, t2
- @@ -1320,10 +1356,15 @@ DEBLOCK_LUMA sse2, v, 16
- pand t2, mpb_1
- psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
- +%ifdef ARCH_X86_64
- + pavgb t1, p2, q1
- + psubb t2, p2, q1
- +%else
- mova t1, p2
- mova t2, p2
- pavgb t1, q1
- psubb t2, q1
- +%endif
- paddb t3, t3
- psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
- pand t2, mpb_1
- @@ -1336,10 +1377,8 @@ DEBLOCK_LUMA sse2, v, 16
- pand t3, mpb_1
- psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
- - mova t3, p0
- - mova t2, p0
- - pxor t3, q1
- - pavgb t2, q1
- + pxor t3, p0, q1
- + pavgb t2, p0, q1
- pand t3, mpb_1
- psubb t2, t3
- pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
- @@ -1353,9 +1392,8 @@ DEBLOCK_LUMA sse2, v, 16
- mova %1, t1 ; store p0
- mova t1, %4 ; p3
- - mova t2, t1
- + paddb t2, t1, p2
- pavgb t1, p2
- - paddb t2, p2
- pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
- paddb t2, t2
- paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
- @@ -1553,6 +1591,8 @@ cglobal deblock_h_luma_intra_%1, 2,4
- INIT_XMM
- DEBLOCK_LUMA_INTRA sse2, v
- +INIT_AVX
- +DEBLOCK_LUMA_INTRA avx , v
- %ifndef ARCH_X86_64
- INIT_MMX
- DEBLOCK_LUMA_INTRA mmxext, v8
- @@ -1566,9 +1606,8 @@ DEBLOCK_LUMA_INTRA mmxext, v8
- mova %6, [pw_2]
- paddw %6, %3
- paddw %6, %4
- - mova %7, %6
- + paddw %7, %6, %2
- paddw %6, %1
- - paddw %7, %2
- paddw %6, %3
- paddw %7, %4
- psraw %6, 2
- @@ -1604,12 +1643,10 @@ DEBLOCK_LUMA_INTRA mmxext, v8
- punpckldq m2, m7 ; q0 ... q1 ...
- punpckldq m4, m1
- punpckldq m6, m3
- - mova m1, m0
- - mova m3, m2
- + punpckhqdq m1, m0, m4 ; p0
- punpcklqdq m0, m4 ; p1
- - punpckhqdq m1, m4 ; p0
- + punpckhqdq m3, m2, m6 ; q1
- punpcklqdq m2, m6 ; q0
- - punpckhqdq m3, m6 ; q1
- %endif
- %endmacro
- @@ -1743,6 +1780,8 @@ DEBLOCK_CHROMA mmxext
- %endif
- INIT_XMM
- DEBLOCK_CHROMA sse2
- +INIT_AVX
- +DEBLOCK_CHROMA avx
- %endif ; HIGH_BIT_DEPTH
- %ifndef HIGH_BIT_DEPTH
- @@ -1839,6 +1878,8 @@ chroma_inter_body_%1:
- INIT_XMM
- DEBLOCK_CHROMA sse2
- +INIT_AVX
- +DEBLOCK_CHROMA avx
- %ifndef ARCH_X86_64
- INIT_MMX
- DEBLOCK_CHROMA mmxext
- @@ -1848,12 +1889,11 @@ DEBLOCK_CHROMA mmxext
- ; in: %1=p0 %2=p1 %3=q1
- ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
- %macro CHROMA_INTRA_P0 3
- - mova m4, %1
- - pxor m4, %3
- + pxor m4, %1, %3
- pand m4, [pb_1] ; m4 = (p0^q1)&1
- pavgb %1, %3
- psubusb %1, m4
- - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
- + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
- %endmacro
- %define t5 r4
- @@ -1905,6 +1945,8 @@ chroma_intra_body_%1:
- INIT_XMM
- DEBLOCK_CHROMA_INTRA sse2
- +INIT_AVX
- +DEBLOCK_CHROMA_INTRA avx
- %ifndef ARCH_X86_64
- INIT_MMX
- DEBLOCK_CHROMA_INTRA mmxext
- @@ -1993,9 +2035,8 @@ DEBLOCK_CHROMA_INTRA mmxext
- shufps m2, m1, 0xdd ; cur nnz, all rows
- pslldq m1, 1
- shufps m0, m1, 0xdd ; left neighbors
- - mova m1, m2
- + pslldq m1, m2, 4
- movd m3, [%1-8] ; could be palignr if nnz was aligned
- - pslldq m1, 4
- por m1, m3 ; top neighbors
- %endmacro
- @@ -2065,23 +2106,19 @@ cglobal deblock_strength_%1, 6,6,8
- por m5, m1
- ; Check mvs
- -%ifidn %1, ssse3
- - mova m3, [mv+4*8*0]
- - mova m2, [mv+4*8*1]
- - mova m0, m3
- - mova m1, m2
- - palignr m3, [mv+4*8*0-16], 12
- - palignr m2, [mv+4*8*1-16], 12
- +%ifidn %1, ssse3 || %1, avx
- + mova m0, [mv+4*8*0]
- + mova m1, [mv+4*8*1]
- + palignr m3, m0, [mv+4*8*0-16], 12
- + palignr m2, m1, [mv+4*8*1-16], 12
- psubw m0, m3
- psubw m1, m2
- packsswb m0, m1
- - mova m3, [mv+4*8*2]
- - mova m7, [mv+4*8*3]
- - mova m2, m3
- - mova m1, m7
- - palignr m3, [mv+4*8*2-16], 12
- - palignr m7, [mv+4*8*3-16], 12
- + mova m2, [mv+4*8*2]
- + mova m1, [mv+4*8*3]
- + palignr m3, m2, [mv+4*8*2-16], 12
- + palignr m7, m1, [mv+4*8*3-16], 12
- psubw m2, m3
- psubw m1, m7
- packsswb m2, m1
- @@ -2153,3 +2190,5 @@ INIT_XMM
- DEBLOCK_STRENGTH_XMM sse2
- %define ABSB2 ABSB2_SSSE3
- DEBLOCK_STRENGTH_XMM ssse3
- +INIT_AVX
- +DEBLOCK_STRENGTH_XMM avx
- diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
- index 121d14e..e7cbf19 100644
- --- a/common/x86/mc-a.asm
- +++ b/common/x86/mc-a.asm
- @@ -155,7 +155,6 @@ cextern pd_32
- %endmacro
- %ifdef HIGH_BIT_DEPTH
- -
- %macro BIWEIGHT_ROW 4
- BIWEIGHT [%2], [%3]
- %if %4==mmsize/4
- @@ -456,6 +455,11 @@ WEIGHTER 16, sse2
- WEIGHTER 20, sse2
- %ifdef HIGH_BIT_DEPTH
- WEIGHTER 12, sse2
- +INIT_AVX
- +WEIGHTER 8, avx
- +WEIGHTER 12, avx
- +WEIGHTER 16, avx
- +WEIGHTER 20, avx
- %else
- %define WEIGHT WEIGHT_SSSE3
- %define WEIGHT_START WEIGHT_START_SSSE3
- @@ -465,6 +469,10 @@ INIT_XMM
- WEIGHTER 8, ssse3
- WEIGHTER 16, ssse3
- WEIGHTER 20, ssse3
- +INIT_AVX
- +WEIGHTER 8, avx
- +WEIGHTER 16, avx
- +WEIGHTER 20, avx
- %endif
- %macro OFFSET_OP 7
- @@ -541,8 +549,15 @@ INIT_XMM
- OFFSETPN 12, sse2
- OFFSETPN 16, sse2
- OFFSETPN 20, sse2
- +INIT_AVX
- +OFFSETPN 12, avx
- +OFFSETPN 16, avx
- +OFFSETPN 20, avx
- %ifdef HIGH_BIT_DEPTH
- +INIT_XMM
- OFFSETPN 8, sse2
- +INIT_AVX
- +OFFSETPN 8, avx
- %endif
- %undef LOAD_HEIGHT
- %undef HEIGHT_REG
- @@ -616,7 +631,6 @@ AVGH 16, 16, mmxext
- AVGH 16, 8, mmxext
- INIT_XMM
- -
- AVG_FUNC 4, movq, movq, sse2
- AVGH 4, 8, sse2
- AVGH 4, 4, sse2
- @@ -1450,16 +1464,15 @@ cglobal prefetch_ref_mmxext, 3,3
- %macro UNPACK_UNALIGNED 4
- movu %1, [%4+0]
- movu %2, [%4+4]
- - mova %3, %1
- + punpckhwd %3, %1, %2
- punpcklwd %1, %2
- - punpckhwd %3, %2
- - mova %2, %1
- %if mmsize == 8
- + mova %2, %1
- punpcklwd %1, %3
- punpckhwd %2, %3
- %else
- + shufps %2, %1, %3, 11011101b
- shufps %1, %3, 10001000b
- - shufps %2, %3, 11011101b
- %endif
- %endmacro
- %else ; !HIGH_BIT_DEPTH
- @@ -1620,12 +1633,10 @@ ALIGN 4
- movu m1, [r3+mmsize/2]
- UNPACK_UNALIGNED m0, m2, [r3+2]
- UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- - mova m2, m0
- - mova m3, m1
- + psrlw m2, m0, 8
- + psrlw m3, m1, 8
- pand m0, [pw_00ff]
- pand m1, [pw_00ff]
- - psrlw m2, 8
- - psrlw m3, 8
- %endif
- pmaddwd m0, m7
- pmaddwd m2, m7
- @@ -1652,12 +1663,10 @@ ALIGN 4
- movu m1, [r3+mmsize/2]
- UNPACK_UNALIGNED m0, m2, [r3+2]
- UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- - mova m2, m0
- - mova m3, m1
- + psrlw m2, m0, 8
- + psrlw m3, m1, 8
- pand m0, [pw_00ff]
- pand m1, [pw_00ff]
- - psrlw m2, 8
- - psrlw m3, 8
- pmaddwd m0, m7
- pmaddwd m2, m7
- pmaddwd m1, m7
- @@ -1668,9 +1677,8 @@ ALIGN 4
- pmullw m4, m6
- pmullw m5, m6
- mova m2, [pw_32]
- - mova m3, m2
- + paddw m3, m2, m5
- paddw m2, m4
- - paddw m3, m5
- mova m4, m0
- mova m5, m1
- pmullw m0, multy0
- @@ -1799,12 +1807,10 @@ ALIGN 4
- movhps m0, [r3]
- movhps m1, [r3+r6]
- %endif
- - mova m2, m0
- - mova m3, m1
- + psrlw m2, m0, 8
- + psrlw m3, m1, 8
- pand m0, [pw_00ff]
- pand m1, [pw_00ff]
- - psrlw m2, 8
- - psrlw m3, 8
- %endif ; HIGH_BIT_DEPTH
- pmullw m0, m4
- pmullw m1, m5
- @@ -1868,9 +1874,8 @@ ALIGN 4
- %endmacro ; MC_CHROMA
- -%macro MC_CHROMA_SSSE3 0-1
- -INIT_XMM
- -cglobal mc_chroma_ssse3%1, 0,6,9
- +%macro MC_CHROMA_SSSE3 1-2
- +cglobal mc_chroma_%1, 0,6,9
- MC_CHROMA_START
- and r5d, 7
- and t2d, 7
- @@ -1884,7 +1889,7 @@ cglobal mc_chroma_ssse3%1, 0,6,9
- imul r5d, t0d ; (x*255+8)*(8-y)
- movd m6, t2d
- movd m7, r5d
- -%ifidn %1, _cache64
- +%ifidn %2, _cache64
- mov t0d, r3d
- and t0d, 7
- %ifdef PIC
- @@ -1913,11 +1918,10 @@ cglobal mc_chroma_ssse3%1, 0,6,9
- pshufb m1, m5
- movu m3, [r3+r4*2]
- pshufb m3, m5
- - mova m2, m1
- mova m4, m3
- pmaddubsw m0, m7
- + pmaddubsw m2, m1, m7
- pmaddubsw m1, m6
- - pmaddubsw m2, m7
- pmaddubsw m3, m6
- paddw m0, [pw_32]
- paddw m2, [pw_32]
- @@ -2008,6 +2012,8 @@ INIT_MMX
- MC_CHROMA mmxext
- INIT_XMM
- MC_CHROMA sse2
- +INIT_AVX
- +MC_CHROMA avx
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX
- %define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
- @@ -2016,6 +2022,9 @@ INIT_XMM
- MC_CHROMA sse2_misalign
- %define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
- MC_CHROMA sse2
- -MC_CHROMA_SSSE3
- -MC_CHROMA_SSSE3 _cache64
- +MC_CHROMA_SSSE3 ssse3
- +MC_CHROMA_SSSE3 ssse3_cache64, _cache64
- +INIT_AVX
- +MC_CHROMA_SSSE3 avx
- +MC_CHROMA_SSSE3 avx_cache64, _cache64
- %endif ; HIGH_BIT_DEPTH
- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
- index cba9773..d472678 100644
- --- a/common/x86/mc-a2.asm
- +++ b/common/x86/mc-a2.asm
- @@ -73,12 +73,10 @@ cextern pd_ffff
- %macro LOAD_ADD_2 6
- mova %5, %3
- mova %1, %4
- - mova %6, %5
- - mova %2, %1
- + punpckhbw %6, %5, m0
- punpcklbw %5, m0
- + punpckhbw %2, %1, m0
- punpcklbw %1, m0
- - punpckhbw %6, m0
- - punpckhbw %2, m0
- paddw %1, %5
- paddw %2, %6
- %endmacro
- @@ -301,7 +299,7 @@ cglobal hpel_filter_h_%1, 3,4,8*(mmsize/16)
- mova [r0+r2-mmsize*1], m4
- jl .loop
- REP_RET
- -%endmacro
- +%endmacro ; HPEL_FILTER
- INIT_MMX
- HPEL_FILTER mmxext
- @@ -310,8 +308,6 @@ HPEL_FILTER sse2
- %endif ; HIGH_BIT_DEPTH
- %ifndef HIGH_BIT_DEPTH
- -INIT_MMX
- -
- %macro HPEL_V 1-2 0
- ;-----------------------------------------------------------------------------
- ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
- @@ -326,13 +322,13 @@ cglobal hpel_filter_v_%1, 5,6,%2
- add r0, r4
- lea r2, [r2+r4*2]
- neg r4
- -%ifnidn %1, ssse3
- +%ifidn %1, sse2
- pxor m0, m0
- %else
- mova m0, [filt_mul15]
- %endif
- .loop:
- -%ifidn %1, ssse3
- +%ifnidn %1, sse2
- mova m1, [r1]
- mova m4, [r1+r3]
- mova m2, [r5+r3*2]
- @@ -370,7 +366,6 @@ cglobal hpel_filter_v_%1, 5,6,%2
- jl .loop
- REP_RET
- %endmacro
- -HPEL_V mmxext
- ;-----------------------------------------------------------------------------
- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
- @@ -585,19 +580,14 @@ cglobal hpel_filter_h_ssse3, 3,3
- mova m7, [pw_16]
- .loop:
- mova m2, [src+16]
- - mova m3, m1
- - palignr m3, m0, 14
- - mova m4, m1
- - palignr m4, m0, 15
- - mova m0, m2
- - palignr m0, m1, 2
- + palignr m3, m1, m0, 14
- + palignr m4, m1, m0, 15
- + palignr m0, m2, m1, 2
- pmaddubsw m3, [filt_mul15]
- pmaddubsw m4, [filt_mul15]
- pmaddubsw m0, [filt_mul51]
- - mova m5, m2
- - palignr m5, m1, 1
- - mova m6, m2
- - palignr m6, m1, 3
- + palignr m5, m2, m1, 1
- + palignr m6, m2, m1, 3
- paddw m3, m0
- mova m0, m1
- pmaddubsw m1, [filt_mul20]
- @@ -613,9 +603,12 @@ cglobal hpel_filter_h_ssse3, 3,3
- add r2, 16
- jl .loop
- REP_RET
- -%endif
- +%endif ; !ARCH_X86_64
- %define PALIGNR PALIGNR_MMX
- +INIT_MMX
- +HPEL_V mmxext
- +INIT_XMM
- %ifndef ARCH_X86_64
- HPEL_C sse2
- %endif
- @@ -624,9 +617,11 @@ HPEL_C sse2_misalign
- %define PALIGNR PALIGNR_SSSE3
- HPEL_C ssse3
- HPEL_V ssse3
- +INIT_AVX
- +HPEL_C avx
- +HPEL_V avx
- %ifdef ARCH_X86_64
- -
- %macro DO_FILT_V 6
- ;The optimum prefetch distance is difficult to determine in checkasm:
- ;any prefetch seems slower than not prefetching.
- @@ -634,22 +629,19 @@ HPEL_V ssse3
- ;+16 is picked somewhat arbitrarily here based on the fact that even one
- ;loop iteration is going to take longer than the prefetch.
- prefetcht0 [r1+r2*2+16]
- -%ifidn %6, ssse3
- +%ifnidn %6, sse2
- mova m1, [r3]
- mova m2, [r3+r2]
- mova %3, [r3+r2*2]
- mova m3, [r1]
- mova %1, [r1+r2]
- mova %2, [r1+r2*2]
- - mova m4, m1
- + punpckhbw m4, m1, m2
- punpcklbw m1, m2
- - punpckhbw m4, m2
- - mova m2, %1
- + punpckhbw m2, %1, %2
- punpcklbw %1, %2
- - punpckhbw m2, %2
- - mova %2, m3
- + punpckhbw %2, m3, %3
- punpcklbw m3, %3
- - punpckhbw %2, %3
- pmaddubsw m1, m12
- pmaddubsw m4, m12
- @@ -677,7 +669,18 @@ HPEL_V ssse3
- movntps [r11+r4+%5], m1
- %endmacro
- -%macro FILT_C 4
- +%macro FILT_C 5
- +%ifidn %5, avx
- + palignr m1, %2, %1, 12
- + palignr m2, %2, %1, 14
- + palignr m3, %3, %2, 4
- + palignr m4, %3, %2, 2
- + paddw m3, m2
- + mova %1, %3
- + PALIGNR %3, %2, 6, m2
- + paddw m4, %2
- + paddw %3, m1
- +%else
- mova m1, %2
- PALIGNR m1, %1, 12, m2
- mova m2, %2
- @@ -691,28 +694,34 @@ HPEL_V ssse3
- PALIGNR %3, %2, 6, m2
- paddw m4, %2
- paddw %3, m1
- +%endif
- FILT_H %3, m3, m4
- %endmacro
- -%macro DO_FILT_C 4
- - FILT_C %1, %2, %3, 6
- - FILT_C %2, %1, %4, 6
- +%macro DO_FILT_C 5
- + FILT_C %1, %2, %3, 6, %5
- + FILT_C %2, %1, %4, 6, %5
- FILT_PACK %3, %4, 6, m15
- movntps [r5+r4], %3
- %endmacro
- %macro ADD8TO16 5
- - mova %3, %1
- - mova %4, %2
- + punpckhbw %3, %1, %5
- punpcklbw %1, %5
- + punpcklbw %4, %2, %5
- punpckhbw %2, %5
- - punpckhbw %3, %5
- - punpcklbw %4, %5
- paddw %2, %3
- paddw %1, %4
- %endmacro
- %macro DO_FILT_H 4
- +%ifidn %4, avx
- + palignr m1, %2, %1, 14
- + palignr m2, %2, %1, 15
- + palignr m4, %3, %2, 1
- + palignr m5, %3, %2, 2
- + palignr m6, %3, %2, 3
- +%else
- mova m1, %2
- PALIGNR m1, %1, 14, m3
- mova m2, %2
- @@ -723,6 +732,7 @@ HPEL_V ssse3
- PALIGNR m5, %2, 2 , m3
- mova m6, %3
- PALIGNR m6, %2, 3 , m3
- +%endif
- mova %1, %2
- %ifidn %4, sse2
- ADD8TO16 m1, m6, m12, m3, m0 ; a
- @@ -730,7 +740,7 @@ HPEL_V ssse3
- ADD8TO16 %2, m4, m12, m3, m0 ; c
- FILT_V2 m1, m2, %2, m6, m5, m4
- FILT_PACK m1, m6, 5, m15
- -%else ; ssse3
- +%else ; ssse3, avx
- pmaddubsw m1, m12
- pmaddubsw m2, m12
- pmaddubsw %2, m14
- @@ -790,7 +800,7 @@ cglobal hpel_filter_%1, 7,7,16
- DO_FILT_V m6, m5, m11, m12, 16, %1
- .lastx:
- paddw m15, m15 ; pw_32
- - DO_FILT_C m9, m8, m7, m6
- + DO_FILT_C m9, m8, m7, m6, %1
- psrlw m15, 1 ; pw_16
- movdqa m7, m5
- DO_FILT_H m10, m13, m11, %1
- @@ -813,11 +823,14 @@ cglobal hpel_filter_%1, 7,7,16
- RET
- %endmacro
- +INIT_XMM
- %define PALIGNR PALIGNR_MMX
- HPEL sse2
- %define PALIGNR PALIGNR_SSSE3
- HPEL ssse3
- -%endif
- +INIT_AVX
- +HPEL avx
- +%endif ; ARCH_X86_64
- %undef movntq
- %undef movntps
- @@ -890,9 +903,8 @@ cglobal plane_copy_core_mmxext, 6,7
- %rep 16/mmsize
- mov%4 m0, [%2+(x/2)*mmsize]
- mov%4 m1, [%3+(x/2)*mmsize]
- - mova m2, m0
- + punpckhwd m2, m0, m1
- punpcklwd m0, m1
- - punpckhwd m2, m1
- mov%5a [%1+(x+0)*mmsize], m0
- mov%5a [%1+(x+1)*mmsize], m2
- %assign x (x+2)
- @@ -909,9 +921,8 @@ cglobal plane_copy_core_mmxext, 6,7
- mov%5a [%1], m0
- %else
- movq m1, [%3]
- - mova m2, m0
- + punpckhbw m2, m0, m1
- punpcklbw m0, m1
- - punpckhbw m2, m1
- mov%5a [%1+0], m0
- mov%5a [%1+8], m2
- %endif
- @@ -924,12 +935,10 @@ cglobal plane_copy_core_mmxext, 6,7
- %rep 16/mmsize
- mova m0, [%3+(n+0)*mmsize]
- mova m1, [%3+(n+1)*mmsize]
- - mova m2, m0
- - mova m3, m1
- + psrld m2, m0, 16
- + psrld m3, m1, 16
- pand m0, %6
- pand m1, %6
- - psrld m2, 16
- - psrld m3, 16
- packssdw m0, m1
- packssdw m2, m3
- mov%7 [%1+(n/2)*mmsize], m0
- @@ -1142,6 +1151,9 @@ PLANE_DEINTERLEAVE mmx
- INIT_XMM
- PLANE_INTERLEAVE sse2
- PLANE_DEINTERLEAVE sse2
- +INIT_AVX
- +PLANE_INTERLEAVE avx
- +PLANE_DEINTERLEAVE avx
- %else
- INIT_MMX
- PLANE_INTERLEAVE mmxext
- @@ -1258,7 +1270,8 @@ cglobal integral_init4h_sse4, 3,4
- jl .loop
- REP_RET
- -cglobal integral_init8h_sse4, 3,4
- +%macro INTEGRAL_INIT8H 1
- +cglobal integral_init8h_%1, 3,4
- lea r3, [r0+r2*2]
- add r1, r2
- neg r2
- @@ -1267,12 +1280,10 @@ cglobal integral_init8h_sse4, 3,4
- movdqa m0, [r1+r2]
- movdqa m1, [r1+r2+16]
- palignr m1, m0, 8
- - movdqa m2, m0
- - movdqa m3, m1
- + mpsadbw m2, m0, m4, 4
- + mpsadbw m3, m1, m4, 4
- mpsadbw m0, m4, 0
- mpsadbw m1, m4, 0
- - mpsadbw m2, m4, 4
- - mpsadbw m3, m4, 4
- paddw m0, [r0+r2*2]
- paddw m1, [r0+r2*2+16]
- paddw m0, m2
- @@ -1282,6 +1293,12 @@ cglobal integral_init8h_sse4, 3,4
- add r2, 16
- jl .loop
- REP_RET
- +%endmacro
- +
- +INIT_XMM
- +INTEGRAL_INIT8H sse4
- +INIT_AVX
- +INTEGRAL_INIT8H avx
- %macro INTEGRAL_INIT_8V 1
- ;-----------------------------------------------------------------------------
- @@ -1394,12 +1411,10 @@ cglobal integral_init4v_ssse3, 3,5
- PALIGNR %2, %4, 1, m6
- pavgb %1, %3
- pavgb %2, %4
- - mova %5, %1
- - mova %6, %2
- + psrlw %5, %1, 8
- + psrlw %6, %2, 8
- pand %1, m7
- pand %2, m7
- - psrlw %5, 8
- - psrlw %6, 8
- %endmacro
- %macro FILT16x2 4
- @@ -1411,12 +1426,10 @@ cglobal integral_init4v_ssse3, 3,5
- pavgb %1, m3
- PALIGNR m3, m2, 1, m6
- pavgb m3, m2
- - mova m5, m3
- - mova m4, %1
- + psrlw m5, m3, 8
- + psrlw m4, %1, 8
- pand m3, m7
- pand %1, m7
- - psrlw m5, 8
- - psrlw m4, 8
- packuswb m3, %1
- packuswb m5, m4
- mova [%2], m3
- @@ -1435,12 +1448,10 @@ cglobal integral_init4v_ssse3, 3,5
- pavgb m0, [r0+%3+r5+1]
- pavgb m1, m3
- pavgb m0, m2
- - mova m3, m1
- - mova m2, m0
- + psrlw m3, m1, 8
- + psrlw m2, m0, 8
- pand m1, m7
- pand m0, m7
- - psrlw m3, 8
- - psrlw m2, 8
- packuswb m0, m1
- packuswb m2, m3
- mova [%1], m0
- @@ -1458,12 +1469,10 @@ cglobal integral_init4v_ssse3, 3,5
- pavgw m0, [r0+%3+r5+2]
- pavgw m1, m3
- pavgw m0, m2
- - mova m3, m1
- - mova m2, m0
- + psrld m3, m1, 16
- + psrld m2, m0, 16
- pand m1, m7
- pand m0, m7
- - psrld m3, 16
- - psrld m2, 16
- packssdw m0, m1
- packssdw m2, m3
- movu [%1], m0
- @@ -1479,12 +1488,10 @@ cglobal integral_init4v_ssse3, 3,5
- pavgw %1, m3
- PALIGNR m3, m2, 2, m6
- pavgw m3, m2
- - mova m5, m3
- - mova m4, %1
- + psrld m5, m3, 16
- + psrld m4, %1, 16
- pand m3, m7
- pand %1, m7
- - psrld m5, 16
- - psrld m4, 16
- packssdw m3, %1
- packssdw m5, m4
- mova [%2], m3
- diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
- index 16097a2..f5116f1 100644
- --- a/common/x86/mc-c.c
- +++ b/common/x86/mc-c.c
- @@ -71,6 +71,11 @@ MC_WEIGHT( 8, ssse3 )
- MC_WEIGHT( 12, ssse3 )
- MC_WEIGHT( 16, ssse3 )
- MC_WEIGHT( 20, ssse3 )
- +MC_WEIGHT( 4, avx )
- +MC_WEIGHT( 8, avx )
- +MC_WEIGHT( 12, avx )
- +MC_WEIGHT( 16, avx )
- +MC_WEIGHT( 20, avx )
- #undef MC_OFFSET
- #undef MC_WEIGHT
- @@ -92,6 +97,9 @@ void x264_plane_copy_interleave_core_mmxext( pixel *dst, int i_dst,
- void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h );
- +void x264_plane_copy_interleave_core_avx( pixel *dst, int i_dst,
- + pixel *srcu, int i_srcu,
- + pixel *srcv, int i_srcv, int w, int h );
- void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h );
- @@ -104,20 +112,27 @@ void x264_plane_copy_deinterleave_sse2( pixel *dstu, int i_dstu,
- void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
- uint8_t *dstv, int i_dstv,
- uint8_t *src, int i_src, int w, int h );
- +void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
- + uint16_t *dstv, int i_dstv,
- + uint16_t *src, int i_src, int w, int h );
- void x264_store_interleave_8x8x2_mmxext( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
- void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
- +void x264_store_interleave_8x8x2_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
- void x264_load_deinterleave_8x8x2_fenc_mmx( pixel *dst, pixel *src, int i_src );
- void x264_load_deinterleave_8x8x2_fenc_sse2( pixel *dst, pixel *src, int i_src );
- void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
- +void x264_load_deinterleave_8x8x2_fenc_avx( uint16_t *dst, uint16_t *src, int i_src );
- void x264_load_deinterleave_8x8x2_fdec_mmx( pixel *dst, pixel *src, int i_src );
- void x264_load_deinterleave_8x8x2_fdec_sse2( pixel *dst, pixel *src, int i_src );
- void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src );
- +void x264_load_deinterleave_8x8x2_fdec_avx( uint16_t *dst, uint16_t *src, int i_src );
- void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
- void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
- void x264_memzero_aligned_mmx( void * dst, int n );
- void x264_memzero_aligned_sse2( void * dst, int n );
- void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
- void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
- +void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, int stride );
- void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
- void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
- void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
- @@ -135,6 +150,8 @@ MC_CHROMA(sse2)
- MC_CHROMA(sse2_misalign)
- MC_CHROMA(ssse3)
- MC_CHROMA(ssse3_cache64)
- +MC_CHROMA(avx)
- +MC_CHROMA(avx_cache64)
- #define LOWRES(cpu)\
- void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
- @@ -415,15 +432,17 @@ static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel
- HPEL(8, mmxext, mmxext, mmxext, mmxext)
- #if HIGH_BIT_DEPTH
- -HPEL(16, sse2, sse2, sse2, sse2 )
- +HPEL(16, sse2, sse2, sse2, sse2)
- #else // !HIGH_BIT_DEPTH
- HPEL(16, sse2_amd, mmxext, mmxext, sse2)
- #if ARCH_X86_64
- void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
- void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
- +void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
- #else
- HPEL(16, sse2, sse2, sse2, sse2)
- HPEL(16, ssse3, ssse3, ssse3, ssse3)
- +HPEL(16, avx, avx, avx, ssse3)
- #endif
- HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
- #endif // HIGH_BIT_DEPTH
- @@ -465,6 +484,9 @@ static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
- PLANE_INTERLEAVE(mmxext)
- PLANE_INTERLEAVE(sse2)
- +#if HIGH_BIT_DEPTH
- +PLANE_INTERLEAVE(avx)
- +#endif
- void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- {
- @@ -570,6 +592,18 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
- pf->integral_init4v = x264_integral_init4v_ssse3;
- +
- + if( !(cpu&X264_CPU_AVX) )
- + return;
- +
- + pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_avx;
- + pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_avx;
- + pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
- + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
- + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_avx;
- +
- + if( !(cpu&X264_CPU_STACK_MOD4) )
- + pf->mc_chroma = x264_mc_chroma_avx;
- #else // !HIGH_BIT_DEPTH
- pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
- pf->prefetch_ref = x264_prefetch_ref_mmxext;
- @@ -685,5 +719,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- pf->integral_init4h = x264_integral_init4h_sse4;
- pf->integral_init8h = x264_integral_init8h_sse4;
- +
- + if( !(cpu&X264_CPU_AVX) )
- + return;
- +
- + pf->integral_init8h = x264_integral_init8h_avx;
- + pf->hpel_filter = x264_hpel_filter_avx;
- + if( !(cpu&X264_CPU_STACK_MOD4) )
- + pf->mc_chroma = x264_mc_chroma_avx;
- +
- + if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_STACK_MOD4) )
- + pf->mc_chroma = x264_mc_chroma_avx_cache64;
- #endif // HIGH_BIT_DEPTH
- }
- diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
- index 4858bf0..67d52c0 100644
- --- a/common/x86/pixel-a.asm
- +++ b/common/x86/pixel-a.asm
- @@ -231,12 +231,10 @@ SSD_ONE 16, 16, sse2
- psubusb m%3, m%7
- por m%1, m%2
- por m%3, m%4
- - mova m%2, m%1
- - mova m%4, m%3
- + punpcklbw m%2, m%1, m%5
- punpckhbw m%1, m%5
- - punpcklbw m%2, m%5
- + punpcklbw m%4, m%3, m%5
- punpckhbw m%3, m%5
- - punpcklbw m%4, m%5
- %endif
- pmaddwd m%1, m%1
- pmaddwd m%2, m%2
- @@ -263,12 +261,10 @@ SSD_ONE 16, 16, sse2
- %macro SSD_CORE_SSSE3 7-8
- %ifidn %8, FULL
- - mova m%6, m%1
- - mova m%7, m%3
- + punpckhbw m%6, m%1, m%2
- + punpckhbw m%7, m%3, m%4
- punpcklbw m%1, m%2
- punpcklbw m%3, m%4
- - punpckhbw m%6, m%2
- - punpckhbw m%7, m%4
- SWAP %6, %2, %3
- SWAP %7, %4
- %endif
- @@ -324,6 +320,8 @@ cglobal pixel_ssd_%1x%2_%3, 0,0,0
- %ifidn %3, ssse3
- mova m7, [hsub_mul]
- +%elifidn %3, avx
- + mova m7, [hsub_mul]
- %elifidn %3, sse2
- mova m7, [pw_00ff]
- %elif %1 >= mmsize
- @@ -376,6 +374,12 @@ SSD 8, 8, ssse3, 8
- SSD 16, 8, ssse3, 8
- SSD 8, 16, ssse3, 8
- SSD 8, 4, ssse3, 8
- +INIT_AVX
- +SSD 16, 16, avx, 8
- +SSD 8, 8, avx, 8
- +SSD 16, 8, avx, 8
- +SSD 8, 16, avx, 8
- +SSD 8, 4, avx, 8
- INIT_MMX
- SSD 4, 4, ssse3
- SSD 4, 8, ssse3
- @@ -432,12 +436,10 @@ cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16)
- jl .loopx
- %if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
- ; equation above, putting the width limit at 8208
- - mova m0, m2
- - mova m1, m3
- + punpckhdq m0, m2, m6
- + punpckhdq m1, m3, m6
- punpckldq m2, m6
- punpckldq m3, m6
- - punpckhdq m0, m6
- - punpckhdq m1, m6
- paddq m3, m2
- paddq m1, m0
- paddq m4, m3
- @@ -506,14 +508,13 @@ cglobal pixel_ssd_nv12_core_%1, 6,7
- psubusb m0, m1
- psubusb m1, [r0+r6]
- por m0, m1
- - mova m2, m0
- + psrlw m2, m0, 8
- + add r6, mmsize
- pand m0, m5
- - psrlw m2, 8
- - pmaddwd m0, m0
- pmaddwd m2, m2
- + pmaddwd m0, m0
- paddd m3, m0
- paddd m4, m2
- - add r6, mmsize
- jl .loopx
- add r0, r1
- add r2, r3
- @@ -530,12 +531,14 @@ cglobal pixel_ssd_nv12_core_%1, 6,7
- movq [r4], m4
- RET
- %endmacro ; SSD_NV12
- -%endif ; !X264_HIGHT_BIT_DEPTH
- +%endif ; !HIGH_BIT_DEPTH
- INIT_MMX
- SSD_NV12 mmxext
- INIT_XMM
- SSD_NV12 sse2
- +INIT_AVX
- +SSD_NV12 avx
- ;=============================================================================
- ; variance
- @@ -598,11 +601,10 @@ SSD_NV12 sse2
- mova m4, [r0+%1+mmsize]
- %else ; !HIGH_BIT_DEPTH
- mova m0, [r0]
- - mova m1, m0
- + punpckhbw m1, m0, m7
- mova m3, [r0+%1]
- mova m4, m3
- punpcklbw m0, m7
- - punpckhbw m1, m7
- %endif ; HIGH_BIT_DEPTH
- %ifidn %1, r1
- lea r0, [r0+%1*2]
- @@ -634,15 +636,15 @@ cglobal pixel_var_8x8_mmxext, 2,3
- VAR_2ROW r1, 4
- VAR_END 8, 8
- -INIT_XMM
- %ifdef HIGH_BIT_DEPTH
- -cglobal pixel_var_16x16_sse2, 2,3,8
- +%macro VAR 1
- +cglobal pixel_var_16x16_%1, 2,3,8
- FIX_STRIDES r1
- VAR_START 0
- VAR_2ROW r1, 8
- VAR_END 16, 16
- -cglobal pixel_var_8x8_sse2, 2,3,8
- +cglobal pixel_var_8x8_%1, 2,3,8
- lea r2, [r1*3]
- VAR_START 0
- mova m0, [r0]
- @@ -657,10 +659,17 @@ cglobal pixel_var_8x8_sse2, 2,3,8
- mova m4, [r0+r2*2]
- VAR_CORE
- VAR_END 8, 8
- +%endmacro ; VAR
- +
- +INIT_XMM
- +VAR sse2
- +INIT_AVX
- +VAR avx
- %endif ; HIGH_BIT_DEPTH
- %ifndef HIGH_BIT_DEPTH
- -cglobal pixel_var_16x16_sse2, 2,3,8
- +%macro VAR 1
- +cglobal pixel_var_16x16_%1, 2,3,8
- VAR_START 1
- mov r2d, 8
- .loop:
- @@ -673,7 +682,7 @@ cglobal pixel_var_16x16_sse2, 2,3,8
- jg .loop
- VAR_END 16, 16
- -cglobal pixel_var_8x8_sse2, 2,4,8
- +cglobal pixel_var_8x8_%1, 2,4,8
- VAR_START 1
- mov r2d, 2
- lea r3, [r1*3]
- @@ -688,6 +697,12 @@ cglobal pixel_var_8x8_sse2, 2,4,8
- dec r2d
- jg .loop
- VAR_END 8, 8
- +%endmacro ; VAR
- +
- +INIT_XMM
- +VAR sse2
- +INIT_AVX
- +VAR avx
- %endif ; !HIGH_BIT_DEPTH
- %macro VAR2_END 0
- @@ -1144,7 +1159,6 @@ cglobal pixel_satd_4x4_mmxext, 4,6
- ; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
- ;-----------------------------------------------------------------------------
- %macro SATDS_SSE2 1
- -INIT_XMM
- %ifnidn %1, sse2
- cglobal pixel_satd_4x4_%1, 4, 6, 6
- SATD_START_MMX
- @@ -1520,7 +1534,6 @@ cglobal pixel_sa8d_16x16_%1, 4,7
- %macro INTRA_SA8D_SSE2 1
- %ifdef ARCH_X86_64
- -INIT_XMM
- ;-----------------------------------------------------------------------------
- ; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
- ;-----------------------------------------------------------------------------
- @@ -1576,7 +1589,6 @@ cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
- paddusw m8, m9
- paddusw m15, m10
- paddusw m15, m8
- - movdqa m14, m15 ; 7x8 sum
- movdqa m8, [r1+0] ; left edge
- movd m9, r0d
- @@ -1585,7 +1597,7 @@ cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
- psubw m9, m0
- ABS1 m8, m10
- ABS1 m9, m11 ; 1x8 sum
- - paddusw m14, m8
- + paddusw m14, m15, m8
- paddusw m15, m9
- punpcklwd m0, m1
- punpcklwd m2, m3
- @@ -1595,9 +1607,8 @@ cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
- punpckldq m4, m6
- punpcklqdq m0, m4 ; transpose
- movdqa m1, [r1+16] ; top edge
- - movdqa m2, m15
- psllw m1, 3
- - psrldq m2, 2 ; 8x7 sum
- + psrldq m2, m15, 2 ; 8x7 sum
- psubw m0, m1 ; 8x1 sum
- ABS1 m0, m1
- paddusw m2, m0
- @@ -1607,15 +1618,13 @@ cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
- pmaddwd m2, m7
- pmaddwd m14, m7
- pmaddwd m15, m7
- - movdqa m3, m2
- + punpckhdq m3, m2, m14
- punpckldq m2, m14
- - punpckhdq m3, m14
- pshufd m5, m15, 0xf5
- paddd m2, m3
- paddd m5, m15
- - movdqa m3, m2
- + punpckhqdq m3, m2, m5
- punpcklqdq m2, m5
- - punpckhqdq m3, m5
- pavgw m3, m2
- pxor m0, m0
- pavgw m3, m0
- @@ -1757,7 +1766,6 @@ cglobal hadamard_load
- %endmacro
- %macro INTRA_SATDS_MMX 1
- -INIT_MMX
- ;-----------------------------------------------------------------------------
- ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
- ;-----------------------------------------------------------------------------
- @@ -2241,7 +2249,6 @@ HADAMARD_AC_WXH_MMX 8, 8
- %endmacro
- %macro HADAMARD_AC_SSE2 1
- -INIT_XMM
- ; in: r0=pix, r1=stride, r2=stride*3
- ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
- cglobal hadamard_ac_8x8_%1
- @@ -2455,8 +2462,10 @@ SA8D sse2
- SATDS_SSE2 sse2
- INTRA_SA8D_SSE2 sse2
- %ifndef HIGH_BIT_DEPTH
- +INIT_MMX
- INTRA_SATDS_MMX mmxext
- %endif
- +INIT_XMM
- HADAMARD_AC_SSE2 sse2
- %define ABS1 ABS1_SSSE3
- @@ -2470,6 +2479,7 @@ HADAMARD_AC_SSE2 sse2
- %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
- %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
- %endif
- +INIT_XMM
- SATDS_SSE2 ssse3
- SA8D ssse3
- HADAMARD_AC_SSE2 ssse3
- @@ -2477,14 +2487,21 @@ HADAMARD_AC_SSE2 ssse3
- %undef movdqu ; movups
- %undef punpcklqdq ; or movlhps
- INTRA_SA8D_SSE2 ssse3
- +INIT_MMX
- INTRA_SATDS_MMX ssse3
- %define TRANS TRANS_SSE4
- %define JDUP JDUP_PENRYN
- %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
- +INIT_XMM
- SATDS_SSE2 sse4
- SA8D sse4
- HADAMARD_AC_SSE2 sse4
- +INIT_AVX
- +SATDS_SSE2 avx
- +SA8D avx
- +INTRA_SA8D_SSE2 avx
- +HADAMARD_AC_SSE2 avx
- ;=============================================================================
- ; SSIM
- @@ -2494,7 +2511,7 @@ HADAMARD_AC_SSE2 sse4
- ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
- ; const uint8_t *pix2, int stride2, int sums[2][4] )
- ;-----------------------------------------------------------------------------
- -
- +INIT_XMM
- %macro SSIM_ITER 1
- %ifdef HIGH_BIT_DEPTH
- movdqu m5, [r0+(%1&1)*r1]
- @@ -2516,9 +2533,8 @@ HADAMARD_AC_SSE2 sse4
- paddw m1, m5
- paddw m2, m6
- %endif
- - movdqa m7, m5
- + pmaddwd m7, m5, m6
- pmaddwd m5, m5
- - pmaddwd m7, m6
- pmaddwd m6, m6
- %if %1==0
- SWAP m3, m5
- @@ -2549,9 +2565,8 @@ cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8
- pshufd m1, m1, 0xd8
- paddd m4, m6
- pmaddwd m1, m7
- - movdqa m5, m3
- + punpckhdq m5, m3, m4
- punpckldq m3, m4
- - punpckhdq m5, m4
- %ifdef UNIX64
- %define t0 r4
- @@ -2569,7 +2584,8 @@ cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8
- ;-----------------------------------------------------------------------------
- ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
- ;-----------------------------------------------------------------------------
- -cglobal pixel_ssim_end4_sse2, 3,3,7
- +%macro SSIM_END4 1
- +cglobal pixel_ssim_end4_%1, 3,3,7
- movdqa m0, [r0+ 0]
- movdqa m1, [r0+16]
- movdqa m2, [r0+32]
- @@ -2609,9 +2625,8 @@ cglobal pixel_ssim_end4_sse2, 3,3,7
- addps m2, m6 ; vars + ssim_c2
- addps m3, m6 ; covar*2 + ssim_c2
- %else
- - movdqa m4, m1
- + pmaddwd m4, m1, m0 ; s1*s2
- pslld m1, 16
- - pmaddwd m4, m0 ; s1*s2
- por m0, m1
- pmaddwd m0, m0 ; s1*s1 + s2*s2
- pslld m4, 1
- @@ -2652,8 +2667,12 @@ cglobal pixel_ssim_end4_sse2, 3,3,7
- fld dword r0m
- %endif
- RET
- +%endmacro ; SSIM_END4
- -
- +INIT_XMM
- +SSIM_END4 sse2
- +INIT_AVX
- +SSIM_END4 avx
- ;=============================================================================
- ; Successive Elimination ADS
- @@ -2776,27 +2795,22 @@ cglobal pixel_ads4_%1, 6,7,12
- movdqu xmm10, [r1]
- movdqu xmm11, [r1+r2]
- .loop:
- - movdqa xmm0, xmm10
- - movdqu xmm1, [r1+16]
- - movdqa xmm10, xmm1
- - psubw xmm0, xmm7
- - psubw xmm1, xmm6
- + psubw xmm0, xmm10, xmm7
- + movdqu xmm10, [r1+16]
- + psubw xmm1, xmm10, xmm6
- ABS1 xmm0, xmm2
- ABS1 xmm1, xmm3
- - movdqa xmm2, xmm11
- - movdqu xmm3, [r1+r2+16]
- - movdqa xmm11, xmm3
- - psubw xmm2, xmm5
- - psubw xmm3, xmm4
- + psubw xmm2, xmm11, xmm5
- + movdqu xmm11, [r1+r2+16]
- paddw xmm0, xmm1
- + psubw xmm3, xmm11, xmm4
- movdqu xmm9, [r3]
- ABS1 xmm2, xmm1
- ABS1 xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- paddusw xmm0, xmm9
- - movdqa xmm1, xmm8
- - psubusw xmm1, xmm0
- + psubusw xmm1, xmm8, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
- %else
- @@ -2848,8 +2862,7 @@ cglobal pixel_ads2_%1, 6,7,8
- ABS1 xmm1, xmm3
- paddw xmm0, xmm1
- paddusw xmm0, xmm4
- - movdqa xmm1, xmm5
- - psubusw xmm1, xmm0
- + psubusw xmm1, xmm5, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
- ADS_END 2
- @@ -2873,18 +2886,19 @@ cglobal pixel_ads1_%1, 6,7,8
- ABS1 xmm1, xmm5
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
- - movdqa xmm4, xmm6
- - movdqa xmm5, xmm6
- - psubusw xmm4, xmm0
- - psubusw xmm5, xmm1
- + psubusw xmm4, xmm6, xmm0
- + psubusw xmm5, xmm6, xmm1
- packsswb xmm4, xmm5
- movdqa [r6], xmm4
- ADS_END 4
- %endmacro
- +INIT_XMM
- ADS_SSE2 sse2
- %define ABS1 ABS1_SSSE3
- ADS_SSE2 ssse3
- +INIT_AVX
- +ADS_SSE2 avx
- ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
- ; {
- diff --git a/common/x86/pixel.h b/common/x86/pixel.h
- index f0901e9..8a7dc74 100644
- --- a/common/x86/pixel.h
- +++ b/common/x86/pixel.h
- @@ -60,14 +60,17 @@ DECL_X1( ssd, mmxext )
- DECL_X1( ssd, sse2slow )
- DECL_X1( ssd, sse2 )
- DECL_X1( ssd, ssse3 )
- +DECL_X1( ssd, avx )
- DECL_X1( satd, mmxext )
- DECL_X1( satd, sse2 )
- DECL_X1( satd, ssse3 )
- DECL_X1( satd, sse4 )
- +DECL_X1( satd, avx )
- DECL_X1( sa8d, mmxext )
- DECL_X1( sa8d, sse2 )
- DECL_X1( sa8d, ssse3 )
- -DECL_X1( sa8d, sse4)
- +DECL_X1( sa8d, sse4 )
- +DECL_X1( sa8d, avx )
- DECL_X1( sad, cache32_mmxext );
- DECL_X1( sad, cache64_mmxext );
- DECL_X1( sad, cache64_sse2 );
- @@ -79,10 +82,12 @@ DECL_X4( sad, cache64_ssse3 );
- DECL_PIXELS( uint64_t, var, mmxext, ( pixel *pix, int i_stride ))
- DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
- +DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
- DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( pixel *pix, int i_stride ))
- DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
- DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
- DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
- +DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
- void x264_intra_satd_x3_4x4_mmxext ( pixel *, pixel *, int * );
- @@ -102,12 +107,14 @@ void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * );
- void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
- void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
- void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
- +void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
- void x264_intra_sad_x3_8x8_mmxext ( pixel *, pixel *, int * );
- void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
- void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
- void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
- void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
- void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
- +void x264_intra_sa8d_x3_8x8_core_avx ( uint8_t *, int16_t [2][8], int * );
- void x264_pixel_ssd_nv12_core_mmxext( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width,
- @@ -115,11 +122,15 @@ void x264_pixel_ssd_nv12_core_mmxext( pixel *pixuv1, int stride1,
- void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width,
- int height, uint64_t *ssd_u, uint64_t *ssd_v );
- +void x264_pixel_ssd_nv12_core_avx( pixel *pixuv1, int stride1,
- + pixel *pixuv2, int stride2, int width,
- + int height, uint64_t *ssd_u, uint64_t *ssd_v );
- void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
- const uint8_t *pix2, int stride2, int sums[2][4] );
- void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1,
- const pixel *pix2, int stride2, int sums[2][4] );
- float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
- +float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
- int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * );
- int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
- int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
- @@ -136,6 +147,9 @@ DECL_ADS( 1, sse2 )
- DECL_ADS( 4, ssse3 )
- DECL_ADS( 2, ssse3 )
- DECL_ADS( 1, ssse3 )
- +DECL_ADS( 4, avx )
- +DECL_ADS( 2, avx )
- +DECL_ADS( 1, avx )
- #undef DECL_PIXELS
- #undef DECL_X1
- diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
- index aefc638..669873e 100644
- --- a/common/x86/predict-a.asm
- +++ b/common/x86/predict-a.asm
- @@ -134,18 +134,16 @@ cextern pw_pixel_max
- ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
- %macro PRED8x8_LOWPASS 5-6
- %ifidn %1, w
- - mova %2, %5
- paddw %3, %4
- psrlw %3, 1
- - pavgw %2, %3
- + pavgw %2, %5, %3
- %else
- mova %6, %3
- pavgb %3, %4
- pxor %4, %6
- - mova %2, %5
- pand %4, [pb_1]
- psubusb %3, %4
- - pavgb %2, %3
- + pavgb %2, %5, %3
- %endif
- %endmacro
- @@ -170,14 +168,13 @@ cextern pw_pixel_max
- %macro PREDICT_4x4_DDL 4
- cglobal predict_4x4_ddl_%1, 1,1
- movu m1, [r0-FDEC_STRIDEB]
- - mova m2, m1
- + psll%2 m2, m1, %3
- mova m3, m1
- mova m4, m1
- - psll%2 m1, %3
- - pxor m2, m1
- - psrl%2 m2, %3
- - pxor m3, m2
- - PRED8x8_LOWPASS %4, m0, m1, m3, m4, m5
- + pxor m1, m2
- + psrl%2 m1, %3
- + pxor m3, m1
- + PRED8x8_LOWPASS %4, m0, m2, m3, m4, m5
- %assign Y 0
- %rep 4
- @@ -192,6 +189,8 @@ cglobal predict_4x4_ddl_%1, 1,1
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- PREDICT_4x4_DDL sse2, dq, 2, w
- +INIT_AVX
- +PREDICT_4x4_DDL avx , dq, 2, w
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- cglobal predict_4x4_ddl_mmxext, 1,2
- @@ -284,9 +283,8 @@ cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
- PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
- %endif
- PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
- - mova m1, m3
- + psll%4 m1, m3, %7*6
- psrl%4 m3, %7*2
- - psll%4 m1, %7*6
- movh [r0+0*FDEC_STRIDEB], m5
- movh [r0+1*FDEC_STRIDEB], m3
- PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
- @@ -318,12 +316,9 @@ cglobal predict_4x4_hd_%1, 1,1,6*(mmsize/16)
- %endif
- punpckh%3 m1, m2 ; l0 l1 l2 l3
- punpckh%6 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- - mova m0, m1
- - mova m2, m1
- - mova m5, m1
- - psrl%4 m0, %7*2 ; .. .. t2 t1 t0 lt l0 l1
- - psrl%4 m2, %7 ; .. t2 t1 t0 lt l0 l1 l2
- - pavg%5 m5, m2
- + psrl%4 m2, m1, %7 ; .. t2 t1 t0 lt l0 l1 l2
- + psrl%4 m0, m1, %7*2 ; .. .. t2 t1 t0 lt l0 l1
- + pavg%5 m5, m1, m2
- PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
- punpckl%2 m5, m3
- psrl%4 m3, %7*4
- @@ -419,6 +414,8 @@ INIT_XMM
- PREDICT_4x4 sse2 , wd, dq, dq, w, qdq, 2
- %define PALIGNR PALIGNR_SSSE3
- PREDICT_4x4 ssse3 , wd, dq, dq, w, qdq, 2
- +INIT_AVX
- +PREDICT_4x4 avx , wd, dq, dq, w, qdq, 2
- %else
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- @@ -496,12 +493,9 @@ cglobal predict_4x4_hu_mmxext, 1,1
- %macro PREDICT_4x4_V1 4
- cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
- movu m1, [r0-FDEC_STRIDEB]
- - mova m3, m1
- - mova m2, m1
- - psrl%2 m3, %3
- - psrl%2 m2, %3*2
- - mova m4, m3
- - pavg%4 m4, m1
- + psrl%2 m3, m1, %3
- + psrl%2 m2, m1, %3*2
- + pavg%4 m4, m3, m1
- PRED8x8_LOWPASS %4, m0, m1, m2, m3, m5
- movh [r0+0*FDEC_STRIDEB], m4
- @@ -516,6 +510,8 @@ cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- PREDICT_4x4_V1 sse2, dq, 2, w
- +INIT_AVX
- +PREDICT_4x4_V1 avx , dq, 2, w
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- @@ -664,10 +660,9 @@ cglobal predict_8x8_filter_%1, 4,5,7*(mmsize/16)
- test r2b, 0x04
- je .fix_tr_2
- mova m0, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
- - mova m5, m0
- mova m2, m0
- mova m4, m0
- - psrl%5 m5, 7*%6
- + psrl%5 m5, m0, 7*%6
- PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
- PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4
- PRED8x8_LOWPASS %2, m1, m2, m5, m0, m4
- @@ -683,23 +678,20 @@ cglobal predict_8x8_filter_%1, 4,5,7*(mmsize/16)
- .done:
- REP_RET
- .fix_lt_1:
- - mova m5, m3
- - pxor m5, m4
- + pxor m5, m3, m4
- psrl%5 m5, 7*%6
- psll%5 m5, 6*%6
- pxor m1, m5
- jmp .do_left
- .fix_lt_2:
- - mova m5, m3
- - pxor m5, m2
- + pxor m5, m3, m2
- psll%5 m5, 7*%6
- psrl%5 m5, 7*%6
- pxor m2, m5
- test r2b, 0x04
- jne .do_top
- .fix_tr_1:
- - mova m5, m3
- - pxor m5, m1
- + pxor m5, m3, m1
- psrl%5 m5, 7*%6
- psll%5 m5, 7*%6
- pxor m1, m5
- @@ -712,6 +704,8 @@ INIT_XMM
- PREDICT_FILTER sse2 , w, d, q, dq, 2
- %define PALIGNR PALIGNR_SSSE3
- PREDICT_FILTER ssse3 , w, d, q, dq, 2
- +INIT_AVX
- +PREDICT_FILTER avx , w, d, q, dq, 2
- %else
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- @@ -745,9 +739,8 @@ PREDICT_8x8_V mmxext
- cglobal predict_8x8_h_%1, 2,2
- movu m1, [r1+7*SIZEOF_PIXEL]
- add r0, 4*FDEC_STRIDEB
- - mova m2, m1
- + punpckl%2 m2, m1, m1
- punpckh%2 m1, m1
- - punpckl%2 m2, m2
- %assign n 0
- %rep 8
- %assign i 1+n/4
- @@ -844,17 +837,21 @@ cglobal predict_8x8_ddl_%1, 2,2,8*(mmsize/16)
- movu m2, [r1+17*SIZEOF_PIXEL]
- movu m3, [r1+23*SIZEOF_PIXEL]
- movu m4, [r1+25*SIZEOF_PIXEL]
- - mova m1, m5
- - psll%3 m1, %4
- + psll%3 m1, m5, %4
- add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS %2, m0, m1, m2, m5, m7
- +%if avx_enabled == 1
- + INIT_XMM
- PRED8x8_LOWPASS %2, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
- + INIT_AVX
- +%else
- + PRED8x8_LOWPASS %2, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
- +%endif
- %assign Y 3
- %rep 6
- mova [r0+Y*FDEC_STRIDEB], m1
- - mova m2, m0
- psll%3 m1, %4
- - psrl%3 m2, 7*%4
- + psrl%3 m2, m0, 7*%4
- psll%3 m0, %4
- por m1, m2
- %assign Y (Y-1)
- @@ -870,6 +867,7 @@ cglobal predict_8x8_ddl_%1, 2,2,8*(mmsize/16)
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_ddr( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- +%if avx_enabled == 0
- cglobal predict_8x8_ddr_%1, 2,2,7*(mmsize/16)
- movu m1, [r1+ 7*SIZEOF_PIXEL]
- movu m2, [r1+ 9*SIZEOF_PIXEL]
- @@ -881,9 +879,8 @@ cglobal predict_8x8_ddr_%1, 2,2,7*(mmsize/16)
- %assign Y 3
- %rep 6
- mova [r0+Y*FDEC_STRIDEB], m0
- - mova m2, m1
- psrl%3 m0, %4
- - psll%3 m2, 7*%4
- + psll%3 m2, m1, 7*%4
- psrl%3 m1, %4
- por m0, m2
- %assign Y (Y-1)
- @@ -895,11 +892,14 @@ cglobal predict_8x8_ddr_%1, 2,2,7*(mmsize/16)
- %assign Y (Y-1)
- mova [r0+Y*FDEC_STRIDEB], m0
- RET
- +%endif
- %endmacro ; PREDICT_8x8
- %ifdef HIGH_BIT_DEPTH
- INIT_XMM
- PREDICT_8x8 sse2 , w, dq, 2
- +INIT_AVX
- +PREDICT_8x8 avx , w, dq, 2
- %elifndef ARCH_X86_64
- INIT_MMX
- PREDICT_8x8 mmxext, b, q , 8
- @@ -918,19 +918,17 @@ cglobal predict_8x8_hu_%1, 2,2,8*(mmsize/16)
- psll%4 m0, 8*SIZEOF_PIXEL
- psrl%4 m2, 8*SIZEOF_PIXEL
- por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
- - mova m3, m2
- mova m4, m2
- mova m5, m2
- + psrl%3 m3, m2, 2*%6
- psrl%3 m2, %6
- - psrl%3 m3, 2*%6
- por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckh%5 m1, m1
- por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavg%2 m4, m2
- PRED8x8_LOWPASS %2, m1, m3, m5, m2, m6
- - mova m5, m4
- + punpckh%5 m5, m4, m1 ; p8 p7 p6 p5
- punpckl%5 m4, m1 ; p4 p3 p2 p1
- - punpckh%5 m5, m1 ; p8 p7 p6 p5
- mova m6, m5
- mova m7, m5
- mova m0, m5
- @@ -957,6 +955,8 @@ INIT_XMM
- PREDICT_8x8_HU sse2 , w, dq, d, wd, 2
- %define PALIGNR PALIGNR_SSSE3
- PREDICT_8x8_HU ssse3 , w, dq, d, wd, 2
- +INIT_AVX
- +PREDICT_8x8_HU avx , w, dq, d, wd, 2
- %elifndef ARCH_X86_64
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- @@ -971,14 +971,13 @@ cglobal predict_8x8_vr_%1, 2,3,7*(mmsize/16)
- mova m2, [r1+16*SIZEOF_PIXEL]
- movu m3, [r1+15*SIZEOF_PIXEL]
- movu m1, [r1+14*SIZEOF_PIXEL]
- - mova m4, m3
- - pavg%2 m3, m2
- + pavg%2 m4, m3, m2
- add r0, FDEC_STRIDEB*4
- - PRED8x8_LOWPASS %2, m0, m1, m2, m4, m5
- - mova [r0-4*FDEC_STRIDEB], m3
- + PRED8x8_LOWPASS %2, m0, m1, m2, m3, m5
- + mova [r0-4*FDEC_STRIDEB], m4
- mova [r0-3*FDEC_STRIDEB], m0
- mova m5, m0
- - mova m6, m3
- + mova m6, m4
- mova m1, [r1+8*SIZEOF_PIXEL]
- mova m2, m1
- psll%3 m2, %4
- @@ -1005,6 +1004,8 @@ INIT_XMM
- PREDICT_8x8_VR sse2 , w, dq, 2
- %define PALIGNR PALIGNR_SSSE3
- PREDICT_8x8_VR ssse3 , w, dq, 2
- +INIT_AVX
- +PREDICT_8x8_VR avx , w, dq, 2
- %else
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- @@ -1042,8 +1043,8 @@ ALIGN 4
- REP_RET
- %endif ; !ARCH_X86_64
- -INIT_XMM
- -cglobal predict_8x8c_p_core_sse2, 1,1
- +%macro PREDICT_8x8C_P 1
- +cglobal predict_8x8c_p_core_%1, 1,1
- movd m0, r1m
- movd m2, r2m
- movd m4, r3m
- @@ -1058,8 +1059,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1
- %ifdef HIGH_BIT_DEPTH
- mov r1d, 8
- .loop:
- - mova m5, m0
- - paddsw m5, m2
- + paddsw m5, m0, m2
- psraw m5, 5
- CLIPW m5, m1, m3
- mova [r0], m5
- @@ -1069,32 +1069,31 @@ cglobal predict_8x8c_p_core_sse2, 1,1
- jg .loop
- %else ;!HIGH_BIT_DEPTH
- paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
- - mova m3, m0
- - paddsw m3, m4
- + paddsw m3, m0, m4
- paddsw m4, m4
- call .loop
- add r0, FDEC_STRIDE*4
- .loop:
- - mova m5, m0
- - mova m1, m3
- - psraw m0, 5
- + paddsw m1, m3, m4
- + paddsw m5, m0, m4
- psraw m3, 5
- + psraw m0, 5
- packuswb m0, m3
- movq [r0+FDEC_STRIDE*0], m0
- movhps [r0+FDEC_STRIDE*1], m0
- - paddsw m5, m4
- - paddsw m1, m4
- - mova m0, m5
- - mova m3, m1
- + paddsw m0, m5, m4
- + paddsw m3, m1, m4
- psraw m5, 5
- psraw m1, 5
- packuswb m5, m1
- movq [r0+FDEC_STRIDE*2], m5
- movhps [r0+FDEC_STRIDE*3], m5
- - paddsw m0, m4
- - paddsw m3, m4
- %endif ;!HIGH_BIT_DEPTH
- RET
- +%endmacro ; PREDICT_8x8C_P
- +
- +INIT_XMM
- +PREDICT_8x8C_P sse2
- ;-----------------------------------------------------------------------------
- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
- @@ -1140,8 +1139,8 @@ ALIGN 4
- REP_RET
- %endif ; !ARCH_X86_64
- -INIT_XMM
- -cglobal predict_16x16_p_core_sse2, 1,2,8
- +%macro PREDICT_16x16_P 1
- +cglobal predict_16x16_p_core_%1, 1,2,8
- movd m0, r1m
- movd m1, r2m
- movd m2, r3m
- @@ -1152,19 +1151,15 @@ cglobal predict_16x16_p_core_sse2, 1,2,8
- SPLATW m0, m0, 0
- SPLATW m1, m1, 0
- SPLATW m2, m2, 0
- - mova m3, m1
- - pmullw m3, [pw_76543210]
- + pmullw m3, m1, [pw_76543210]
- psllw m1, 3
- %ifdef HIGH_BIT_DEPTH
- mov r1d, 16
- .loop:
- - mova m4, m0
- - mova m5, m0
- - mova m7, m3
- - paddsw m7, m6
- - paddsw m4, m7
- + paddsw m7, m3, m6
- + paddsw m4, m0, m7
- paddsw m7, m1
- - paddsw m5, m7
- + paddsw m5, m0, m7
- psraw m4, 5
- psraw m5, 5
- CLIPW m4, [pb_0], [pw_pixel_max]
- @@ -1178,19 +1173,16 @@ cglobal predict_16x16_p_core_sse2, 1,2,8
- %else ;!HIGH_BIT_DEPTH
- paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
- - mova m7, m2
- - paddsw m7, m7
- + paddsw m7, m2, m7
- mov r1d, 8
- ALIGN 4
- .loop:
- mova m3, m0
- mova m4, m1
- - mova m5, m0
- - mova m6, m1
- psraw m3, 5
- psraw m4, 5
- - paddsw m5, m2
- - paddsw m6, m2
- + paddsw m5, m0, m2
- + paddsw m6, m1, m2
- psraw m5, 5
- psraw m6, 5
- packuswb m3, m4
- @@ -1204,17 +1196,22 @@ ALIGN 4
- jg .loop
- %endif ;!HIGH_BIT_DEPTH
- REP_RET
- +%endmacro ; PREDICT_16x16_P
- -%ifndef HIGH_BIT_DEPTH
- INIT_XMM
- +PREDICT_16x16_P sse2
- +INIT_AVX
- +PREDICT_16x16_P avx
- +
- +%ifndef HIGH_BIT_DEPTH
- +%macro PREDICT_8x8 1
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- -cglobal predict_8x8_ddl_sse2, 2,2
- +cglobal predict_8x8_ddl_%1, 2,2
- movdqa xmm3, [r1+16]
- movdqu xmm2, [r1+17]
- - movdqa xmm1, xmm3
- - pslldq xmm1, 1
- + pslldq xmm1, xmm3, 1
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
- @@ -1229,16 +1226,14 @@ cglobal predict_8x8_ddl_sse2, 2,2
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- -cglobal predict_8x8_ddr_sse2, 2,2
- +cglobal predict_8x8_ddr_%1, 2,2
- movdqu xmm3, [r1+8]
- movdqu xmm1, [r1+7]
- - movdqa xmm2, xmm3
- - psrldq xmm2, 1
- + psrldq xmm2, xmm3, 1
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
- - movdqa xmm1, xmm0
- - psrldq xmm1, 1
- + psrldq xmm1, xmm0, 1
- %assign Y 3
- %rep 3
- movq [r0+Y*FDEC_STRIDE], xmm0
- @@ -1250,19 +1245,15 @@ cglobal predict_8x8_ddr_sse2, 2,2
- movq [r0-3*FDEC_STRIDE], xmm0
- movq [r0-4*FDEC_STRIDE], xmm1
- RET
- -%endif ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- -cglobal predict_8x8_vl_sse2, 2,2
- +cglobal predict_8x8_vl_%1, 2,2
- movdqa xmm4, [r1+16]
- - movdqa xmm2, xmm4
- - movdqa xmm1, xmm4
- - movdqa xmm3, xmm4
- - psrldq xmm2, 1
- - pslldq xmm1, 1
- - pavgb xmm3, xmm2
- + pslldq xmm1, xmm4, 1
- + psrldq xmm2, xmm4, 1
- + pavgb xmm3, xmm4, xmm2
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5
- ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
- @@ -1282,19 +1273,17 @@ cglobal predict_8x8_vl_sse2, 2,2
- RET
- -%ifndef HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- -cglobal predict_8x8_vr_sse2, 2,2,7
- +cglobal predict_8x8_vr_%1, 2,2,7
- movdqu xmm0, [r1+8]
- movdqa xmm6, [pw_ff00]
- add r0, 4*FDEC_STRIDE
- - movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- + pslldq xmm1, xmm0, 2
- pslldq xmm0, 1
- - pslldq xmm1, 2
- pavgb xmm2, xmm0
- PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5
- pandn xmm6, xmm4
- @@ -1317,7 +1306,14 @@ cglobal predict_8x8_vr_sse2, 2,2,7
- %assign Y (Y-2)
- %endrep
- RET
- -%endif
- +%endmacro ; PREDICT_8x8
- +
- +INIT_XMM
- +PREDICT_8x8 sse2
- +INIT_AVX
- +PREDICT_8x8 avx
- +
- +%endif ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_hd( pixel *src, pixel *edge )
- @@ -1336,15 +1332,12 @@ cglobal predict_8x8_hd_%1, 2,2,8*(mmsize/16)
- mova m5, m3
- pavg%2 m3, m1
- PRED8x8_LOWPASS %2, m0, m4, m1, m5, m7
- - mova m4, m2
- - mova m1, m2 ; t6 t5 t4 t3 t2 t1 t0 lt
- - psrl%3 m4, 2*%5 ; .. .. t6 t5 t4 t3 t2 t1
- - psrl%3 m1, %5 ; .. t6 t5 t4 t3 t2 t1 t0
- + psrl%3 m4, m2, 2*%5 ; .. .. t6 t5 t4 t3 t2 t1
- + psrl%3 m1, m2, %5 ; .. t6 t5 t4 t3 t2 t1 t0
- PRED8x8_LOWPASS %2, m6, m4, m2, m1, m5
- ; .. p11 p10 p9
- - mova m7, m3
- + punpckh%4 m7, m3, m0 ; p8 p7 p6 p5
- punpckl%4 m3, m0 ; p4 p3 p2 p1
- - punpckh%4 m7, m0 ; p8 p7 p6 p5
- mova m1, m7
- mova m0, m7
- mova m4, m7
- @@ -1373,6 +1366,8 @@ INIT_XMM
- PREDICT_8x8_HD sse2 , w, dq, wd, 2
- %define PALIGNR PALIGNR_SSSE3
- PREDICT_8x8_HD ssse3 , w, dq, wd, 2
- +INIT_AVX
- +PREDICT_8x8_HD avx , w, dq, wd, 2
- %else
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- @@ -1391,8 +1386,7 @@ cglobal predict_8x8_hd_%1, 2,2
- PALIGNR xmm1, xmm0, 7, xmm4
- PALIGNR xmm2, xmm0, 9, xmm5
- PALIGNR xmm3, xmm0, 8, xmm0
- - movdqa xmm4, xmm1
- - pavgb xmm4, xmm3
- + pavgb xmm4, xmm1, xmm3
- PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5
- punpcklbw xmm4, xmm0
- movhlps xmm0, xmm4
- @@ -1414,6 +1408,8 @@ INIT_XMM
- PREDICT_8x8_HD sse2
- %define PALIGNR PALIGNR_SSSE3
- PREDICT_8x8_HD ssse3
- +INIT_AVX
- +PREDICT_8x8_HD avx
- INIT_MMX
- %define PALIGNR PALIGNR_MMX
- %endif ; HIGH_BIT_DEPTH
- diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
- index b83a0a5..9a8d99d 100644
- --- a/common/x86/predict-c.c
- +++ b/common/x86/predict-c.c
- @@ -43,6 +43,7 @@
- void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
- void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
- void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
- + void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
- void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
- void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
- void x264_predict_8x8c_dc_mmxext( pixel *src );
- @@ -70,31 +71,43 @@
- void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
- void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[33] );
- void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[33] );
- + void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[33] );
- + void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[33] );
- void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
- + void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[33] );
- void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] );
- void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[33] );
- void x264_predict_8x8_vr_ssse3( uint16_t *src, uint16_t edge[33] );
- + void x264_predict_8x8_vr_avx( pixel *src, pixel edge[33] );
- void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[33] );
- void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[33] );
- + void x264_predict_8x8_hu_avx( pixel *src, pixel edge[33] );
- void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[33] );
- void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[33] );
- + void x264_predict_8x8_hd_avx( pixel *src, pixel edge[33] );
- void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
- void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
- void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[33], int i_neighbor, int i_filters );
- + void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
- void x264_predict_4x4_ddl_mmxext( pixel *src );
- void x264_predict_4x4_ddl_sse2( uint16_t *src );
- + void x264_predict_4x4_ddl_avx( uint16_t *src );
- void x264_predict_4x4_ddr_mmxext( pixel *src );
- void x264_predict_4x4_vl_mmxext( pixel *src );
- void x264_predict_4x4_vl_sse2( uint16_t *src );
- + void x264_predict_4x4_vl_avx( uint16_t *src );
- void x264_predict_4x4_vr_mmxext( uint8_t *src );
- void x264_predict_4x4_vr_sse2( uint16_t *src );
- void x264_predict_4x4_vr_ssse3( pixel *src );
- + void x264_predict_4x4_vr_avx( uint16_t *src );
- void x264_predict_4x4_hd_mmxext( pixel *src );
- void x264_predict_4x4_hd_sse2( uint16_t *src );
- void x264_predict_4x4_hd_ssse3( pixel *src );
- + void x264_predict_4x4_hd_avx( uint16_t *src );
- void x264_predict_4x4_dc_mmxext( pixel *src );
- void x264_predict_4x4_ddr_sse2( uint16_t *src );
- void x264_predict_4x4_ddr_ssse3( pixel *src );
- + void x264_predict_4x4_ddr_avx( uint16_t *src );
- void x264_predict_4x4_hu_mmxext( pixel *src );
- #define PREDICT_16x16_DC(name)\
- @@ -164,6 +177,7 @@ static void x264_predict_16x16_p_##name( pixel *src )\
- PREDICT_16x16_P( mmxext )
- #endif
- PREDICT_16x16_P( sse2 )
- +PREDICT_16x16_P( avx )
- #endif //!HIGH_BIT_DEPTH
- #ifdef __GNUC__
- @@ -387,6 +401,7 @@ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )
- #if ARCH_X86_64
- INTRA_SA8D_X3(sse2)
- INTRA_SA8D_X3(ssse3)
- +INTRA_SA8D_X3(avx)
- #else
- INTRA_SA8D_X3(mmxext)
- #endif
- @@ -432,6 +447,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
- if( !(cpu&X264_CPU_SSSE3) )
- return;
- pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
- + if( !(cpu&X264_CPU_AVX) )
- + return;
- + pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx;
- #ifdef __GNUC__
- pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3;
- #endif
- @@ -503,6 +521,13 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
- pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
- pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
- *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
- + if( !(cpu&X264_CPU_AVX) )
- + return;
- + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx;
- + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
- + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx;
- + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
- + *predict_8x8_filter = x264_predict_8x8_filter_avx;
- #else
- pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext;
- pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext;
- @@ -530,6 +555,12 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
- pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
- pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
- *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
- + if( !(cpu&X264_CPU_AVX) )
- + return;
- + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx;
- + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx;
- + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
- + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
- #endif // HIGH_BIT_DEPTH
- }
- @@ -551,6 +582,15 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
- pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_sse2;
- pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
- pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_sse2;
- + if( !(cpu&X264_CPU_AVX) )
- + return;
- + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx;
- + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx;
- + pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx;
- +#if ARCH_X86_64
- + pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx;
- +#endif
- + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx;
- #else
- pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext;
- #endif // HIGH_BIT_DEPTH
- diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
- index 96ba712..5bc9b6e 100644
- --- a/common/x86/quant-a.asm
- +++ b/common/x86/quant-a.asm
- @@ -492,9 +492,8 @@ QUANT_AC quant_8x8_sse4, 8
- paddd m0, m3
- psrad m0, m2
- %else
- - mova m1, m0
- + punpckhwd m1, m0, m4
- punpcklwd m0, m4
- - punpckhwd m1, m4
- pmaddwd m0, %2
- pmaddwd m1, %3
- paddd m0, m3
- @@ -586,6 +585,7 @@ cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
- psrld m3, 1
- DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
- +%ifnidn %1, avx
- cglobal dequant_%2x%2_flat16_%1, 0,3
- movifnidn t2d, r2m
- %if %2 == 8
- @@ -625,6 +625,7 @@ cglobal dequant_%2x%2_flat16_%1, 0,3
- DEQUANT16_FLAT [r1+32], 32, 96
- %endif
- RET
- +%endif ; !AVX
- %endmacro ; DEQUANT
- %ifdef HIGH_BIT_DEPTH
- @@ -642,6 +643,9 @@ DEQUANT mmx, 8, 6, 1
- INIT_XMM
- DEQUANT sse2, 4, 4, 2
- DEQUANT sse2, 8, 6, 2
- +INIT_AVX
- +DEQUANT avx, 4, 4, 2
- +DEQUANT avx, 8, 6, 2
- %endif
- %macro DEQUANT_DC 2
- @@ -714,9 +718,8 @@ cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
- punpcklwd m2, m4
- %rep SIZEOF_PIXEL*32/mmsize
- mova m0, [r0+x]
- - mova m1, m0
- + punpckhwd m1, m0, m5
- punpcklwd m0, m5
- - punpckhwd m1, m5
- pmaddwd m0, m2
- pmaddwd m1, m2
- psrad m0, m3
- @@ -733,11 +736,15 @@ cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
- INIT_XMM
- DEQUANT_DC sse2 , d
- DEQUANT_DC sse4 , d
- +INIT_AVX
- +DEQUANT_DC avx , d
- %else
- INIT_MMX
- DEQUANT_DC mmxext, w
- INIT_XMM
- DEQUANT_DC sse2 , w
- +INIT_AVX
- +DEQUANT_DC avx , w
- %endif
- %ifdef HIGH_BIT_DEPTH
- @@ -757,11 +764,9 @@ cglobal denoise_dct_%1, 4,4,%2
- mova m5, m1
- psubd m0, [r2+r3*4+0*mmsize]
- psubd m1, [r2+r3*4+1*mmsize]
- - mova m7, m0
- - pcmpgtd m7, m6
- + pcmpgtd m7, m0, m6
- pand m0, m7
- - mova m7, m1
- - pcmpgtd m7, m6
- + pcmpgtd m7, m1, m6
- pand m1, m7
- PSIGND m0, m2
- PSIGND m1, m3
- @@ -786,6 +791,8 @@ DENOISE_DCT sse2, 8
- %define PABSD PABSD_SSSE3
- %define PSIGND PSIGND_SSSE3
- DENOISE_DCT ssse3, 8
- +INIT_AVX
- +DENOISE_DCT avx , 8
- %else ; !HIGH_BIT_DEPTH
- diff --git a/common/x86/quant.h b/common/x86/quant.h
- index de2b10b..d8f0edd 100644
- --- a/common/x86/quant.h
- +++ b/common/x86/quant.h
- @@ -50,6 +50,9 @@ void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
- void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
- void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
- void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
- +void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
- +void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
- +void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
- void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
- void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
- index 4d23b82..36455cc 100644
- --- a/common/x86/sad-a.asm
- +++ b/common/x86/sad-a.asm
- @@ -326,11 +326,10 @@ cglobal intra_sad_x3_4x4_sse4, 3,3
- pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
- movd xmm2, [r1-FDEC_STRIDE]
- pxor xmm3, xmm3
- - movdqa xmm5, xmm4
- - pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
- - pshufb xmm5, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
- - pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
- - punpckldq xmm2, xmm4 ; ABCDEFGH
- + pshufb xmm5, xmm4, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
- + pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
- + pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
- + punpckldq xmm2, xmm4 ; ABCDEFGH
- psadbw xmm2, xmm3
- movd xmm1, [r0+FENC_STRIDE*0]
- pinsrd xmm1, [r0+FENC_STRIDE*1], 1
- @@ -341,9 +340,8 @@ cglobal intra_sad_x3_4x4_sse4, 3,3
- psraw xmm2, 2
- pavgw xmm2, xmm3
- pshufb xmm2, xmm3 ; DC prediction
- - movdqa xmm3, xmm0
- + punpckhqdq xmm3, xmm0, xmm5
- punpcklqdq xmm0, xmm5
- - punpckhqdq xmm3, xmm5
- psadbw xmm2, xmm1
- paddw xmm0, xmm3
- movhlps xmm4, xmm2
- @@ -446,12 +444,10 @@ cglobal intra_sad_x3_8x8_ssse3, 3,4,9
- .loop:
- movq m6, [r0+FENC_STRIDE*0]
- movhps m6, [r0+FENC_STRIDE*1]
- - movdqa m7, m0
- - pshufb m7, [shuf+r3*8] ; H prediction
- + pshufb m7, m0, [shuf+r3*8] ; H prediction
- %ifdef ARCH_X86_64
- - movdqa m8, m1
- psadbw m7, m6
- - psadbw m8, m6
- + psadbw m8, m1, m6
- psadbw m6, m2
- paddw m4, m7
- paddw m3, m8
- @@ -459,8 +455,7 @@ cglobal intra_sad_x3_8x8_ssse3, 3,4,9
- %else
- psadbw m7, m6
- paddw m4, m7
- - movdqa m7, m1
- - psadbw m7, m6
- + psadbw m7, m1, m6
- psadbw m6, m2
- paddw m3, m7
- paddw m5, m6
- diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
- index 7ddc1c2..d56c312 100644
- --- a/common/x86/x86inc.asm
- +++ b/common/x86/x86inc.asm
- @@ -547,6 +547,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
- %macro INIT_AVX 0
- INIT_XMM
- %assign avx_enabled 1
- + %define PALIGNR PALIGNR_SSSE3
- %define RESET_MM_PERMUTATION INIT_AVX
- %endmacro
- @@ -870,6 +871,7 @@ AVX_INSTR punpcklwd, 0, 0
- AVX_INSTR punpckldq, 0, 0
- AVX_INSTR punpcklqdq, 0, 0
- AVX_INSTR pxor, 0, 0
- +AVX_INSTR shufps, 0, 1
- AVX_INSTR subpd, 1, 0
- AVX_INSTR subps, 1, 0
- AVX_INSTR subsd, 1, 0
- diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
- index 4e25448..34177bd 100644
- --- a/common/x86/x86util.asm
- +++ b/common/x86/x86util.asm
- @@ -42,16 +42,20 @@
- %assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
- %macro SBUTTERFLY 4
- +%if avx_enabled == 0
- mova m%4, m%2
- punpckl%1 m%2, m%3
- punpckh%1 m%4, m%3
- +%else
- + punpckh%1 m%4, m%2, m%3
- + punpckl%1 m%2, m%3
- +%endif
- SWAP %3, %4
- %endmacro
- %macro SBUTTERFLY2 4
- - mova m%4, m%2
- - punpckh%1 m%2, m%3
- - punpckl%1 m%4, m%3
- + punpckl%1 m%4, m%2, m%3
- + punpckh%1 m%2, m%2, m%3
- SWAP %2, %4, %3
- %endmacro
- @@ -229,16 +233,15 @@
- %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
- %ifnum %5
- - mova m%1, m%5
- - mova m%3, m%5
- + pand m%3, m%5, m%4 ; src .. y6 .. y4
- + pand m%1, m%5, m%2 ; dst .. y6 .. y4
- %else
- mova m%1, %5
- - mova m%3, m%1
- + pand m%3, m%1, m%4 ; src .. y6 .. y4
- + pand m%1, m%1, m%2 ; dst .. y6 .. y4
- %endif
- - pand m%1, m%2 ; dst .. y6 .. y4
- - pand m%3, m%4 ; src .. y6 .. y4
- - psrlw m%2, 8 ; dst .. y7 .. y5
- - psrlw m%4, 8 ; src .. y7 .. y5
- + psrlw m%2, 8 ; dst .. y7 .. y5
- + psrlw m%4, 8 ; src .. y7 .. y5
- %endmacro
- %macro SUMSUB_BA 3-4
- @@ -317,23 +320,31 @@
- %macro TRANS_SSE4 5-6 ; see above
- %ifidn %1, d
- - mova m%5, m%3
- %ifidn %2, ord
- - psrl%1 m%3, 16
- -%endif
- + psrl%1 m%5, m%3, 16
- + SWAP %3, %5
- pblendw m%3, m%4, 10101010b
- - psll%1 m%4, 16
- -%ifidn %2, ord
- + psll%1 m%4, 16
- pblendw m%4, m%5, 01010101b
- %else
- - psrl%1 m%5, 16
- - por m%4, m%5
- +%if avx_enabled == 0
- + mova m%5, m%3
- + pblendw m%3, m%4, 10101010b
- + psll%1 m%4, 16
- + psrl%1 m%5, 16
- + por m%4, m%5
- +%else
- + pblendw m%5, m%3, m%4, 10101010b
- + psll%1 m%4, 16
- + psrl%1 m%3, 16
- + por m%4, m%3
- + SWAP %3, %5
- +%endif
- %endif
- %elifidn %1, q
- - mova m%5, m%3
- + shufps m%5, m%3, m%4, 11011101b
- shufps m%3, m%4, 10001000b
- - shufps m%5, m%4, 11011101b
- - SWAP %4, %5
- + SWAP %4, %5
- %endif
- %endmacro
- @@ -427,19 +438,34 @@
- %endmacro
- %macro SUMSUB2_AB 4
- +%if avx_enabled == 0
- mova %4, %2
- padd%1 %2, %2
- padd%1 %2, %3
- psub%1 %4, %3
- psub%1 %4, %3
- +%else
- + psub%1 %4, %2, %3
- + psub%1 %4, %3
- + padd%1 %2, %2
- + padd%1 %2, %3
- +%endif
- %endmacro
- %macro SUMSUB2_BA 4
- +%if avx_enabled == 0
- mova m%4, m%2
- padd%1 m%2, m%3
- padd%1 m%2, m%3
- psub%1 m%3, m%4
- psub%1 m%3, m%4
- +%else
- + padd%1 m%4, m%2, m%3
- + padd%1 m%4, m%3
- + psub%1 m%3, m%2
- + psub%1 m%3, m%2
- + SWAP m%2, m%4
- +%endif
- %endmacro
- %macro SUMSUBD2_AB 5
- @@ -616,9 +642,8 @@
- %endmacro
- %macro HADDUW 2
- - mova %2, %1
- + psrld %2, %1, 16
- pslld %1, 16
- - psrld %2, 16
- psrld %1, 16
- paddd %1, %2
- HADDD %1, %2
- --
- 1.7.2.3
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement