Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 6e9f6f7c55bd4fe95adc69b1ec7a7ff335f46fe7 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Tue, 7 Dec 2010 15:19:46 -0500
- Subject: [PATCH 1/5] fixes
- ---
- common/x86/predict-a.asm | 48 ++++++++++++++++++++++++++++++++++++++++++---
- common/x86/predict-c.c | 10 ++++----
- 2 files changed, 49 insertions(+), 9 deletions(-)
- diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
- index c908c3a..7af6024 100644
- --- a/common/x86/predict-a.asm
- +++ b/common/x86/predict-a.asm
- @@ -168,7 +168,7 @@ cextern pb_reverse
- ;-----------------------------------------------------------------------------
- %macro PREDICT_4x4_DDL 4
- cglobal predict_4x4_ddl_%1, 1,1
- - mova m1, [r0-FDEC_STRIDEB]
- + movu m1, [r0-FDEC_STRIDEB]
- mova m2, m1
- mova m3, m1
- mova m4, m1
- @@ -224,8 +224,21 @@ PREDICT_4x4_DDL mmxext, q , 8, b
- ;-----------------------------------------------------------------------------
- %macro PREDICT_4x4 7
- cglobal predict_4x4_ddr_%1, 1,1
- - mova m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- - mova m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + movu m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + movu m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- +%ifdef HIGH_BIT_DEPTH
- + movu m4, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + punpckh%2 m2, m4
- + movh m3, [r0-1*FDEC_STRIDEB]
- + punpckh%3 m1, m2
- + PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
- + mova m1, m3
- + movu m4, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
- + mova m2, m3
- + movu m4, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
- +%else
- punpckh%2 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- movh m3, [r0-1*FDEC_STRIDEB]
- punpckh%3 m1, m2
- @@ -234,6 +247,7 @@ cglobal predict_4x4_ddr_%1, 1,1
- PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
- mova m2, m3
- PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
- +%endif
- PRED8x8_LOWPASS %5, m0, m3, m1, m2, m4
- %assign Y 3
- movh [r0+Y*FDEC_STRIDEB], m0
- @@ -247,6 +261,19 @@ cglobal predict_4x4_ddr_%1, 1,1
- cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
- movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
- mova m5, m0
- +%ifdef HIGH_BIT_DEPTH
- + movu m1, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
- + pavg%5 m5, m0
- + movu m1, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
- + mova m1, m0
- + movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + PALIGNR m0, m2, 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
- + mova m2, m0
- + movu m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + PALIGNR m0, m3, 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
- +%else
- PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
- pavg%5 m5, m0
- PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
- @@ -254,6 +281,7 @@ cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
- PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
- mova m2, m0
- PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
- +%endif
- PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
- mova m1, m3
- psrl%4 m3, %7*2
- @@ -269,12 +297,24 @@ cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
- cglobal predict_4x4_hd_%1, 1,1,6*(mmsize/16)
- movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
- +%ifdef HIGH_BIT_DEPTH
- + movu m1, [r0-1*FDEC_STRIDEB]
- + punpckl%6 m0, m1 ; t3 t2 t1 t0 lt .. .. ..
- + psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
- + movu m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
- + movu m2, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + punpckh%2 m1, m2 ; l2 l3
- + movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
- + movu m3, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- + punpckh%2 m2, m3 ; l0 l1
- +%else
- punpckl%6 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
- psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
- mova m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
- punpckh%2 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
- mova m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
- punpckh%2 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
- +%endif
- punpckh%3 m1, m2 ; l0 l1 l2 l3
- punpckh%6 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- mova m0, m1
- @@ -378,7 +418,7 @@ cglobal predict_4x4_hu_mmxext, 1,1
- ;-----------------------------------------------------------------------------
- %macro PREDICT_4x4_V1 4
- cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
- - mova m1, [r0-FDEC_STRIDEB]
- + movu m1, [r0-FDEC_STRIDEB]
- mova m3, m1
- mova m2, m1
- psrl%2 m3, %3
- diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
- index 994e05f..829a191 100644
- --- a/common/x86/predict-c.c
- +++ b/common/x86/predict-c.c
- @@ -505,14 +505,14 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
- #if HIGH_BIT_DEPTH
- if( !(cpu&X264_CPU_SSE2) )
- return;
- -// pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
- + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
- pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_sse2;
- -// pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
- + pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
- if( !(cpu&X264_CPU_SSSE3) )
- return;
- -// pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
- -// pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
- -// pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
- + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
- + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
- + pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
- #else
- pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext;
- pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext;
- --
- 1.7.2.3
- From 590562302162f9f150b36a2ab813e190d70abd2a Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Tue, 7 Dec 2010 22:48:15 -0500
- Subject: [PATCH 2/5] zigzag_scan_4x4_frame, zigzag_scan_4x4_field, zigzag_scan_8x8_frame, zigzag_scan_8x8_field
- ---
- common/dct.c | 23 +++-
- common/x86/dct-a.asm | 440 ++++++++++++++++++++++++++++----------------------
- common/x86/dct.h | 5 +-
- 3 files changed, 268 insertions(+), 200 deletions(-)
- diff --git a/common/dct.c b/common/dct.c
- index 788452b..25c53d9 100644
- --- a/common/dct.c
- +++ b/common/dct.c
- @@ -732,7 +732,16 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- pf->sub_8x8 = zigzag_sub_8x8_field;
- pf->sub_4x4 = zigzag_sub_4x4_field;
- pf->sub_4x4ac = zigzag_sub_4x4ac_field;
- -#if !HIGH_BIT_DEPTH
- +#if HIGH_BIT_DEPTH
- + if( cpu&X264_CPU_SSE2 )
- + {
- + pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
- + }
- + if( cpu&X264_CPU_SSE4 )
- + {
- + pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
- + }
- +#else
- #if HAVE_MMX
- if( cpu&X264_CPU_MMXEXT )
- {
- @@ -750,7 +759,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- if( cpu&X264_CPU_ALTIVEC )
- pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
- #endif
- -#endif // !HIGH_BIT_DEPTH
- +#endif // HIGH_BIT_DEPTH
- }
- else
- {
- @@ -759,7 +768,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- pf->sub_8x8 = zigzag_sub_8x8_frame;
- pf->sub_4x4 = zigzag_sub_4x4_frame;
- pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
- -#if !HIGH_BIT_DEPTH
- +#if HIGH_BIT_DEPTH
- + if( cpu&X264_CPU_SSE2 )
- + {
- + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
- + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
- + }
- +#else
- #if HAVE_MMX
- if( cpu&X264_CPU_MMX )
- pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
- @@ -785,7 +800,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
- if( cpu&X264_CPU_NEON )
- pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
- #endif
- -#endif // !HIGH_BIT_DEPTH
- +#endif // HIGH_BIT_DEPTH
- }
- pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
- index 50f806a..8500e03 100644
- --- a/common/x86/dct-a.asm
- +++ b/common/x86/dct-a.asm
- @@ -891,136 +891,158 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8
- RET
- %endmacro
- +%ifndef HIGH_BIT_DEPTH
- INIT_XMM
- %define PALIGNR PALIGNR_MMX
- SCAN_8x8 sse2
- %define PALIGNR PALIGNR_SSSE3
- SCAN_8x8 ssse3
- +%endif
- ;-----------------------------------------------------------------------------
- -; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
- +; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
- ;-----------------------------------------------------------------------------
- -cglobal zigzag_scan_8x8_frame_mmxext, 2,2
- - movq mm0, [r1]
- - movq mm1, [r1+2*8]
- - movq mm2, [r1+2*14]
- - movq mm3, [r1+2*21]
- - movq mm4, [r1+2*28]
- - movq mm5, mm0
- - movq mm6, mm1
- - psrlq mm0, 16
- - punpckldq mm1, mm1
- - punpcklwd mm5, mm6
- - punpckhwd mm1, mm3
- - punpckhwd mm6, mm0
- - punpckldq mm5, mm0
- - movq mm7, [r1+2*52]
- - movq mm0, [r1+2*60]
- - punpckhwd mm1, mm2
- - punpcklwd mm2, mm4
- - punpckhwd mm4, mm3
- - punpckldq mm3, mm3
- - punpckhwd mm3, mm2
- - movq [r0], mm5
- - movq [r0+2*4], mm1
- - movq [r0+2*8], mm6
- - punpcklwd mm6, mm0
- - punpcklwd mm6, mm7
- - movq mm1, [r1+2*32]
- - movq mm5, [r1+2*39]
- - movq mm2, [r1+2*46]
- - movq [r0+2*35], mm3
- - movq [r0+2*47], mm4
- - punpckhwd mm7, mm0
- - psllq mm0, 16
- - movq mm3, mm5
- - punpcklwd mm5, mm1
- - punpckhwd mm1, mm2
- - punpckhdq mm3, mm3
- - movq [r0+2*52], mm6
- - movq [r0+2*13], mm5
- - movq mm4, [r1+2*11]
- - movq mm6, [r1+2*25]
- - punpcklwd mm5, mm7
- - punpcklwd mm1, mm3
- - punpckhdq mm0, mm7
- - movq mm3, [r1+2*4]
- - movq mm7, [r1+2*18]
- - punpcklwd mm2, mm5
- - movq [r0+2*25], mm1
- - movq mm1, mm4
- - movq mm5, mm6
- - punpcklwd mm4, mm3
- - punpcklwd mm6, mm7
- - punpckhwd mm1, mm3
- - punpckhwd mm5, mm7
- - movq mm3, mm6
- - movq mm7, mm5
- - punpckldq mm6, mm4
- - punpckldq mm5, mm1
- - punpckhdq mm3, mm4
- - punpckhdq mm7, mm1
- - movq mm4, [r1+2*35]
- - movq mm1, [r1+2*49]
- - pshufw mm6, mm6, 0x1b
- - pshufw mm5, mm5, 0x1b
- - movq [r0+2*60], mm0
- - movq [r0+2*56], mm2
- - movq mm0, [r1+2*42]
- - movq mm2, [r1+2*56]
- - movq [r0+2*17], mm3
- - movq [r0+2*32], mm7
- - movq [r0+2*10], mm6
- - movq [r0+2*21], mm5
- - movq mm3, mm0
- - movq mm7, mm2
- - punpcklwd mm0, mm4
- - punpcklwd mm2, mm1
- - punpckhwd mm3, mm4
- - punpckhwd mm7, mm1
- - movq mm4, mm2
- - movq mm1, mm7
- - punpckhdq mm2, mm0
- - punpckhdq mm7, mm3
- - punpckldq mm4, mm0
- - punpckldq mm1, mm3
- - pshufw mm2, mm2, 0x1b
- - pshufw mm7, mm7, 0x1b
- - movq [r0+2*28], mm4
- - movq [r0+2*43], mm1
- - movq [r0+2*39], mm2
- - movq [r0+2*50], mm7
- +%macro SCAN_8x8_FRAME 6
- +cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
- + mova m0, [r1]
- + mova m1, [r1+SIZEOF_PIXEL*2* 8]
- + movu m2, [r1+SIZEOF_PIXEL*2*14]
- + movu m3, [r1+SIZEOF_PIXEL*2*21]
- + mova m4, [r1+SIZEOF_PIXEL*2*28]
- + mova m5, m0
- + mova m6, m1
- + psrl%3 m0, %2
- + punpckl%4 m1, m1
- + punpckl%5 m5, m6
- + punpckh%5 m1, m3
- + punpckh%5 m6, m0
- + punpckl%4 m5, m0
- + mova m7, [r1+SIZEOF_PIXEL*2*52]
- + mova m0, [r1+SIZEOF_PIXEL*2*60]
- + punpckh%5 m1, m2
- + punpckl%5 m2, m4
- + punpckh%5 m4, m3
- + punpckl%4 m3, m3
- + punpckh%5 m3, m2
- + mova [r0], m5
- + mova [r0+SIZEOF_PIXEL*2*4], m1
- + mova [r0+SIZEOF_PIXEL*2*8], m6
- + punpckl%5 m6, m0
- + punpckl%5 m6, m7
- + mova m1, [r1+SIZEOF_PIXEL*2*32]
- + movu m5, [r1+SIZEOF_PIXEL*2*39]
- + movu m2, [r1+SIZEOF_PIXEL*2*46]
- + movu [r0+SIZEOF_PIXEL*2*35], m3
- + movu [r0+SIZEOF_PIXEL*2*47], m4
- + punpckh%5 m7, m0
- + psll%3 m0, %2
- + mova m3, m5
- + punpckl%5 m5, m1
- + punpckh%5 m1, m2
- + punpckh%4 m3, m3
- + mova [r0+SIZEOF_PIXEL*2*52], m6
- + movu [r0+SIZEOF_PIXEL*2*13], m5
- + movu m4, [r1+SIZEOF_PIXEL*2*11]
- + movu m6, [r1+SIZEOF_PIXEL*2*25]
- + punpckl%5 m5, m7
- + punpckl%5 m1, m3
- + punpckh%4 m0, m7
- + mova m3, [r1+SIZEOF_PIXEL*2* 4]
- + movu m7, [r1+SIZEOF_PIXEL*2*18]
- + punpckl%5 m2, m5
- + movu [r0+SIZEOF_PIXEL*2*25], m1
- + mova m1, m4
- + mova m5, m6
- + punpckl%5 m4, m3
- + punpckl%5 m6, m7
- + punpckh%5 m1, m3
- + punpckh%5 m5, m7
- + mova m3, m6
- + mova m7, m5
- + punpckl%4 m6, m4
- + punpckl%4 m5, m1
- + punpckh%4 m3, m4
- + punpckh%4 m7, m1
- + movu m4, [r1+SIZEOF_PIXEL*2*35]
- + movu m1, [r1+SIZEOF_PIXEL*2*49]
- + pshuf%6 m6, m6, 0x1b
- + pshuf%6 m5, m5, 0x1b
- + mova [r0+SIZEOF_PIXEL*2*60], m0
- + mova [r0+SIZEOF_PIXEL*2*56], m2
- + movu m0, [r1+SIZEOF_PIXEL*2*42]
- + mova m2, [r1+SIZEOF_PIXEL*2*56]
- + movu [r0+SIZEOF_PIXEL*2*17], m3
- + mova [r0+SIZEOF_PIXEL*2*32], m7
- + movu [r0+SIZEOF_PIXEL*2*10], m6
- + movu [r0+SIZEOF_PIXEL*2*21], m5
- + mova m3, m0
- + mova m7, m2
- + punpckl%5 m0, m4
- + punpckl%5 m2, m1
- + punpckh%5 m3, m4
- + punpckh%5 m7, m1
- + mova m4, m2
- + mova m1, m7
- + punpckh%4 m2, m0
- + punpckh%4 m7, m3
- + punpckl%4 m4, m0
- + punpckl%4 m1, m3
- + pshuf%6 m2, m2, 0x1b
- + pshuf%6 m7, m7, 0x1b
- + mova [r0+SIZEOF_PIXEL*2*28], m4
- + movu [r0+SIZEOF_PIXEL*2*43], m1
- + movu [r0+SIZEOF_PIXEL*2*39], m2
- + movu [r0+SIZEOF_PIXEL*2*50], m7
- RET
- +%endmacro
- +
- +%ifdef HIGH_BIT_DEPTH
- +INIT_XMM
- +SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
- +%else
- +INIT_MMX
- +SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
- +%endif
- ;-----------------------------------------------------------------------------
- -; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
- +; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
- ;-----------------------------------------------------------------------------
- -cglobal zigzag_scan_4x4_frame_mmx, 2,2
- - movq mm0, [r1]
- - movq mm1, [r1+8]
- - movq mm2, [r1+16]
- - movq mm3, [r1+24]
- - movq mm4, mm0
- - movq mm5, mm1
- - movq mm6, mm2
- - movq mm7, mm3
- - psllq mm3, 16
- - psrlq mm0, 16
- - punpckldq mm2, mm2
- - punpckhdq mm1, mm1
- - punpcklwd mm4, mm5
- - punpcklwd mm5, mm3
- - punpckldq mm4, mm0
- - punpckhwd mm5, mm2
- - punpckhwd mm0, mm6
- - punpckhwd mm6, mm7
- - punpcklwd mm1, mm0
- - punpckhdq mm3, mm6
- - movq [r0], mm4
- - movq [r0+8], mm5
- - movq [r0+16], mm1
- - movq [r0+24], mm3
- +%macro SCAN_4x4 5
- +cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
- + mova m0, [r1]
- + mova m1, [r1+SIZEOF_PIXEL* 8]
- + mova m2, [r1+SIZEOF_PIXEL*16]
- + mova m3, [r1+SIZEOF_PIXEL*24]
- + mova m4, m0
- + mova m5, m1
- + mova m6, m2
- + mova m7, m3
- + psll%3 m3, %2
- + psrl%3 m0, %2
- + punpckl%4 m2, m2
- + punpckh%4 m1, m1
- + punpckl%5 m4, m5
- + punpckl%5 m5, m3
- + punpckl%4 m4, m0
- + punpckh%5 m5, m2
- + punpckh%5 m0, m6
- + punpckh%5 m6, m7
- + punpckl%5 m1, m0
- + punpckh%4 m3, m6
- + mova [r0], m4
- + mova [r0+SIZEOF_PIXEL* 8], m5
- + mova [r0+SIZEOF_PIXEL*16], m1
- + mova [r0+SIZEOF_PIXEL*24], m3
- RET
- +%endmacro
- +
- +%ifdef HIGH_BIT_DEPTH
- +INIT_XMM
- +SCAN_4x4 sse2, 4 , dq, qdq, dq
- +%else
- +INIT_MMX
- +SCAN_4x4 mmx , 16, q , dq , wd
- +%endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
- @@ -1039,6 +1061,25 @@ cglobal zigzag_scan_4x4_frame_ssse3, 2,2
- movdqa [r0+16], xmm1
- RET
- +%ifdef HIGH_BIT_DEPTH
- +INIT_XMM
- +;-----------------------------------------------------------------------------
- +; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
- +;-----------------------------------------------------------------------------
- +cglobal zigzag_scan_4x4_field_sse2, 2,3
- + movu m4, [r1+8]
- + pshufd m0, m4, 0xd2
- + mova m1, [r1+32]
- + mova m2, [r1+48]
- + movu [r0+8], m0
- + mova [r0+32], m1
- + mova [r0+48], m2
- + movq mm0, [r1]
- + movq [r0], mm0
- + movq mm0, [r1+24]
- + movq [r0+24], mm0
- + RET
- +%else
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
- ;-----------------------------------------------------------------------------
- @@ -1055,11 +1096,11 @@ cglobal zigzag_scan_4x4_field_mmxext, 2,3
- mov r2d, [r1+12]
- mov [r0+12], r2d
- RET
- +%endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
- ;-----------------------------------------------------------------------------
- -
- ; Output order:
- ; 0 1 2 8 9 3 4 10
- ; 16 11 5 6 7 12 17 24
- @@ -1069,84 +1110,93 @@ cglobal zigzag_scan_4x4_field_mmxext, 2,3
- ; 36 37 38 39 43 49 50 44
- ; 45 46 47 51 56 57 52 53
- ; 54 55 58 59 60 61 62 63
- -
- -cglobal zigzag_scan_8x8_field_mmxext, 2,3
- - movq mm0, [r1+2*0] ; 03 02 01 00
- - movq mm1, [r1+2*4] ; 07 06 05 04
- - movq mm2, [r1+2*8] ; 11 10 09 08
- - pshufw mm3, mm0, 011111111b ; 03 03 03 03
- - movd r2, mm2 ; 09 08
- - pshufw mm2, mm2, 000111001b ; 08 11 10 09
- - punpcklwd mm3, mm1 ; 05 03 04 03
- - pinsrw mm0, r2, 3 ; 08 02 01 00
- - movq mm4, mm2
- - punpcklwd mm2, mm3 ; 04 10 03 09
- - pshufw mm2, mm2, 010110100b ; 10 04 03 09
- - movq [r0+2*0], mm0 ; 08 02 01 00
- - movq [r0+2*4], mm2 ; 10 04 03 09
- - movq mm3, [r1+2*12] ; 15 14 13 12
- - movq mm5, [r1+2*16] ; 19 18 17 16
- - punpckldq mm6, mm5 ; 17 16 XX XX
- - psrlq mm1, 16 ; XX 07 06 05
- - punpckhwd mm6, mm4 ; 08 17 11 16
- - punpckldq mm6, mm1 ; 06 05 11 16
- - movq [r0+2*8], mm6 ; 06 05 11 16
- - psrlq mm1, 16 ; XX XX 07 06
- - punpcklwd mm1, mm5 ; 17 07 16 06
- - movq mm0, [r1+2*20] ; 23 22 21 20
- - movq mm2, [r1+2*24] ; 27 26 25 24
- - movq mm6, mm3
- - punpckhdq mm1, mm1 ; 17 07 17 07
- - punpcklwd mm6, mm2 ; 25 13 24 12
- - pextrw r2, mm5, 2
- - movq [r0+2*24], mm0 ; 23 22 21 20
- - punpcklwd mm1, mm6 ; 24 17 12 07
- - movq [r0+2*12], mm1
- - pinsrw mm3, r2, 0 ; 15 14 13 18
- - movq [r0+2*16], mm3 ; 15 14 13 18
- - movq mm7, [r1+2*28]
- - movq mm0, [r1+2*32] ; 35 34 33 32
- - psrlq mm5, 48 ; XX XX XX 19
- - pshufw mm1, mm2, 011111001b ; 27 27 26 25
- - punpcklwd mm5, mm0 ; 33 XX 32 19
- - psrlq mm2, 48 ; XX XX XX 27
- - punpcklwd mm5, mm1 ; 26 32 25 19
- - movq [r0+2*32], mm7
- - movq [r0+2*20], mm5 ; 26 32 25 19
- - movq mm7, [r1+2*36]
- - movq mm1, [r1+2*40] ; 43 42 41 40
- - pshufw mm3, mm0, 011111001b ; 35 35 34 33
- - punpcklwd mm2, mm1 ; 41 XX 40 27
- - movq [r0+2*40], mm7
- - punpcklwd mm2, mm3 ; 34 40 33 27
- - movq [r0+2*28], mm2
- - movq mm7, [r1+2*44] ; 47 46 45 44
- - movq mm2, [r1+2*48] ; 51 50 49 48
- - psrlq mm0, 48 ; XX XX XX 35
- - punpcklwd mm0, mm2 ; 49 XX 48 35
- - pshufw mm3, mm1, 011111001b ; 43 43 42 41
- - punpcklwd mm0, mm3 ; 42 48 41 35
- - movq [r0+2*36], mm0
- - pextrw r2, mm2, 3 ; 51
- - psrlq mm1, 48 ; XX XX XX 43
- - punpcklwd mm1, mm7 ; 45 XX 44 43
- - psrlq mm2, 16 ; XX 51 50 49
- - punpcklwd mm1, mm2 ; 50 44 49 43
- - pshufw mm1, mm1, 010110100b ; 44 50 49 43
- - movq [r0+2*44], mm1
- - psrlq mm7, 16 ; XX 47 46 45
- - pinsrw mm7, r2, 3 ; 51 47 46 45
- - movq [r0+2*48], mm7
- - movq mm0, [r1+2*56] ; 59 58 57 56
- - movq mm1, [r1+2*52] ; 55 54 53 52
- - movq mm2, mm0
- - movq mm7, [r1+2*60]
- - punpckldq mm2, mm1 ; 53 52 57 56
- - punpckhdq mm1, mm0 ; 59 58 55 54
- - movq [r0+2*52], mm2
- - movq [r0+2*56], mm1
- - movq [r0+2*60], mm7
- +%undef SCAN_8x8
- +%macro SCAN_8x8 6
- +cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
- + mova m0, [r1+SIZEOF_PIXEL*2*0] ; 03 02 01 00
- + mova m1, [r1+SIZEOF_PIXEL*2*4] ; 07 06 05 04
- + mova m2, [r1+SIZEOF_PIXEL*2*8] ; 11 10 09 08
- + pshuf%2 m3, m0, 011111111b ; 03 03 03 03
- + movd r2, m2 ; 09 08
- + pshuf%2 m2, m2, 000111001b ; 08 11 10 09
- + punpckl%3 m3, m1 ; 05 03 04 03
- + pinsr%2 m0, r2d, 3 ; 08 02 01 00
- + mova m4, m2
- + punpckl%3 m2, m3 ; 04 10 03 09
- + pshuf%2 m2, m2, 010110100b ; 10 04 03 09
- + mova [r0+SIZEOF_PIXEL*2*0], m0 ; 08 02 01 00
- + mova [r0+SIZEOF_PIXEL*2*4], m2 ; 10 04 03 09
- + mova m3, [r1+SIZEOF_PIXEL*2*12] ; 15 14 13 12
- + mova m5, [r1+SIZEOF_PIXEL*2*16] ; 19 18 17 16
- + punpckl%4 m6, m5 ; 17 16 XX XX
- + psrl%5 m1, %6 ; XX 07 06 05
- + punpckh%3 m6, m4 ; 08 17 11 16
- + punpckl%4 m6, m1 ; 06 05 11 16
- + mova [r0+SIZEOF_PIXEL*2*8], m6 ; 06 05 11 16
- + psrl%5 m1, %6 ; XX XX 07 06
- + punpckl%3 m1, m5 ; 17 07 16 06
- + mova m0, [r1+SIZEOF_PIXEL*2*20] ; 23 22 21 20
- + mova m2, [r1+SIZEOF_PIXEL*2*24] ; 27 26 25 24
- + mova m6, m3
- + punpckh%4 m1, m1 ; 17 07 17 07
- + punpckl%3 m6, m2 ; 25 13 24 12
- + pextr%2 r2d, m5, 2
- + mova [r0+SIZEOF_PIXEL*2*24], m0 ; 23 22 21 20
- + punpckl%3 m1, m6 ; 24 17 12 07
- + mova [r0+SIZEOF_PIXEL*2*12], m1
- + pinsr%2 m3, r2d, 0 ; 15 14 13 18
- + mova [r0+SIZEOF_PIXEL*2*16], m3 ; 15 14 13 18
- + mova m7, [r1+SIZEOF_PIXEL*2*28]
- + mova m0, [r1+SIZEOF_PIXEL*2*32] ; 35 34 33 32
- + psrl%5 m5, %6*3 ; XX XX XX 19
- + pshuf%2 m1, m2, 011111001b ; 27 27 26 25
- + punpckl%3 m5, m0 ; 33 XX 32 19
- + psrl%5 m2, %6*3 ; XX XX XX 27
- + punpckl%3 m5, m1 ; 26 32 25 19
- + mova [r0+SIZEOF_PIXEL*2*32], m7
- + mova [r0+SIZEOF_PIXEL*2*20], m5 ; 26 32 25 19
- + mova m7, [r1+SIZEOF_PIXEL*2*36]
- + mova m1, [r1+SIZEOF_PIXEL*2*40] ; 43 42 41 40
- + pshuf%2 m3, m0, 011111001b ; 35 35 34 33
- + punpckl%3 m2, m1 ; 41 XX 40 27
- + mova [r0+SIZEOF_PIXEL*2*40], m7
- + punpckl%3 m2, m3 ; 34 40 33 27
- + mova [r0+SIZEOF_PIXEL*2*28], m2
- + mova m7, [r1+SIZEOF_PIXEL*2*44] ; 47 46 45 44
- + mova m2, [r1+SIZEOF_PIXEL*2*48] ; 51 50 49 48
- + psrl%5 m0, %6*3 ; XX XX XX 35
- + punpckl%3 m0, m2 ; 49 XX 48 35
- + pshuf%2 m3, m1, 011111001b ; 43 43 42 41
- + punpckl%3 m0, m3 ; 42 48 41 35
- + mova [r0+SIZEOF_PIXEL*2*36], m0
- + pextr%2 r2d, m2, 3 ; 51
- + psrl%5 m1, %6*3 ; XX XX XX 43
- + punpckl%3 m1, m7 ; 45 XX 44 43
- + psrl%5 m2, %6 ; XX 51 50 49
- + punpckl%3 m1, m2 ; 50 44 49 43
- + pshuf%2 m1, m1, 010110100b ; 44 50 49 43
- + mova [r0+SIZEOF_PIXEL*2*44], m1
- + psrl%5 m7, %6 ; XX 47 46 45
- + pinsr%2 m7, r2d, 3 ; 51 47 46 45
- + mova [r0+SIZEOF_PIXEL*2*48], m7
- + mova m0, [r1+SIZEOF_PIXEL*2*56] ; 59 58 57 56
- + mova m1, [r1+SIZEOF_PIXEL*2*52] ; 55 54 53 52
- + mova m2, m0
- + mova m7, [r1+SIZEOF_PIXEL*2*60]
- + punpckl%4 m2, m1 ; 53 52 57 56
- + punpckh%4 m1, m0 ; 59 58 55 54
- + mova [r0+SIZEOF_PIXEL*2*52], m2
- + mova [r0+SIZEOF_PIXEL*2*56], m1
- + mova [r0+SIZEOF_PIXEL*2*60], m7
- RET
- +%endmacro
- +%ifdef HIGH_BIT_DEPTH
- +INIT_XMM
- +SCAN_8x8 sse4 , d, dq, qdq, dq, 4
- +%else
- +INIT_MMX
- +SCAN_8x8 mmxext, w, wd, dq , q , 16
- +%endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
- diff --git a/common/x86/dct.h b/common/x86/dct.h
- index bb8c250..54a6e44 100644
- --- a/common/x86/dct.h
- +++ b/common/x86/dct.h
- @@ -72,11 +72,14 @@ void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct [64] );
- void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
- void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
- -void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[64] );
- +void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
- void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
- void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
- +void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
- void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
- +void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
- void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
- +void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
- void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
- int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
- int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
- --
- 1.7.2.3
- From 433adb378e46005187df9867e1e0e0c41df8dbc3 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Wed, 8 Dec 2010 17:56:22 -0500
- Subject: [PATCH 3/5] dequant4x4, dequant8x8, dequant_4x4dc sse2, sse4
- ---
- common/quant.c | 6 ++
- common/x86/const-a.asm | 1 +
- common/x86/quant-a.asm | 157 ++++++++++++++++++++++++++++++++++++++----------
- common/x86/quant.h | 9 ++-
- tools/checkasm.c | 6 +-
- 5 files changed, 141 insertions(+), 38 deletions(-)
- diff --git a/common/quant.c b/common/quant.c
- index 816e60a..b8a707d 100644
- --- a/common/quant.c
- +++ b/common/quant.c
- @@ -322,6 +322,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
- pf->quant_8x8 = x264_quant_8x8_sse2;
- pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
- pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
- + pf->dequant_4x4 = x264_dequant_4x4_sse2;
- + pf->dequant_8x8 = x264_dequant_8x8_sse2;
- + pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
- pf->denoise_dct = x264_denoise_dct_sse2;
- pf->decimate_score15 = x264_decimate_score15_sse2;
- pf->decimate_score16 = x264_decimate_score16_sse2;
- @@ -367,6 +370,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
- pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
- pf->quant_4x4 = x264_quant_4x4_sse4;
- pf->quant_8x8 = x264_quant_8x8_sse4;
- + pf->dequant_4x4 = x264_dequant_4x4_sse4;
- + pf->dequant_8x8 = x264_dequant_8x8_sse4;
- + pf->dequant_4x4_dc = x264_dequant_4x4dc_sse4;
- }
- #endif // HAVE_MMX
- #else // !HIGH_BIT_DEPTH
- diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
- index d6e621e..8e92f52 100644
- --- a/common/x86/const-a.asm
- +++ b/common/x86/const-a.asm
- @@ -52,6 +52,7 @@ const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
- const pd_1, times 4 dd 1
- const pd_32, times 4 dd 32
- const pd_128, times 4 dd 128
- +const pd_ffff, times 4 dd 0xffff
- const pw_00ff, times 8 dw 0x00ff
- const pw_ff00, times 8 dw 0xff00
- diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
- index 5d7a15e..fc52ace 100644
- --- a/common/x86/quant-a.asm
- +++ b/common/x86/quant-a.asm
- @@ -75,6 +75,7 @@ cextern pb_1
- cextern pw_1
- cextern pd_1
- cextern pb_01
- +cextern pd_ffff
- %macro QUANT_DC_START_MMX 0
- movd m6, r1m ; mf
- @@ -469,26 +470,52 @@ QUANT_AC quant_8x8_sse4, 8
- ; dequant
- ;=============================================================================
- -%macro DEQUANT16_L 3
- +%macro DEQUANT16_L 5
- ;;; %1 dct[y][x]
- ;;; %2,%3 dequant_mf[i_mf][y][x]
- ;;; m2 i_qbits
- -
- mova m0, %2
- +%ifdef HIGH_BIT_DEPTH
- +%ifidn %5,sse4
- + pmulld m0, %1
- +%else
- + mova m4, %1
- + mova m3, m4
- + pand m4, [pd_ffff]
- + pslldq m4, 2
- + por m3, m4
- + pmaddwd m0, m3
- +%endif
- +%else
- packssdw m0, %3
- pmullw m0, %1
- - psllw m0, m2
- +%endif
- + psll%4 m0, m2
- mova %1, m0
- %endmacro
- -%macro DEQUANT32_R 3
- +%macro DEQUANT32_R 5
- ;;; %1 dct[y][x]
- ;;; %2,%3 dequant_mf[i_mf][y][x]
- ;;; m2 -i_qbits
- ;;; m3 f
- ;;; m4 0
- -
- mova m0, %1
- +%ifdef HIGH_BIT_DEPTH
- +%ifidn %5,sse4
- + pmulld m0, %2
- + paddd m0, m3
- + psrad m0, m2
- +%else
- + mova m4, m0
- + pand m0, [pd_ffff]
- + pslldq m0, 2
- + por m0, m4
- + pmaddwd m0, %2
- + paddd m0, m3
- + psrad m0, m2
- +%endif
- +%else
- mova m1, m0
- punpcklwd m0, m4
- punpckhwd m1, m4
- @@ -499,21 +526,22 @@ QUANT_AC quant_8x8_sse4, 8
- psrad m0, m2
- psrad m1, m2
- packssdw m0, m1
- +%endif
- mova %1, m0
- %endmacro
- -%macro DEQUANT_LOOP 3
- +%macro DEQUANT_LOOP 5
- %if 8*(%2-2*%3)
- mov t0d, 8*(%2-2*%3)
- %%loop:
- - %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
- - %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
- + %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], %4, %5
- + %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], %4, %5
- sub t0d, 16*%3
- jge %%loop
- REP_RET
- %else
- - %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
- - %1 [r0 ], [r1 ], [r1+ 8*%3]
- + %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], %4, %5
- + %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], %4, %5
- RET
- %endif
- %endmacro
- @@ -562,16 +590,16 @@ QUANT_AC quant_8x8_sse4, 8
- %endmacro
- ;-----------------------------------------------------------------------------
- -; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
- +; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
- ;-----------------------------------------------------------------------------
- -%macro DEQUANT 4
- -cglobal dequant_%2x%2_%1, 0,3
- +%macro DEQUANT 5
- +cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
- .skip_prologue:
- DEQUANT_START %3+2, %3
- .lshift:
- movd m2, t0d
- - DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
- + DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4, %5, %1
- .rshift32:
- neg t0d
- @@ -580,7 +608,7 @@ cglobal dequant_%2x%2_%1, 0,3
- pxor m4, m4
- pslld m3, m2
- psrld m3, 1
- - DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
- + DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4, %5, %1
- cglobal dequant_%2x%2_flat16_%1, 0,3
- movifnidn t2d, r2m
- @@ -623,23 +651,60 @@ cglobal dequant_%2x%2_flat16_%1, 0,3
- RET
- %endmacro ; DEQUANT
- +%ifdef HIGH_BIT_DEPTH
- +INIT_XMM
- +DEQUANT sse2, 4, 4, 1, d
- +DEQUANT sse4, 4, 4, 1, d
- +DEQUANT sse2, 8, 6, 1, d
- +DEQUANT sse4, 8, 6, 1, d
- +%else
- %ifndef ARCH_X86_64
- INIT_MMX
- -DEQUANT mmx, 4, 4, 1
- -DEQUANT mmx, 8, 6, 1
- +DEQUANT mmx, 4, 4, 1, w
- +DEQUANT mmx, 8, 6, 1, w
- %endif
- INIT_XMM
- -DEQUANT sse2, 4, 4, 2
- -DEQUANT sse2, 8, 6, 2
- +DEQUANT sse2, 4, 4, 2, w
- +DEQUANT sse2, 8, 6, 2, w
- +%endif
- -%macro DEQUANT_DC 1
- -cglobal dequant_4x4dc_%1, 0,3
- +%macro DEQUANT_DC 2
- +cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
- DEQUANT_START 6, 6
- .lshift:
- - movd m3, [r1]
- - movd m2, t0d
- - pslld m3, m2
- + movd m3, [r1]
- + movd m2, t0d
- + pslld m3, m2
- +%ifdef HIGH_BIT_DEPTH
- + pshufd m3, m3, 0
- + mova m5, [pd_ffff]
- +%assign x 0
- +%rep SIZEOF_PIXEL*16/mmsize
- + mova m0, [r0+mmsize*0+x]
- + mova m1, [r0+mmsize*1+x]
- +%ifidn %1,sse4
- + pmull%2 m0, m3
- + pmull%2 m1, m3
- +%else
- + mova m2, m0
- + pand m2, m5
- + pslldq m2, 2
- + por m0, m2
- + pmaddwd m0, m3
- +
- + mova m4, m1
- + pand m4, m5
- + pslldq m4, 2
- + por m1, m4
- + pmaddwd m1, m3
- +%endif
- + mova [r0+mmsize*0+x], m0
- + mova [r0+mmsize*1+x], m1
- +%assign x x+mmsize*2
- +%endrep
- +
- +%else
- %if mmsize==16
- pshuflw m3, m3, 0
- punpcklqdq m3, m3
- @@ -647,33 +712,54 @@ cglobal dequant_4x4dc_%1, 0,3
- pshufw m3, m3, 0
- %endif
- %assign x 0
- -%rep 16/mmsize
- +%rep SIZEOF_PIXEL*16/mmsize
- mova m0, [r0+mmsize*0+x]
- mova m1, [r0+mmsize*1+x]
- - pmullw m0, m3
- - pmullw m1, m3
- + pmull%2 m0, m3
- + pmull%2 m1, m3
- mova [r0+mmsize*0+x], m0
- mova [r0+mmsize*1+x], m1
- %assign x x+mmsize*2
- %endrep
- +%endif ; HIGH_BIT_DEPTH
- RET
- .rshift32:
- neg t0d
- movd m3, t0d
- - mova m4, [pw_1]
- + mova m4, [p%2_1]
- mova m5, m4
- pslld m4, m3
- psrld m4, 1
- movd m2, [r1]
- +%assign x 0
- +%ifdef HIGH_BIT_DEPTH
- + pshufd m2, m2, 0
- +%rep SIZEOF_PIXEL*32/mmsize
- + mova m0, [r0+x]
- +%ifidn %1,sse4
- + pmulld m0, m2
- +%else
- + mova m1, m0
- + pand m1, [pd_ffff]
- + pslldq m1, 2
- + por m0, m1
- + pmaddwd m0, m2
- +%endif
- + paddd m0, m4
- + psrad m0, m3
- + mova [r0+x], m0
- +%assign x x+mmsize
- +%endrep
- +
- +%else
- %if mmsize==8
- punpcklwd m2, m2
- %else
- pshuflw m2, m2, 0
- %endif
- punpcklwd m2, m4
- -%assign x 0
- -%rep 32/mmsize
- +%rep SIZEOF_PIXEL*32/mmsize
- mova m0, [r0+x]
- mova m1, m0
- punpcklwd m0, m5
- @@ -686,13 +772,20 @@ cglobal dequant_4x4dc_%1, 0,3
- mova [r0+x], m0
- %assign x x+mmsize
- %endrep
- +%endif
- RET
- %endmacro
- +%ifdef HIGH_BIT_DEPTH
- +INIT_XMM
- +DEQUANT_DC sse2 , d
- +DEQUANT_DC sse4 , d
- +%else
- INIT_MMX
- -DEQUANT_DC mmxext
- +DEQUANT_DC mmxext, w
- INIT_XMM
- -DEQUANT_DC sse2
- +DEQUANT_DC sse2 , w
- +%endif
- %ifdef HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- diff --git a/common/x86/quant.h b/common/x86/quant.h
- index a28099c..56e9847 100644
- --- a/common/x86/quant.h
- +++ b/common/x86/quant.h
- @@ -47,9 +47,9 @@ int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
- void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
- -void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- -void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- -void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
- +void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
- +void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
- +void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
- void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
- void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
- @@ -57,6 +57,9 @@ void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i
- void x264_denoise_dct_mmx( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
- void x264_denoise_dct_sse2( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
- void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
- +void x264_dequant_4x4_sse4( int32_t dct[16], int dequant_mf[6][16], int i_qp );
- +void x264_dequant_4x4dc_sse4( int32_t dct[16], int dequant_mf[6][16], int i_qp );
- +void x264_dequant_8x8_sse4( int32_t dct[64], int dequant_mf[6][64], int i_qp );
- int x264_decimate_score15_mmxext( dctcoef *dct );
- int x264_decimate_score15_sse2 ( dctcoef *dct );
- int x264_decimate_score15_ssse3 ( dctcoef *dct );
- diff --git a/tools/checkasm.c b/tools/checkasm.c
- index 020bcab..4a05d2b 100644
- --- a/tools/checkasm.c
- +++ b/tools/checkasm.c
- @@ -677,8 +677,8 @@ static int check_dct( int cpu_ref, int cpu_new )
- for( int i = 0; i < 16 && ok; i++ )\
- {\
- for( int j = 0; j < 16; j++ )\
- - dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
- - : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
- + dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
- + : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
- : ((*p++)&0x1fff)-0x1000; /* general case */\
- memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
- call_c1( dct_c.name, dct1[0] );\
- @@ -1533,7 +1533,7 @@ static int check_quant( int cpu_ref, int cpu_new )
- for( int qp = QP_MAX; qp > 0; qp-- ) \
- { \
- for( int i = 0; i < 16; i++ ) \
- - dct1[i] = rand(); \
- + dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
- call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
- memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
- call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
- --
- 1.7.2.3
- From 8cbd80ecb90a46fbfcc2e8a6ba2602c358f07306 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Fri, 10 Dec 2010 19:45:09 -0500
- Subject: [PATCH 4/5] x264_weight_cache_mmxext
- ---
- common/x86/mc-a.asm | 12 ++----------
- common/x86/mc-c.c | 18 +++++++++++++++++-
- 2 files changed, 19 insertions(+), 11 deletions(-)
- diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
- index 22fb872..e436148 100644
- --- a/common/x86/mc-a.asm
- +++ b/common/x86/mc-a.asm
- @@ -188,18 +188,10 @@ AVG_WEIGHT ssse3, 16, 7
- %ifdef HIGH_BIT_DEPTH
- %macro WEIGHT_START 1 ; (width)
- + mova m0, [r4+ 0] ; 1<<denom
- + mova m3, [r4+16]
- movd m2, [r4+32] ; denom
- - movd m3, [r4+36] ; scale
- - mov TMP_REG, [r4+40] ; offset
- - mova m0, [pw_1]
- - shl TMP_REG, BIT_DEPTH-7
- mova m4, [pw_pixel_max]
- - add TMP_REG, 1
- - psllw m0, m2 ; 1<<denom
- - movd m1, TMP_REG ; 1+(offset<<(BIT_DEPTH-8+1))
- - psllw m3, 1 ; scale<<1
- - punpcklwd m3, m1
- - SPLATD m3, m3
- paddw m2, [sq_1] ; denom+1
- %endmacro
- diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
- index 6859e3c..47b3cf9 100644
- --- a/common/x86/mc-c.c
- +++ b/common/x86/mc-c.c
- @@ -221,6 +221,21 @@ MC_COPY_WTAB(sse2,mmx,mmx,sse2)
- #if HIGH_BIT_DEPTH
- MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
- MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,12)
- +static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
- +{
- + int i;
- + int16_t den1, den2, den3;
- +
- + w->weightfn = h->mc.weight;
- + den1 = 1<<w->i_denom;
- + den2 = w->i_scale<<1;
- + den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
- + for( i = 0; i < 8; i++ )
- + {
- + w->cachea[i] = den1;
- + w->cacheb[i] = i&1 ? den3 : den2;
- + }
- +}
- #else
- MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
- MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
- @@ -268,7 +283,7 @@ static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
- }
- w->weightfn = h->mc.weight;
- den1 = w->i_scale << (8 - w->i_denom);
- - for(i = 0;i<8;i++)
- + for( i = 0; i < 8; i++ )
- {
- w->cachea[i] = den1;
- w->cacheb[i] = w->i_offset;
- @@ -459,6 +474,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- pf->weight = x264_mc_weight_wtab_mmxext;
- #if HIGH_BIT_DEPTH
- + pf->weight_cache = x264_weight_cache_mmxext;
- if( !(cpu&X264_CPU_SSE2) )
- return;
- --
- 1.7.2.3
- From 42081042ed16d57e47d23ccdc7ae904e295794bc Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sat, 11 Dec 2010 15:30:35 -0500
- Subject: [PATCH 5/5] offset(add/sub), cosmetics
- ---
- common/x86/mc-a.asm | 32 +++++++++++++++++++++++++-------
- common/x86/mc-c.c | 32 +++++++++++++++++++++++++-------
- 2 files changed, 50 insertions(+), 14 deletions(-)
- diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
- index e436148..bb600de 100644
- --- a/common/x86/mc-a.asm
- +++ b/common/x86/mc-a.asm
- @@ -346,7 +346,7 @@ AVG_WEIGHT ssse3, 16, 7
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- -;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
- +;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
- ;-----------------------------------------------------------------------------
- %ifdef ARCH_X86_64
- @@ -407,34 +407,49 @@ WEIGHTER 20, ssse3
- %macro OFFSET_OP 7
- mov%6 m0, [%1]
- mov%6 m1, [%2]
- +%ifdef HIGH_BIT_DEPTH
- + p%5usw m0, m2
- + p%5usw m1, m2
- + pminsw m0, m3
- + pminsw m1, m3
- +%else
- p%5usb m0, m2
- p%5usb m1, m2
- - mov%7 [%3], m0
- - mov%7 [%4], m1
- +%endif
- + mov%7 [%3], m0
- + mov%7 [%4], m1
- %endmacro
- %macro OFFSET_TWO_ROW 4
- %assign x 0
- %rep %3
- -%if (%3-x) >= mmsize
- +%if (%3-x/SIZEOF_PIXEL) >= mmsize/SIZEOF_PIXEL
- OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
- %assign x (x+mmsize)
- %else
- - OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
- +%ifdef HIGH_BIT_DEPTH
- + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
- +%else
- + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
- +%endif
- %exitrep
- %endif
- -%if x >= %3
- +%if x/SIZEOF_PIXEL >= %3
- %exitrep
- %endif
- %endrep
- %endmacro
- ;-----------------------------------------------------------------------------
- -;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
- +;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
- ;-----------------------------------------------------------------------------
- %macro OFFSET 3
- cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
- + FIX_STRIDES r1, r3
- mova m2, [r4]
- +%ifdef HIGH_BIT_DEPTH
- + mova m3, [pw_pixel_max]
- +%endif
- LOAD_HEIGHT
- .loop:
- OFFSET_TWO_ROW r2, r0, %1, %3
- @@ -459,6 +474,9 @@ INIT_XMM
- OFFSETPN 12, sse2
- OFFSETPN 16, sse2
- OFFSETPN 20, sse2
- +%ifdef HIGH_BIT_DEPTH
- +OFFSETPN 8, sse2
- +%endif
- %undef LOAD_HEIGHT
- %undef HEIGHT_REG
- %undef NUMREGS
- diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
- index 47b3cf9..1d34ed2 100644
- --- a/common/x86/mc-c.c
- +++ b/common/x86/mc-c.c
- @@ -50,8 +50,8 @@ DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int
- void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
- #define MC_WEIGHT_OFFSET(w,type) \
- - void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
- - void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
- + void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
- + void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
- MC_WEIGHT(w,type)
- MC_WEIGHT_OFFSET( 4, mmxext )
- @@ -62,6 +62,9 @@ MC_WEIGHT_OFFSET( 20, mmxext )
- MC_WEIGHT_OFFSET( 12, sse2 )
- MC_WEIGHT_OFFSET( 16, sse2 )
- MC_WEIGHT_OFFSET( 20, sse2 )
- +#if HIGH_BIT_DEPTH
- +MC_WEIGHT_OFFSET( 8, sse2 )
- +#endif
- MC_WEIGHT( 8, sse2 )
- MC_WEIGHT( 4, ssse3 )
- MC_WEIGHT( 8, ssse3 )
- @@ -220,12 +223,27 @@ MC_COPY_WTAB(sse2,mmx,mmx,sse2)
- #if HIGH_BIT_DEPTH
- MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
- +MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
- +MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
- MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,12)
- +MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,sse2,16)
- +MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,sse2,16)
- +
- static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
- {
- int i;
- int16_t den1, den2, den3;
- + if( w->i_scale == 1<<w->i_denom )
- + {
- + if( w->i_offset < 0 )
- + w->weightfn = h->mc.offsetsub;
- + else
- + w->weightfn = h->mc.offsetadd;
- + for( i = 0; i < 8; i++ )
- + w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
- + return;
- + }
- w->weightfn = h->mc.weight;
- den1 = 1<<w->i_denom;
- den2 = w->i_scale<<1;
- @@ -472,9 +490,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- pf->mc_chroma = x264_mc_chroma_mmxext;
- pf->hpel_filter = x264_hpel_filter_mmxext;
- pf->weight = x264_mc_weight_wtab_mmxext;
- + pf->weight_cache = x264_weight_cache_mmxext;
- + pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
- + pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
- #if HIGH_BIT_DEPTH
- - pf->weight_cache = x264_weight_cache_mmxext;
- if( !(cpu&X264_CPU_SSE2) )
- return;
- @@ -490,6 +510,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- pf->integral_init4v = x264_integral_init4v_sse2;
- pf->integral_init8v = x264_integral_init8v_sse2;
- pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
- + pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
- + pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
- if( cpu&X264_CPU_SSE2_IS_SLOW )
- return;
- @@ -506,10 +528,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
- pf->integral_init4v = x264_integral_init4v_ssse3;
- #else // !HIGH_BIT_DEPTH
- - pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
- - pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
- - pf->weight_cache = x264_weight_cache_mmxext;
- -
- pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
- pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
- pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
- --
- 1.7.2.3
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement