- diff --git a/common/dct.c b/common/dct.c
- index 669e24f..87b096a 100644
- --- a/common/dct.c
- +++ b/common/dct.c
- -230,6 +230,23 @@ static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
- }
- }
- +static void add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
- +{
- + int i,j;
- + dc = (dc + 32) >> 6;
- + for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
- + for( j = 0; j < 4; j++)
- + p_dst[j] += x264_clip_uint8(p_dst[j]+dc);
- +}
- +
- +static void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
- +{
- + add4x4_idct_dc( &p_dst[0], dct2x2[0] );
- + add4x4_idct_dc( &p_dst[4], dct2x2[1] );
- + add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct2x2[2] );
- + add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct2x2[3] );
- +}
- +
- static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
- {
- add4x4_idct( &p_dst[0], dct[0] );
- -380,6 +397,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
- dctf->sub8x8_dct8 = sub8x8_dct8;
- dctf->add8x8_idct8 = add8x8_idct8;
- + dctf->add8x8_idct_dc = add8x8_idct_dc;
- dctf->sub16x16_dct8 = sub16x16_dct8;
- dctf->add16x16_idct8 = add16x16_idct8;
- -399,6 +417,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
- dctf->add4x4_idct = x264_add4x4_idct_mmx;
- dctf->add8x8_idct = x264_add8x8_idct_mmx;
- + dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
- dctf->add16x16_idct = x264_add16x16_idct_mmx;
- dctf->dct4x4dc = x264_dct4x4dc_mmx;
- diff --git a/common/dct.h b/common/dct.h
- index ee9d7d1..3ce4370 100644
- --- a/common/dct.h
- +++ b/common/dct.h
- -94,6 +94,8 @@ typedef struct
- void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
- void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
- +
- + void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t *dct2x2 );
- void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
- void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][8][8] );
- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
- index 77badda..d6579b6 100644
- --- a/common/x86/dct-a.asm
- +++ b/common/x86/dct-a.asm
- -306,7 +306,48 @@ cextern x264_add8x8_idct8_sse2
- SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
- ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
- +;-----------------------------------------------------------------------------
- +; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
- +;-----------------------------------------------------------------------------
- +%macro ADD_DC 3
- + movq mm4, [%3+FDEC_STRIDE*0]
- + movq mm5, [%3+FDEC_STRIDE*1]
- + movq mm6, [%3+FDEC_STRIDE*2]
- + movq mm7, [%3+FDEC_STRIDE*3]
- + paddusb mm4, %1
- + paddusb mm5, %1
- + paddusb mm6, %1
- + paddusb mm7, %1
- + psubusb mm4, %2
- + psubusb mm5, %2
- + psubusb mm6, %2
- + psubusb mm7, %2
- + movq [%3+FDEC_STRIDE*0], mm4
- + movq [%3+FDEC_STRIDE*1], mm5
- + movq [%3+FDEC_STRIDE*2], mm6
- + movq [%3+FDEC_STRIDE*3], mm7
- +%endmacro
- +
- +cglobal x264_add8x8_idct_dc_mmx, 2,2,1
- + movq mm0, [r1]
- + pxor mm1, mm1
- + paddw mm0, [pw_32 GLOBAL]
- + psraw mm0, 6
- + psubw mm1, mm0
- + packuswb mm0, mm0
- + packuswb mm1, mm1
- + punpcklbw mm0, mm0
- + punpckhbw mm1, mm1
- + movq mm2, mm0
- + movq mm3, mm1
- + punpcklbw mm0, mm0
- + punpcklbw mm1, mm1
- + punpckhbw mm2, mm2
- + punpckhbw mm3, mm3
- + ADD_DC mm0, mm1, r0
- + ADD_DC mm2, mm3, r0+FDEC_STRIDE*4
- + ret
- ;-----------------------------------------------------------------------------
- ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
- diff --git a/common/x86/dct.h b/common/x86/dct.h
- index 859937c..785496b 100644
- --- a/common/x86/dct.h
- +++ b/common/x86/dct.h
- -31,6 +31,7 @@ void x264_sub16x16_dct_sse2( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix
- void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] );
- void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] );
- +void x264_add8x8_idct_dc_mmx( uint8_t *p_dst, int16_t *dct2x2 );
- void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] );
- void x264_add8x8_idct_sse2( uint8_t *p_dst, int16_t dct[4][4][4] );
- void x264_add16x16_idct_sse2( uint8_t *p_dst, int16_t dct[16][4][4] );
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index de94536..6f78c8d 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- -64,8 +64,10 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
- {
- int i_run;
- - if( abs( dct[idx--] ) > 1 )
- - return 9;
- + if( (unsigned)(dct[idx--] + 1) > 2 )
- + {
- + return 7;
- + }
- i_run = 0;
- while( idx >= 0 && dct[idx] == 0 )
- -188,6 +190,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
- {
- int i, ch;
- int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
- + int ac_nnz[2] = {1,1};
- for( ch = 0; ch < 2; ch++ )
- {
- -239,10 +243,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
- if( b_decimate && i_decimate_score < 7 )
- {
- /* Near null chroma 8x8 block so make it null (bits saving) */
- - memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
- - if( !array_non_zero( dct2x2 ) )
- - continue;
- - memset( dct4x4, 0, sizeof( dct4x4 ) );
- + ac_nnz[ch] = 0;
- + if( array_non_zero( dct2x2 ) )
- + h->dctf.add8x8_idct_dc(p_dst, dct2x2);
- + continue;
- }
- else
- {
- -257,11 +261,24 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
- /* coded block pattern */
- h->mb.i_cbp_chroma = 0;
- - for( i = 0; i < 8; i++ )
- + for( ch = 0; ch < 2; ch++ )
- {
- - int nz = array_non_zero( h->dct.luma4x4[16+i] );
- - h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
- - h->mb.i_cbp_chroma |= nz;
- + if(ac_nnz[ch])
- + for( i = 0; i < 4; i++ )
- + {
- + int nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
- + h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
- + h->mb.i_cbp_chroma |= nz;
- + }
- + else
- + {
- + /* We can't avoid this memset because entropy coding uses the cbp to decide whether to write AC residual */
- + /* and there's only one cbp, not one per chroma channel. */
- + if(ac_nnz[!ch])
- + memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
- + for( i = 0; i < 4; i++ )
- + h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = 0;
- + }
- }
- if( h->mb.i_cbp_chroma )
- h->mb.i_cbp_chroma = 2; /* dc+ac (we can't do only ac) */
- -499,7 +515,7 @@ void x264_macroblock_encode( x264_t *h )
- h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
- - if( b_decimate )
- + if( b_decimate && i_decimate_8x8 <= 6 )
- i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 );
- }
