Share Pastebin
Guest
Public paste!

Dark Shikari

By: a guest | May 6th, 2008 | Syntax: None | Size: 7.38 KB | Hits: 406 | Expires: Never
This paste has a previous version, view the difference. Copy text to clipboard
  1. diff --git a/common/dct.c b/common/dct.c
  2. index 669e24f..87b096a 100644
  3. --- a/common/dct.c
  4. +++ b/common/dct.c
  5.  -230,6 +230,23 @@ static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
  6.      }
  7.  }
  8.  
  9. +static void add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
  10. +{
  11. +    int i,j;
  12. +    dc = (dc + 32) >> 6;
  13. +    for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
  14. +        for( j = 0; j < 4; j++)
  15. +            p_dst[j] += x264_clip_uint8(p_dst[j]+dc);
  16. +}
  17. +
  18. +static void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
  19. +{
  20. +    add4x4_idct_dc( &p_dst[0],               dct2x2[0] );
  21. +    add4x4_idct_dc( &p_dst[4],               dct2x2[1] );
  22. +    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct2x2[2] );
  23. +    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct2x2[3] );
  24. +}
  25. +
  26.  static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
  27.  {
  28.      add4x4_idct( &p_dst[0],               dct[0] );
  29.  -380,6 +397,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
  30.  
  31.      dctf->sub8x8_dct8   = sub8x8_dct8;
  32.      dctf->add8x8_idct8  = add8x8_idct8;
  33. +    dctf->add8x8_idct_dc = add8x8_idct_dc;
  34.  
  35.      dctf->sub16x16_dct8  = sub16x16_dct8;
  36.      dctf->add16x16_idct8 = add16x16_idct8;
  37.  -399,6 +417,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
  38.  
  39.          dctf->add4x4_idct   = x264_add4x4_idct_mmx;
  40.          dctf->add8x8_idct   = x264_add8x8_idct_mmx;
  41. +        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
  42.          dctf->add16x16_idct = x264_add16x16_idct_mmx;
  43.  
  44.          dctf->dct4x4dc      = x264_dct4x4dc_mmx;
  45. diff --git a/common/dct.h b/common/dct.h
  46. index ee9d7d1..3ce4370 100644
  47. --- a/common/dct.h
  48. +++ b/common/dct.h
  49.  -94,6 +94,8 @@ typedef struct
  50.  
  51.      void (*sub8x8_dct8)  ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
  52.      void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
  53. +    
  54. +    void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t *dct2x2 );
  55.  
  56.      void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
  57.      void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][8][8] );
  58. diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
  59. index 77badda..d6579b6 100644
  60. --- a/common/x86/dct-a.asm
  61. +++ b/common/x86/dct-a.asm
  62.  -306,7 +306,48 @@ cextern x264_add8x8_idct8_sse2
  63.  SUB_NxN_DCT  x264_sub16x16_dct8_sse2,  x264_sub8x8_dct8_sse2,  128, 8, 0, 0
  64.  ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
  65.  
  66. +;-----------------------------------------------------------------------------
  67. +; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
  68. +;-----------------------------------------------------------------------------
  69.  
  70. +%macro ADD_DC 3
  71. +    movq      mm4, [%3+FDEC_STRIDE*0]
  72. +    movq      mm5, [%3+FDEC_STRIDE*1]
  73. +    movq      mm6, [%3+FDEC_STRIDE*2]
  74. +    movq      mm7, [%3+FDEC_STRIDE*3]
  75. +    paddusb   mm4, %1
  76. +    paddusb   mm5, %1
  77. +    paddusb   mm6, %1
  78. +    paddusb   mm7, %1
  79. +    psubusb   mm4, %2
  80. +    psubusb   mm5, %2
  81. +    psubusb   mm6, %2
  82. +    psubusb   mm7, %2
  83. +    movq      [%3+FDEC_STRIDE*0], mm4
  84. +    movq      [%3+FDEC_STRIDE*1], mm5
  85. +    movq      [%3+FDEC_STRIDE*2], mm6
  86. +    movq      [%3+FDEC_STRIDE*3], mm7
  87. +%endmacro
  88. +
  89. +cglobal x264_add8x8_idct_dc_mmx, 2,2,1
  90. +    movq      mm0, [r1]
  91. +    pxor      mm1, mm1
  92. +    paddw     mm0, [pw_32 GLOBAL]
  93. +    psraw     mm0, 6
  94. +    psubw     mm1, mm0
  95. +    packuswb  mm0, mm0
  96. +    packuswb  mm1, mm1
  97. +    punpcklbw mm0, mm0
  98. +    punpckhbw mm1, mm1
  99. +    movq      mm2, mm0
  100. +    movq      mm3, mm1
  101. +    punpcklbw mm0, mm0
  102. +    punpcklbw mm1, mm1
  103. +    punpckhbw mm2, mm2
  104. +    punpckhbw mm3, mm3
  105. +    ADD_DC mm0, mm1, r0
  106. +    ADD_DC mm2, mm3, r0+FDEC_STRIDE*4
  107. +    ret
  108.  
  109.  ;-----------------------------------------------------------------------------
  110.  ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
  111. diff --git a/common/x86/dct.h b/common/x86/dct.h
  112. index 859937c..785496b 100644
  113. --- a/common/x86/dct.h
  114. +++ b/common/x86/dct.h
  115.  -31,6 +31,7 @@ void x264_sub16x16_dct_sse2( int16_t dct[16][4][4],  uint8_t *pix1, uint8_t *pix
  116.  
  117.  void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] );
  118.  void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] );
  119. +void x264_add8x8_idct_dc_mmx( uint8_t *p_dst, int16_t *dct2x2 );
  120.  void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] );
  121.  void x264_add8x8_idct_sse2( uint8_t *p_dst, int16_t dct[4][4][4] );
  122.  void x264_add16x16_idct_sse2( uint8_t *p_dst, int16_t dct[16][4][4] );
  123. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  124. index de94536..6f78c8d 100644
  125. --- a/encoder/macroblock.c
  126. +++ b/encoder/macroblock.c
  127.  -64,8 +64,10 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
  128.      {
  129.          int i_run;
  130.  
  131. -        if( abs( dct[idx--] ) > 1 )
  132. -            return 9;
  133. +        if( (unsigned)(dct[idx--] + 1) > 2 )
  134. +        {
  135. +            return 7;
  136. +        }
  137.  
  138.          i_run = 0;
  139.          while( idx >= 0 && dct[idx] == 0 )
  140.  -188,6 +190,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
  141.  {
  142.      int i, ch;
  143.      int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
  144. +    int ac_nnz[2] = {1,1};
  145.  
  146.      for( ch = 0; ch < 2; ch++ )
  147.      {
  148.  -239,10 +243,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
  149.          if( b_decimate && i_decimate_score < 7 )
  150.          {
  151.              /* Near null chroma 8x8 block so make it null (bits saving) */
  152. -            memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
  153. -            if( !array_non_zero( dct2x2 ) )
  154. -                continue;
  155. -            memset( dct4x4, 0, sizeof( dct4x4 ) );
  156. +            ac_nnz[ch] = 0;
  157. +            if( array_non_zero( dct2x2 ) )
  158. +                h->dctf.add8x8_idct_dc(p_dst, dct2x2);
  159. +            continue;
  160.          }
  161.          else
  162.          {
  163.  -257,11 +261,24 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
  164.  
  165.      /* coded block pattern */
  166.      h->mb.i_cbp_chroma = 0;
  167. -    for( i = 0; i < 8; i++ )
  168. +    for( ch = 0; ch < 2; ch++ )
  169.      {
  170. -        int nz = array_non_zero( h->dct.luma4x4[16+i] );
  171. -        h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
  172. -        h->mb.i_cbp_chroma |= nz;
  173. +        if(ac_nnz[ch])
  174. +            for( i = 0; i < 4; i++ )
  175. +            {
  176. +                int nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
  177. +                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
  178. +                h->mb.i_cbp_chroma |= nz;
  179. +            }
  180. +        else
  181. +        {
  182. +            /* We can't avoid this memset because entropy coding uses the cbp to decide whether to write AC residual */
  183. +            /* and there's only one cbp, not one per chroma channel. */
  184. +            if(ac_nnz[!ch])
  185. +                memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
  186. +            for( i = 0; i < 4; i++ )
  187. +                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = 0;
  188. +        }
  189.      }
  190.      if( h->mb.i_cbp_chroma )
  191.          h->mb.i_cbp_chroma = 2;    /* dc+ac (we can't do only ac) */
  192.  -499,7 +515,7 @@ void x264_macroblock_encode( x264_t *h )
  193.  
  194.                      h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
  195.                      
  196. -                    if( b_decimate )
  197. +                    if( b_decimate && i_decimate_8x8 <= 6 )
  198.                          i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 );
  199.                  }