Advertisement
Guest User

Untitled

a guest
May 25th, 2017
561
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 72.63 KB | None | 0 0
  1. From 09b5a943c62212447a0151dfd5324f8e36715272 Mon Sep 17 00:00:00 2001
  2. From: Anton Mitrofanov <BugMaster@narod.ru>
  3. Date: Thu, 6 May 2010 10:03:31 -0700
  4. Subject: [PATCH 1/9] More cosmetics
  5.  
  6. ---
  7. common/cpu.c           |    4 +-
  8.  common/macroblock.c    |    6 +++-
  9.  common/mc.c            |    4 +-
  10.  common/mvpred.c        |   12 ++++----
  11.  common/ppc/dct.c       |    2 +-
  12.  common/ppc/mc.c        |   12 ++++----
  13.  common/ppc/ppccommon.h |    8 +++---
  14.  common/ppc/quant.c     |    6 ++--
  15.  common/predict.c       |    2 +-
  16.  common/x86/const-a.asm |    2 +-
  17.  common/x86/mc-c.c      |    2 +-
  18.  common/x86/predict-c.c |    2 +-
  19.  encoder/cabac.c        |    8 +++---
  20.  encoder/me.c           |   18 ++++++------
  21.  input/avs.c            |    2 +-
  22.  tools/checkasm.c       |   66 ++++++++++++++++++++++++------------------------
  23.  16 files changed, 79 insertions(+), 77 deletions(-)
  24.  
  25. diff --git a/common/cpu.c b/common/cpu.c
  26. index 904eedc..933a754 100644
  27. --- a/common/cpu.c
  28. +++ b/common/cpu.c
  29. @@ -87,8 +87,8 @@ static void sigill_handler( int sig )
  30.  #endif
  31.  
  32.  #ifdef HAVE_MMX
  33. -extern int  x264_cpu_cpuid_test( void );
  34. -extern uint32_t  x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
  35. +int x264_cpu_cpuid_test( void );
  36. +uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
  37.  
  38.  uint32_t x264_cpu_detect( void )
  39.  {
  40. diff --git a/common/macroblock.c b/common/macroblock.c
  41. index f402588..110c3a5 100644
  42. --- a/common/macroblock.c
  43. +++ b/common/macroblock.c
  44. @@ -295,7 +295,8 @@ int x264_macroblock_cache_allocate( x264_t *h )
  45.      }
  46.  
  47.      return 0;
  48. -fail: return -1;
  49. +fail:
  50. +    return -1;
  51.  }
  52.  void x264_macroblock_cache_free( x264_t *h )
  53.  {
  54. @@ -348,7 +349,8 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
  55.      CHECKED_MALLOC( h->scratch_buffer, scratch_size );
  56.  
  57.      return 0;
  58. -fail: return -1;
  59. +fail:
  60. +    return -1;
  61.  }
  62.  
  63.  void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
  64. diff --git a/common/mc.c b/common/mc.c
  65. index ad7fe79..ada8bdc 100644
  66. --- a/common/mc.c
  67. +++ b/common/mc.c
  68. @@ -97,9 +97,9 @@ static void name( uint8_t *pix1, int i_stride_pix1, \
  69.                    uint8_t *pix2, int i_stride_pix2, \
  70.                    uint8_t *pix3, int i_stride_pix3, int weight ) \
  71.  { \
  72. -    if( weight == 32 )\
  73. +    if( weight == 32 ) \
  74.          pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
  75. -    else\
  76. +    else \
  77.          pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
  78.  }
  79.  PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
  80. diff --git a/common/mvpred.c b/common/mvpred.c
  81. index de91826..54b4d5a 100755
  82. --- a/common/mvpred.c
  83. +++ b/common/mvpred.c
  84. @@ -394,7 +394,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
  85.      int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
  86.      int i = 0;
  87.  
  88. -#define SET_MVP(mvp)\
  89. +#define SET_MVP(mvp) \
  90.      { \
  91.          CP32( mvc[i], mvp ); \
  92.          i++; \
  93. @@ -445,13 +445,13 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
  94.          if( h->sh.b_mbaff && field^(i_ref&1) )
  95.              refpoc += h->sh.i_delta_poc_bottom;
  96.  
  97. -#define SET_TMVP( dx, dy )\
  98. +#define SET_TMVP( dx, dy ) \
  99.          { \
  100.              int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \
  101. -            int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field];\
  102. -            mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8;\
  103. -            mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8;\
  104. -            i++;\
  105. +            int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field]; \
  106. +            mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8; \
  107. +            mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8; \
  108. +            i++; \
  109.          }
  110.  
  111.          SET_TMVP(0,0);
  112. diff --git a/common/ppc/dct.c b/common/ppc/dct.c
  113. index fdadf53..eb223ae 100644
  114. --- a/common/ppc/dct.c
  115. +++ b/common/ppc/dct.c
  116. @@ -205,7 +205,7 @@ void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
  117.      vec_st( dct_tr1v, 16,  (signed short *)dct );
  118.      vec_st( dct_tr2v, 32,  (signed short *)dct );
  119.      vec_st( dct_tr3v, 48,  (signed short *)dct );
  120. -    
  121. +
  122.      vec_st( dct_tr4v, 64,  (signed short *)dct );
  123.      vec_st( dct_tr5v, 80,  (signed short *)dct );
  124.      vec_st( dct_tr6v, 96,  (signed short *)dct );
  125. diff --git a/common/ppc/mc.c b/common/ppc/mc.c
  126. index dfe250a..26b81f8 100644
  127. --- a/common/ppc/mc.c
  128. +++ b/common/ppc/mc.c
  129. @@ -291,8 +291,8 @@ static void mc_chroma_2xh( uint8_t *dst, int i_dst_stride,
  130.   }
  131.  
  132.  
  133. -#define DO_PROCESS_W4( a )  \
  134. -    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A );   \
  135. +#define DO_PROCESS_W4( a ) \
  136. +    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
  137.      dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
  138.  
  139.  static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
  140. @@ -369,10 +369,10 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
  141.      }
  142.  }
  143.  
  144. -#define DO_PROCESS_W8( a )  \
  145. -    src##a##v_16A = vec_u8_to_u16( src##a##v_8A );  \
  146. -    src##a##v_16B = vec_u8_to_u16( src##a##v_8B );  \
  147. -    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A );   \
  148. +#define DO_PROCESS_W8( a ) \
  149. +    src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \
  150. +    src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \
  151. +    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
  152.      dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
  153.  
  154.  static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
  155. diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
  156. index 510ab26..e61afaa 100644
  157. --- a/common/ppc/ppccommon.h
  158. +++ b/common/ppc/ppccommon.h
  159. @@ -113,13 +113,13 @@ typedef union {
  160.      vec_u8_t _hv, _lv
  161.  
  162.  #define PREP_LOAD_SRC( src )              \
  163. -    vec_u8_t _##src##_ = vec_lvsl(0, src)
  164. +    vec_u8_t _##src##_ = vec_lvsl(0, src)
  165.  
  166.  #define VEC_LOAD_G( p, v, n, t )                 \
  167.      _hv = vec_ld( 0, p );                        \
  168.      v   = (t) vec_lvsl( 0, p );                  \
  169.      _lv = vec_ld( n - 1, p );                    \
  170. -    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
  171. +    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
  172.  
  173.  #define VEC_LOAD( p, v, n, t, g )                   \
  174.      _hv = vec_ld( 0, p );                           \
  175. @@ -134,7 +134,7 @@ typedef union {
  176.  #define VEC_LOAD_PARTIAL( p, v, n, t, g)               \
  177.      _hv = vec_ld( 0, p);                               \
  178.      v   = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
  179. -    
  180. +
  181.  
  182.  /***********************************************************************
  183.   * PREP_STORE##n: declares required vectors to store n bytes to a
  184. @@ -155,7 +155,7 @@ typedef union {
  185.      _lv    = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
  186.      vec_st( _lv, 15, (uint8_t *) p );                    \
  187.      _hv    = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
  188. -    vec_st( _hv, 0, (uint8_t *) p )
  189. +    vec_st( _hv, 0, (uint8_t *) p )
  190.  
  191.  
  192.  #define PREP_STORE8 \
  193. diff --git a/common/ppc/quant.c b/common/ppc/quant.c
  194. index 4b2825c..6f41a06 100644
  195. --- a/common/ppc/quant.c
  196. +++ b/common/ppc/quant.c
  197. @@ -20,7 +20,7 @@
  198.  
  199.  #include "common/common.h"
  200.  #include "ppccommon.h"
  201. -#include "quant.h"            
  202. +#include "quant.h"
  203.  
  204.  // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
  205.  #define QUANT_16_U( idx0, idx1 )                                    \
  206. @@ -55,7 +55,7 @@
  207.      nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
  208.      vec_st(temp2v, (idx1), (int16_t*)dct);                          \
  209.  }
  210. -                
  211. +
  212.  int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
  213.  {
  214.      LOAD_ZERO;
  215. @@ -220,7 +220,7 @@ int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64
  216.      vec_u16_t biasvB;
  217.  
  218.      vec_s16_t temp1v, temp2v;
  219. -    
  220. +
  221.      vec_u32_u qbits_u;
  222.      qbits_u.s[0]=16;
  223.      i_qbitsv = vec_splat(qbits_u.v, 0);
  224. diff --git a/common/predict.c b/common/predict.c
  225. index 783cc9b..f120ca7 100644
  226. --- a/common/predict.c
  227. +++ b/common/predict.c
  228. @@ -41,7 +41,7 @@
  229.   * 16x16 prediction for intra luma block
  230.   ****************************************************************************/
  231.  
  232. -#define PREDICT_16x16_DC(v) \
  233. +#define PREDICT_16x16_DC(v)\
  234.      for( int i = 0; i < 16; i++ )\
  235.      {\
  236.          M32( src+ 0 ) = v;\
  237. diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
  238. index 79bbf1b..99a34be 100755
  239. --- a/common/x86/const-a.asm
  240. +++ b/common/x86/const-a.asm
  241. @@ -43,7 +43,7 @@ const pw_64,       times 8 dw 64
  242.  const pw_32_0,     times 4 dw 32,
  243.                     times 4 dw 0
  244.  const pw_8000,     times 8 dw 0x8000
  245. -const pw_3fff,   times 8 dw 0x3fff
  246. +const pw_3fff,     times 8 dw 0x3fff
  247.  
  248.  const pd_1,        times 4 dd 1
  249.  const pd_128,      times 4 dd 128
  250. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
  251. index fb73562..6d386f6 100644
  252. --- a/common/x86/mc-c.c
  253. +++ b/common/x86/mc-c.c
  254. @@ -103,7 +103,7 @@ void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
  255.  void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
  256.  void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  257.                                        uint16_t *inter_costs, uint16_t *inv_qscales, int len );
  258. -#define LOWRES(cpu) \
  259. +#define LOWRES(cpu)\
  260.  void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
  261.                                          int src_stride, int dst_stride, int width, int height );
  262.  LOWRES(mmxext)
  263. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  264. index 6fa7e3b..0e3e1c7 100644
  265. --- a/common/x86/predict-c.c
  266. +++ b/common/x86/predict-c.c
  267. @@ -326,7 +326,7 @@ static void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
  268.      t=e; e+=f; f-=t;\
  269.      t=g; g+=h; h-=t;
  270.  
  271. -#define INTRA_SA8D_X3(cpu) \
  272. +#define INTRA_SA8D_X3(cpu)\
  273.  void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\
  274.  {\
  275.      PREDICT_8x8_LOAD_TOP\
  276. diff --git a/encoder/cabac.c b/encoder/cabac.c
  277. index 1086447..bc76fc8 100644
  278. --- a/encoder/cabac.c
  279. +++ b/encoder/cabac.c
  280. @@ -736,13 +736,13 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
  281.  }
  282.  #endif
  283.  
  284. -#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra ) \
  285. -{ \
  286. -    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \
  287. +#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra )\
  288. +{\
  289. +    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra );\
  290.      if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
  291.      {\
  292.          x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
  293. -        block_residual_write_cabac( h, cb, i_ctxBlockCat, l ); \
  294. +        block_residual_write_cabac( h, cb, i_ctxBlockCat, l );\
  295.      }\
  296.      else\
  297.          x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
  298. diff --git a/encoder/me.c b/encoder/me.c
  299. index d7b2928..5e113f0 100644
  300. --- a/encoder/me.c
  301. +++ b/encoder/me.c
  302. @@ -914,14 +914,14 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  303.      m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy];
  304.  }
  305.  
  306. -#define BIME_CACHE( dx, dy, list ) \
  307. -{ \
  308. +#define BIME_CACHE( dx, dy, list )\
  309. +{\
  310.      x264_me_t *m = m##list;\
  311. -    int i = 4 + 3*dx + dy; \
  312. +    int i = 4 + 3*dx + dy;\
  313.      int mvx = bm##list##x+dx;\
  314.      int mvy = bm##list##y+dy;\
  315.      stride[list][i] = bw;\
  316. -    src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \
  317. +    src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\
  318.      if( rd )\
  319.      {\
  320.          h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
  321. @@ -1107,11 +1107,11 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
  322.      { \
  323.          uint64_t cost; \
  324.          M32( cache_mv ) = pack16to32_mask(mx,my); \
  325. -        if( m->i_pixel <= PIXEL_8x8 )\
  326. -        {\
  327. -            h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
  328. -            h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
  329. -        }\
  330. +        if( m->i_pixel <= PIXEL_8x8 ) \
  331. +        { \
  332. +            h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
  333. +            h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
  334. +        } \
  335.          cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
  336.          COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
  337.      } \
  338. diff --git a/input/avs.c b/input/avs.c
  339. index 9e3aa55..5489a5e 100644
  340. --- a/input/avs.c
  341. +++ b/input/avs.c
  342. @@ -45,7 +45,7 @@
  343.  /* maximum size of the sequence of filters to try on non script files */
  344.  #define AVS_MAX_SEQUENCE 5
  345.  
  346. -#define LOAD_AVS_FUNC(name, continue_on_fail) \
  347. +#define LOAD_AVS_FUNC(name, continue_on_fail)\
  348.  {\
  349.      h->func.name = (void*)GetProcAddress( h->library, #name );\
  350.      if( !continue_on_fail && !h->func.name )\
  351. diff --git a/tools/checkasm.c b/tools/checkasm.c
  352. index 228b75f..2008d2f 100644
  353. --- a/tools/checkasm.c
  354. +++ b/tools/checkasm.c
  355. @@ -265,7 +265,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
  356.          buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1));
  357.  
  358.  #define TEST_PIXEL( name, align ) \
  359. -    ok = 1, used_asm = 0;\
  360. +    ok = 1, used_asm = 0; \
  361.      for( int i = 0; i < 7; i++ ) \
  362.      { \
  363.          int res_c, res_asm; \
  364. @@ -305,7 +305,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
  365.      TEST_PIXEL( sa8d, 1 );
  366.  
  367.  #define TEST_PIXEL_X( N ) \
  368. -    ok = 1; used_asm = 0;\
  369. +    ok = 1; used_asm = 0; \
  370.      for( int i = 0; i < 7; i++ ) \
  371.      { \
  372.          int res_c[4]={0}, res_asm[4]={0}; \
  373. @@ -350,7 +350,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
  374.      { \
  375.          set_func_name( "%s_%s", "var", pixel_names[i] ); \
  376.          used_asm = 1; \
  377. -        /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
  378. +        /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
  379.          call_c1( pixel_c.var[i], buf1, 16 ); \
  380.          call_a1( pixel_asm.var[i], buf1, 16 ); \
  381.          uint64_t res_c   = pixel_c.var[i]( buf1, 16 ); \
  382. @@ -415,7 +415,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
  383.      if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
  384.      { \
  385.          int res_c[3], res_asm[3]; \
  386. -        set_func_name( #name );\
  387. +        set_func_name( #name ); \
  388.          used_asm = 1; \
  389.          memcpy( buf3, buf2, 1024 ); \
  390.          for( int i = 0; i < 3; i++ ) \
  391. @@ -538,7 +538,7 @@ static int check_dct( int cpu_ref, int cpu_new )
  392.  #define TEST_DCT( name, t1, t2, size ) \
  393.      if( dct_asm.name != dct_ref.name ) \
  394.      { \
  395. -        set_func_name( #name );\
  396. +        set_func_name( #name ); \
  397.          used_asm = 1; \
  398.          call_c( dct_c.name, t1, buf1, buf2 ); \
  399.          call_a( dct_asm.name, t2, buf1, buf2 ); \
  400. @@ -579,7 +579,7 @@ static int check_dct( int cpu_ref, int cpu_new )
  401.  #define TEST_IDCT( name, src ) \
  402.      if( dct_asm.name != dct_ref.name ) \
  403.      { \
  404. -        set_func_name( #name );\
  405. +        set_func_name( #name ); \
  406.          used_asm = 1; \
  407.          memcpy( buf3, buf1, 32*32 ); \
  408.          memcpy( buf4, buf1, 32*32 ); \
  409. @@ -644,12 +644,12 @@ static int check_dct( int cpu_ref, int cpu_new )
  410.      ALIGNED_16( int16_t level1[64] );
  411.      ALIGNED_16( int16_t level2[64] );
  412.  
  413. -#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size )   \
  414. +#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
  415.      if( zigzag_asm.name != zigzag_ref.name ) \
  416.      { \
  417. -        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
  418. +        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  419.          used_asm = 1; \
  420. -        memcpy(dct, buf1, size*sizeof(int16_t));\
  421. +        memcpy(dct, buf1, size*sizeof(int16_t)); \
  422.          call_c( zigzag_c.name, t1, dct ); \
  423.          call_a( zigzag_asm.name, t2, dct ); \
  424.          if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \
  425. @@ -663,18 +663,18 @@ static int check_dct( int cpu_ref, int cpu_new )
  426.      if( zigzag_asm.name != zigzag_ref.name ) \
  427.      { \
  428.          int nz_a, nz_c; \
  429. -        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
  430. +        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  431.          used_asm = 1; \
  432.          memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
  433.          memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
  434. -        nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 );  \
  435. +        nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \
  436.          nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
  437. -        if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a )  \
  438. +        if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
  439.          { \
  440.              ok = 0; \
  441.              fprintf( stderr, #name " [FAILED]\n" ); \
  442.          } \
  443. -        call_c2( zigzag_c.name, t1, buf2, buf3 );  \
  444. +        call_c2( zigzag_c.name, t1, buf2, buf3 ); \
  445.          call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
  446.      }
  447.  
  448. @@ -683,7 +683,7 @@ static int check_dct( int cpu_ref, int cpu_new )
  449.      { \
  450.          int nz_a, nz_c; \
  451.          int16_t dc_a, dc_c; \
  452. -        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
  453. +        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  454.          used_asm = 1; \
  455.          for( int i = 0; i < 2; i++ ) \
  456.          { \
  457. @@ -694,27 +694,27 @@ static int check_dct( int cpu_ref, int cpu_new )
  458.                  memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
  459.                  memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
  460.              } \
  461. -            nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c );  \
  462. +            nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
  463.              nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
  464. -            if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a )  \
  465. +            if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \
  466.              { \
  467.                  ok = 0; \
  468.                  fprintf( stderr, #name " [FAILED]\n" ); \
  469.                  break; \
  470.              } \
  471.          } \
  472. -        call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c );  \
  473. +        call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
  474.          call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
  475.      }
  476.  
  477. -#define TEST_INTERLEAVE( name, t1, t2, dct, size )   \
  478. +#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
  479.      if( zigzag_asm.name != zigzag_ref.name ) \
  480.      { \
  481.          for( int j = 0; j < 100; j++ ) \
  482.          { \
  483. -            set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
  484. +            set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  485.              used_asm = 1; \
  486. -            memcpy(dct, buf1, size*sizeof(int16_t));\
  487. +            memcpy(dct, buf1, size*sizeof(int16_t)); \
  488.              for( int i = 0; i < size; i++ ) \
  489.                  dct[i] = rand()&0x1F ? 0 : dct[i]; \
  490.              memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
  491. @@ -784,7 +784,7 @@ static int check_mc( int cpu_ref, int cpu_new )
  492.          if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
  493.          { \
  494.              const x264_weight_t *weight = weight_none; \
  495. -            set_func_name( "mc_luma_%dx%d", w, h );\
  496. +            set_func_name( "mc_luma_%dx%d", w, h ); \
  497.              used_asm = 1; \
  498.              memset( buf3, 0xCD, 1024 ); \
  499.              memset( buf4, 0xCD, 1024 ); \
  500. @@ -801,7 +801,7 @@ static int check_mc( int cpu_ref, int cpu_new )
  501.              uint8_t *ref = dst2; \
  502.              int ref_stride = 32; \
  503.              const x264_weight_t *weight = weight_none; \
  504. -            set_func_name( "get_ref_%dx%d", w, h );\
  505. +            set_func_name( "get_ref_%dx%d", w, h ); \
  506.              used_asm = 1; \
  507.              memset( buf3, 0xCD, 1024 ); \
  508.              memset( buf4, 0xCD, 1024 ); \
  509. @@ -819,13 +819,13 @@ static int check_mc( int cpu_ref, int cpu_new )
  510.  #define MC_TEST_CHROMA( w, h ) \
  511.          if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
  512.          { \
  513. -            set_func_name( "mc_chroma_%dx%d", w, h );\
  514. +            set_func_name( "mc_chroma_%dx%d", w, h ); \
  515.              used_asm = 1; \
  516.              memset( buf3, 0xCD, 1024 ); \
  517.              memset( buf4, 0xCD, 1024 ); \
  518.              call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
  519.              call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
  520. -            /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
  521. +            /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
  522.              for( int j = 0; j < h; j++ ) \
  523.                  for( int i = w; i < 4; i++ ) \
  524.                      dst2[i+j*16] = dst1[i+j*16]; \
  525. @@ -878,7 +878,7 @@ static int check_mc( int cpu_ref, int cpu_new )
  526.          memcpy( buf4, buf1+320, 320 ); \
  527.          if( mc_a.name[i] != mc_ref.name[i] ) \
  528.          { \
  529. -            set_func_name( "%s_%s", #name, pixel_names[i] );\
  530. +            set_func_name( "%s_%s", #name, pixel_names[i] ); \
  531.              used_asm = 1; \
  532.              call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
  533.              call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
  534. @@ -899,7 +899,7 @@ static int check_mc( int cpu_ref, int cpu_new )
  535.  
  536.  #define MC_TEST_WEIGHT( name, weight, aligned ) \
  537.      int align_off = (aligned ? 0 : rand()%16); \
  538. -    ok = 1, used_asm = 0;\
  539. +    ok = 1, used_asm = 0; \
  540.      for( int i = 1; i <= 5; i++ ) \
  541.      { \
  542.          ALIGNED_16( uint8_t buffC[640] ); \
  543. @@ -1115,14 +1115,14 @@ static int check_deblock( int cpu_ref, int cpu_new )
  544.  #define TEST_DEBLOCK( name, align, ... ) \
  545.      for( int i = 0; i < 36; i++ ) \
  546.      { \
  547. -        int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */\
  548. +        int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
  549.          for( int j = 0; j < 1024; j++ ) \
  550. -            /* two distributions of random to excersize different failure modes */\
  551. +            /* two distributions of random to excersize different failure modes */ \
  552.              buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
  553.          memcpy( buf4, buf3, 1024 ); \
  554.          if( db_a.name != db_ref.name ) \
  555.          { \
  556. -            set_func_name( #name );\
  557. +            set_func_name( #name ); \
  558.              used_asm = 1; \
  559.              call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
  560.              call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
  561. @@ -1236,7 +1236,7 @@ static int check_quant( int cpu_ref, int cpu_new )
  562.                          dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
  563.                      result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
  564.                      result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
  565. -                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a )       \
  566. +                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
  567.                      { \
  568.                          oks[0] = 0; \
  569.                          fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
  570. @@ -1491,11 +1491,11 @@ static int check_intra( int cpu_ref, int cpu_new )
  571.  
  572.      ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
  573.  
  574. -#define INTRA_TEST( name, dir, w, ... ) \
  575. +#define INTRA_TEST( name, dir, w, ... )\
  576.      if( ip_a.name[dir] != ip_ref.name[dir] )\
  577. -    { \
  578. +    {\
  579.          set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
  580. -        used_asm = 1; \
  581. +        used_asm = 1;\
  582.          memcpy( buf3, buf1, 32*20 );\
  583.          memcpy( buf4, buf1, 32*20 );\
  584.          call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\
  585. --
  586. 1.7.0.4
  587.  
  588.  
  589. From 29b379cc3499541e72007131909d45a8c472f2b5 Mon Sep 17 00:00:00 2001
  590. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  591. Date: Sat, 8 May 2010 11:58:22 -0700
  592. Subject: [PATCH 2/9] Fix intra refresh behavior with I-frames
  593.  Intra refresh still allows I-frames (for scenecuts/etc).
  594.  Now I-frames count as a full refresh, as opposed to instantly triggering a refresh.
  595.  
  596. ---
  597. common/frame.h    |    1 +
  598.  encoder/encoder.c |   28 +++++++++++++++++-----------
  599.  2 files changed, 18 insertions(+), 11 deletions(-)
  600.  
  601. diff --git a/common/frame.h b/common/frame.h
  602. index 357929e..e2766ad 100644
  603. --- a/common/frame.h
  604. +++ b/common/frame.h
  605. @@ -142,6 +142,7 @@ typedef struct x264_frame
  606.      float   f_pir_position;
  607.      int     i_pir_start_col;
  608.      int     i_pir_end_col;
  609. +    int     i_frames_since_pir;
  610.  } x264_frame_t;
  611.  
  612.  /* synchronized frame list */
  613. diff --git a/encoder/encoder.c b/encoder/encoder.c
  614. index 7ad4295..7c5a64f 100644
  615. --- a/encoder/encoder.c
  616. +++ b/encoder/encoder.c
  617. @@ -2375,25 +2375,31 @@ int     x264_encoder_encode( x264_t *h,
  618.      h->i_nal_type = i_nal_type;
  619.      h->i_nal_ref_idc = i_nal_ref_idc;
  620.  
  621. -    if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
  622. +    if( h->param.b_intra_refresh )
  623.      {
  624. -        int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
  625. -        float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
  626. -        int max_position = (int)(increment * h->param.i_keyint_max);
  627. -        if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
  628. -            h->fdec->f_pir_position = 0;
  629. -        else
  630. +        if( IS_X264_TYPE_I( h->fenc->i_type ) )
  631. +        {
  632. +            h->fdec->i_frames_since_pir = 0;
  633. +            /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
  634. +             * the whole frame and counts as an intra refresh. */
  635. +            h->fdec->f_pir_position = h->sps->i_mb_width;
  636. +        }
  637. +        else if( h->fenc->i_type == X264_TYPE_P )
  638.          {
  639. +            int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
  640. +            float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
  641.              h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
  642. -            if( h->fdec->f_pir_position+0.5 >= max_position )
  643. +            h->fdec->i_frames_since_pir = h->fref0[0]->i_frames_since_pir + pocdiff;
  644. +            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max )
  645.              {
  646.                  h->fdec->f_pir_position = 0;
  647. +                h->fdec->i_frames_since_pir = 0;
  648.                  h->fenc->b_keyframe = 1;
  649.              }
  650. +            h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
  651. +            h->fdec->f_pir_position += increment * pocdiff;
  652. +            h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
  653.          }
  654. -        h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
  655. -        h->fdec->f_pir_position += increment * pocdiff;
  656. -        h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
  657.      }
  658.  
  659.      if( h->fenc->b_keyframe )
  660. --
  661. 1.7.0.4
  662.  
  663.  
  664. From 47b30702e9e8b0f9ff6f87a52e0bbc0755a1dbd9 Mon Sep 17 00:00:00 2001
  665. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  666. Date: Sat, 8 May 2010 12:07:13 -0700
  667. Subject: [PATCH 3/9] Add API function to trigger intra refresh
  668.  Useful for interactive applications where the encoder knows that packet loss has occurred on the client.
  669.  Full documentation is in x264.h.
  670.  
  671. ---
  672. common/common.h   |    2 ++
  673.  encoder/encoder.c |   11 ++++++++++-
  674.  x264.h            |   10 +++++++++-
  675.  3 files changed, 21 insertions(+), 2 deletions(-)
  676.  
  677. diff --git a/common/common.h b/common/common.h
  678. index 91d5030..f673648 100644
  679. --- a/common/common.h
  680. +++ b/common/common.h
  681. @@ -408,6 +408,8 @@ struct x264_t
  682.      int             i_coded_fields_lookahead; /* Use separate counters for lookahead */
  683.      int             i_cpb_delay_lookahead;
  684.  
  685. +    int             b_queued_intra_refresh;
  686. +
  687.      /* We use only one SPS and one PPS */
  688.      x264_sps_t      sps_array[1];
  689.      x264_sps_t      *sps;
  690. diff --git a/encoder/encoder.c b/encoder/encoder.c
  691. index 7c5a64f..42d49bf 100644
  692. --- a/encoder/encoder.c
  693. +++ b/encoder/encoder.c
  694. @@ -2131,6 +2131,12 @@ static int x264_threaded_slices_write( x264_t *h )
  695.      return 0;
  696.  }
  697.  
  698. +void x264_encoder_intra_refresh( x264_t *h )
  699. +{
  700. +    h = h->thread[h->thread[0]->i_thread_phase];
  701. +    h->b_queued_intra_refresh = 1;
  702. +}
  703. +
  704.  /****************************************************************************
  705.   * x264_encoder_encode:
  706.   *  XXX: i_poc   : is the poc of the current given picture
  707. @@ -2380,6 +2386,7 @@ int     x264_encoder_encode( x264_t *h,
  708.          if( IS_X264_TYPE_I( h->fenc->i_type ) )
  709.          {
  710.              h->fdec->i_frames_since_pir = 0;
  711. +            h->b_queued_intra_refresh = 0;
  712.              /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
  713.               * the whole frame and counts as an intra refresh. */
  714.              h->fdec->f_pir_position = h->sps->i_mb_width;
  715. @@ -2390,10 +2397,12 @@ int     x264_encoder_encode( x264_t *h,
  716.              float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
  717.              h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
  718.              h->fdec->i_frames_since_pir = h->fref0[0]->i_frames_since_pir + pocdiff;
  719. -            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max )
  720. +            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max ||
  721. +                (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->sps->i_mb_width) )
  722.              {
  723.                  h->fdec->f_pir_position = 0;
  724.                  h->fdec->i_frames_since_pir = 0;
  725. +                h->b_queued_intra_refresh = 0;
  726.                  h->fenc->b_keyframe = 1;
  727.              }
  728.              h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
  729. diff --git a/x264.h b/x264.h
  730. index 83f087e..f568dc5 100644
  731. --- a/x264.h
  732. +++ b/x264.h
  733. @@ -35,7 +35,7 @@
  734.  
  735.  #include <stdarg.h>
  736.  
  737. -#define X264_BUILD 94
  738. +#define X264_BUILD 95
  739.  
  740.  /* x264_t:
  741.   *      opaque handler for encoder */
  742. @@ -639,5 +639,13 @@ void    x264_encoder_close  ( x264_t * );
  743.   *      return the number of currently delayed (buffered) frames
  744.   *      this should be used at the end of the stream, to know when you have all the encoded frames. */
  745.  int     x264_encoder_delayed_frames( x264_t * );
  746. +/* x264_encoder_intra_refresh:
  747. + *      If an intra refresh is not in progress, begin one with the next P-frame.
  748. + *      If an intra refresh is in progress, begin one as soon as the current one finishes.
  749. + *      Requires that b_intra_refresh be set.
  750. + *      Useful for interactive streaming where the client can tell the server that packet loss has
  751. + *      occurred.  In this case, keyint can be set to an extremely high value so that intra refreshes
  752. + *      only occur when calling x264_encoder_intra_refresh. */
  753. +void    x264_encoder_intra_refresh( x264_t * );
  754.  
  755.  #endif
  756. --
  757. 1.7.0.4
  758.  
  759.  
  760. From 548ea47cb5484a3754a1217e30b7640a12d061b5 Mon Sep 17 00:00:00 2001
  761. From: Henrik Gramner <hengar-6@student.ltu.se>
  762. Date: Mon, 10 May 2010 23:27:36 +0200
  763. Subject: [PATCH 4/9] Shrink even more constant arrays
  764.  
  765. ---
  766. common/arm/mc-c.c |    4 ++--
  767.  common/mc.c       |    4 ++--
  768.  common/ppc/mc.c   |    4 ++--
  769.  common/set.c      |   10 +++++-----
  770.  common/x86/mc-c.c |    4 ++--
  771.  encoder/encoder.c |    4 ++--
  772.  encoder/me.c      |    2 +-
  773.  encoder/set.c     |   14 +++++---------
  774.  8 files changed, 21 insertions(+), 25 deletions(-)
  775.  
  776. diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
  777. index 0a7b734..d294eff 100644
  778. --- a/common/arm/mc-c.c
  779. +++ b/common/arm/mc-c.c
  780. @@ -112,8 +112,8 @@ static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int,
  781.      x264_mc_copy_w16_neon,
  782.  };
  783.  
  784. -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  785. -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  786. +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  787. +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  788.  
  789.  static void mc_luma_neon( uint8_t *dst,    int i_dst_stride,
  790.                            uint8_t *src[4], int i_src_stride,
  791. diff --git a/common/mc.c b/common/mc.c
  792. index ada8bdc..e0dc659 100644
  793. --- a/common/mc.c
  794. +++ b/common/mc.c
  795. @@ -203,8 +203,8 @@ static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *s
  796.      }
  797.  }
  798.  
  799. -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  800. -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  801. +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  802. +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  803.  
  804.  static void mc_luma( uint8_t *dst,    int i_dst_stride,
  805.                       uint8_t *src[4], int i_src_stride,
  806. diff --git a/common/ppc/mc.c b/common/ppc/mc.c
  807. index 26b81f8..83c60b1 100644
  808. --- a/common/ppc/mc.c
  809. +++ b/common/ppc/mc.c
  810. @@ -37,8 +37,8 @@ typedef void (*pf_mc_t)( uint8_t *src, int i_src,
  811.                           uint8_t *dst, int i_dst, int i_height );
  812.  
  813.  
  814. -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  815. -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  816. +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  817. +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  818.  
  819.  
  820.  static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
  821. diff --git a/common/set.c b/common/set.c
  822. index 50d4213..16cff8e 100644
  823. --- a/common/set.c
  824. +++ b/common/set.c
  825. @@ -23,7 +23,7 @@
  826.  #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
  827.  #define DIV(n,d) (((n) + ((d)>>1)) / (d))
  828.  
  829. -static const int dequant4_scale[6][3] =
  830. +static const uint8_t dequant4_scale[6][3] =
  831.  {
  832.      { 10, 13, 16 },
  833.      { 11, 14, 18 },
  834. @@ -32,7 +32,7 @@ static const int dequant4_scale[6][3] =
  835.      { 16, 20, 25 },
  836.      { 18, 23, 29 }
  837.  };
  838. -static const int quant4_scale[6][3] =
  839. +static const uint16_t quant4_scale[6][3] =
  840.  {
  841.      { 13107, 8066, 5243 },
  842.      { 11916, 7490, 4660 },
  843. @@ -42,11 +42,11 @@ static const int quant4_scale[6][3] =
  844.      {  7282, 4559, 2893 },
  845.  };
  846.  
  847. -static const int quant8_scan[16] =
  848. +static const uint8_t quant8_scan[16] =
  849.  {
  850.      0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
  851.  };
  852. -static const int dequant8_scale[6][6] =
  853. +static const uint8_t dequant8_scale[6][6] =
  854.  {
  855.      { 20, 18, 32, 19, 25, 24 },
  856.      { 22, 19, 35, 21, 28, 26 },
  857. @@ -55,7 +55,7 @@ static const int dequant8_scale[6][6] =
  858.      { 32, 28, 51, 30, 40, 38 },
  859.      { 36, 32, 58, 34, 46, 43 },
  860.  };
  861. -static const int quant8_scale[6][6] =
  862. +static const uint16_t quant8_scale[6][6] =
  863.  {
  864.      { 13107, 11428, 20972, 12222, 16777, 15481 },
  865.      { 11916, 10826, 19174, 11058, 14980, 14290 },
  866. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
  867. index 6d386f6..f641cff 100644
  868. --- a/common/x86/mc-c.c
  869. +++ b/common/x86/mc-c.c
  870. @@ -228,8 +228,8 @@ static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
  871.      }
  872.  }
  873.  
  874. -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  875. -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  876. +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  877. +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  878.  
  879.  #define MC_LUMA(name,instr1,instr2)\
  880.  static void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
  881. diff --git a/encoder/encoder.c b/encoder/encoder.c
  882. index 42d49bf..e082024 100644
  883. --- a/encoder/encoder.c
  884. +++ b/encoder/encoder.c
  885. @@ -2816,8 +2816,8 @@ void    x264_encoder_close  ( x264_t *h )
  886.      /* Slices used and PSNR */
  887.      for( int i = 0; i < 5; i++ )
  888.      {
  889. -        static const int slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B };
  890. -        static const char *slice_name[] = { "P", "B", "I", "SP", "SI" };
  891. +        static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B };
  892. +        static const char * const slice_name[] = { "P", "B", "I", "SP", "SI" };
  893.          int i_slice = slice_order[i];
  894.  
  895.          if( h->stat.i_frame_count[i_slice] > 0 )
  896. diff --git a/encoder/me.c b/encoder/me.c
  897. index 5e113f0..a35da53 100644
  898. --- a/encoder/me.c
  899. +++ b/encoder/me.c
  900. @@ -484,7 +484,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
  901.              int i = 1;
  902.              do
  903.              {
  904. -                static const int hex4[16][2] = {
  905. +                static const int8_t hex4[16][2] = {
  906.                      { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
  907.                      {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
  908.                      {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
  909. diff --git a/encoder/set.c b/encoder/set.c
  910. index e3a071c..ce52a4b 100644
  911. --- a/encoder/set.c
  912. +++ b/encoder/set.c
  913. @@ -315,26 +315,22 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
  914.          if( sps->vui.b_aspect_ratio_info_present )
  915.          {
  916.              int i;
  917. -            static const struct { int w, h; int sar; } sar[] =
  918. +            static const struct { uint8_t w, h, sar; } sar[] =
  919.              {
  920.                  { 1,   1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
  921.                  { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
  922.                  { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
  923. -                { 160,99, 13}, { 0, 0, -1 }
  924. +                { 160,99, 13}, { 0, 0, 255 }
  925.              };
  926. -            for( i = 0; sar[i].sar != -1; i++ )
  927. +            for( i = 0; sar[i].sar != 255; i++ )
  928.              {
  929.                  if( sar[i].w == sps->vui.i_sar_width &&
  930.                      sar[i].h == sps->vui.i_sar_height )
  931.                      break;
  932.              }
  933. -            if( sar[i].sar != -1 )
  934. +            bs_write( s, 8, sar[i].sar );
  935. +            if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */
  936.              {
  937. -                bs_write( s, 8, sar[i].sar );
  938. -            }
  939. -            else
  940. -            {
  941. -                bs_write( s, 8, 255);   /* aspect_ratio_idc (extended) */
  942.                  bs_write( s, 16, sps->vui.i_sar_width );
  943.                  bs_write( s, 16, sps->vui.i_sar_height );
  944.              }
  945. --
  946. 1.7.0.4
  947.  
  948.  
  949. From 5d1dd185510c753033ed841e55425eded293a10b Mon Sep 17 00:00:00 2001
  950. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  951. Date: Mon, 10 May 2010 22:59:12 -0700
  952. Subject: [PATCH 5/9] Fix condition for printing rc=cbr in options SEI
  953.  Also fix crf-max formatting.
  954.  
  955. ---
  956. common/common.c |    4 ++--
  957.  1 files changed, 2 insertions(+), 2 deletions(-)
  958.  
  959. diff --git a/common/common.c b/common/common.c
  960. index 848c6de..ad7cf98 100644
  961. --- a/common/common.c
  962. +++ b/common/common.c
  963. @@ -1237,7 +1237,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
  964.          s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead );
  965.  
  966.      s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
  967. -                               ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size == p->rc.i_bitrate ? "cbr" : "abr" )
  968. +                               ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" )
  969.                                 : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
  970.      if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
  971.      {
  972. @@ -1256,7 +1256,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
  973.              s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d",
  974.                            p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size );
  975.              if( p->rc.i_rc_method == X264_RC_CRF )
  976. -                s += sprintf( s, " crf-max=%.1f", p->rc.f_rf_constant_max );
  977. +                s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max );
  978.          }
  979.      }
  980.      else if( p->rc.i_rc_method == X264_RC_CQP )
  981. --
  982. 1.7.0.4
  983.  
  984.  
  985. From ffaf1e14b54d791f369fc51a534111ddd839c55d Mon Sep 17 00:00:00 2001
  986. From: Anton Mitrofanov <BugMaster@narod.ru>
  987. Date: Wed, 12 May 2010 01:57:38 +0400
  988. Subject: [PATCH 6/9] Fix crash with sliced-threads on Phenom
  989.  
  990. ---
  991. encoder/encoder.c |    4 ++++
  992.  1 files changed, 4 insertions(+), 0 deletions(-)
  993.  
  994. diff --git a/encoder/encoder.c b/encoder/encoder.c
  995. index e082024..3a5520f 100644
  996. --- a/encoder/encoder.c
  997. +++ b/encoder/encoder.c
  998. @@ -2066,6 +2066,10 @@ static void *x264_slices_write( x264_t *h )
  999.  static int x264_threaded_slices_write( x264_t *h )
  1000.  {
  1001.      void *ret = NULL;
  1002. +#ifdef HAVE_MMX
  1003. +    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
  1004. +        x264_cpu_mask_misalign_sse();
  1005. +#endif
  1006.      /* set first/last mb and sync contexts */
  1007.      for( int i = 0; i < h->param.i_threads; i++ )
  1008.      {
  1009. --
  1010. 1.7.0.4
  1011.  
  1012.  
  1013. From ec937b4219673bdea810f00bd9cc91f5d174302b Mon Sep 17 00:00:00 2001
  1014. From: Anton Mitrofanov <BugMaster@narod.ru>
  1015. Date: Wed, 12 May 2010 22:05:34 +0400
  1016. Subject: [PATCH 7/9] Fix bitrate calculation in progress status
  1017.  Was slightly incorrect due to using pts, which is out of order.
  1018.  
  1019. ---
  1020. x264.c |   34 +++++++++++++++++++++++++---------
  1021.  1 files changed, 25 insertions(+), 9 deletions(-)
  1022.  
  1023. diff --git a/x264.c b/x264.c
  1024. index 8f4e372..1a85c74 100644
  1025. --- a/x264.c
  1026. +++ b/x264.c
  1027. @@ -1312,7 +1312,7 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
  1028.   * Encode:
  1029.   *****************************************************************************/
  1030.  
  1031. -static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_pts )
  1032. +static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_dts )
  1033.  {
  1034.      x264_picture_t pic_out;
  1035.      x264_nal_t *nal;
  1036. @@ -1330,18 +1330,22 @@ static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *l
  1037.      if( i_frame_size )
  1038.      {
  1039.          i_frame_size = output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out );
  1040. -        *last_pts = pic_out.i_pts;
  1041. +        *last_dts = pic_out.i_dts;
  1042.      }
  1043.  
  1044.      return i_frame_size;
  1045.  }
  1046.  
  1047. -static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_pts )
  1048. +static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_ts )
  1049.  {
  1050.      char    buf[200];
  1051.      int64_t i_elapsed = x264_mdate() - i_start;
  1052.      double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
  1053. -    double bitrate = (double) i_file * 8 / ( (double) last_pts * 1000 * param->i_timebase_num / param->i_timebase_den );
  1054. +    double bitrate;
  1055. +    if( last_ts )
  1056. +        bitrate = (double) i_file * 8 / ( (double) last_ts * 1000 * param->i_timebase_num / param->i_timebase_den );
  1057. +    else
  1058. +        bitrate = (double) i_file * 8 / ( (double) 1000 * param->i_fps_den / param->i_fps_num );
  1059.      if( i_frame_total )
  1060.      {
  1061.          int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
  1062. @@ -1369,7 +1373,9 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
  1063.      int64_t i_file = 0;
  1064.      int     i_frame_size;
  1065.      int     i_update_interval;
  1066. -    int64_t last_pts = 0;
  1067. +    int64_t last_dts = 0;
  1068. +    int64_t prev_dts = 0;
  1069. +    int64_t first_dts = 0;
  1070.  #   define  MAX_PTS_WARNING 3 /* arbitrary */
  1071.      int     pts_warning_cnt = 0;
  1072.      int64_t largest_pts = -1;
  1073. @@ -1506,12 +1512,17 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
  1074.              pic.i_qpplus1 = 0;
  1075.          }
  1076.  
  1077. -        i_frame_size = Encode_frame( h, opt->hout, &pic, &last_pts );
  1078. +        prev_dts = last_dts;
  1079. +        i_frame_size = Encode_frame( h, opt->hout, &pic, &last_dts );
  1080.          if( i_frame_size < 0 )
  1081.              return -1;
  1082.          i_file += i_frame_size;
  1083.          if( i_frame_size )
  1084. +        {
  1085.              i_frame_output++;
  1086. +            if( i_frame_output == 1 )
  1087. +                first_dts = prev_dts = last_dts;
  1088. +        }
  1089.  
  1090.          i_frame++;
  1091.  
  1092. @@ -1520,19 +1531,24 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
  1093.  
  1094.          /* update status line (up to 1000 times per input file) */
  1095.          if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
  1096. -            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
  1097. +            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
  1098.      }
  1099.      /* Flush delayed frames */
  1100.      while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
  1101.      {
  1102. -        i_frame_size = Encode_frame( h, opt->hout, NULL, &last_pts );
  1103. +        prev_dts = last_dts;
  1104. +        i_frame_size = Encode_frame( h, opt->hout, NULL, &last_dts );
  1105.          if( i_frame_size < 0 )
  1106.              return -1;
  1107.          i_file += i_frame_size;
  1108.          if( i_frame_size )
  1109. +        {
  1110.              i_frame_output++;
  1111. +            if( i_frame_output == 1 )
  1112. +                first_dts = prev_dts = last_dts;
  1113. +        }
  1114.          if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
  1115. -            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
  1116. +            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
  1117.      }
  1118.      if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG )
  1119.          fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
  1120. --
  1121. 1.7.0.4
  1122.  
  1123.  
  1124. From d1d7484aba046614add62e2bdc4da23e570525c3 Mon Sep 17 00:00:00 2001
  1125. From: Kieran Kunhya <kieran@kunhya.com>
  1126. Date: Thu, 13 May 2010 19:13:35 +0100
  1127. Subject: [PATCH 8/9] Fix typo in pulldown
  1128.  
  1129. ---
  1130. x264.c |    2 +-
  1131.  1 files changed, 1 insertions(+), 1 deletions(-)
  1132.  
  1133. diff --git a/x264.c b/x264.c
  1134. index 1a85c74..862aabb 100644
  1135. --- a/x264.c
  1136. +++ b/x264.c
  1137. @@ -120,7 +120,7 @@ enum pulldown_type_e
  1138.  
  1139.  static const cli_pulldown_t pulldown_values[] =
  1140.  {
  1141. -    [X264_PULLDOWN_22]     = {1,  {TB},                                   2.0},
  1142. +    [X264_PULLDOWN_22]     = {1,  {TB},                                   1.0},
  1143.      [X264_PULLDOWN_32]     = {4,  {TBT, BT, BTB, TB},                     1.25},
  1144.      [X264_PULLDOWN_64]     = {2,  {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0},
  1145.      [X264_PULLDOWN_DOUBLE] = {1,  {PIC_STRUCT_DOUBLE},                    2.0},
  1146. --
  1147. 1.7.0.4
  1148.  
  1149.  
  1150. From a21e7bd854c8c441a081c4a353b02bf41454bb95 Mon Sep 17 00:00:00 2001
  1151. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1152. Date: Sat, 15 May 2010 14:48:58 -0700
  1153. Subject: [PATCH 9/9] Overhaul CABAC: faster, less cache usage
  1154.  Horribly munge up the CABAC tables to allow deduplication of some data.
  1155.  Saves 256 bytes of L1d cache in non-RD, 512 bytes in RD.
  1156.  Add asm versions of bypass and terminal; save L1i cache by re-using putbyte code.
  1157.  Further optimize encode_decision.
  1158.  All 3 primary CABAC functions fit in under 256 bytes of code total on x86_64.
  1159.  
  1160. ---
  1161. common/cabac.c         |  185 ++++++++++++++++++++----------------------------
  1162.  common/cabac.h         |   22 ++++--
  1163.  common/x86/cabac-a.asm |   76 ++++++++++++++++----
  1164.  common/x86/x86inc.asm  |    2 +-
  1165.  encoder/rdo.c          |    2 +
  1166.  tools/checkasm.c       |   46 ++++++++++--
  1167.  6 files changed, 195 insertions(+), 138 deletions(-)
  1168.  
  1169. diff --git a/common/cabac.c b/common/cabac.c
  1170. index f50aef6..11988a1 100644
  1171. --- a/common/cabac.c
  1172. +++ b/common/cabac.c
  1173. @@ -664,75 +664,44 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] =
  1174.      }
  1175.  };
  1176.  
  1177. -/* FIXME could avoid this duplication by reversing the order of states
  1178. - * with MPS=0, but that would uglify the other tables */
  1179. -const uint8_t x264_cabac_range_lps[128][4] =
  1180. +const uint8_t x264_cabac_range_lps[64][4] =
  1181.  {
  1182. -    {   2,   2,   2,   2 },
  1183. -    {   6,   7,   8,   9 }, {   6,   7,   9,  10 }, {   6,   8,   9,  11 },
  1184. -    {   7,   8,  10,  11 }, {   7,   9,  10,  12 }, {   7,   9,  11,  12 },
  1185. -    {   8,   9,  11,  13 }, {   8,  10,  12,  14 }, {   9,  11,  12,  14 },
  1186. -    {   9,  11,  13,  15 }, {  10,  12,  14,  16 }, {  10,  12,  15,  17 },
  1187. -    {  11,  13,  15,  18 }, {  11,  14,  16,  19 }, {  12,  14,  17,  20 },
  1188. -    {  12,  15,  18,  21 }, {  13,  16,  19,  22 }, {  14,  17,  20,  23 },
  1189. -    {  14,  18,  21,  24 }, {  15,  19,  22,  25 }, {  16,  20,  23,  27 },
  1190. -    {  17,  21,  25,  28 }, {  18,  22,  26,  30 }, {  19,  23,  27,  31 },
  1191. -    {  20,  24,  29,  33 }, {  21,  26,  30,  35 }, {  22,  27,  32,  37 },
  1192. -    {  23,  28,  33,  39 }, {  24,  30,  35,  41 }, {  26,  31,  37,  43 },
  1193. -    {  27,  33,  39,  45 }, {  29,  35,  41,  48 }, {  30,  37,  43,  50 },
  1194. -    {  32,  39,  46,  53 }, {  33,  41,  48,  56 }, {  35,  43,  51,  59 },
  1195. -    {  37,  45,  54,  62 }, {  39,  48,  56,  65 }, {  41,  50,  59,  69 },
  1196. -    {  43,  53,  63,  72 }, {  46,  56,  66,  76 }, {  48,  59,  69,  80 },
  1197. -    {  51,  62,  73,  85 }, {  53,  65,  77,  89 }, {  56,  69,  81,  94 },
  1198. -    {  59,  72,  86,  99 }, {  62,  76,  90, 104 }, {  66,  80,  95, 110 },
  1199. -    {  69,  85, 100, 116 }, {  73,  89, 105, 122 }, {  77,  94, 111, 128 },
  1200. -    {  81,  99, 117, 135 }, {  85, 104, 123, 142 }, {  90, 110, 130, 150 },
  1201. -    {  95, 116, 137, 158 }, { 100, 122, 144, 166 }, { 105, 128, 152, 175 },
  1202. -    { 111, 135, 160, 185 }, { 116, 142, 169, 195 }, { 123, 150, 178, 205 },
  1203. -    { 128, 158, 187, 216 }, { 128, 167, 197, 227 }, { 128, 176, 208, 240 },
  1204. -
  1205. -    { 128, 176, 208, 240 }, { 128, 167, 197, 227 }, { 128, 158, 187, 216 },
  1206. -    { 123, 150, 178, 205 }, { 116, 142, 169, 195 }, { 111, 135, 160, 185 },
  1207. -    { 105, 128, 152, 175 }, { 100, 122, 144, 166 }, {  95, 116, 137, 158 },
  1208. -    {  90, 110, 130, 150 }, {  85, 104, 123, 142 }, {  81,  99, 117, 135 },
  1209. -    {  77,  94, 111, 128 }, {  73,  89, 105, 122 }, {  69,  85, 100, 116 },
  1210. -    {  66,  80,  95, 110 }, {  62,  76,  90, 104 }, {  59,  72,  86,  99 },
  1211. -    {  56,  69,  81,  94 }, {  53,  65,  77,  89 }, {  51,  62,  73,  85 },
  1212. -    {  48,  59,  69,  80 }, {  46,  56,  66,  76 }, {  43,  53,  63,  72 },
  1213. -    {  41,  50,  59,  69 }, {  39,  48,  56,  65 }, {  37,  45,  54,  62 },
  1214. -    {  35,  43,  51,  59 }, {  33,  41,  48,  56 }, {  32,  39,  46,  53 },
  1215. -    {  30,  37,  43,  50 }, {  29,  35,  41,  48 }, {  27,  33,  39,  45 },
  1216. -    {  26,  31,  37,  43 }, {  24,  30,  35,  41 }, {  23,  28,  33,  39 },
  1217. -    {  22,  27,  32,  37 }, {  21,  26,  30,  35 }, {  20,  24,  29,  33 },
  1218. -    {  19,  23,  27,  31 }, {  18,  22,  26,  30 }, {  17,  21,  25,  28 },
  1219. -    {  16,  20,  23,  27 }, {  15,  19,  22,  25 }, {  14,  18,  21,  24 },
  1220. -    {  14,  17,  20,  23 }, {  13,  16,  19,  22 }, {  12,  15,  18,  21 },
  1221. -    {  12,  14,  17,  20 }, {  11,  14,  16,  19 }, {  11,  13,  15,  18 },
  1222. -    {  10,  12,  15,  17 }, {  10,  12,  14,  16 }, {   9,  11,  13,  15 },
  1223. -    {   9,  11,  12,  14 }, {   8,  10,  12,  14 }, {   8,   9,  11,  13 },
  1224. -    {   7,   9,  11,  12 }, {   7,   9,  10,  12 }, {   7,   8,  10,  11 },
  1225. -    {   6,   8,   9,  11 }, {   6,   7,   9,  10 }, {   6,   7,   8,   9 },
  1226. -    {   2,   2,   2,   2 },
  1227. +    {  2,   2,   2,   2}, {  6,   7,   8,   9}, {  6,   7,   9,  10}, {  6,   8,   9,  11},
  1228. +    {  7,   8,  10,  11}, {  7,   9,  10,  12}, {  7,   9,  11,  12}, {  8,   9,  11,  13},
  1229. +    {  8,  10,  12,  14}, {  9,  11,  12,  14}, {  9,  11,  13,  15}, { 10,  12,  14,  16},
  1230. +    { 10,  12,  15,  17}, { 11,  13,  15,  18}, { 11,  14,  16,  19}, { 12,  14,  17,  20},
  1231. +    { 12,  15,  18,  21}, { 13,  16,  19,  22}, { 14,  17,  20,  23}, { 14,  18,  21,  24},
  1232. +    { 15,  19,  22,  25}, { 16,  20,  23,  27}, { 17,  21,  25,  28}, { 18,  22,  26,  30},
  1233. +    { 19,  23,  27,  31}, { 20,  24,  29,  33}, { 21,  26,  30,  35}, { 22,  27,  32,  37},
  1234. +    { 23,  28,  33,  39}, { 24,  30,  35,  41}, { 26,  31,  37,  43}, { 27,  33,  39,  45},
  1235. +    { 29,  35,  41,  48}, { 30,  37,  43,  50}, { 32,  39,  46,  53}, { 33,  41,  48,  56},
  1236. +    { 35,  43,  51,  59}, { 37,  45,  54,  62}, { 39,  48,  56,  65}, { 41,  50,  59,  69},
  1237. +    { 43,  53,  63,  72}, { 46,  56,  66,  76}, { 48,  59,  69,  80}, { 51,  62,  73,  85},
  1238. +    { 53,  65,  77,  89}, { 56,  69,  81,  94}, { 59,  72,  86,  99}, { 62,  76,  90, 104},
  1239. +    { 66,  80,  95, 110}, { 69,  85, 100, 116}, { 73,  89, 105, 122}, { 77,  94, 111, 128},
  1240. +    { 81,  99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158},
  1241. +    {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195},
  1242. +    {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240}
  1243.  };
  1244.  
  1245.  const uint8_t x264_cabac_transition[128][2] =
  1246.  {
  1247. -    {  0,  0}, {  1, 25}, {  1, 25}, {  2, 26}, {  3, 26}, {  4, 26}, {  5, 27}, {  6, 27},
  1248. -    {  7, 27}, {  8, 28}, {  9, 28}, { 10, 28}, { 11, 29}, { 12, 29}, { 13, 30}, { 14, 30},
  1249. -    { 15, 30}, { 16, 31}, { 17, 31}, { 18, 32}, { 19, 33}, { 20, 33}, { 21, 33}, { 22, 34},
  1250. -    { 23, 34}, { 24, 35}, { 25, 36}, { 26, 36}, { 27, 37}, { 28, 37}, { 29, 38}, { 30, 39},
  1251. -    { 31, 39}, { 32, 40}, { 33, 41}, { 34, 41}, { 35, 42}, { 36, 42}, { 37, 44}, { 38, 44},
  1252. -    { 39, 45}, { 40, 45}, { 41, 47}, { 42, 47}, { 43, 48}, { 44, 48}, { 45, 50}, { 46, 50},
  1253. -    { 47, 51}, { 48, 52}, { 49, 52}, { 50, 54}, { 51, 54}, { 52, 55}, { 53, 56}, { 54, 57},
  1254. -    { 55, 58}, { 56, 59}, { 57, 59}, { 58, 61}, { 59, 61}, { 60, 62}, { 61, 63}, { 62, 64},
  1255. -    { 63, 65}, { 64, 66}, { 65, 67}, { 66, 68}, { 66, 69}, { 68, 70}, { 68, 71}, { 69, 72},
  1256. -    { 70, 73}, { 71, 74}, { 72, 75}, { 73, 76}, { 73, 77}, { 75, 78}, { 75, 79}, { 76, 80},
  1257. -    { 77, 81}, { 77, 82}, { 79, 83}, { 79, 84}, { 80, 85}, { 80, 86}, { 82, 87}, { 82, 88},
  1258. -    { 83, 89}, { 83, 90}, { 85, 91}, { 85, 92}, { 86, 93}, { 86, 94}, { 87, 95}, { 88, 96},
  1259. -    { 88, 97}, { 89, 98}, { 90, 99}, { 90,100}, { 91,101}, { 91,102}, { 92,103}, { 93,104},
  1260. -    { 93,105}, { 94,106}, { 94,107}, { 94,108}, { 95,109}, { 96,110}, { 96,111}, { 97,112},
  1261. -    { 97,113}, { 97,114}, { 98,115}, { 98,116}, { 99,117}, { 99,118}, { 99,119}, {100,120},
  1262. -    {100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127},
  1263. +    {  0,   0}, {  1,   1}, {  2,  50}, { 51,   3}, {  2,  50}, { 51,   3}, {  4,  52}, { 53,   5},
  1264. +    {  6,  52}, { 53,   7}, {  8,  52}, { 53,   9}, { 10,  54}, { 55,  11}, { 12,  54}, { 55,  13},
  1265. +    { 14,  54}, { 55,  15}, { 16,  56}, { 57,  17}, { 18,  56}, { 57,  19}, { 20,  56}, { 57,  21},
  1266. +    { 22,  58}, { 59,  23}, { 24,  58}, { 59,  25}, { 26,  60}, { 61,  27}, { 28,  60}, { 61,  29},
  1267. +    { 30,  60}, { 61,  31}, { 32,  62}, { 63,  33}, { 34,  62}, { 63,  35}, { 36,  64}, { 65,  37},
  1268. +    { 38,  66}, { 67,  39}, { 40,  66}, { 67,  41}, { 42,  66}, { 67,  43}, { 44,  68}, { 69,  45},
  1269. +    { 46,  68}, { 69,  47}, { 48,  70}, { 71,  49}, { 50,  72}, { 73,  51}, { 52,  72}, { 73,  53},
  1270. +    { 54,  74}, { 75,  55}, { 56,  74}, { 75,  57}, { 58,  76}, { 77,  59}, { 60,  78}, { 79,  61},
  1271. +    { 62,  78}, { 79,  63}, { 64,  80}, { 81,  65}, { 66,  82}, { 83,  67}, { 68,  82}, { 83,  69},
  1272. +    { 70,  84}, { 85,  71}, { 72,  84}, { 85,  73}, { 74,  88}, { 89,  75}, { 76,  88}, { 89,  77},
  1273. +    { 78,  90}, { 91,  79}, { 80,  90}, { 91,  81}, { 82,  94}, { 95,  83}, { 84,  94}, { 95,  85},
  1274. +    { 86,  96}, { 97,  87}, { 88,  96}, { 97,  89}, { 90, 100}, {101,  91}, { 92, 100}, {101,  93},
  1275. +    { 94, 102}, {103,  95}, { 96, 104}, {105,  97}, { 98, 104}, {105,  99}, {100, 108}, {109, 101},
  1276. +    {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109},
  1277. +    {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117},
  1278. +    {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125}
  1279.  };
  1280.  
  1281.  const uint8_t x264_cabac_renorm_shift[64]= {
  1282. @@ -743,41 +712,40 @@ const uint8_t x264_cabac_renorm_shift[64]= {
  1283.  };
  1284.  
  1285.  /* -ln2(probability) */
  1286. -#define F(a,b) {FIX8(a),FIX8(b)}
  1287. -const uint16_t x264_cabac_entropy[128][2] =
  1288. +const uint16_t x264_cabac_entropy[128] =
  1289.  {
  1290. -    F(0.0273,5.7370), F(0.0288,5.6618), F(0.0303,5.5866), F(0.0320,5.5114),
  1291. -    F(0.0337,5.4362), F(0.0355,5.3610), F(0.0375,5.2859), F(0.0395,5.2106),
  1292. -    F(0.0416,5.1354), F(0.0439,5.0602), F(0.0463,4.9851), F(0.0488,4.9099),
  1293. -    F(0.0515,4.8347), F(0.0543,4.7595), F(0.0572,4.6843), F(0.0604,4.6091),
  1294. -    F(0.0637,4.5339), F(0.0671,4.4588), F(0.0708,4.3836), F(0.0747,4.3083),
  1295. -    F(0.0788,4.2332), F(0.0832,4.1580), F(0.0878,4.0828), F(0.0926,4.0076),
  1296. -    F(0.0977,3.9324), F(0.1032,3.8572), F(0.1089,3.7820), F(0.1149,3.7068),
  1297. -    F(0.1214,3.6316), F(0.1282,3.5565), F(0.1353,3.4813), F(0.1429,3.4061),
  1298. -    F(0.1510,3.3309), F(0.1596,3.2557), F(0.1686,3.1805), F(0.1782,3.1053),
  1299. -    F(0.1884,3.0301), F(0.1992,2.9549), F(0.2107,2.8797), F(0.2229,2.8046),
  1300. -    F(0.2358,2.7294), F(0.2496,2.6542), F(0.2642,2.5790), F(0.2798,2.5038),
  1301. -    F(0.2964,2.4286), F(0.3142,2.3534), F(0.3331,2.2782), F(0.3532,2.2030),
  1302. -    F(0.3748,2.1278), F(0.3979,2.0527), F(0.4226,1.9775), F(0.4491,1.9023),
  1303. -    F(0.4776,1.8271), F(0.5082,1.7519), F(0.5412,1.6767), F(0.5768,1.6015),
  1304. -    F(0.6152,1.5263), F(0.6568,1.4511), F(0.7020,1.3759), F(0.7513,1.3008),
  1305. -    F(0.8050,1.2256), F(0.8638,1.1504), F(0.9285,1.0752), F(1.0000,1.0000),
  1306. -    F(1.0000,1.0000), F(1.0752,0.9285), F(1.1504,0.8638), F(1.2256,0.8050),
  1307. -    F(1.3008,0.7513), F(1.3759,0.7020), F(1.4511,0.6568), F(1.5263,0.6152),
  1308. -    F(1.6015,0.5768), F(1.6767,0.5412), F(1.7519,0.5082), F(1.8271,0.4776),
  1309. -    F(1.9023,0.4491), F(1.9775,0.4226), F(2.0527,0.3979), F(2.1278,0.3748),
  1310. -    F(2.2030,0.3532), F(2.2782,0.3331), F(2.3534,0.3142), F(2.4286,0.2964),
  1311. -    F(2.5038,0.2798), F(2.5790,0.2642), F(2.6542,0.2496), F(2.7294,0.2358),
  1312. -    F(2.8046,0.2229), F(2.8797,0.2107), F(2.9549,0.1992), F(3.0301,0.1884),
  1313. -    F(3.1053,0.1782), F(3.1805,0.1686), F(3.2557,0.1596), F(3.3309,0.1510),
  1314. -    F(3.4061,0.1429), F(3.4813,0.1353), F(3.5565,0.1282), F(3.6316,0.1214),
  1315. -    F(3.7068,0.1149), F(3.7820,0.1089), F(3.8572,0.1032), F(3.9324,0.0977),
  1316. -    F(4.0076,0.0926), F(4.0828,0.0878), F(4.1580,0.0832), F(4.2332,0.0788),
  1317. -    F(4.3083,0.0747), F(4.3836,0.0708), F(4.4588,0.0671), F(4.5339,0.0637),
  1318. -    F(4.6091,0.0604), F(4.6843,0.0572), F(4.7595,0.0543), F(4.8347,0.0515),
  1319. -    F(4.9099,0.0488), F(4.9851,0.0463), F(5.0602,0.0439), F(5.1354,0.0416),
  1320. -    F(5.2106,0.0395), F(5.2859,0.0375), F(5.3610,0.0355), F(5.4362,0.0337),
  1321. -    F(5.5114,0.0320), F(5.5866,0.0303), F(5.6618,0.0288), F(5.7370,0.0273),
  1322. +    FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618),
  1323. +    FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114),
  1324. +    FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610),
  1325. +    FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106),
  1326. +    FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602),
  1327. +    FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099),
  1328. +    FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595),
  1329. +    FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091),
  1330. +    FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588),
  1331. +    FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083),
  1332. +    FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580),
  1333. +    FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076),
  1334. +    FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572),
  1335. +    FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068),
  1336. +    FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565),
  1337. +    FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061),
  1338. +    FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557),
  1339. +    FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053),
  1340. +    FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549),
  1341. +    FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046),
  1342. +    FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542),
  1343. +    FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038),
  1344. +    FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534),
  1345. +    FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030),
  1346. +    FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527),
  1347. +    FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023),
  1348. +    FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519),
  1349. +    FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015),
  1350. +    FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511),
  1351. +    FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008),
  1352. +    FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504),
  1353. +    FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000)
  1354.  };
  1355.  
  1356.  
  1357. @@ -794,14 +762,17 @@ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int
  1358.          cabac_context_init = &x264_cabac_context_init_PB[i_model];
  1359.  
  1360.      for( int i = 0; i < 460; i++ )
  1361. -        cb->state[i] = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
  1362. +    {
  1363. +        int state = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
  1364. +        cb->state[i] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
  1365. +    }
  1366.  }
  1367.  
  1368.  void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
  1369.  {
  1370.      cb->i_low   = 0;
  1371.      cb->i_range = 0x01FE;
  1372. -    cb->i_queue = -1; // the first bit will be shifted away and not written
  1373. +    cb->i_queue = -9; // the first bit will be shifted away and not written
  1374.      cb->i_bytes_outstanding = 0;
  1375.      cb->p_start = p_data;
  1376.      cb->p       = p_data;
  1377. @@ -810,10 +781,10 @@ void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
  1378.  
  1379.  static inline void x264_cabac_putbyte( x264_cabac_t *cb )
  1380.  {
  1381. -    if( cb->i_queue >= 8 )
  1382. +    if( cb->i_queue >= 0 )
  1383.      {
  1384. -        int out = cb->i_low >> (cb->i_queue+2);
  1385. -        cb->i_low &= (4<<cb->i_queue)-1;
  1386. +        int out = cb->i_low >> (cb->i_queue+10);
  1387. +        cb->i_low &= (0x400<<cb->i_queue)-1;
  1388.          cb->i_queue -= 8;
  1389.  
  1390.          if( (out & 0xff) == 0xff )
  1391. @@ -855,9 +826,9 @@ static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
  1392.  void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
  1393.  {
  1394.      int i_state = cb->state[i_ctx];
  1395. -    int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)-4];
  1396. +    int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
  1397.      cb->i_range -= i_range_lps;
  1398. -    if( b != (i_state >> 6) )
  1399. +    if( b != (i_state & 1) )
  1400.      {
  1401.          cb->i_low += cb->i_range;
  1402.          cb->i_range = i_range_lps;
  1403. @@ -866,7 +837,7 @@ void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
  1404.      x264_cabac_encode_renorm( cb );
  1405.  }
  1406.  
  1407. -void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
  1408. +void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
  1409.  {
  1410.      cb->i_low <<= 1;
  1411.      cb->i_low += -b & cb->i_range;
  1412. @@ -892,7 +863,7 @@ void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
  1413.      } while( k > 0 );
  1414.  }
  1415.  
  1416. -void x264_cabac_encode_terminal( x264_cabac_t *cb )
  1417. +void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
  1418.  {
  1419.      cb->i_range -= 2;
  1420.      x264_cabac_encode_renorm( cb );
  1421. diff --git a/common/cabac.h b/common/cabac.h
  1422. index ef68fe6..9fc3007 100644
  1423. --- a/common/cabac.h
  1424. +++ b/common/cabac.h
  1425. @@ -31,7 +31,7 @@ typedef struct
  1426.      int i_range;
  1427.  
  1428.      /* bit stream */
  1429. -    int i_queue;
  1430. +    int i_queue; //stored with an offset of -8 for faster asm
  1431.      int i_bytes_outstanding;
  1432.  
  1433.      uint8_t *p_start;
  1434. @@ -46,7 +46,7 @@ typedef struct
  1435.  } x264_cabac_t;
  1436.  
  1437.  extern const uint8_t x264_cabac_transition[128][2];
  1438. -extern const uint16_t x264_cabac_entropy[128][2];
  1439. +extern const uint16_t x264_cabac_entropy[128];
  1440.  
  1441.  /* init the contexts given i_slice_type, the quantif and the model */
  1442.  void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
  1443. @@ -55,15 +55,21 @@ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int
  1444.  void x264_cabac_encode_init ( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
  1445.  void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
  1446.  void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
  1447. -void x264_cabac_encode_bypass( x264_cabac_t *cb, int b );
  1448. +void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
  1449. +void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
  1450. +void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
  1451. +void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
  1452.  void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
  1453. -void x264_cabac_encode_terminal( x264_cabac_t *cb );
  1454.  void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
  1455.  
  1456.  #ifdef HAVE_MMX
  1457.  #define x264_cabac_encode_decision x264_cabac_encode_decision_asm
  1458. +#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
  1459. +#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
  1460.  #else
  1461.  #define x264_cabac_encode_decision x264_cabac_encode_decision_c
  1462. +#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
  1463. +#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
  1464.  #endif
  1465.  #define x264_cabac_encode_decision_noup x264_cabac_encode_decision
  1466.  
  1467. @@ -78,25 +84,25 @@ static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx
  1468.  {
  1469.      int i_state = cb->state[i_ctx];
  1470.      cb->state[i_ctx] = x264_cabac_transition[i_state][b];
  1471. -    cb->f8_bits_encoded += x264_cabac_entropy[i_state][b];
  1472. +    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
  1473.  }
  1474.  
  1475.  static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
  1476.  {
  1477.      int i_state = *state;
  1478.      *state = x264_cabac_transition[i_state][b];
  1479. -    return x264_cabac_entropy[i_state][b];
  1480. +    return x264_cabac_entropy[i_state^b];
  1481.  }
  1482.  
  1483.  static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
  1484.  {
  1485.      int i_state = cb->state[i_ctx];
  1486. -    cb->f8_bits_encoded += x264_cabac_entropy[i_state][b];
  1487. +    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
  1488.  }
  1489.  
  1490.  static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
  1491.  {
  1492. -    return x264_cabac_entropy[*state][b];
  1493. +    return x264_cabac_entropy[*state^b];
  1494.  }
  1495.  
  1496.  #endif
  1497. diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
  1498. index 2af98c7..8621c5b 100644
  1499. --- a/common/x86/cabac-a.asm
  1500. +++ b/common/x86/cabac-a.asm
  1501. @@ -32,13 +32,13 @@ cextern cabac_renorm_shift
  1502.  
  1503.  ; t3 must be ecx, since it's used for shift.
  1504.  %ifdef WIN64
  1505. -    DECLARE_REG_TMP 3,1,2,0,4,5,6,10
  1506. +    DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2
  1507.      %define pointer resq
  1508.  %elifdef ARCH_X86_64
  1509. -    DECLARE_REG_TMP 0,1,2,3,4,5,6,10
  1510. +    DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6
  1511.      %define pointer resq
  1512.  %else
  1513. -    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
  1514. +    DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2
  1515.      %define pointer resd
  1516.  %endif
  1517.  
  1518. @@ -72,13 +72,15 @@ cglobal cabac_encode_decision_asm, 0,7
  1519.      movifnidn t0,  r0mp
  1520.      movifnidn t1d, r1m
  1521.      mov   t5d, [t0+cb.range]
  1522. -    movzx t6d, byte [t0+cb.state+t1]
  1523. +    movzx t4d, byte [t0+cb.state+t1]
  1524.      mov   t3d, t5d
  1525. +    mov   t6d, t4d
  1526.      shr   t5d, 6
  1527. +    shr   t4d, 1
  1528.      movifnidn t2d, r2m
  1529. -    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t6*4
  1530. +    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4
  1531.      LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
  1532. -    shr   t6d, 6
  1533. +    and   t6d, 1
  1534.      sub   t3d, t5d
  1535.      cmp   t6d, t2d
  1536.      mov   t6d, [t0+cb.low]
  1537. @@ -94,20 +96,66 @@ cglobal cabac_encode_decision_asm, 0,7
  1538.      shl   t6d, t3b
  1539.      add   t3d, [t0+cb.queue]
  1540.      mov   [t0+cb.range], t4d
  1541. -    cmp   t3d, 8
  1542. -    jl .update_queue_low
  1543. -;cabac_putbyte
  1544. +    jge cabac_putbyte
  1545. +.update_queue_low:
  1546. +    mov   [t0+cb.low], t6d
  1547. +    mov   [t0+cb.queue], t3d
  1548. +    RET
  1549. +
  1550. +cglobal cabac_encode_bypass_asm, 0,3
  1551. +    movifnidn  t0, r0mp
  1552. +    movifnidn t3d, r1m
  1553. +    neg       t3d
  1554. +    mov       t8d, [t0+cb.low]
  1555. +    and       t3d, [t0+cb.range]
  1556. +    lea       t8d, [t8*2+t3]
  1557. +    mov       t3d, [t0+cb.queue]
  1558. +    inc       t3d
  1559. +%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
  1560. +    jge cabac_putbyte
  1561. +%else
  1562. +    jge .putbyte
  1563. +%endif
  1564. +    mov   [t0+cb.low], t8d
  1565. +    mov   [t0+cb.queue], t3d
  1566. +    RET
  1567. +.putbyte:
  1568. +    PROLOGUE 0,7
  1569. +    movifnidn t6d, t8d
  1570. +    jmp cabac_putbyte
  1571. +
  1572. +cglobal cabac_encode_terminal_asm, 0,3
  1573. +    movifnidn  t0, r0mp
  1574. +    sub  dword [t0+cb.range], 2
  1575. +; shortcut: the renormalization shift in terminal
  1576. +; can only be 0 or 1 and is zero over 99% of the time.
  1577. +    test dword [t0+cb.range], 0x100
  1578. +    je .renorm
  1579. +    REP_RET
  1580. +.renorm:
  1581. +    shl  dword [t0+cb.low], 1
  1582. +    shl  dword [t0+cb.range], 1
  1583. +    inc  dword [t0+cb.queue]
  1584. +    jge .putbyte
  1585. +    REP_RET
  1586. +.putbyte:
  1587. +    PROLOGUE 0,7
  1588. +    mov t3d, [t0+cb.queue]
  1589. +    mov t6d, [t0+cb.low]
  1590. +    jmp cabac_putbyte
  1591. +
  1592. +cabac_putbyte:
  1593.      ; alive: t0=cb t3=queue t6=low
  1594.  %ifdef WIN64
  1595.      DECLARE_REG_TMP 3,4,1,0,2,5,6,10
  1596.  %endif
  1597.      mov   t1d, -1
  1598. -    add   t3d, 2
  1599. +    add   t3d, 10
  1600.      mov   t2d, t6d
  1601.      shl   t1d, t3b
  1602.      shr   t2d, t3b ; out
  1603.      not   t1d
  1604. -    sub   t3d, 10
  1605. +    sub   t3d, 18
  1606.      and   t6d, t1d
  1607.      mov   t5d, [t0+cb.bytes_outstanding]
  1608.      cmp   t2b, 0xff ; FIXME is a 32bit op faster?
  1609. @@ -125,8 +173,4 @@ cglobal cabac_encode_decision_asm, 0,7
  1610.  .postpone:
  1611.      inc   t5d
  1612.      mov   [t0+cb.bytes_outstanding], t5d
  1613. -.update_queue_low:
  1614. -    mov   [t0+cb.low], t6d
  1615. -    mov   [t0+cb.queue], t3d
  1616. -    RET
  1617. -
  1618. +    jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
  1619. diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
  1620. index 9d23640..f006f37 100644
  1621. --- a/common/x86/x86inc.asm
  1622. +++ b/common/x86/x86inc.asm
  1623. @@ -171,7 +171,7 @@ DECLARE_REG_SIZE bp, bpl
  1624.      %endrep
  1625.  %endmacro
  1626.  
  1627. -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
  1628. +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
  1629.  
  1630.  %ifdef ARCH_X86_64
  1631.      %define gprsize 8
  1632. diff --git a/encoder/rdo.c b/encoder/rdo.c
  1633. index 4d83b6a..574a484 100644
  1634. --- a/encoder/rdo.c
  1635. +++ b/encoder/rdo.c
  1636. @@ -50,6 +50,8 @@ static uint16_t cabac_size_5ones[128];
  1637.   * fractional bits, but only finite precision. */
  1638.  #undef  x264_cabac_encode_decision
  1639.  #undef  x264_cabac_encode_decision_noup
  1640. +#undef  x264_cabac_encode_bypass
  1641. +#undef  x264_cabac_encode_terminal
  1642.  #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
  1643.  #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
  1644.  #define x264_cabac_encode_terminal(c)     ((c)->f8_bits_encoded += 7)
  1645. diff --git a/tools/checkasm.c b/tools/checkasm.c
  1646. index 2008d2f..9bc15c8 100644
  1647. --- a/tools/checkasm.c
  1648. +++ b/tools/checkasm.c
  1649. @@ -1556,32 +1556,66 @@ static int check_intra( int cpu_ref, int cpu_new )
  1650.  }
  1651.  
  1652.  #define DECL_CABAC(cpu) \
  1653. -static void run_cabac_##cpu( uint8_t *dst )\
  1654. +static void run_cabac_decision_##cpu( uint8_t *dst )\
  1655.  {\
  1656.      x264_cabac_t cb;\
  1657.      x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
  1658.      x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
  1659.      for( int i = 0; i < 0x1000; i++ )\
  1660.          x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
  1661. +}\
  1662. +static void run_cabac_bypass_##cpu( uint8_t *dst )\
  1663. +{\
  1664. +    x264_cabac_t cb;\
  1665. +    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
  1666. +    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
  1667. +    for( int i = 0; i < 0x1000; i++ )\
  1668. +        x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
  1669. +}\
  1670. +static void run_cabac_terminal_##cpu( uint8_t *dst )\
  1671. +{\
  1672. +    x264_cabac_t cb;\
  1673. +    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
  1674. +    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
  1675. +    for( int i = 0; i < 0x1000; i++ )\
  1676. +        x264_cabac_encode_terminal_##cpu( &cb );\
  1677.  }
  1678.  DECL_CABAC(c)
  1679.  #ifdef HAVE_MMX
  1680.  DECL_CABAC(asm)
  1681.  #else
  1682. -#define run_cabac_asm run_cabac_c
  1683. +#define run_cabac_decision_asm run_cabac_decision_c
  1684. +#define run_cabac_bypass_asm run_cabac_bypass_c
  1685. +#define run_cabac_terminal_asm run_cabac_terminal_c
  1686.  #endif
  1687.  
  1688.  static int check_cabac( int cpu_ref, int cpu_new )
  1689.  {
  1690.      int ret = 0, ok, used_asm = 1;
  1691. -    if( cpu_ref || run_cabac_c == run_cabac_asm)
  1692. +    if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
  1693.          return 0;
  1694. +
  1695.      set_func_name( "cabac_encode_decision" );
  1696.      memcpy( buf4, buf3, 0x1000 );
  1697. -    call_c( run_cabac_c, buf3 );
  1698. -    call_a( run_cabac_asm, buf4 );
  1699. +    call_c( run_cabac_decision_c, buf3 );
  1700. +    call_a( run_cabac_decision_asm, buf4 );
  1701. +    ok = !memcmp( buf3, buf4, 0x1000 );
  1702. +    report( "cabac decision:" );
  1703. +
  1704. +    set_func_name( "cabac_encode_bypass" );
  1705. +    memcpy( buf4, buf3, 0x1000 );
  1706. +    call_c( run_cabac_bypass_c, buf3 );
  1707. +    call_a( run_cabac_bypass_asm, buf4 );
  1708.      ok = !memcmp( buf3, buf4, 0x1000 );
  1709. -    report( "cabac :" );
  1710. +    report( "cabac bypass:" );
  1711. +
  1712. +    set_func_name( "cabac_encode_terminal" );
  1713. +    memcpy( buf4, buf3, 0x1000 );
  1714. +    call_c( run_cabac_terminal_c, buf3 );
  1715. +    call_a( run_cabac_terminal_asm, buf4 );
  1716. +    ok = !memcmp( buf3, buf4, 0x1000 );
  1717. +    report( "cabac terminal:" );
  1718. +
  1719.      return ret;
  1720.  }
  1721.  
  1722. --
  1723. 1.7.0.4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement