Untitled

From 09b5a943c62212447a0151dfd5324f8e36715272 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Thu, 6 May 2010 10:03:31 -0700
Subject: [PATCH 1/9] More cosmetics

---
 common/cpu.c           |    4 +-
 common/macroblock.c    |    6 +++-
 common/mc.c            |    4 +-
 common/mvpred.c        |   12 ++++----
 common/ppc/dct.c       |    2 +-
 common/ppc/mc.c        |   12 ++++----
 common/ppc/ppccommon.h |    8 +++---
 common/ppc/quant.c     |    6 ++--
 common/predict.c       |    2 +-
 common/x86/const-a.asm |    2 +-
 common/x86/mc-c.c      |    2 +-
 common/x86/predict-c.c |    2 +-
 encoder/cabac.c        |    8 +++---
 encoder/me.c           |   18 ++++++------
 input/avs.c            |    2 +-
 tools/checkasm.c       |   66 ++++++++++++++++++++++++------------------------
 16 files changed, 79 insertions(+), 77 deletions(-)

diff --git a/common/cpu.c b/common/cpu.c
index 904eedc..933a754 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -87,8 +87,8 @@ static void sigill_handler( int sig )
 #endif

 #ifdef HAVE_MMX
-extern int  x264_cpu_cpuid_test( void );
-extern uint32_t  x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
+int x264_cpu_cpuid_test( void );
+uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );

 uint32_t x264_cpu_detect( void )
 {
diff --git a/common/macroblock.c b/common/macroblock.c
index f402588..110c3a5 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -295,7 +295,8 @@ int x264_macroblock_cache_allocate( x264_t *h )
     }

     return 0;
-fail: return -1;
+fail:
+    return -1;
 }
 void x264_macroblock_cache_free( x264_t *h )
 {
@@ -348,7 +349,8 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
     CHECKED_MALLOC( h->scratch_buffer, scratch_size );

     return 0;
-fail: return -1;
+fail:
+    return -1;
 }

 void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
diff --git a/common/mc.c b/common/mc.c
index ad7fe79..ada8bdc 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -97,9 +97,9 @@ static void name( uint8_t *pix1, int i_stride_pix1, \
                   uint8_t *pix2, int i_stride_pix2, \
                   uint8_t *pix3, int i_stride_pix3, int weight ) \
 { \
-    if( weight == 32 )\
+    if( weight == 32 ) \
         pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
-    else\
+    else \
         pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
 }
 PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
diff --git a/common/mvpred.c b/common/mvpred.c
index de91826..54b4d5a 100755
--- a/common/mvpred.c
+++ b/common/mvpred.c
@@ -394,7 +394,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
     int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
     int i = 0;

-#define SET_MVP(mvp)\
+#define SET_MVP(mvp) \
     { \
         CP32( mvc[i], mvp ); \
         i++; \
@@ -445,13 +445,13 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
         if( h->sh.b_mbaff && field^(i_ref&1) )
             refpoc += h->sh.i_delta_poc_bottom;

-#define SET_TMVP( dx, dy )\
+#define SET_TMVP( dx, dy ) \
         { \
             int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \
-            int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field];\
-            mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8;\
-            mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8;\
-            i++;\
+            int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field]; \
+            mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8; \
+            mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8; \
+            i++; \
         }

         SET_TMVP(0,0);
diff --git a/common/ppc/dct.c b/common/ppc/dct.c
index fdadf53..eb223ae 100644
--- a/common/ppc/dct.c
+++ b/common/ppc/dct.c
@@ -205,7 +205,7 @@ void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
     vec_st( dct_tr1v, 16,  (signed short *)dct );
     vec_st( dct_tr2v, 32,  (signed short *)dct );
     vec_st( dct_tr3v, 48,  (signed short *)dct );
-
+
     vec_st( dct_tr4v, 64,  (signed short *)dct );
     vec_st( dct_tr5v, 80,  (signed short *)dct );
     vec_st( dct_tr6v, 96,  (signed short *)dct );
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index dfe250a..26b81f8 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -291,8 +291,8 @@ static void mc_chroma_2xh( uint8_t *dst, int i_dst_stride,
  }


-#define DO_PROCESS_W4( a )  \
-    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A );   \
+#define DO_PROCESS_W4( a ) \
+    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
     dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )

 static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
@@ -369,10 +369,10 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
     }
 }

-#define DO_PROCESS_W8( a )  \
-    src##a##v_16A = vec_u8_to_u16( src##a##v_8A );  \
-    src##a##v_16B = vec_u8_to_u16( src##a##v_8B );  \
-    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A );   \
+#define DO_PROCESS_W8( a ) \
+    src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \
+    src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \
+    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
     dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )

 static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index 510ab26..e61afaa 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -113,13 +113,13 @@ typedef union {
     vec_u8_t _hv, _lv

 #define PREP_LOAD_SRC( src )              \
-    vec_u8_t _##src##_ = vec_lvsl(0, src)
+    vec_u8_t _##src##_ = vec_lvsl(0, src)

 #define VEC_LOAD_G( p, v, n, t )                 \
     _hv = vec_ld( 0, p );                        \
     v   = (t) vec_lvsl( 0, p );                  \
     _lv = vec_ld( n - 1, p );                    \
-    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
+    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )

 #define VEC_LOAD( p, v, n, t, g )                   \
     _hv = vec_ld( 0, p );                           \
@@ -134,7 +134,7 @@ typedef union {
 #define VEC_LOAD_PARTIAL( p, v, n, t, g)               \
     _hv = vec_ld( 0, p);                               \
     v   = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
-
+

 /***********************************************************************
  * PREP_STORE##n: declares required vectors to store n bytes to a
@@ -155,7 +155,7 @@ typedef union {
     _lv    = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
     vec_st( _lv, 15, (uint8_t *) p );                    \
     _hv    = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
-    vec_st( _hv, 0, (uint8_t *) p )
+    vec_st( _hv, 0, (uint8_t *) p )


 #define PREP_STORE8 \
diff --git a/common/ppc/quant.c b/common/ppc/quant.c
index 4b2825c..6f41a06 100644
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -20,7 +20,7 @@

 #include "common/common.h"
 #include "ppccommon.h"
-#include "quant.h"
+#include "quant.h"

 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
 #define QUANT_16_U( idx0, idx1 )                                    \
@@ -55,7 +55,7 @@
     nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
     vec_st(temp2v, (idx1), (int16_t*)dct);                          \
 }
-
+
 int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
 {
     LOAD_ZERO;
@@ -220,7 +220,7 @@ int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64
     vec_u16_t biasvB;

     vec_s16_t temp1v, temp2v;
-
+
     vec_u32_u qbits_u;
     qbits_u.s[0]=16;
     i_qbitsv = vec_splat(qbits_u.v, 0);
diff --git a/common/predict.c b/common/predict.c
index 783cc9b..f120ca7 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -41,7 +41,7 @@
  * 16x16 prediction for intra luma block
  ****************************************************************************/

-#define PREDICT_16x16_DC(v) \
+#define PREDICT_16x16_DC(v)\
     for( int i = 0; i < 16; i++ )\
     {\
         M32( src+ 0 ) = v;\
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index 79bbf1b..99a34be 100755
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -43,7 +43,7 @@ const pw_64,       times 8 dw 64
 const pw_32_0,     times 4 dw 32,
                    times 4 dw 0
 const pw_8000,     times 8 dw 0x8000
-const pw_3fff,   times 8 dw 0x3fff
+const pw_3fff,     times 8 dw 0x3fff

 const pd_1,        times 4 dd 1
 const pd_128,      times 4 dd 128
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index fb73562..6d386f6 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -103,7 +103,7 @@ void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
 void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
 void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, int len );
-#define LOWRES(cpu) \
+#define LOWRES(cpu)\
 void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
                                         int src_stride, int dst_stride, int width, int height );
 LOWRES(mmxext)
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 6fa7e3b..0e3e1c7 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -326,7 +326,7 @@ static void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
     t=e; e+=f; f-=t;\
     t=g; g+=h; h-=t;

-#define INTRA_SA8D_X3(cpu) \
+#define INTRA_SA8D_X3(cpu)\
 void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\
 {\
     PREDICT_8x8_LOAD_TOP\
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 1086447..bc76fc8 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -736,13 +736,13 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
 }
 #endif

-#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra ) \
-{ \
-    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \
+#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra )\
+{\
+    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra );\
     if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
     {\
         x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
-        block_residual_write_cabac( h, cb, i_ctxBlockCat, l ); \
+        block_residual_write_cabac( h, cb, i_ctxBlockCat, l );\
     }\
     else\
         x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
diff --git a/encoder/me.c b/encoder/me.c
index d7b2928..5e113f0 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -914,14 +914,14 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy];
 }

-#define BIME_CACHE( dx, dy, list ) \
-{ \
+#define BIME_CACHE( dx, dy, list )\
+{\
     x264_me_t *m = m##list;\
-    int i = 4 + 3*dx + dy; \
+    int i = 4 + 3*dx + dy;\
     int mvx = bm##list##x+dx;\
     int mvy = bm##list##y+dy;\
     stride[list][i] = bw;\
-    src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \
+    src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\
     if( rd )\
     {\
         h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
@@ -1107,11 +1107,11 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
     { \
         uint64_t cost; \
         M32( cache_mv ) = pack16to32_mask(mx,my); \
-        if( m->i_pixel <= PIXEL_8x8 )\
-        {\
-            h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
-            h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
-        }\
+        if( m->i_pixel <= PIXEL_8x8 ) \
+        { \
+            h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+            h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+        } \
         cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
         COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
     } \
diff --git a/input/avs.c b/input/avs.c
index 9e3aa55..5489a5e 100644
--- a/input/avs.c
+++ b/input/avs.c
@@ -45,7 +45,7 @@
 /* maximum size of the sequence of filters to try on non script files */
 #define AVS_MAX_SEQUENCE 5

-#define LOAD_AVS_FUNC(name, continue_on_fail) \
+#define LOAD_AVS_FUNC(name, continue_on_fail)\
 {\
     h->func.name = (void*)GetProcAddress( h->library, #name );\
     if( !continue_on_fail && !h->func.name )\
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 228b75f..2008d2f 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -265,7 +265,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
         buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1));

 #define TEST_PIXEL( name, align ) \
-    ok = 1, used_asm = 0;\
+    ok = 1, used_asm = 0; \
     for( int i = 0; i < 7; i++ ) \
     { \
         int res_c, res_asm; \
@@ -305,7 +305,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
     TEST_PIXEL( sa8d, 1 );

 #define TEST_PIXEL_X( N ) \
-    ok = 1; used_asm = 0;\
+    ok = 1; used_asm = 0; \
     for( int i = 0; i < 7; i++ ) \
     { \
         int res_c[4]={0}, res_asm[4]={0}; \
@@ -350,7 +350,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
     { \
         set_func_name( "%s_%s", "var", pixel_names[i] ); \
         used_asm = 1; \
-        /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
+        /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
         call_c1( pixel_c.var[i], buf1, 16 ); \
         call_a1( pixel_asm.var[i], buf1, 16 ); \
         uint64_t res_c   = pixel_c.var[i]( buf1, 16 ); \
@@ -415,7 +415,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
     { \
         int res_c[3], res_asm[3]; \
-        set_func_name( #name );\
+        set_func_name( #name ); \
         used_asm = 1; \
         memcpy( buf3, buf2, 1024 ); \
         for( int i = 0; i < 3; i++ ) \
@@ -538,7 +538,7 @@ static int check_dct( int cpu_ref, int cpu_new )
 #define TEST_DCT( name, t1, t2, size ) \
     if( dct_asm.name != dct_ref.name ) \
     { \
-        set_func_name( #name );\
+        set_func_name( #name ); \
         used_asm = 1; \
         call_c( dct_c.name, t1, buf1, buf2 ); \
         call_a( dct_asm.name, t2, buf1, buf2 ); \
@@ -579,7 +579,7 @@ static int check_dct( int cpu_ref, int cpu_new )
 #define TEST_IDCT( name, src ) \
     if( dct_asm.name != dct_ref.name ) \
     { \
-        set_func_name( #name );\
+        set_func_name( #name ); \
         used_asm = 1; \
         memcpy( buf3, buf1, 32*32 ); \
         memcpy( buf4, buf1, 32*32 ); \
@@ -644,12 +644,12 @@ static int check_dct( int cpu_ref, int cpu_new )
     ALIGNED_16( int16_t level1[64] );
     ALIGNED_16( int16_t level2[64] );

-#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size )   \
+#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
     if( zigzag_asm.name != zigzag_ref.name ) \
     { \
-        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
         used_asm = 1; \
-        memcpy(dct, buf1, size*sizeof(int16_t));\
+        memcpy(dct, buf1, size*sizeof(int16_t)); \
         call_c( zigzag_c.name, t1, dct ); \
         call_a( zigzag_asm.name, t2, dct ); \
         if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \
@@ -663,18 +663,18 @@ static int check_dct( int cpu_ref, int cpu_new )
     if( zigzag_asm.name != zigzag_ref.name ) \
     { \
         int nz_a, nz_c; \
-        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
         used_asm = 1; \
         memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
         memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
-        nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 );  \
+        nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \
         nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
-        if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a )  \
+        if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
         } \
-        call_c2( zigzag_c.name, t1, buf2, buf3 );  \
+        call_c2( zigzag_c.name, t1, buf2, buf3 ); \
         call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
     }

@@ -683,7 +683,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     { \
         int nz_a, nz_c; \
         int16_t dc_a, dc_c; \
-        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
         used_asm = 1; \
         for( int i = 0; i < 2; i++ ) \
         { \
@@ -694,27 +694,27 @@ static int check_dct( int cpu_ref, int cpu_new )
                 memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
                 memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
             } \
-            nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c );  \
+            nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
             nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
-            if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a )  \
+            if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name " [FAILED]\n" ); \
                 break; \
             } \
         } \
-        call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c );  \
+        call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
         call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
     }

-#define TEST_INTERLEAVE( name, t1, t2, dct, size )   \
+#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
     if( zigzag_asm.name != zigzag_ref.name ) \
     { \
         for( int j = 0; j < 100; j++ ) \
         { \
-            set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+            set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
             used_asm = 1; \
-            memcpy(dct, buf1, size*sizeof(int16_t));\
+            memcpy(dct, buf1, size*sizeof(int16_t)); \
             for( int i = 0; i < size; i++ ) \
                 dct[i] = rand()&0x1F ? 0 : dct[i]; \
             memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
@@ -784,7 +784,7 @@ static int check_mc( int cpu_ref, int cpu_new )
         if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
         { \
             const x264_weight_t *weight = weight_none; \
-            set_func_name( "mc_luma_%dx%d", w, h );\
+            set_func_name( "mc_luma_%dx%d", w, h ); \
             used_asm = 1; \
             memset( buf3, 0xCD, 1024 ); \
             memset( buf4, 0xCD, 1024 ); \
@@ -801,7 +801,7 @@ static int check_mc( int cpu_ref, int cpu_new )
             uint8_t *ref = dst2; \
             int ref_stride = 32; \
             const x264_weight_t *weight = weight_none; \
-            set_func_name( "get_ref_%dx%d", w, h );\
+            set_func_name( "get_ref_%dx%d", w, h ); \
             used_asm = 1; \
             memset( buf3, 0xCD, 1024 ); \
             memset( buf4, 0xCD, 1024 ); \
@@ -819,13 +819,13 @@ static int check_mc( int cpu_ref, int cpu_new )
 #define MC_TEST_CHROMA( w, h ) \
         if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
         { \
-            set_func_name( "mc_chroma_%dx%d", w, h );\
+            set_func_name( "mc_chroma_%dx%d", w, h ); \
             used_asm = 1; \
             memset( buf3, 0xCD, 1024 ); \
             memset( buf4, 0xCD, 1024 ); \
             call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
             call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
-            /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
+            /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
             for( int j = 0; j < h; j++ ) \
                 for( int i = w; i < 4; i++ ) \
                     dst2[i+j*16] = dst1[i+j*16]; \
@@ -878,7 +878,7 @@ static int check_mc( int cpu_ref, int cpu_new )
         memcpy( buf4, buf1+320, 320 ); \
         if( mc_a.name[i] != mc_ref.name[i] ) \
         { \
-            set_func_name( "%s_%s", #name, pixel_names[i] );\
+            set_func_name( "%s_%s", #name, pixel_names[i] ); \
             used_asm = 1; \
             call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
             call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
@@ -899,7 +899,7 @@ static int check_mc( int cpu_ref, int cpu_new )

 #define MC_TEST_WEIGHT( name, weight, aligned ) \
     int align_off = (aligned ? 0 : rand()%16); \
-    ok = 1, used_asm = 0;\
+    ok = 1, used_asm = 0; \
     for( int i = 1; i <= 5; i++ ) \
     { \
         ALIGNED_16( uint8_t buffC[640] ); \
@@ -1115,14 +1115,14 @@ static int check_deblock( int cpu_ref, int cpu_new )
 #define TEST_DEBLOCK( name, align, ... ) \
     for( int i = 0; i < 36; i++ ) \
     { \
-        int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */\
+        int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
         for( int j = 0; j < 1024; j++ ) \
-            /* two distributions of random to excersize different failure modes */\
+            /* two distributions of random to excersize different failure modes */ \
             buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
         memcpy( buf4, buf3, 1024 ); \
         if( db_a.name != db_ref.name ) \
         { \
-            set_func_name( #name );\
+            set_func_name( #name ); \
             used_asm = 1; \
             call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
             call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
@@ -1236,7 +1236,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                         dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
                     result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                     result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a )       \
+                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
@@ -1491,11 +1491,11 @@ static int check_intra( int cpu_ref, int cpu_new )

     ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );

-#define INTRA_TEST( name, dir, w, ... ) \
+#define INTRA_TEST( name, dir, w, ... )\
     if( ip_a.name[dir] != ip_ref.name[dir] )\
-    { \
+    {\
         set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
-        used_asm = 1; \
+        used_asm = 1;\
         memcpy( buf3, buf1, 32*20 );\
         memcpy( buf4, buf1, 32*20 );\
         call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\
--
1.7.0.4


From 29b379cc3499541e72007131909d45a8c472f2b5 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sat, 8 May 2010 11:58:22 -0700
Subject: [PATCH 2/9] Fix intra refresh behavior with I-frames
 Intra refresh still allows I-frames (for scenecuts/etc).
 Now I-frames count as a full refresh, as opposed to instantly triggering a refresh.

---
 common/frame.h    |    1 +
 encoder/encoder.c |   28 +++++++++++++++++-----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/common/frame.h b/common/frame.h
index 357929e..e2766ad 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -142,6 +142,7 @@ typedef struct x264_frame
     float   f_pir_position;
     int     i_pir_start_col;
     int     i_pir_end_col;
+    int     i_frames_since_pir;
 } x264_frame_t;

 /* synchronized frame list */
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 7ad4295..7c5a64f 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2375,25 +2375,31 @@ int     x264_encoder_encode( x264_t *h,
     h->i_nal_type = i_nal_type;
     h->i_nal_ref_idc = i_nal_ref_idc;

-    if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
+    if( h->param.b_intra_refresh )
     {
-        int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
-        float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
-        int max_position = (int)(increment * h->param.i_keyint_max);
-        if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
-            h->fdec->f_pir_position = 0;
-        else
+        if( IS_X264_TYPE_I( h->fenc->i_type ) )
+        {
+            h->fdec->i_frames_since_pir = 0;
+            /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
+             * the whole frame and counts as an intra refresh. */
+            h->fdec->f_pir_position = h->sps->i_mb_width;
+        }
+        else if( h->fenc->i_type == X264_TYPE_P )
         {
+            int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
+            float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
             h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
-            if( h->fdec->f_pir_position+0.5 >= max_position )
+            h->fdec->i_frames_since_pir = h->fref0[0]->i_frames_since_pir + pocdiff;
+            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max )
             {
                 h->fdec->f_pir_position = 0;
+                h->fdec->i_frames_since_pir = 0;
                 h->fenc->b_keyframe = 1;
             }
+            h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
+            h->fdec->f_pir_position += increment * pocdiff;
+            h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
         }
-        h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
-        h->fdec->f_pir_position += increment * pocdiff;
-        h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
     }

     if( h->fenc->b_keyframe )
--
1.7.0.4


From 47b30702e9e8b0f9ff6f87a52e0bbc0755a1dbd9 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sat, 8 May 2010 12:07:13 -0700
Subject: [PATCH 3/9] Add API function to trigger intra refresh
 Useful for interactive applications where the encoder knows that packet loss has occurred on the client.
 Full documentation is in x264.h.

---
 common/common.h   |    2 ++
 encoder/encoder.c |   11 ++++++++++-
 x264.h            |   10 +++++++++-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/common/common.h b/common/common.h
index 91d5030..f673648 100644
--- a/common/common.h
+++ b/common/common.h
@@ -408,6 +408,8 @@ struct x264_t
     int             i_coded_fields_lookahead; /* Use separate counters for lookahead */
     int             i_cpb_delay_lookahead;

+    int             b_queued_intra_refresh;
+
     /* We use only one SPS and one PPS */
     x264_sps_t      sps_array[1];
     x264_sps_t      *sps;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 7c5a64f..42d49bf 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2131,6 +2131,12 @@ static int x264_threaded_slices_write( x264_t *h )
     return 0;
 }

+void x264_encoder_intra_refresh( x264_t *h )
+{
+    h = h->thread[h->thread[0]->i_thread_phase];
+    h->b_queued_intra_refresh = 1;
+}
+
 /****************************************************************************
  * x264_encoder_encode:
  *  XXX: i_poc   : is the poc of the current given picture
@@ -2380,6 +2386,7 @@ int     x264_encoder_encode( x264_t *h,
         if( IS_X264_TYPE_I( h->fenc->i_type ) )
         {
             h->fdec->i_frames_since_pir = 0;
+            h->b_queued_intra_refresh = 0;
             /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
              * the whole frame and counts as an intra refresh. */
             h->fdec->f_pir_position = h->sps->i_mb_width;
@@ -2390,10 +2397,12 @@ int     x264_encoder_encode( x264_t *h,
             float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
             h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
             h->fdec->i_frames_since_pir = h->fref0[0]->i_frames_since_pir + pocdiff;
-            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max )
+            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max ||
+                (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->sps->i_mb_width) )
             {
                 h->fdec->f_pir_position = 0;
                 h->fdec->i_frames_since_pir = 0;
+                h->b_queued_intra_refresh = 0;
                 h->fenc->b_keyframe = 1;
             }
             h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
diff --git a/x264.h b/x264.h
index 83f087e..f568dc5 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@

 #include <stdarg.h>

-#define X264_BUILD 94
+#define X264_BUILD 95

 /* x264_t:
  *      opaque handler for encoder */
@@ -639,5 +639,13 @@ void    x264_encoder_close  ( x264_t * );
  *      return the number of currently delayed (buffered) frames
  *      this should be used at the end of the stream, to know when you have all the encoded frames. */
 int     x264_encoder_delayed_frames( x264_t * );
+/* x264_encoder_intra_refresh:
+ *      If an intra refresh is not in progress, begin one with the next P-frame.
+ *      If an intra refresh is in progress, begin one as soon as the current one finishes.
+ *      Requires that b_intra_refresh be set.
+ *      Useful for interactive streaming where the client can tell the server that packet loss has
+ *      occurred.  In this case, keyint can be set to an extremely high value so that intra refreshes
+ *      only occur when calling x264_encoder_intra_refresh. */
+void    x264_encoder_intra_refresh( x264_t * );

 #endif
--
1.7.0.4


From 548ea47cb5484a3754a1217e30b7640a12d061b5 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <hengar-6@student.ltu.se>
Date: Mon, 10 May 2010 23:27:36 +0200
Subject: [PATCH 4/9] Shrink even more constant arrays

---
 common/arm/mc-c.c |    4 ++--
 common/mc.c       |    4 ++--
 common/ppc/mc.c   |    4 ++--
 common/set.c      |   10 +++++-----
 common/x86/mc-c.c |    4 ++--
 encoder/encoder.c |    4 ++--
 encoder/me.c      |    2 +-
 encoder/set.c     |   14 +++++---------
 8 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 0a7b734..d294eff 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -112,8 +112,8 @@ static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int,
     x264_mc_copy_w16_neon,
 };

-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

 static void mc_luma_neon( uint8_t *dst,    int i_dst_stride,
                           uint8_t *src[4], int i_src_stride,
diff --git a/common/mc.c b/common/mc.c
index ada8bdc..e0dc659 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -203,8 +203,8 @@ static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *s
     }
 }

-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

 static void mc_luma( uint8_t *dst,    int i_dst_stride,
                      uint8_t *src[4], int i_src_stride,
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 26b81f8..83c60b1 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -37,8 +37,8 @@ typedef void (*pf_mc_t)( uint8_t *src, int i_src,
                          uint8_t *dst, int i_dst, int i_height );


-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};


 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
diff --git a/common/set.c b/common/set.c
index 50d4213..16cff8e 100644
--- a/common/set.c
+++ b/common/set.c
@@ -23,7 +23,7 @@
 #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
 #define DIV(n,d) (((n) + ((d)>>1)) / (d))

-static const int dequant4_scale[6][3] =
+static const uint8_t dequant4_scale[6][3] =
 {
     { 10, 13, 16 },
     { 11, 14, 18 },
@@ -32,7 +32,7 @@ static const int dequant4_scale[6][3] =
     { 16, 20, 25 },
     { 18, 23, 29 }
 };
-static const int quant4_scale[6][3] =
+static const uint16_t quant4_scale[6][3] =
 {
     { 13107, 8066, 5243 },
     { 11916, 7490, 4660 },
@@ -42,11 +42,11 @@ static const int quant4_scale[6][3] =
     {  7282, 4559, 2893 },
 };

-static const int quant8_scan[16] =
+static const uint8_t quant8_scan[16] =
 {
     0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
 };
-static const int dequant8_scale[6][6] =
+static const uint8_t dequant8_scale[6][6] =
 {
     { 20, 18, 32, 19, 25, 24 },
     { 22, 19, 35, 21, 28, 26 },
@@ -55,7 +55,7 @@ static const int dequant8_scale[6][6] =
     { 32, 28, 51, 30, 40, 38 },
     { 36, 32, 58, 34, 46, 43 },
 };
-static const int quant8_scale[6][6] =
+static const uint16_t quant8_scale[6][6] =
 {
     { 13107, 11428, 20972, 12222, 16777, 15481 },
     { 11916, 10826, 19174, 11058, 14980, 14290 },
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 6d386f6..f641cff 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -228,8 +228,8 @@ static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
     }
 }

-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

 #define MC_LUMA(name,instr1,instr2)\
 static void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 42d49bf..e082024 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2816,8 +2816,8 @@ void    x264_encoder_close  ( x264_t *h )
     /* Slices used and PSNR */
     for( int i = 0; i < 5; i++ )
     {
-        static const int slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B };
-        static const char *slice_name[] = { "P", "B", "I", "SP", "SI" };
+        static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B };
+        static const char * const slice_name[] = { "P", "B", "I", "SP", "SI" };
         int i_slice = slice_order[i];

         if( h->stat.i_frame_count[i_slice] > 0 )
diff --git a/encoder/me.c b/encoder/me.c
index 5e113f0..a35da53 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -484,7 +484,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
             int i = 1;
             do
             {
-                static const int hex4[16][2] = {
+                static const int8_t hex4[16][2] = {
                     { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
                     {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
                     {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
diff --git a/encoder/set.c b/encoder/set.c
index e3a071c..ce52a4b 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -315,26 +315,22 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
         if( sps->vui.b_aspect_ratio_info_present )
         {
             int i;
-            static const struct { int w, h; int sar; } sar[] =
+            static const struct { uint8_t w, h, sar; } sar[] =
             {
                 { 1,   1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
                 { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
                 { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
-                { 160,99, 13}, { 0, 0, -1 }
+                { 160,99, 13}, { 0, 0, 255 }
             };
-            for( i = 0; sar[i].sar != -1; i++ )
+            for( i = 0; sar[i].sar != 255; i++ )
             {
                 if( sar[i].w == sps->vui.i_sar_width &&
                     sar[i].h == sps->vui.i_sar_height )
                     break;
             }
-            if( sar[i].sar != -1 )
+            bs_write( s, 8, sar[i].sar );
+            if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */
             {
-                bs_write( s, 8, sar[i].sar );
-            }
-            else
-            {
-                bs_write( s, 8, 255);   /* aspect_ratio_idc (extended) */
                 bs_write( s, 16, sps->vui.i_sar_width );
                 bs_write( s, 16, sps->vui.i_sar_height );
             }
--
1.7.0.4


From 5d1dd185510c753033ed841e55425eded293a10b Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Mon, 10 May 2010 22:59:12 -0700
Subject: [PATCH 5/9] Fix condition for printing rc=cbr in options SEI
 Also fix crf-max formatting.

---
 common/common.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/common.c b/common/common.c
index 848c6de..ad7cf98 100644
--- a/common/common.c
+++ b/common/common.c
@@ -1237,7 +1237,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
         s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead );

     s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
-                               ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size == p->rc.i_bitrate ? "cbr" : "abr" )
+                               ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" )
                                : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
     if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
     {
@@ -1256,7 +1256,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
             s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d",
                           p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size );
             if( p->rc.i_rc_method == X264_RC_CRF )
-                s += sprintf( s, " crf-max=%.1f", p->rc.f_rf_constant_max );
+                s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max );
         }
     }
     else if( p->rc.i_rc_method == X264_RC_CQP )
--
1.7.0.4


From ffaf1e14b54d791f369fc51a534111ddd839c55d Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Wed, 12 May 2010 01:57:38 +0400
Subject: [PATCH 6/9] Fix crash with sliced-threads on Phenom

---
 encoder/encoder.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/encoder/encoder.c b/encoder/encoder.c
index e082024..3a5520f 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2066,6 +2066,10 @@ static void *x264_slices_write( x264_t *h )
 static int x264_threaded_slices_write( x264_t *h )
 {
     void *ret = NULL;
+#ifdef HAVE_MMX
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
     /* set first/last mb and sync contexts */
     for( int i = 0; i < h->param.i_threads; i++ )
     {
--
1.7.0.4


From ec937b4219673bdea810f00bd9cc91f5d174302b Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Wed, 12 May 2010 22:05:34 +0400
Subject: [PATCH 7/9] Fix bitrate calculation in progress status
 Was slightly incorrect due to using pts, which is out of order.

---
 x264.c |   34 +++++++++++++++++++++++++---------
 1 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/x264.c b/x264.c
index 8f4e372..1a85c74 100644
--- a/x264.c
+++ b/x264.c
@@ -1312,7 +1312,7 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
  * Encode:
  *****************************************************************************/

-static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_pts )
+static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_dts )
 {
     x264_picture_t pic_out;
     x264_nal_t *nal;
@@ -1330,18 +1330,22 @@ static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *l
     if( i_frame_size )
     {
         i_frame_size = output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out );
-        *last_pts = pic_out.i_pts;
+        *last_dts = pic_out.i_dts;
     }

     return i_frame_size;
 }

-static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_pts )
+static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_ts )
 {
     char    buf[200];
     int64_t i_elapsed = x264_mdate() - i_start;
     double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
-    double bitrate = (double) i_file * 8 / ( (double) last_pts * 1000 * param->i_timebase_num / param->i_timebase_den );
+    double bitrate;
+    if( last_ts )
+        bitrate = (double) i_file * 8 / ( (double) last_ts * 1000 * param->i_timebase_num / param->i_timebase_den );
+    else
+        bitrate = (double) i_file * 8 / ( (double) 1000 * param->i_fps_den / param->i_fps_num );
     if( i_frame_total )
     {
         int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
@@ -1369,7 +1373,9 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
     int64_t i_file = 0;
     int     i_frame_size;
     int     i_update_interval;
-    int64_t last_pts = 0;
+    int64_t last_dts = 0;
+    int64_t prev_dts = 0;
+    int64_t first_dts = 0;
 #   define  MAX_PTS_WARNING 3 /* arbitrary */
     int     pts_warning_cnt = 0;
     int64_t largest_pts = -1;
@@ -1506,12 +1512,17 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
             pic.i_qpplus1 = 0;
         }

-        i_frame_size = Encode_frame( h, opt->hout, &pic, &last_pts );
+        prev_dts = last_dts;
+        i_frame_size = Encode_frame( h, opt->hout, &pic, &last_dts );
         if( i_frame_size < 0 )
             return -1;
         i_file += i_frame_size;
         if( i_frame_size )
+        {
             i_frame_output++;
+            if( i_frame_output == 1 )
+                first_dts = prev_dts = last_dts;
+        }

         i_frame++;

@@ -1520,19 +1531,24 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )

         /* update status line (up to 1000 times per input file) */
         if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
-            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
+            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
     }
     /* Flush delayed frames */
     while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
     {
-        i_frame_size = Encode_frame( h, opt->hout, NULL, &last_pts );
+        prev_dts = last_dts;
+        i_frame_size = Encode_frame( h, opt->hout, NULL, &last_dts );
         if( i_frame_size < 0 )
             return -1;
         i_file += i_frame_size;
         if( i_frame_size )
+        {
             i_frame_output++;
+            if( i_frame_output == 1 )
+                first_dts = prev_dts = last_dts;
+        }
         if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
-            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
+            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
     }
     if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG )
         fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
--
1.7.0.4


From d1d7484aba046614add62e2bdc4da23e570525c3 Mon Sep 17 00:00:00 2001
From: Kieran Kunhya <kieran@kunhya.com>
Date: Thu, 13 May 2010 19:13:35 +0100
Subject: [PATCH 8/9] Fix typo in pulldown

---
 x264.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/x264.c b/x264.c
index 1a85c74..862aabb 100644
--- a/x264.c
+++ b/x264.c
@@ -120,7 +120,7 @@ enum pulldown_type_e

 static const cli_pulldown_t pulldown_values[] =
 {
-    [X264_PULLDOWN_22]     = {1,  {TB},                                   2.0},
+    [X264_PULLDOWN_22]     = {1,  {TB},                                   1.0},
     [X264_PULLDOWN_32]     = {4,  {TBT, BT, BTB, TB},                     1.25},
     [X264_PULLDOWN_64]     = {2,  {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0},
     [X264_PULLDOWN_DOUBLE] = {1,  {PIC_STRUCT_DOUBLE},                    2.0},
--
1.7.0.4


From a21e7bd854c8c441a081c4a353b02bf41454bb95 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sat, 15 May 2010 14:48:58 -0700
Subject: [PATCH 9/9] Overhaul CABAC: faster, less cache usage
 Horribly munge up the CABAC tables to allow deduplication of some data.
 Saves 256 bytes of L1d cache in non-RD, 512 bytes in RD.
 Add asm versions of bypass and terminal; save L1i cache by re-using putbyte code.
 Further optimize encode_decision.
 All 3 primary CABAC functions fit in under 256 bytes of code total on x86_64.

---
 common/cabac.c         |  185 ++++++++++++++++++++----------------------------
 common/cabac.h         |   22 ++++--
 common/x86/cabac-a.asm |   76 ++++++++++++++++----
 common/x86/x86inc.asm  |    2 +-
 encoder/rdo.c          |    2 +
 tools/checkasm.c       |   46 ++++++++++--
 6 files changed, 195 insertions(+), 138 deletions(-)

diff --git a/common/cabac.c b/common/cabac.c
index f50aef6..11988a1 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -664,75 +664,44 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] =
     }
 };

-/* FIXME could avoid this duplication by reversing the order of states
- * with MPS=0, but that would uglify the other tables */
-const uint8_t x264_cabac_range_lps[128][4] =
+const uint8_t x264_cabac_range_lps[64][4] =
 {
-    {   2,   2,   2,   2 },
-    {   6,   7,   8,   9 }, {   6,   7,   9,  10 }, {   6,   8,   9,  11 },
-    {   7,   8,  10,  11 }, {   7,   9,  10,  12 }, {   7,   9,  11,  12 },
-    {   8,   9,  11,  13 }, {   8,  10,  12,  14 }, {   9,  11,  12,  14 },
-    {   9,  11,  13,  15 }, {  10,  12,  14,  16 }, {  10,  12,  15,  17 },
-    {  11,  13,  15,  18 }, {  11,  14,  16,  19 }, {  12,  14,  17,  20 },
-    {  12,  15,  18,  21 }, {  13,  16,  19,  22 }, {  14,  17,  20,  23 },
-    {  14,  18,  21,  24 }, {  15,  19,  22,  25 }, {  16,  20,  23,  27 },
-    {  17,  21,  25,  28 }, {  18,  22,  26,  30 }, {  19,  23,  27,  31 },
-    {  20,  24,  29,  33 }, {  21,  26,  30,  35 }, {  22,  27,  32,  37 },
-    {  23,  28,  33,  39 }, {  24,  30,  35,  41 }, {  26,  31,  37,  43 },
-    {  27,  33,  39,  45 }, {  29,  35,  41,  48 }, {  30,  37,  43,  50 },
-    {  32,  39,  46,  53 }, {  33,  41,  48,  56 }, {  35,  43,  51,  59 },
-    {  37,  45,  54,  62 }, {  39,  48,  56,  65 }, {  41,  50,  59,  69 },
-    {  43,  53,  63,  72 }, {  46,  56,  66,  76 }, {  48,  59,  69,  80 },
-    {  51,  62,  73,  85 }, {  53,  65,  77,  89 }, {  56,  69,  81,  94 },
-    {  59,  72,  86,  99 }, {  62,  76,  90, 104 }, {  66,  80,  95, 110 },
-    {  69,  85, 100, 116 }, {  73,  89, 105, 122 }, {  77,  94, 111, 128 },
-    {  81,  99, 117, 135 }, {  85, 104, 123, 142 }, {  90, 110, 130, 150 },
-    {  95, 116, 137, 158 }, { 100, 122, 144, 166 }, { 105, 128, 152, 175 },
-    { 111, 135, 160, 185 }, { 116, 142, 169, 195 }, { 123, 150, 178, 205 },
-    { 128, 158, 187, 216 }, { 128, 167, 197, 227 }, { 128, 176, 208, 240 },
-
-    { 128, 176, 208, 240 }, { 128, 167, 197, 227 }, { 128, 158, 187, 216 },
-    { 123, 150, 178, 205 }, { 116, 142, 169, 195 }, { 111, 135, 160, 185 },
-    { 105, 128, 152, 175 }, { 100, 122, 144, 166 }, {  95, 116, 137, 158 },
-    {  90, 110, 130, 150 }, {  85, 104, 123, 142 }, {  81,  99, 117, 135 },
-    {  77,  94, 111, 128 }, {  73,  89, 105, 122 }, {  69,  85, 100, 116 },
-    {  66,  80,  95, 110 }, {  62,  76,  90, 104 }, {  59,  72,  86,  99 },
-    {  56,  69,  81,  94 }, {  53,  65,  77,  89 }, {  51,  62,  73,  85 },
-    {  48,  59,  69,  80 }, {  46,  56,  66,  76 }, {  43,  53,  63,  72 },
-    {  41,  50,  59,  69 }, {  39,  48,  56,  65 }, {  37,  45,  54,  62 },
-    {  35,  43,  51,  59 }, {  33,  41,  48,  56 }, {  32,  39,  46,  53 },
-    {  30,  37,  43,  50 }, {  29,  35,  41,  48 }, {  27,  33,  39,  45 },
-    {  26,  31,  37,  43 }, {  24,  30,  35,  41 }, {  23,  28,  33,  39 },
-    {  22,  27,  32,  37 }, {  21,  26,  30,  35 }, {  20,  24,  29,  33 },
-    {  19,  23,  27,  31 }, {  18,  22,  26,  30 }, {  17,  21,  25,  28 },
-    {  16,  20,  23,  27 }, {  15,  19,  22,  25 }, {  14,  18,  21,  24 },
-    {  14,  17,  20,  23 }, {  13,  16,  19,  22 }, {  12,  15,  18,  21 },
-    {  12,  14,  17,  20 }, {  11,  14,  16,  19 }, {  11,  13,  15,  18 },
-    {  10,  12,  15,  17 }, {  10,  12,  14,  16 }, {   9,  11,  13,  15 },
-    {   9,  11,  12,  14 }, {   8,  10,  12,  14 }, {   8,   9,  11,  13 },
-    {   7,   9,  11,  12 }, {   7,   9,  10,  12 }, {   7,   8,  10,  11 },
-    {   6,   8,   9,  11 }, {   6,   7,   9,  10 }, {   6,   7,   8,   9 },
-    {   2,   2,   2,   2 },
+    {  2,   2,   2,   2}, {  6,   7,   8,   9}, {  6,   7,   9,  10}, {  6,   8,   9,  11},
+    {  7,   8,  10,  11}, {  7,   9,  10,  12}, {  7,   9,  11,  12}, {  8,   9,  11,  13},
+    {  8,  10,  12,  14}, {  9,  11,  12,  14}, {  9,  11,  13,  15}, { 10,  12,  14,  16},
+    { 10,  12,  15,  17}, { 11,  13,  15,  18}, { 11,  14,  16,  19}, { 12,  14,  17,  20},
+    { 12,  15,  18,  21}, { 13,  16,  19,  22}, { 14,  17,  20,  23}, { 14,  18,  21,  24},
+    { 15,  19,  22,  25}, { 16,  20,  23,  27}, { 17,  21,  25,  28}, { 18,  22,  26,  30},
+    { 19,  23,  27,  31}, { 20,  24,  29,  33}, { 21,  26,  30,  35}, { 22,  27,  32,  37},
+    { 23,  28,  33,  39}, { 24,  30,  35,  41}, { 26,  31,  37,  43}, { 27,  33,  39,  45},
+    { 29,  35,  41,  48}, { 30,  37,  43,  50}, { 32,  39,  46,  53}, { 33,  41,  48,  56},
+    { 35,  43,  51,  59}, { 37,  45,  54,  62}, { 39,  48,  56,  65}, { 41,  50,  59,  69},
+    { 43,  53,  63,  72}, { 46,  56,  66,  76}, { 48,  59,  69,  80}, { 51,  62,  73,  85},
+    { 53,  65,  77,  89}, { 56,  69,  81,  94}, { 59,  72,  86,  99}, { 62,  76,  90, 104},
+    { 66,  80,  95, 110}, { 69,  85, 100, 116}, { 73,  89, 105, 122}, { 77,  94, 111, 128},
+    { 81,  99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158},
+    {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195},
+    {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240}
 };

 const uint8_t x264_cabac_transition[128][2] =
 {
-    {  0,  0}, {  1, 25}, {  1, 25}, {  2, 26}, {  3, 26}, {  4, 26}, {  5, 27}, {  6, 27},
-    {  7, 27}, {  8, 28}, {  9, 28}, { 10, 28}, { 11, 29}, { 12, 29}, { 13, 30}, { 14, 30},
-    { 15, 30}, { 16, 31}, { 17, 31}, { 18, 32}, { 19, 33}, { 20, 33}, { 21, 33}, { 22, 34},
-    { 23, 34}, { 24, 35}, { 25, 36}, { 26, 36}, { 27, 37}, { 28, 37}, { 29, 38}, { 30, 39},
-    { 31, 39}, { 32, 40}, { 33, 41}, { 34, 41}, { 35, 42}, { 36, 42}, { 37, 44}, { 38, 44},
-    { 39, 45}, { 40, 45}, { 41, 47}, { 42, 47}, { 43, 48}, { 44, 48}, { 45, 50}, { 46, 50},
-    { 47, 51}, { 48, 52}, { 49, 52}, { 50, 54}, { 51, 54}, { 52, 55}, { 53, 56}, { 54, 57},
-    { 55, 58}, { 56, 59}, { 57, 59}, { 58, 61}, { 59, 61}, { 60, 62}, { 61, 63}, { 62, 64},
-    { 63, 65}, { 64, 66}, { 65, 67}, { 66, 68}, { 66, 69}, { 68, 70}, { 68, 71}, { 69, 72},
-    { 70, 73}, { 71, 74}, { 72, 75}, { 73, 76}, { 73, 77}, { 75, 78}, { 75, 79}, { 76, 80},
-    { 77, 81}, { 77, 82}, { 79, 83}, { 79, 84}, { 80, 85}, { 80, 86}, { 82, 87}, { 82, 88},
-    { 83, 89}, { 83, 90}, { 85, 91}, { 85, 92}, { 86, 93}, { 86, 94}, { 87, 95}, { 88, 96},
-    { 88, 97}, { 89, 98}, { 90, 99}, { 90,100}, { 91,101}, { 91,102}, { 92,103}, { 93,104},
-    { 93,105}, { 94,106}, { 94,107}, { 94,108}, { 95,109}, { 96,110}, { 96,111}, { 97,112},
-    { 97,113}, { 97,114}, { 98,115}, { 98,116}, { 99,117}, { 99,118}, { 99,119}, {100,120},
-    {100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127},
+    {  0,   0}, {  1,   1}, {  2,  50}, { 51,   3}, {  2,  50}, { 51,   3}, {  4,  52}, { 53,   5},
+    {  6,  52}, { 53,   7}, {  8,  52}, { 53,   9}, { 10,  54}, { 55,  11}, { 12,  54}, { 55,  13},
+    { 14,  54}, { 55,  15}, { 16,  56}, { 57,  17}, { 18,  56}, { 57,  19}, { 20,  56}, { 57,  21},
+    { 22,  58}, { 59,  23}, { 24,  58}, { 59,  25}, { 26,  60}, { 61,  27}, { 28,  60}, { 61,  29},
+    { 30,  60}, { 61,  31}, { 32,  62}, { 63,  33}, { 34,  62}, { 63,  35}, { 36,  64}, { 65,  37},
+    { 38,  66}, { 67,  39}, { 40,  66}, { 67,  41}, { 42,  66}, { 67,  43}, { 44,  68}, { 69,  45},
+    { 46,  68}, { 69,  47}, { 48,  70}, { 71,  49}, { 50,  72}, { 73,  51}, { 52,  72}, { 73,  53},
+    { 54,  74}, { 75,  55}, { 56,  74}, { 75,  57}, { 58,  76}, { 77,  59}, { 60,  78}, { 79,  61},
+    { 62,  78}, { 79,  63}, { 64,  80}, { 81,  65}, { 66,  82}, { 83,  67}, { 68,  82}, { 83,  69},
+    { 70,  84}, { 85,  71}, { 72,  84}, { 85,  73}, { 74,  88}, { 89,  75}, { 76,  88}, { 89,  77},
+    { 78,  90}, { 91,  79}, { 80,  90}, { 91,  81}, { 82,  94}, { 95,  83}, { 84,  94}, { 95,  85},
+    { 86,  96}, { 97,  87}, { 88,  96}, { 97,  89}, { 90, 100}, {101,  91}, { 92, 100}, {101,  93},
+    { 94, 102}, {103,  95}, { 96, 104}, {105,  97}, { 98, 104}, {105,  99}, {100, 108}, {109, 101},
+    {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109},
+    {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117},
+    {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125}
 };

 const uint8_t x264_cabac_renorm_shift[64]= {
@@ -743,41 +712,40 @@ const uint8_t x264_cabac_renorm_shift[64]= {
 };

 /* -ln2(probability) */
-#define F(a,b) {FIX8(a),FIX8(b)}
-const uint16_t x264_cabac_entropy[128][2] =
+const uint16_t x264_cabac_entropy[128] =
 {
-    F(0.0273,5.7370), F(0.0288,5.6618), F(0.0303,5.5866), F(0.0320,5.5114),
-    F(0.0337,5.4362), F(0.0355,5.3610), F(0.0375,5.2859), F(0.0395,5.2106),
-    F(0.0416,5.1354), F(0.0439,5.0602), F(0.0463,4.9851), F(0.0488,4.9099),
-    F(0.0515,4.8347), F(0.0543,4.7595), F(0.0572,4.6843), F(0.0604,4.6091),
-    F(0.0637,4.5339), F(0.0671,4.4588), F(0.0708,4.3836), F(0.0747,4.3083),
-    F(0.0788,4.2332), F(0.0832,4.1580), F(0.0878,4.0828), F(0.0926,4.0076),
-    F(0.0977,3.9324), F(0.1032,3.8572), F(0.1089,3.7820), F(0.1149,3.7068),
-    F(0.1214,3.6316), F(0.1282,3.5565), F(0.1353,3.4813), F(0.1429,3.4061),
-    F(0.1510,3.3309), F(0.1596,3.2557), F(0.1686,3.1805), F(0.1782,3.1053),
-    F(0.1884,3.0301), F(0.1992,2.9549), F(0.2107,2.8797), F(0.2229,2.8046),
-    F(0.2358,2.7294), F(0.2496,2.6542), F(0.2642,2.5790), F(0.2798,2.5038),
-    F(0.2964,2.4286), F(0.3142,2.3534), F(0.3331,2.2782), F(0.3532,2.2030),
-    F(0.3748,2.1278), F(0.3979,2.0527), F(0.4226,1.9775), F(0.4491,1.9023),
-    F(0.4776,1.8271), F(0.5082,1.7519), F(0.5412,1.6767), F(0.5768,1.6015),
-    F(0.6152,1.5263), F(0.6568,1.4511), F(0.7020,1.3759), F(0.7513,1.3008),
-    F(0.8050,1.2256), F(0.8638,1.1504), F(0.9285,1.0752), F(1.0000,1.0000),
-    F(1.0000,1.0000), F(1.0752,0.9285), F(1.1504,0.8638), F(1.2256,0.8050),
-    F(1.3008,0.7513), F(1.3759,0.7020), F(1.4511,0.6568), F(1.5263,0.6152),
-    F(1.6015,0.5768), F(1.6767,0.5412), F(1.7519,0.5082), F(1.8271,0.4776),
-    F(1.9023,0.4491), F(1.9775,0.4226), F(2.0527,0.3979), F(2.1278,0.3748),
-    F(2.2030,0.3532), F(2.2782,0.3331), F(2.3534,0.3142), F(2.4286,0.2964),
-    F(2.5038,0.2798), F(2.5790,0.2642), F(2.6542,0.2496), F(2.7294,0.2358),
-    F(2.8046,0.2229), F(2.8797,0.2107), F(2.9549,0.1992), F(3.0301,0.1884),
-    F(3.1053,0.1782), F(3.1805,0.1686), F(3.2557,0.1596), F(3.3309,0.1510),
-    F(3.4061,0.1429), F(3.4813,0.1353), F(3.5565,0.1282), F(3.6316,0.1214),
-    F(3.7068,0.1149), F(3.7820,0.1089), F(3.8572,0.1032), F(3.9324,0.0977),
-    F(4.0076,0.0926), F(4.0828,0.0878), F(4.1580,0.0832), F(4.2332,0.0788),
-    F(4.3083,0.0747), F(4.3836,0.0708), F(4.4588,0.0671), F(4.5339,0.0637),
-    F(4.6091,0.0604), F(4.6843,0.0572), F(4.7595,0.0543), F(4.8347,0.0515),
-    F(4.9099,0.0488), F(4.9851,0.0463), F(5.0602,0.0439), F(5.1354,0.0416),
-    F(5.2106,0.0395), F(5.2859,0.0375), F(5.3610,0.0355), F(5.4362,0.0337),
-    F(5.5114,0.0320), F(5.5866,0.0303), F(5.6618,0.0288), F(5.7370,0.0273),
+    FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618),
+    FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114),
+    FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610),
+    FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106),
+    FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602),
+    FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099),
+    FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595),
+    FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091),
+    FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588),
+    FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083),
+    FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580),
+    FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076),
+    FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572),
+    FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068),
+    FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565),
+    FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061),
+    FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557),
+    FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053),
+    FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549),
+    FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046),
+    FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542),
+    FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038),
+    FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534),
+    FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030),
+    FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527),
+    FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023),
+    FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519),
+    FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015),
+    FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511),
+    FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008),
+    FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504),
+    FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000)
 };


@@ -794,14 +762,17 @@ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int
         cabac_context_init = &x264_cabac_context_init_PB[i_model];

     for( int i = 0; i < 460; i++ )
-        cb->state[i] = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
+    {
+        int state = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
+        cb->state[i] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
+    }
 }

 void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
 {
     cb->i_low   = 0;
     cb->i_range = 0x01FE;
-    cb->i_queue = -1; // the first bit will be shifted away and not written
+    cb->i_queue = -9; // the first bit will be shifted away and not written
     cb->i_bytes_outstanding = 0;
     cb->p_start = p_data;
     cb->p       = p_data;
@@ -810,10 +781,10 @@ void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )

 static inline void x264_cabac_putbyte( x264_cabac_t *cb )
 {
-    if( cb->i_queue >= 8 )
+    if( cb->i_queue >= 0 )
     {
-        int out = cb->i_low >> (cb->i_queue+2);
-        cb->i_low &= (4<<cb->i_queue)-1;
+        int out = cb->i_low >> (cb->i_queue+10);
+        cb->i_low &= (0x400<<cb->i_queue)-1;
         cb->i_queue -= 8;

         if( (out & 0xff) == 0xff )
@@ -855,9 +826,9 @@ static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
 void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
 {
     int i_state = cb->state[i_ctx];
-    int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)-4];
+    int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
     cb->i_range -= i_range_lps;
-    if( b != (i_state >> 6) )
+    if( b != (i_state & 1) )
     {
         cb->i_low += cb->i_range;
         cb->i_range = i_range_lps;
@@ -866,7 +837,7 @@ void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
     x264_cabac_encode_renorm( cb );
 }

-void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
+void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
 {
     cb->i_low <<= 1;
     cb->i_low += -b & cb->i_range;
@@ -892,7 +863,7 @@ void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
     } while( k > 0 );
 }

-void x264_cabac_encode_terminal( x264_cabac_t *cb )
+void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
 {
     cb->i_range -= 2;
     x264_cabac_encode_renorm( cb );
diff --git a/common/cabac.h b/common/cabac.h
index ef68fe6..9fc3007 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -31,7 +31,7 @@ typedef struct
     int i_range;

     /* bit stream */
-    int i_queue;
+    int i_queue; //stored with an offset of -8 for faster asm
     int i_bytes_outstanding;

     uint8_t *p_start;
@@ -46,7 +46,7 @@ typedef struct
 } x264_cabac_t;

 extern const uint8_t x264_cabac_transition[128][2];
-extern const uint16_t x264_cabac_entropy[128][2];
+extern const uint16_t x264_cabac_entropy[128];

 /* init the contexts given i_slice_type, the quantif and the model */
 void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
@@ -55,15 +55,21 @@ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int
 void x264_cabac_encode_init ( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
 void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
 void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
-void x264_cabac_encode_bypass( x264_cabac_t *cb, int b );
+void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
+void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
+void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
+void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
 void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
-void x264_cabac_encode_terminal( x264_cabac_t *cb );
 void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );

 #ifdef HAVE_MMX
 #define x264_cabac_encode_decision x264_cabac_encode_decision_asm
+#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
+#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
 #else
 #define x264_cabac_encode_decision x264_cabac_encode_decision_c
+#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
+#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
 #endif
 #define x264_cabac_encode_decision_noup x264_cabac_encode_decision

@@ -78,25 +84,25 @@ static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx
 {
     int i_state = cb->state[i_ctx];
     cb->state[i_ctx] = x264_cabac_transition[i_state][b];
-    cb->f8_bits_encoded += x264_cabac_entropy[i_state][b];
+    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
 }

 static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
 {
     int i_state = *state;
     *state = x264_cabac_transition[i_state][b];
-    return x264_cabac_entropy[i_state][b];
+    return x264_cabac_entropy[i_state^b];
 }

 static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
 {
     int i_state = cb->state[i_ctx];
-    cb->f8_bits_encoded += x264_cabac_entropy[i_state][b];
+    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
 }

 static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
 {
-    return x264_cabac_entropy[*state][b];
+    return x264_cabac_entropy[*state^b];
 }

 #endif
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 2af98c7..8621c5b 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -32,13 +32,13 @@ cextern cabac_renorm_shift

 ; t3 must be ecx, since it's used for shift.
 %ifdef WIN64
-    DECLARE_REG_TMP 3,1,2,0,4,5,6,10
+    DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2
     %define pointer resq
 %elifdef ARCH_X86_64
-    DECLARE_REG_TMP 0,1,2,3,4,5,6,10
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6
     %define pointer resq
 %else
-    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
+    DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2
     %define pointer resd
 %endif

@@ -72,13 +72,15 @@ cglobal cabac_encode_decision_asm, 0,7
     movifnidn t0,  r0mp
     movifnidn t1d, r1m
     mov   t5d, [t0+cb.range]
-    movzx t6d, byte [t0+cb.state+t1]
+    movzx t4d, byte [t0+cb.state+t1]
     mov   t3d, t5d
+    mov   t6d, t4d
     shr   t5d, 6
+    shr   t4d, 1
     movifnidn t2d, r2m
-    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t6*4
+    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4
     LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
-    shr   t6d, 6
+    and   t6d, 1
     sub   t3d, t5d
     cmp   t6d, t2d
     mov   t6d, [t0+cb.low]
@@ -94,20 +96,66 @@ cglobal cabac_encode_decision_asm, 0,7
     shl   t6d, t3b
     add   t3d, [t0+cb.queue]
     mov   [t0+cb.range], t4d
-    cmp   t3d, 8
-    jl .update_queue_low
-;cabac_putbyte
+    jge cabac_putbyte
+.update_queue_low:
+    mov   [t0+cb.low], t6d
+    mov   [t0+cb.queue], t3d
+    RET
+
+cglobal cabac_encode_bypass_asm, 0,3
+    movifnidn  t0, r0mp
+    movifnidn t3d, r1m
+    neg       t3d
+    mov       t8d, [t0+cb.low]
+    and       t3d, [t0+cb.range]
+    lea       t8d, [t8*2+t3]
+    mov       t3d, [t0+cb.queue]
+    inc       t3d
+%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
+    jge cabac_putbyte
+%else
+    jge .putbyte
+%endif
+    mov   [t0+cb.low], t8d
+    mov   [t0+cb.queue], t3d
+    RET
+.putbyte:
+    PROLOGUE 0,7
+    movifnidn t6d, t8d
+    jmp cabac_putbyte
+
+cglobal cabac_encode_terminal_asm, 0,3
+    movifnidn  t0, r0mp
+    sub  dword [t0+cb.range], 2
+; shortcut: the renormalization shift in terminal
+; can only be 0 or 1 and is zero over 99% of the time.
+    test dword [t0+cb.range], 0x100
+    je .renorm
+    REP_RET
+.renorm:
+    shl  dword [t0+cb.low], 1
+    shl  dword [t0+cb.range], 1
+    inc  dword [t0+cb.queue]
+    jge .putbyte
+    REP_RET
+.putbyte:
+    PROLOGUE 0,7
+    mov t3d, [t0+cb.queue]
+    mov t6d, [t0+cb.low]
+    jmp cabac_putbyte
+
+cabac_putbyte:
     ; alive: t0=cb t3=queue t6=low
 %ifdef WIN64
     DECLARE_REG_TMP 3,4,1,0,2,5,6,10
 %endif
     mov   t1d, -1
-    add   t3d, 2
+    add   t3d, 10
     mov   t2d, t6d
     shl   t1d, t3b
     shr   t2d, t3b ; out
     not   t1d
-    sub   t3d, 10
+    sub   t3d, 18
     and   t6d, t1d
     mov   t5d, [t0+cb.bytes_outstanding]
     cmp   t2b, 0xff ; FIXME is a 32bit op faster?
@@ -125,8 +173,4 @@ cglobal cabac_encode_decision_asm, 0,7
 .postpone:
     inc   t5d
     mov   [t0+cb.bytes_outstanding], t5d
-.update_queue_low:
-    mov   [t0+cb.low], t6d
-    mov   [t0+cb.queue], t3d
-    RET
-
+    jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 9d23640..f006f37 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -171,7 +171,7 @@ DECLARE_REG_SIZE bp, bpl
     %endrep
 %endmacro

-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9

 %ifdef ARCH_X86_64
     %define gprsize 8
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 4d83b6a..574a484 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -50,6 +50,8 @@ static uint16_t cabac_size_5ones[128];
  * fractional bits, but only finite precision. */
 #undef  x264_cabac_encode_decision
 #undef  x264_cabac_encode_decision_noup
+#undef  x264_cabac_encode_bypass
+#undef  x264_cabac_encode_terminal
 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
 #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
 #define x264_cabac_encode_terminal(c)     ((c)->f8_bits_encoded += 7)
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 2008d2f..9bc15c8 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1556,32 +1556,66 @@ static int check_intra( int cpu_ref, int cpu_new )
 }

 #define DECL_CABAC(cpu) \
-static void run_cabac_##cpu( uint8_t *dst )\
+static void run_cabac_decision_##cpu( uint8_t *dst )\
 {\
     x264_cabac_t cb;\
     x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
     x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
     for( int i = 0; i < 0x1000; i++ )\
         x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
+}\
+static void run_cabac_bypass_##cpu( uint8_t *dst )\
+{\
+    x264_cabac_t cb;\
+    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
+    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
+    for( int i = 0; i < 0x1000; i++ )\
+        x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
+}\
+static void run_cabac_terminal_##cpu( uint8_t *dst )\
+{\
+    x264_cabac_t cb;\
+    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
+    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
+    for( int i = 0; i < 0x1000; i++ )\
+        x264_cabac_encode_terminal_##cpu( &cb );\
 }
 DECL_CABAC(c)
 #ifdef HAVE_MMX
 DECL_CABAC(asm)
 #else
-#define run_cabac_asm run_cabac_c
+#define run_cabac_decision_asm run_cabac_decision_c
+#define run_cabac_bypass_asm run_cabac_bypass_c
+#define run_cabac_terminal_asm run_cabac_terminal_c
 #endif

 static int check_cabac( int cpu_ref, int cpu_new )
 {
     int ret = 0, ok, used_asm = 1;
-    if( cpu_ref || run_cabac_c == run_cabac_asm)
+    if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
         return 0;
+
     set_func_name( "cabac_encode_decision" );
     memcpy( buf4, buf3, 0x1000 );
-    call_c( run_cabac_c, buf3 );
-    call_a( run_cabac_asm, buf4 );
+    call_c( run_cabac_decision_c, buf3 );
+    call_a( run_cabac_decision_asm, buf4 );
+    ok = !memcmp( buf3, buf4, 0x1000 );
+    report( "cabac decision:" );
+
+    set_func_name( "cabac_encode_bypass" );
+    memcpy( buf4, buf3, 0x1000 );
+    call_c( run_cabac_bypass_c, buf3 );
+    call_a( run_cabac_bypass_asm, buf4 );
     ok = !memcmp( buf3, buf4, 0x1000 );
-    report( "cabac :" );
+    report( "cabac bypass:" );
+
+    set_func_name( "cabac_encode_terminal" );
+    memcpy( buf4, buf3, 0x1000 );
+    call_c( run_cabac_terminal_c, buf3 );
+    call_a( run_cabac_terminal_asm, buf4 );
+    ok = !memcmp( buf3, buf4, 0x1000 );
+    report( "cabac terminal:" );
+
     return ret;
 }

--
1.7.0.4