Untitled

From 4784723450ae1dd28ede1ff04a93f1849d6444e5 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Tue, 16 Feb 2010 09:41:55 -0800
Subject: [PATCH 01/16] Fix I and B-frame QPs with threads
 Rounding errors resulted in slightly wrong QPs with threads enabled.

---
 encoder/ratecontrol.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 8c61582..3d86aaa 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -1077,15 +1077,15 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )

     rc->qpa_rc =
     rc->qpa_aq = 0;
-    h->fdec->f_qp_avg_rc =
-    h->fdec->f_qp_avg_aq =
     rc->qpm =
     rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 );
+    h->fdec->f_qp_avg_rc =
+    h->fdec->f_qp_avg_aq =
     rc->f_qpm = q;
     if( rce )
         rce->new_qp = rc->qp;

-    accum_p_qp_update( h, rc->qp );
+    accum_p_qp_update( h, rc->f_qpm );

     if( h->sh.i_type != SLICE_TYPE_B )
         rc->last_non_b_pict_type = h->sh.i_type;
--
1.6.1.2


From 28e6eb67ffaa002469f60c40e2b5d58b2a758f9c Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Mon, 22 Feb 2010 11:21:51 -0800
Subject: [PATCH 02/16] Fix integer overflow in chroma SSD check
 Could cause bad skips at very high quantizers on extreme inputs.

---
 encoder/rdo.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/encoder/rdo.c b/encoder/rdo.c
index 3ed4a47..e15f47d 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -131,7 +131,7 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 static inline int ssd_mb( x264_t *h )
 {
     int chromassd = ssd_plane(h, PIXEL_8x8, 1, 0, 0) + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
-    chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+    chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
     return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chromassd;
 }

@@ -223,7 +223,7 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )

     chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
               + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
-    chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+    chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
     i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 ) + chromassd;

     if( h->param.b_cabac )
--
1.6.1.2


From f0da96145cb068ade0f0232d0682137c9065929f Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Mon, 22 Feb 2010 13:04:47 -0800
Subject: [PATCH 03/16] Fix overread of scratch buffer
 Could cause crashes on non-mod16 frames.

---
 encoder/encoder.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/encoder/encoder.c b/encoder/encoder.c
index df62389..89bf457 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1055,7 +1055,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
     /* Allocate scratch buffer */
     for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
     {
-        int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
+        int buf_hpel = (h->fdec->i_width[0]+48) * sizeof(int16_t);
         int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
         int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
         int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
--
1.6.1.2


From 25292b825a42b577bd121c48d2508f3b4aa7a9eb Mon Sep 17 00:00:00 2001
From: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
Date: Tue, 16 Feb 2010 11:05:21 -0800
Subject: [PATCH 04/16] Add GPAC version check

---
 configure |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/configure b/configure
index 25f5458..d0ff43a 100755
--- a/configure
+++ b/configure
@@ -584,7 +584,13 @@ if [ $SYS = MINGW ]; then
 fi
 if [ "$mp4_output" = "auto" ] ; then
     mp4_output="no"
-    cc_check gpac/isomedia.h "$MP4_LDFLAGS" && mp4_output="yes"
+    if cc_check gpac/isomedia.h "$MP4_LDFLAGS" ; then
+        if cc_check gpac/isomedia.h "$MP4_LDFLAGS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0);" ; then
+            mp4_output="yes"
+        else
+            echo "Warning: gpac is too old, update to 2007-06-21 UTC or later"
+        fi
+    fi
 fi
 if [ "$mp4_output" = "yes" ] ; then
     define MP4_OUTPUT
--
1.6.1.2


From 5234f855a23607ae0dbfce9eeb0c69007e9d69e4 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sun, 21 Feb 2010 14:21:26 -0800
Subject: [PATCH 05/16] SimpleBlock requires Matroska Doctype v2

---
 output/matroska_ebml.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
index 7265909..89790b7 100644
--- a/output/matroska_ebml.c
+++ b/output/matroska_ebml.c
@@ -338,8 +338,8 @@ int mk_writeHeader( mk_writer *w, const char *writing_app,
     CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
     CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
     CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
-    CHECK( mk_write_uint( c, 0x4287, 1 ) ); // DocTypeVersion
-    CHECK( mk_write_uint( c, 0x4285, 1 ) ); // DocTypeReadversion
+    CHECK( mk_write_uint( c, 0x4287, 2 ) ); // DocTypeVersion
+    CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadversion
     CHECK( mk_close_context( c, 0 ) );

     if( !(c = mk_create_context( w, w->root, 0x18538067 )) ) // Segment
--
1.6.1.2


From fff9312827eb936da8da24a426e167494208d195 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Tue, 16 Feb 2010 10:13:33 -0800
Subject: [PATCH 06/16] Much faster and simpler direct spatial calculation

---
 common/macroblock.c |  130 ++++++++++++++++++++++++--------------------------
 1 files changed, 62 insertions(+), 68 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 278659c..19cd371 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -36,8 +36,6 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
     int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
     int16_t *mv_c  = h->mb.cache.mv[i_list][i8 - 8 + i_width];

-    int i_count = 0;
-
     if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 )
     {
         i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
@@ -83,9 +81,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         }
     }

-    if( i_refa == i_ref ) i_count++;
-    if( i_refb == i_ref ) i_count++;
-    if( i_refc == i_ref ) i_count++;
+    int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);

     if( i_count > 1 )
     {
@@ -115,18 +111,13 @@ void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2]
     int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
     int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
     int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
-
-    int i_count = 0;
-
     if( i_refc == -2 )
     {
         i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
         mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
     }

-    if( i_refa == i_ref ) i_count++;
-    if( i_refb == i_ref ) i_count++;
-    if( i_refc == i_ref ) i_count++;
+    int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);

     if( i_count > 1 )
     {
@@ -196,7 +187,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
         if( i_ref >= 0 )
         {
             const int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
-            const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
+            const int16_t *mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
             const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
             const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
             if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) )
@@ -221,58 +212,67 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )

 static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
 {
-    int ref[2];
+    int8_t ref[2];
     ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
-    int i_list;
-    int i8;
-    const int8_t *l1ref0 = &h->fref1[0]->ref[0][ h->mb.i_b8_xy ];
-    const int8_t *l1ref1 = &h->fref1[0]->ref[1][ h->mb.i_b8_xy ];
-    const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->fref1[0]->mv[0][ h->mb.i_b4_xy ];
-    const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->fref1[0]->mv[1][ h->mb.i_b4_xy ];
-    const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
+    int i_list, i8, i_ref;
+    const int8_t *l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy];
+    const int8_t *l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy];
+    const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy],
+                                    (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] };
+    const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];

-    for( i_list=0; i_list<2; i_list++ )
+    for( i_list = 0; i_list < 2; i_list++ )
     {
-        int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
-        int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
-        int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
+        int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
+        int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
+        int     i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
+        int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
+        int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
+        int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
         if( i_refc == -2 )
+        {
             i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
+            mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
+        }
+
+        i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
+        if( i_ref < 0 )
+        {
+            i_ref = -1;
+            M32( mv[i_list] ) = 0;
+        }
+        else
+        {
+            /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases
+             * not relevant to spatial direct. */
+            int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
+
+            if( i_count > 1 )
+                x264_median_mv( mv[i_list], mv_a, mv_b, mv_c );
+            else
+            {
+                if( i_refa == i_ref )
+                    CP32( mv[i_list], mv_a );
+                else if( i_refb == i_ref )
+                    CP32( mv[i_list], mv_b );
+                else
+                    CP32( mv[i_list], mv_c );
+            }
+        }

-        ref[i_list] = i_refa;
-        if( ref[i_list] < 0 || ( i_refb < ref[i_list] && i_refb >= 0 ))
-            ref[i_list] = i_refb;
-        if( ref[i_list] < 0 || ( i_refc < ref[i_list] && i_refc >= 0 ))
-            ref[i_list] = i_refc;
-        if( ref[i_list] < 0 )
-            ref[i_list] = -1;
+        x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref );
+        x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] );
+        ref[i_list] = i_ref;
     }

-    if( ref[0] < 0 && ref[1] < 0 )
+    if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */
     {
         x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
         x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
-        x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 );
-        x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 );
         return 1;
     }

-    if( ref[0] >= 0 )
-        x264_mb_predict_mv_16x16( h, 0, ref[0], mv[0] );
-    else
-        M32( mv[0] ) = 0;
-
-    if( ref[1] >= 0 )
-        x264_mb_predict_mv_16x16( h, 1, ref[1], mv[1] );
-    else
-        M32( mv[1] ) = 0;
-
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
-    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
-    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
-
-    if( !M64( mv ) )
+    if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
         return 1;

     if( h->param.i_threads > 1
@@ -287,31 +287,25 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
         return 0;
     }

-    if( IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
-        return 1;
-
     /* col_zero_flag */
-    for( i8=0; i8<4; i8++ )
+    for( i8 = 0; i8 < 4; i8++ )
     {
-        const int x8 = i8%2;
-        const int y8 = i8/2;
+        const int x8 = i8&1;
+        const int y8 = i8>>1;
         const int o8 = x8 + y8 * h->mb.i_b8_stride;
         const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride);
+        int idx;
         if( l1ref0[o8] == 0 )
-        {
-            if( abs( l1mv0[o4][0] ) <= 1 && abs( l1mv0[o4][1] ) <= 1 )
-            {
-                if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
-                if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
-            }
-        }
+            idx = 0;
         else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 )
+            idx = 1;
+        else
+            continue;
+
+        if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
         {
-            if( abs( l1mv1[o4][0] ) <= 1 && abs( l1mv1[o4][1] ) <= 1 )
-            {
-                if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
-                if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
-            }
+            if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
+            if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
         }
     }

--
1.6.1.2


From 4a1303d128a4f7a9df81321940f789022695a9ad Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Wed, 17 Feb 2010 22:41:16 -0800
Subject: [PATCH 07/16] Keep track of macroblock partitions
 Allows vastly simpler motion compensation and direct MV calculation.

---
 common/common.h     |    2 +
 common/frame.c      |    1 +
 common/frame.h      |    1 +
 common/macroblock.c |  233 +++++++++++++++++++++++++--------------------------
 encoder/analyse.c   |    1 +
 5 files changed, 121 insertions(+), 117 deletions(-)

diff --git a/common/common.h b/common/common.h
index e2e8fac..68f79ba 100644
--- a/common/common.h
+++ b/common/common.h
@@ -519,6 +519,7 @@ struct x264_t

         /* mb table */
         int8_t  *type;                      /* mb type */
+        uint8_t *partition;                 /* mb partition */
         int8_t  *qp;                        /* mb qp */
         int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
         int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
@@ -627,6 +628,7 @@ struct x264_t

             ALIGNED_4( int16_t direct_mv[2][4][2] );
             ALIGNED_4( int8_t  direct_ref[2][4] );
+            int     direct_partition;
             ALIGNED_4( int16_t pskip_mv[2] );

             /* number of neighbors (top and left) that used 8x8 dct */
diff --git a/common/frame.c b/common/frame.c
index d89f5ab..2798f25 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -95,6 +95,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     if( b_fdec ) /* fdec frame */
     {
         CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
+        CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
         CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
         CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
         if( h->param.i_bframe )
diff --git a/common/frame.h b/common/frame.h
index 7c8e2ff..6e7de50 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -75,6 +75,7 @@ typedef struct x264_frame

     /* motion data */
     int8_t  *mb_type;
+    uint8_t *mb_partition;
     int16_t (*mv[2])[2];
     int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
     uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
diff --git a/common/macroblock.c b/common/macroblock.c
index 19cd371..2573415 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -165,9 +165,12 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
     int i_mb_8x8 =  4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
     int i8;
     const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
+    const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];

     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );

+    h->mb.i_partition = partition_col;
+
     if( IS_INTRA( type_col ) )
     {
         x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
@@ -176,7 +179,15 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
         return 1;
     }

-    for( i8 = 0; i8 < 4; i8++ )
+    /* Don't do any checks other than the ones we have to, based
+     * on the size of the colocated partitions.
+     * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
+    int max_i8 = (D_16x16 - partition_col) + 1;
+    int step = (partition_col == D_16x8) + 1;
+    int width = 4 >> ((D_16x16 - partition_col)&1);
+    int height = 4 >> ((D_16x16 - partition_col)>>1);
+
+    for( i8 = 0; i8 < max_i8; i8 += step )
     {
         const int x8 = i8%2;
         const int y8 = i8/2;
@@ -192,9 +203,9 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
             const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
             if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) )
                 return 0;
-            x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
-            x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, pack16to32_mask(l0x, l0y) );
-            x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
+            x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref );
+            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) );
+            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
         }
         else
         {
@@ -220,6 +231,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
     const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy],
                                     (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] };
     const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
+    const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];
+
+    h->mb.i_partition = partition_col;

     for( i_list = 0; i_list < 2; i_list++ )
     {
@@ -287,8 +301,16 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
         return 0;
     }

+    /* Don't do any checks other than the ones we have to, based
+     * on the size of the colocated partitions.
+     * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
+    int max_i8 = (D_16x16 - partition_col) + 1;
+    int step = (partition_col == D_16x8) + 1;
+    int width = 4 >> ((D_16x16 - partition_col)&1);
+    int height = 4 >> ((D_16x16 - partition_col)>>1);
+
     /* col_zero_flag */
-    for( i8 = 0; i8 < 4; i8++ )
+    for( i8 = 0; i8 < max_i8; i8 += step )
     {
         const int x8 = i8&1;
         const int y8 = i8>>1;
@@ -304,8 +326,8 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )

         if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
         {
-            if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
-            if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
+            if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 );
+            if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 );
         }
     }

@@ -324,32 +346,29 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )

     if( b_changed != NULL && b_available )
     {
-        int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
-        int changed = 0;
+        int changed;

-        if( IS_INTRA( type_col ) || type_col == P_SKIP )
+        changed  = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] );
+        changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] );
+        changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]];
+        changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]];
+        if( !changed && h->mb.i_partition != D_16x16 )
         {
-            changed |= M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][X264_SCAN8_0] );
-            changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][X264_SCAN8_0] );
-            changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][X264_SCAN8_0];
-            changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][X264_SCAN8_0];
+            changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] );
+            changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] );
+            changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]];
+            changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]];
         }
-        else
+        if( !changed && h->mb.i_partition == D_8x8 )
         {
-            int l;
-            for( l = 0; l < 2; l++ )
-            {
-                changed |= M32( h->mb.cache.direct_mv[l][0] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 0]] );
-                if( changed ) break;
-                changed |= M32( h->mb.cache.direct_mv[l][1] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 4]] );
-                changed |= M32( h->mb.cache.direct_mv[l][2] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 8]] );
-                changed |= M32( h->mb.cache.direct_mv[l][3] ) ^ M32( h->mb.cache.mv[l][x264_scan8[12]] );
-                if( changed ) break;
-                changed |= h->mb.cache.direct_ref[l][0] ^ h->mb.cache.ref[l][x264_scan8[ 0]];
-                changed |= h->mb.cache.direct_ref[l][1] ^ h->mb.cache.ref[l][x264_scan8[ 4]];
-                changed |= h->mb.cache.direct_ref[l][2] ^ h->mb.cache.ref[l][x264_scan8[ 8]];
-                changed |= h->mb.cache.direct_ref[l][3] ^ h->mb.cache.ref[l][x264_scan8[12]];
-            }
+            changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] );
+            changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] );
+            changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] );
+            changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] );
+            changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]];
+            changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]];
+            changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]];
+            changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]];
         }
         *b_changed = changed;
         if( !changed )
@@ -370,6 +389,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
             h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
             h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
             h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
+            h->mb.cache.direct_partition = h->mb.i_partition;
         }
     }

@@ -564,116 +584,93 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
     h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
 }

-static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
-{
-    const int i8 = x264_scan8[0] + x + 8*y;
-
-    if( h->mb.cache.ref[0][i8] >= 0 )
-        if( h->mb.cache.ref[1][i8] >= 0 )
-            x264_mb_mc_01xywh( h, x, y, 2, 2 );
-        else
-            x264_mb_mc_0xywh( h, x, y, 2, 2 );
-    else
-        x264_mb_mc_1xywh( h, x, y, 2, 2 );
-}
-
 void x264_mb_mc_8x8( x264_t *h, int i8 )
 {
     const int x = 2*(i8&1);
     const int y = 2*(i8>>1);
-    switch( h->mb.i_sub_partition[i8] )
+
+    if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        switch( h->mb.i_sub_partition[i8] )
+        {
+            case D_L0_8x8:
+                x264_mb_mc_0xywh( h, x, y, 2, 2 );
+                break;
+            case D_L0_8x4:
+                x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
+                x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
+                break;
+            case D_L0_4x8:
+                x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
+                x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
+                break;
+            case D_L0_4x4:
+                x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
+                x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
+                x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
+                x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
+                break;
+        }
+    }
+    else
     {
-        case D_L0_8x8:
-            x264_mb_mc_0xywh( h, x, y, 2, 2 );
-            break;
-        case D_L0_8x4:
-            x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
-            x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
-            break;
-        case D_L0_4x8:
-            x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
-            x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
-            break;
-        case D_L0_4x4:
-            x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
-            x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
-            x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
-            x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
-            break;
-        case D_L1_8x8:
+        const int i8 = x264_scan8[0] + x + 8*y;
+
+        if( h->mb.cache.ref[0][i8] >= 0 )
+            if( h->mb.cache.ref[1][i8] >= 0 )
+                x264_mb_mc_01xywh( h, x, y, 2, 2 );
+            else
+                x264_mb_mc_0xywh( h, x, y, 2, 2 );
+        else
             x264_mb_mc_1xywh( h, x, y, 2, 2 );
-            break;
-        case D_BI_8x8:
-            x264_mb_mc_01xywh( h, x, y, 2, 2 );
-            break;
-        case D_DIRECT_8x8:
-            x264_mb_mc_direct8x8( h, x, y );
-            break;
     }
 }

 void x264_mb_mc( x264_t *h )
 {
-    if( h->mb.i_type == P_L0 )
-    {
-        if( h->mb.i_partition == D_16x16 )
-        {
-            x264_mb_mc_0xywh( h, 0, 0, 4, 4 );
-        }
-        else if( h->mb.i_partition == D_16x8 )
-        {
-            x264_mb_mc_0xywh( h, 0, 0, 4, 2 );
-            x264_mb_mc_0xywh( h, 0, 2, 4, 2 );
-        }
-        else if( h->mb.i_partition == D_8x16 )
-        {
-            x264_mb_mc_0xywh( h, 0, 0, 2, 4 );
-            x264_mb_mc_0xywh( h, 2, 0, 2, 4 );
-        }
-    }
-    else if( h->mb.i_type == P_8x8 || h->mb.i_type == B_8x8 )
+    if( h->mb.i_partition == D_8x8 )
     {
         int i;
         for( i = 0; i < 4; i++ )
             x264_mb_mc_8x8( h, i );
     }
-    else if( h->mb.i_type == B_SKIP || h->mb.i_type == B_DIRECT )
-    {
-        x264_mb_mc_direct8x8( h, 0, 0 );
-        x264_mb_mc_direct8x8( h, 2, 0 );
-        x264_mb_mc_direct8x8( h, 0, 2 );
-        x264_mb_mc_direct8x8( h, 2, 2 );
-    }
-    else    /* B_*x* */
+    else
     {
-        const uint8_t *b_list0 = x264_mb_type_list_table[h->mb.i_type][0];
-        const uint8_t *b_list1 = x264_mb_type_list_table[h->mb.i_type][1];
+        const int ref0a = h->mb.cache.ref[0][x264_scan8[ 0]];
+        const int ref0b = h->mb.cache.ref[0][x264_scan8[12]];
+        const int ref1a = h->mb.cache.ref[1][x264_scan8[ 0]];
+        const int ref1b = h->mb.cache.ref[1][x264_scan8[12]];

         if( h->mb.i_partition == D_16x16 )
         {
-            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
-            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
-            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
+            if( ref0a >= 0 )
+                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
+                else             x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
+            else                 x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
         }
         else if( h->mb.i_partition == D_16x8 )
         {
-            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
-            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
-            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );
-
-            if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
-            else if( b_list0[1] )          x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
-            else if( b_list1[1] )          x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
+            if( ref0a >= 0 )
+                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
+                else             x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
+            else                 x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );
+
+            if( ref0b >= 0 )
+                if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
+                else             x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
+            else                 x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
         }
         else if( h->mb.i_partition == D_8x16 )
         {
-            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
-            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
-            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );
-
-            if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
-            else if( b_list0[1] )          x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
-            else if( b_list1[1] )          x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
+            if( ref0a >= 0 )
+                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
+                else             x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
+            else                 x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );
+
+            if( ref0b >= 0 )
+                if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
+                else             x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
+            else                 x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
         }
     }
 }
@@ -767,10 +764,6 @@ int x264_macroblock_cache_init( x264_t *h )
             h->mb.intra_border_backup[i][j] += 8;
         }

-    /* init with not available (for top right idx=7,15) */
-    memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
-    memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
-
     return 0;
 fail: return -1;
 }
@@ -808,6 +801,7 @@ void x264_macroblock_slice_init( x264_t *h )
     h->mb.ref[0] = h->fdec->ref[0];
     h->mb.ref[1] = h->fdec->ref[1];
     h->mb.type = h->fdec->mb_type;
+    h->mb.partition = h->fdec->mb_partition;

     h->fdec->i_ref[0] = h->i_ref0;
     h->fdec->i_ref[1] = h->i_ref1;
@@ -835,6 +829,10 @@ void x264_macroblock_slice_init( x264_t *h )
     if( h->sh.i_type == SLICE_TYPE_P )
         memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );

+    /* init with not available (for top right idx=7,15) */
+    memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
+    memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
+
     setup_inverse_delta_pocs( h );

     h->mb.i_neighbour4[6] =
@@ -1304,6 +1302,7 @@ void x264_macroblock_cache_save( x264_t *h )
     x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );

     h->mb.type[i_mb_xy] = i_mb_type;
+    h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
     h->mb.i_mb_prev_xy = i_mb_xy;

     /* save intra4x4 */
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 1d48b7d..6ee5f8e 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -3149,6 +3149,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )

         case B_SKIP:
         case B_DIRECT:
+            h->mb.i_partition = h->mb.cache.direct_partition;
             x264_mb_load_mv_direct8x8( h, 0 );
             x264_mb_load_mv_direct8x8( h, 1 );
             x264_mb_load_mv_direct8x8( h, 2 );
--
1.6.1.2


From d0be7257766d40b39dd453ebe8a266b64d653f71 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Thu, 18 Feb 2010 10:37:57 -0800
Subject: [PATCH 08/16] Add temporal predictor support to interlaced encoding
 0.5-1% better compression in interlaced mode

---
 common/frame.h      |    2 +-
 common/macroblock.c |   26 +++++++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/common/frame.h b/common/frame.h
index 6e7de50..0566b1e 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -85,7 +85,7 @@ typedef struct x264_frame
     int8_t  *ref[2];
     int     i_ref[2];
     int     ref_poc[2][16];
-    int     inv_ref_poc[16]; // inverse values (list0 only) to avoid divisions in MB encoding
+    int16_t inv_ref_poc[2][32]; // inverse values (list0 only) to avoid divisions in MB encoding

     /* for adaptive B-frame decision.
      * contains the SATD cost of the lowres frame encoded in various modes
diff --git a/common/macroblock.c b/common/macroblock.c
index 2573415..68c7e06 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -447,10 +447,14 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
 #undef SET_MVP

     /* temporal predictors */
-    /* FIXME temporal scaling w/ interlace */
-    if( h->fref0[0]->i_ref[0] > 0 && !h->sh.b_mbaff )
+    if( h->fref0[0]->i_ref[0] > 0 )
     {
         x264_frame_t *l0 = h->fref0[0];
+        int field = h->mb.i_mb_y&1;
+        int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
+        int refpoc = h->fref0[i_ref>>h->sh.b_mbaff]->i_poc;
+        if( h->sh.b_mbaff && field^(i_ref&1) )
+            refpoc += h->sh.i_delta_poc_bottom;

 #define SET_TMVP(dx, dy) { \
             int i_b4 = h->mb.i_b4_xy + dx*4 + dy*4*h->mb.i_b4_stride; \
@@ -458,7 +462,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
             int ref_col = l0->ref[0][i_b8]; \
             if( ref_col >= 0 ) \
             { \
-                int scale = (h->fdec->i_poc - h->fdec->ref_poc[0][i_ref]) * l0->inv_ref_poc[ref_col];\
+                int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field][ref_col];\
                 mvc[i][0] = (l0->mv[0][i_b4][0]*scale + 128) >> 8;\
                 mvc[i][1] = (l0->mv[0][i_b4][1]*scale + 128) >> 8;\
                 i++; \
@@ -479,11 +483,19 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
 /* Set up a lookup table for delta pocs to reduce an IDIV to an IMUL */
 static void setup_inverse_delta_pocs( x264_t *h )
 {
-    int i;
-    for( i = 0; i < h->i_ref0; i++ )
+    int i, field;
+    for( field = 0; field <= h->sh.b_mbaff; field++ )
     {
-        int delta = h->fdec->i_poc - h->fref0[i]->i_poc;
-        h->fdec->inv_ref_poc[i] = (256 + delta/2) / delta;
+        int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
+        for( i = 0; i < (h->i_ref0<<h->sh.b_mbaff); i++ )
+        {
+            int refpoc = h->fref0[i>>h->sh.b_mbaff]->i_poc;
+            if( h->sh.b_mbaff && field^(i&1) )
+                refpoc += h->sh.i_delta_poc_bottom;
+            int delta = curpoc - refpoc;
+
+            h->fdec->inv_ref_poc[field][i] = (256 + delta/2) / delta;
+        }
     }
 }

--
1.6.1.2


From da810dcc80ef85239a7c641b8af5c00f88aba1eb Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Thu, 18 Feb 2010 17:01:38 -0800
Subject: [PATCH 09/16] Much faster and more efficient MVD handling
 Store MV deltas as clipped absolute values.
 This means CABAC no longer has to calculate absolute values in MV context selection.
 This also lets us cut the memory spent on MVDs by a factor of 2, speeding up cache_mvd and reducing memory usage by 32*threads*(num macroblocks) bytes.
 On a Core i7 encoding 1080p, this is about 3 megabytes saved.

---
 common/common.h     |    8 ++++----
 common/macroblock.c |   47 +++++++++++++----------------------------------
 common/macroblock.h |   31 +++++++++++++++++++++++++++++--
 common/x86/util.h   |   40 +++++++++++++++++-----------------------
 encoder/cabac.c     |   20 +++++++++++---------
 encoder/me.c        |    3 ++-
 6 files changed, 76 insertions(+), 73 deletions(-)

diff --git a/common/common.h b/common/common.h
index 68f79ba..ab54508 100644
--- a/common/common.h
+++ b/common/common.h
@@ -171,13 +171,13 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
     return sum;
 }

-static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
+static inline uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
 {
     int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]);
     int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]);
     amvd0 = (amvd0 > 2) + (amvd0 > 32);
     amvd1 = (amvd1 > 2) + (amvd1 > 32);
-    return amvd0 + (amvd1<<16);
+    return amvd0 + (amvd1<<8);
 }

 extern const uint8_t x264_exp2_lut[64];
@@ -527,7 +527,7 @@ struct x264_t
         uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
         int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
         int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
-        int16_t (*mvd[2])[2];               /* mb mv difference with predict. set to 0 if intra. cabac only */
+        uint8_t (*mvd[2])[2];               /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
         int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
         int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
         int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
@@ -621,7 +621,7 @@ struct x264_t

             /* 0 if not available */
             ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
-            ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
+            ALIGNED_8( uint8_t mvd[2][X264_SCAN8_SIZE][2] );

             /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
             ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
diff --git a/common/macroblock.c b/common/macroblock.c
index 68c7e06..8a4f095 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -712,8 +712,8 @@ int x264_macroblock_cache_init( x264_t *h )
     if( h->param.b_cabac )
     {
         CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
-        CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(int16_t) );
-        CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(int16_t) );
+        CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(uint8_t) );
+        CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(uint8_t) );
     }

     for( i=0; i<2; i++ )
@@ -1211,33 +1211,24 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
             if( h->param.b_cabac )
             {
                 if( i_top_type >= 0 )
-                {
-                    const int i8 = x264_scan8[0] - 8;
-                    const int iv = i_top_4x4;
-                    CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
-                    CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
-                }
+                    CP64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8], h->mb.mvd[i_list][i_top_4x4] );
                 else
-                {
-                    const int i8 = x264_scan8[0] - 8;
-                    M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
-                    M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
-                }
+                    M64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8] ) = 0;

                 if( i_left_type >= 0 )
                 {
                     const int i8 = x264_scan8[0] - 1;
                     const int iv = i_mb_4x4 - 1;
-                    CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
-                    CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
-                    CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
-                    CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
                 }
                 else
                 {
                     const int i8 = x264_scan8[0] - 1;
                     for( i = 0; i < 4; i++ )
-                        M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
+                        M16( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
                 }
             }
         }
@@ -1416,30 +1407,18 @@ void x264_macroblock_cache_save( x264_t *h )
         if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
         {
             for( y = 0; y < 4; y++ )
-            {
-                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
-                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
-            }
+                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4], h->mb.cache.mvd[0][x264_scan8[0]+8*y] );
             if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
-                {
-                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
-                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
-                }
+                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4], h->mb.cache.mvd[1][x264_scan8[0]+8*y] );
         }
         else
         {
             for( y = 0; y < 4; y++ )
-            {
-                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
-                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
-            }
+                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4] ) = 0;
             if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
-                {
-                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
-                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
-                }
+                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4] ) = 0;
         }

         if( h->sh.i_type == SLICE_TYPE_B )
diff --git a/common/macroblock.h b/common/macroblock.h
index 48f3105..eb903d2 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -353,6 +353,33 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int
         if( height == 4 ) M16( d+6 ) = val2;
     }
 }
+static ALWAYS_INLINE void x264_macroblock_cache_rect2( void *dst, int width, int height, uint16_t val )
+{
+    uint16_t *d = dst;
+    uint32_t val32 = val + (val<<16);
+    uint64_t val64 = val32 + ((uint64_t)val32<<32);
+    if( width == 4 )
+    {
+                          M64( d+ 0 ) = val64;
+        if( height >= 2 ) M64( d+ 8 ) = val64;
+        if( height == 4 ) M64( d+16 ) = val64;
+        if( height == 4 ) M64( d+24 ) = val64;
+    }
+    else if( width == 2 )
+    {
+                          M32( d+ 0 ) = val32;
+        if( height >= 2 ) M32( d+ 8 ) = val32;
+        if( height == 4 ) M32( d+16 ) = val32;
+        if( height == 4 ) M32( d+24 ) = val32;
+    }
+    else //if( width == 1 )
+    {
+                          M16( d+ 0 ) = val;
+        if( height >= 2 ) M16( d+ 8 ) = val;
+        if( height == 4 ) M16( d+16 ) = val;
+        if( height == 4 ) M16( d+24 ) = val;
+    }
+}
 static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
 {
     int dy;
@@ -383,9 +410,9 @@ static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int
 {
     x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
 }
-static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
+static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mv )
 {
-    x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
+    x264_macroblock_cache_rect2( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
 }
 static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
 {
diff --git a/common/x86/util.h b/common/x86/util.h
index c8bcf4b..0674323 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -77,32 +77,26 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
     );
     return sum;
 }
-#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
-static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
+#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
+static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
 {
-    static const uint64_t pw_2    = 0x0002000200020002ULL;
-    static const uint64_t pw_28   = 0x001C001C001C001CULL;
-    static const uint64_t pw_2184 = 0x0888088808880888ULL;
-    /* MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32) */
-    /* 2184 = fix16(1/30) */
-    uint32_t amvd;
+    static const uint64_t pb_2    = 0x0202020202020202ULL;
+    static const uint64_t pb_32   = 0x2020202020202020ULL;
+    int amvd;
     asm(
-        "movd      %1, %%mm0 \n"
-        "movd      %2, %%mm1 \n"
-        "pxor   %%mm2, %%mm2 \n"
-        "pxor   %%mm3, %%mm3 \n"
-        "psubw  %%mm0, %%mm2 \n"
-        "psubw  %%mm1, %%mm3 \n"
-        "pmaxsw %%mm2, %%mm0 \n"
-        "pmaxsw %%mm3, %%mm1 \n"
-        "paddw     %3, %%mm0 \n"
-        "paddw  %%mm1, %%mm0 \n"
-        "pmulhuw   %4, %%mm0 \n"
-        "pminsw    %5, %%mm0 \n"
-        "movd   %%mm0, %0    \n"
+        "movd         %1, %%mm0 \n"
+        "movd         %2, %%mm1 \n"
+        "paddb     %%mm1, %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "movq      %%mm0, %%mm1 \n"
+        "pcmpgtb      %3, %%mm0 \n"
+        "pcmpgtb      %4, %%mm1 \n"
+        "psubb     %%mm0, %%mm2 \n"
+        "psubb     %%mm1, %%mm2 \n"
+        "movd      %%mm2, %0    \n"
         :"=r"(amvd)
-        :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
-         "m"(pw_28),"m"(pw_2184),"m"(pw_2)
+        :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
+         "m"(pb_2),"m"(pb_32)
     );
     return amvd;
 }
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 271f527..083b783 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -349,7 +349,7 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
     x264_cabac_encode_decision( cb, 54 + ctx, 0 );
 }

-static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
+static inline int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
 {
     const int i_abs = abs( mvd );
     const int ctxbase = l ? 47 : 40;
@@ -408,32 +408,34 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
         x264_cabac_encode_bypass( cb, mvd < 0 );
     }
 #endif
+    /* Since we don't need to keep track of MVDs larger than 33, just cap the value.
+     * This lets us store MVDs as 8-bit values instead of 16-bit. */
+    return X264_MIN( i_abs, 33 );
 }

-static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
+static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
 {
     ALIGNED_4( int16_t mvp[2] );
-    uint32_t amvd;
     int mdx, mdy;

     /* Calculate mvd */
     x264_mb_predict_mv( h, i_list, idx, width, mvp );
     mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
     mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
-    amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
-                               h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
+    uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
+                                       h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);

     /* encode */
-    x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF );
-    x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 );
+    mdx = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF );
+    mdy = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 );

-    return pack16to32_mask(mdx,mdy);
+    return pack8to16(mdx,mdy);
 }

 #define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
 do\
 {\
-    uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
+    uint16_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
     x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
 } while(0)

diff --git a/encoder/me.c b/encoder/me.c
index f58a6a8..44f6c7d 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -1174,6 +1174,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
     m->mv[0] = bmx;
     m->mv[1] = bmy;
     x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
-    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+    uint16_t amvd = pack8to16(X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33));
+    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
     h->mb.b_skip_mc = 0;
 }
--
1.6.1.2


From 54d1bed32086228ce2de06a5207501bdf258d9a9 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Fri, 19 Feb 2010 10:45:22 -0800
Subject: [PATCH 10/16] Faster, more accurate psy-RD caching
 Keep more variants of cached Hadamard scores and only calculate them when necessary.
 Results in more calculation, but simpler lookups.
 Slightly more accurate due to internal rounding in SATD and SA8D functions.

---
 common/common.h      |    8 ++---
 common/x86/mc-a2.asm |    6 +++-
 encoder/analyse.c    |   39 ++++++---------------------
 encoder/rdo.c        |   69 ++++++++++++++++++++++++++++---------------------
 4 files changed, 55 insertions(+), 67 deletions(-)

diff --git a/common/common.h b/common/common.h
index ab54508..413b82f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -583,11 +583,9 @@ struct x264_t
             ALIGNED_16( int16_t fenc_dct8[4][64] );
             ALIGNED_16( int16_t fenc_dct4[16][16] );

-            /* Psy RD SATD scores */
-            int fenc_satd[4][4];
-            int fenc_satd_sum;
-            int fenc_sa8d[2][2];
-            int fenc_sa8d_sum;
+            /* Psy RD SATD/SA8D scores cache */
+            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
+            ALIGNED_16( uint32_t fenc_satd_cache[32] );

             /* pointer over mb of the frame to be compressed */
             uint8_t *p_fenc[3];
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index f2e69c0..d86d6ef 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -731,15 +731,17 @@ cglobal x264_memcpy_aligned_sse2, 3,3
 ;-----------------------------------------------------------------------------
 %macro MEMZERO 1
 cglobal x264_memzero_aligned_%1, 2,2
+    add  r0, r1
+    neg  r1
     pxor m0, m0
 .loop:
-    sub r1d, mmsize*8
 %assign i 0
 %rep 8
     mova [r0 + r1 + i], m0
 %assign i i+mmsize
 %endrep
-    jg .loop
+    add r1d, mmsize*8
+    jl .loop
     REP_RET
 %endmacro

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 6ee5f8e..02fbf7c 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -578,34 +578,13 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 }

-/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
-static inline void x264_mb_cache_fenc_satd( x264_t *h )
+/* Reset fenc satd scores cache for psy RD */
+static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 {
-    ALIGNED_16( static uint8_t zero[16] ) = {0};
-    uint8_t *fenc;
-    int x, y, satd_sum = 0, sa8d_sum = 0;
-    if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
-        x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
-    if( !h->mb.i_psy_rd )
-        return;
-    for( y = 0; y < 4; y++ )
-        for( x = 0; x < 4; x++ )
-        {
-            fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
-            h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
-                                      - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
-            satd_sum += h->mb.pic.fenc_satd[y][x];
-        }
-    for( y = 0; y < 2; y++ )
-        for( x = 0; x < 2; x++ )
-        {
-            fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
-            h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
-                                      - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
-            sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
-        }
-    h->mb.pic.fenc_satd_sum = satd_sum;
-    h->mb.pic.fenc_sa8d_sum = sa8d_sum;
+    /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
+    h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
+    if( b_satd )
+        h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
 }

 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
@@ -1193,7 +1172,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     h->mb.i_type = P_L0;
     if( a->i_mbrd )
     {
-        x264_mb_cache_fenc_satd( h );
+        x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
         {
             h->mb.i_partition = D_16x16;
@@ -2432,7 +2411,7 @@ void x264_macroblock_analyse( x264_t *h )
     {
 intra_analysis:
         if( analysis.i_mbrd )
-            x264_mb_cache_fenc_satd( h );
+            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
         x264_mb_analyse_intra( h, &analysis, COST_MAX );
         if( analysis.i_mbrd )
             x264_intra_rd( h, &analysis, COST_MAX );
@@ -2749,7 +2728,7 @@ intra_analysis:
         int b_skip = 0;

         if( analysis.i_mbrd )
-            x264_mb_cache_fenc_satd( h );
+            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );

         h->mb.i_type = B_SKIP;
         if( h->mb.b_direct_auto_write )
diff --git a/encoder/rdo.c b/encoder/rdo.c
index e15f47d..fed2a28 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -61,36 +61,44 @@ static uint16_t cabac_size_5ones[128];
 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
         sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )

-
-/* Sum the cached SATDs to avoid repeating them. */
-static inline int sum_satd( x264_t *h, int pixel, int x, int y )
+static inline uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
 {
-    int satd = 0;
-    int min_x = x>>2;
-    int min_y = y>>2;
-    int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
-    int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
-    if( pixel == PIXEL_16x16 )
-        return h->mb.pic.fenc_satd_sum;
-    for( y = min_y; y < max_y; y++ )
-        for( x = min_x; x < max_x; x++ )
-            satd += h->mb.pic.fenc_satd[y][x];
-    return satd;
+    static const uint8_t hadamard_shift_x[4] = {4,   4,   3,   3};
+    static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
+    static const uint8_t  hadamard_offset[4] = {0,   1,   3,   5};
+    int cache_index = (x >> hadamard_shift_x[pixel]) + (y >> hadamard_shift_y[pixel])
+                    + hadamard_offset[pixel];
+    uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
+    if( res )
+        return res - 1;
+    else
+    {
+        uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
+        res = h->pixf.hadamard_ac[pixel]( fenc, FENC_STRIDE );
+        h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
+        return res;
+    }
 }

-static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
+static inline int cached_satd( x264_t *h, int pixel, int x, int y )
 {
-    int sa8d = 0;
-    int min_x = x>>3;
-    int min_y = y>>3;
-    int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
-    int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
-    if( pixel == PIXEL_16x16 )
-        return h->mb.pic.fenc_sa8d_sum;
-    for( y = min_y; y < max_y; y++ )
-        for( x = min_x; x < max_x; x++ )
-            sa8d += h->mb.pic.fenc_sa8d[y][x];
-    return sa8d;
+    static const uint8_t satd_shift_x[3] = {3,   2,   2};
+    static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
+    static const uint8_t  satd_offset[3] = {0,   8,   16};
+    ALIGNED_16( static uint8_t zero[16] );
+    int cache_index = (x >> satd_shift_x[pixel - PIXEL_8x4]) + (y >> satd_shift_y[pixel - PIXEL_8x4])
+                    + satd_offset[pixel - PIXEL_8x4];
+    int res = h->mb.pic.fenc_satd_cache[cache_index];
+    if( res )
+        return res - 1;
+    else
+    {
+        uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
+        int dc = h->pixf.sad[pixel]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
+        res = h->pixf.satd[pixel]( fenc, FENC_STRIDE, zero, 0 ) - dc;
+        h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
+        return res;
+    }
 }

 /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
@@ -113,15 +121,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
         /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
         if( size <= PIXEL_8x8 )
         {
-            uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
-            satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
-                 + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
+            uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
+            uint64_t fenc_acs = cached_hadamard( h, size, x, y );
+            satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
+                 + abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
             satd >>= 1;
         }
         else
         {
             int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
-            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
+            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
         }
         satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
     }
--
1.6.1.2


From c45278a7107934fdad77c0cac14a924b97a6272e Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sun, 21 Feb 2010 01:56:12 -0800
Subject: [PATCH 11/16] Move presets, tunings, and profiles into libx264
 Now any application calling libx264 can use them.
 Full documentation and guidelines for usage are included in x264.h.

---
 common/common.c |  266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 x264.c          |  267 +++----------------------------------------------------
 x264.h          |   96 +++++++++++++++++---
 3 files changed, 357 insertions(+), 272 deletions(-)

diff --git a/common/common.c b/common/common.c
index 0dd7af5..a99b65b 100644
--- a/common/common.c
+++ b/common/common.c
@@ -36,7 +36,7 @@ static void x264_log_default( void *, int, const char *, va_list );
 /****************************************************************************
  * x264_param_default:
  ****************************************************************************/
-void    x264_param_default( x264_param_t *param )
+void x264_param_default( x264_param_t *param )
 {
     /* */
     memset( param, 0, sizeof( x264_param_t ) );
@@ -160,6 +160,270 @@ void    x264_param_default( x264_param_t *param )
     param->b_dts_compress = 0;
 }

+static int x264_param_apply_preset( x264_param_t *param, const char *preset )
+{
+    if( !strcasecmp( preset, "ultrafast" ) )
+    {
+        param->i_frame_reference = 1;
+        param->i_scenecut_threshold = 0;
+        param->b_deblocking_filter = 0;
+        param->b_cabac = 0;
+        param->i_bframe = 0;
+        param->analyse.intra = 0;
+        param->analyse.inter = 0;
+        param->analyse.b_transform_8x8 = 0;
+        param->analyse.i_me_method = X264_ME_DIA;
+        param->analyse.i_subpel_refine = 0;
+        param->rc.i_aq_mode = 0;
+        param->analyse.b_mixed_references = 0;
+        param->analyse.i_trellis = 0;
+        param->i_bframe_adaptive = X264_B_ADAPT_NONE;
+        param->rc.b_mb_tree = 0;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+    }
+    else if( !strcasecmp( preset, "veryfast" ) )
+    {
+        param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
+        param->analyse.i_me_method = X264_ME_DIA;
+        param->analyse.i_subpel_refine = 1;
+        param->i_frame_reference = 1;
+        param->analyse.b_mixed_references = 0;
+        param->analyse.i_trellis = 0;
+        param->rc.b_mb_tree = 0;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+    }
+    else if( !strcasecmp( preset, "faster" ) )
+    {
+        param->analyse.b_mixed_references = 0;
+        param->i_frame_reference = 2;
+        param->analyse.i_subpel_refine = 4;
+        param->rc.b_mb_tree = 0;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
+    }
+    else if( !strcasecmp( preset, "fast" ) )
+    {
+        param->i_frame_reference = 2;
+        param->analyse.i_subpel_refine = 6;
+        param->rc.i_lookahead = 30;
+    }
+    else if( !strcasecmp( preset, "medium" ) )
+    {
+        /* Default is medium */
+    }
+    else if( !strcasecmp( preset, "slow" ) )
+    {
+        param->analyse.i_me_method = X264_ME_UMH;
+        param->analyse.i_subpel_refine = 8;
+        param->i_frame_reference = 5;
+        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+        param->rc.i_lookahead = 50;
+    }
+    else if( !strcasecmp( preset, "slower" ) )
+    {
+        param->analyse.i_me_method = X264_ME_UMH;
+        param->analyse.i_subpel_refine = 9;
+        param->i_frame_reference = 8;
+        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+        param->analyse.i_trellis = 2;
+        param->rc.i_lookahead = 60;
+    }
+    else if( !strcasecmp( preset, "veryslow" ) )
+    {
+        param->analyse.i_me_method = X264_ME_UMH;
+        param->analyse.i_subpel_refine = 10;
+        param->analyse.i_me_range = 24;
+        param->i_frame_reference = 16;
+        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+        param->analyse.i_trellis = 2;
+        param->i_bframe = 8;
+        param->rc.i_lookahead = 60;
+    }
+    else if( !strcasecmp( preset, "placebo" ) )
+    {
+        param->analyse.i_me_method = X264_ME_TESA;
+        param->analyse.i_subpel_refine = 10;
+        param->analyse.i_me_range = 24;
+        param->i_frame_reference = 16;
+        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+        param->analyse.b_fast_pskip = 0;
+        param->analyse.i_trellis = 2;
+        param->i_bframe = 16;
+        param->rc.i_lookahead = 60;
+    }
+    else
+    {
+        fprintf( stderr, "x264 [error]: invalid preset '%s'\n", preset );
+        return -1;
+    }
+    return 0;
+}
+
+static int x264_param_apply_tune( x264_param_t *param, const char *tune )
+{
+    char *tmp = x264_malloc( strlen( tune ) );
+    if( !tmp )
+        return -1;
+    tmp = strcpy( tmp, tune );
+    char *s = strtok( tmp, ",./-+" );
+    int psy_tuning_used = 0;
+    while( s )
+    {
+        if( !strncasecmp( s, "film", 4 ) )
+        {
+            if( psy_tuning_used++ ) goto psy_failure;
+            param->i_deblocking_filter_alphac0 = -1;
+            param->i_deblocking_filter_beta = -1;
+            param->analyse.f_psy_trellis = 0.15;
+        }
+        else if( !strncasecmp( s, "animation", 9 ) )
+        {
+            if( psy_tuning_used++ ) goto psy_failure;
+            param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
+            param->i_deblocking_filter_alphac0 = 1;
+            param->i_deblocking_filter_beta = 1;
+            param->analyse.f_psy_rd = 0.4;
+            param->rc.f_aq_strength = 0.6;
+            param->i_bframe += 2;
+        }
+        else if( !strncasecmp( s, "grain", 5 ) )
+        {
+            if( psy_tuning_used++ ) goto psy_failure;
+            param->i_deblocking_filter_alphac0 = -2;
+            param->i_deblocking_filter_beta = -2;
+            param->analyse.f_psy_trellis = 0.25;
+            param->analyse.b_dct_decimate = 0;
+            param->rc.f_pb_factor = 1.1;
+            param->rc.f_ip_factor = 1.1;
+            param->rc.f_aq_strength = 0.5;
+            param->analyse.i_luma_deadzone[0] = 6;
+            param->analyse.i_luma_deadzone[1] = 6;
+            param->rc.f_qcompress = 0.8;
+        }
+        else if( !strncasecmp( s, "psnr", 4 ) )
+        {
+            if( psy_tuning_used++ ) goto psy_failure;
+            param->rc.i_aq_mode = X264_AQ_NONE;
+            param->analyse.b_psy = 0;
+        }
+        else if( !strncasecmp( s, "ssim", 4 ) )
+        {
+            if( psy_tuning_used++ ) goto psy_failure;
+            param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
+            param->analyse.b_psy = 0;
+        }
+        else if( !strncasecmp( s, "fastdecode", 10 ) )
+        {
+            param->b_deblocking_filter = 0;
+            param->b_cabac = 0;
+            param->analyse.b_weighted_bipred = 0;
+            param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+        }
+        else if( !strncasecmp( s, "zerolatency", 11 ) )
+        {
+            param->rc.i_lookahead = 0;
+            param->i_sync_lookahead = 0;
+            param->i_bframe = 0;
+            param->b_sliced_threads = 1;
+        }
+        else if( !strncasecmp( s, "touhou", 6 ) )
+        {
+            if( psy_tuning_used++ ) goto psy_failure;
+            param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
+            param->i_deblocking_filter_alphac0 = -1;
+            param->i_deblocking_filter_beta = -1;
+            param->analyse.f_psy_trellis = 0.2;
+            param->rc.f_aq_strength = 1.3;
+            if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
+                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+        }
+        else
+        {
+            fprintf( stderr, "x264 [error]: invalid tune '%s'\n", s );
+            return -1;
+        }
+        if( 0 )
+        {
+    psy_failure:
+            fprintf( stderr, "x264 [warning]: only 1 psy tuning can be used: ignoring tune %s\n", s );
+        }
+        s = strtok( NULL, ",./-+" );
+    }
+    return 0;
+}
+
+int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune )
+{
+    x264_param_default( param );
+
+    if( preset && x264_param_apply_preset( param, preset ) < 0 )
+        return -1;
+    if( tune && x264_param_apply_tune( param, tune ) < 0 )
+        return -1;
+    return 0;
+}
+
+void x264_param_apply_fastfirstpass( x264_param_t *param )
+{
+    /* Set faster options in case of turbo firstpass. */
+    if( param->rc.b_stat_read && !param->rc.b_stat_write )
+    {
+        param->i_frame_reference = 1;
+        param->analyse.b_transform_8x8 = 0;
+        param->analyse.inter = 0;
+        param->analyse.i_me_method = X264_ME_DIA;
+        param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
+        param->analyse.i_trellis = 0;
+    }
+}
+
+int x264_param_apply_profile( x264_param_t *param, const char *profile )
+{
+    if( !profile )
+        return 0;
+
+    if( !strcasecmp( profile, "baseline" ) )
+    {
+        param->analyse.b_transform_8x8 = 0;
+        param->b_cabac = 0;
+        param->i_cqm_preset = X264_CQM_FLAT;
+        param->i_bframe = 0;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+        if( param->b_interlaced )
+        {
+            fprintf( stderr, "x264 [error]: baseline profile doesn't support interlacing\n" );
+            return -1;
+        }
+    }
+    else if( !strcasecmp( profile, "main" ) )
+    {
+        param->analyse.b_transform_8x8 = 0;
+        param->i_cqm_preset = X264_CQM_FLAT;
+    }
+    else if( !strcasecmp( profile, "high" ) )
+    {
+        /* Default */
+    }
+    else
+    {
+        fprintf( stderr, "x264 [error]: invalid profile: %s\n", profile );
+        return -1;
+    }
+    if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0) ||
+        (param->rc.i_rc_method == X264_RC_CRF && param->rc.f_rf_constant == 0) )
+    {
+        fprintf( stderr, "x264 [error]: %s profile doesn't support lossless\n", profile );
+        return -1;
+    }
+    return 0;
+}
+
 static int parse_enum( const char *arg, const char * const *names, int *dst )
 {
     int i;
diff --git a/x264.c b/x264.c
index 959626a..2875dd1 100644
--- a/x264.c
+++ b/x264.c
@@ -115,8 +115,6 @@ int main( int argc, char **argv )
     _setmode(_fileno(stdout), _O_BINARY);
 #endif

-    x264_param_default( &param );
-
     /* Parse command line */
     if( Parse( argc, argv, &param, &opt ) < 0 )
         return -1;
@@ -799,12 +797,13 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
     char *profile = NULL;
     int b_thread_input = 0;
     int b_turbo = 1;
-    int b_pass1 = 0;
     int b_user_ref = 0;
     int b_user_fps = 0;
     int b_user_interlaced = 0;
     int i;
     cli_input_opt_t input_opt;
+    char *preset = NULL;
+    char *tune = NULL;

     memset( opt, 0, sizeof(cli_opt_t) );
     memset( &input_opt, 0, sizeof(cli_input_opt_t) );
@@ -816,219 +815,20 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
         int c = getopt_long( argc, argv, short_options, long_options, NULL );
         if( c == -1 )
             break;
-
         if( c == OPT_PRESET )
         {
-            if( !strcasecmp( optarg, "ultrafast" ) )
-            {
-                param->i_frame_reference = 1;
-                param->i_scenecut_threshold = 0;
-                param->b_deblocking_filter = 0;
-                param->b_cabac = 0;
-                param->i_bframe = 0;
-                param->analyse.intra = 0;
-                param->analyse.inter = 0;
-                param->analyse.b_transform_8x8 = 0;
-                param->analyse.i_me_method = X264_ME_DIA;
-                param->analyse.i_subpel_refine = 0;
-                param->rc.i_aq_mode = 0;
-                param->analyse.b_mixed_references = 0;
-                param->analyse.i_trellis = 0;
-                param->i_bframe_adaptive = X264_B_ADAPT_NONE;
-                param->rc.b_mb_tree = 0;
-                param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-            }
-            else if( !strcasecmp( optarg, "veryfast" ) )
-            {
-                param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
-                param->analyse.i_me_method = X264_ME_DIA;
-                param->analyse.i_subpel_refine = 1;
-                param->i_frame_reference = 1;
-                param->analyse.b_mixed_references = 0;
-                param->analyse.i_trellis = 0;
-                param->rc.b_mb_tree = 0;
-                param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-            }
-            else if( !strcasecmp( optarg, "faster" ) )
-            {
-                param->analyse.b_mixed_references = 0;
-                param->i_frame_reference = 2;
-                param->analyse.i_subpel_refine = 4;
-                param->rc.b_mb_tree = 0;
-                param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
-            }
-            else if( !strcasecmp( optarg, "fast" ) )
-            {
-                param->i_frame_reference = 2;
-                param->analyse.i_subpel_refine = 6;
-                param->rc.i_lookahead = 30;
-            }
-            else if( !strcasecmp( optarg, "medium" ) )
-            {
-                /* Default is medium */
-            }
-            else if( !strcasecmp( optarg, "slow" ) )
-            {
-                param->analyse.i_me_method = X264_ME_UMH;
-                param->analyse.i_subpel_refine = 8;
-                param->i_frame_reference = 5;
-                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
-                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-                param->rc.i_lookahead = 50;
-            }
-            else if( !strcasecmp( optarg, "slower" ) )
-            {
-                param->analyse.i_me_method = X264_ME_UMH;
-                param->analyse.i_subpel_refine = 9;
-                param->i_frame_reference = 8;
-                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
-                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-                param->analyse.i_trellis = 2;
-                param->rc.i_lookahead = 60;
-            }
-            else if( !strcasecmp( optarg, "veryslow" ) )
-            {
-                param->analyse.i_me_method = X264_ME_UMH;
-                param->analyse.i_subpel_refine = 10;
-                param->analyse.i_me_range = 24;
-                param->i_frame_reference = 16;
-                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
-                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-                param->analyse.i_trellis = 2;
-                param->i_bframe = 8;
-                param->rc.i_lookahead = 60;
-            }
-            else if( !strcasecmp( optarg, "placebo" ) )
-            {
-                param->analyse.i_me_method = X264_ME_TESA;
-                param->analyse.i_subpel_refine = 10;
-                param->analyse.i_me_range = 24;
-                param->i_frame_reference = 16;
-                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
-                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-                param->analyse.b_fast_pskip = 0;
-                param->analyse.i_trellis = 2;
-                param->i_bframe = 16;
-                param->rc.i_lookahead = 60;
+            preset = optarg;
+            if( !strcmp( preset, "placebo" ) )
                 b_turbo = 0;
-            }
-            else
-            {
-                fprintf( stderr, "x264 [error]: invalid preset '%s'\n", optarg );
-                return -1;
-            }
         }
-        else if( c == '?' )
-            return -1;
-    }
-
-    /* Tunings are applied next. */
-    for( optind = 0;; )
-    {
-        int c = getopt_long( argc, argv, short_options, long_options, NULL );
-        if( c == -1 )
-            break;
-
         if( c == OPT_TUNE )
-        {
-            char *s = strtok( optarg, ",./-+" );
-            int psy_tuning_used = 0;
-            while( s )
-            {
-                if( !strncasecmp( s, "film", 4 ) )
-                {
-                    if( psy_tuning_used ) goto psy_failure;
-                    param->i_deblocking_filter_alphac0 = -1;
-                    param->i_deblocking_filter_beta = -1;
-                    param->analyse.f_psy_trellis = 0.15;
-                    psy_tuning_used = 1;
-                }
-                else if( !strncasecmp( s, "animation", 9 ) )
-                {
-                    if( psy_tuning_used ) goto psy_failure;
-                    param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
-                    param->i_deblocking_filter_alphac0 = 1;
-                    param->i_deblocking_filter_beta = 1;
-                    param->analyse.f_psy_rd = 0.4;
-                    param->rc.f_aq_strength = 0.6;
-                    param->i_bframe += 2;
-                    psy_tuning_used = 1;
-                }
-                else if( !strncasecmp( s, "grain", 5 ) )
-                {
-                    if( psy_tuning_used ) goto psy_failure;
-                    param->i_deblocking_filter_alphac0 = -2;
-                    param->i_deblocking_filter_beta = -2;
-                    param->analyse.f_psy_trellis = 0.25;
-                    param->analyse.b_dct_decimate = 0;
-                    param->rc.f_pb_factor = 1.1;
-                    param->rc.f_ip_factor = 1.1;
-                    param->rc.f_aq_strength = 0.5;
-                    param->analyse.i_luma_deadzone[0] = 6;
-                    param->analyse.i_luma_deadzone[1] = 6;
-                    param->rc.f_qcompress = 0.8;
-                    psy_tuning_used = 1;
-                }
-                else if( !strncasecmp( s, "psnr", 4 ) )
-                {
-                    if( psy_tuning_used ) goto psy_failure;
-                    param->rc.i_aq_mode = X264_AQ_NONE;
-                    param->analyse.b_psy = 0;
-                    psy_tuning_used = 1;
-                }
-                else if( !strncasecmp( s, "ssim", 4 ) )
-                {
-                    if( psy_tuning_used ) goto psy_failure;
-                    param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
-                    param->analyse.b_psy = 0;
-                    psy_tuning_used = 1;
-                }
-                else if( !strncasecmp( s, "fastdecode", 10 ) )
-                {
-                    param->b_deblocking_filter = 0;
-                    param->b_cabac = 0;
-                    param->analyse.b_weighted_bipred = 0;
-                    param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-                }
-                else if( !strncasecmp( s, "zerolatency", 11 ) )
-                {
-                    param->rc.i_lookahead = 0;
-                    param->i_sync_lookahead = 0;
-                    param->i_bframe = 0;
-                    param->b_sliced_threads = 1;
-                }
-                else if( !strncasecmp( s, "touhou", 6 ) )
-                {
-                    if( psy_tuning_used ) goto psy_failure;
-                    param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
-                    param->i_deblocking_filter_alphac0 = -1;
-                    param->i_deblocking_filter_beta = -1;
-                    param->analyse.f_psy_trellis = 0.2;
-                    param->rc.f_aq_strength = 1.3;
-                    if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
-                        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-                    psy_tuning_used = 1;
-                }
-                else
-                {
-                    fprintf( stderr, "x264 [error]: invalid tune '%s'\n", s );
-                    return -1;
-                }
-                if( 0 )
-                {
-psy_failure:
-                    fprintf( stderr, "x264 [warning]: only 1 psy tuning can be used: ignoring tune %s\n", s );
-                }
-                s = strtok( NULL, ",./-+" );
-            }
-        }
+            tune = optarg;
         else if( c == '?' )
             return -1;
     }

+    x264_param_default_preset( param, preset, tune );
+
     /* Parse command line options */
     for( optind = 0;; )
     {
@@ -1144,9 +944,6 @@ psy_failure:
             case 'r':
                 b_user_ref = 1;
                 goto generic_option;
-            case 'p':
-                b_pass1 = atoi( optarg ) == 1;
-                goto generic_option;
             case OPT_FPS:
                 b_user_fps = 1;
                 param->b_vfr_input = 0;
@@ -1185,54 +982,12 @@ generic_option:
         }
     }

-    /* Set faster options in case of turbo firstpass. */
-    if( b_turbo && b_pass1 )
-    {
-        param->i_frame_reference = 1;
-        param->analyse.b_transform_8x8 = 0;
-        param->analyse.inter = 0;
-        param->analyse.i_me_method = X264_ME_DIA;
-        param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
-        param->analyse.i_trellis = 0;
-    }
+    /* If first pass mode is used, apply faster settings. */
+    if( b_turbo )
+        x264_param_apply_fastfirstpass( param );

     /* Apply profile restrictions. */
-    if( profile )
-    {
-        if( !strcasecmp( profile, "baseline" ) )
-        {
-            param->analyse.b_transform_8x8 = 0;
-            param->b_cabac = 0;
-            param->i_cqm_preset = X264_CQM_FLAT;
-            param->i_bframe = 0;
-            param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-            if( param->b_interlaced )
-            {
-                fprintf( stderr, "x264 [error]: baseline profile doesn't support interlacing\n" );
-                return -1;
-            }
-        }
-        else if( !strcasecmp( profile, "main" ) )
-        {
-            param->analyse.b_transform_8x8 = 0;
-            param->i_cqm_preset = X264_CQM_FLAT;
-        }
-        else if( !strcasecmp( profile, "high" ) )
-        {
-            /* Default */
-        }
-        else
-        {
-            fprintf( stderr, "x264 [error]: invalid profile: %s\n", profile );
-            return -1;
-        }
-        if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0) ||
-            (param->rc.i_rc_method == X264_RC_CRF && param->rc.f_rf_constant == 0) )
-        {
-            fprintf( stderr, "x264 [error]: %s profile doesn't support lossless\n", profile );
-            return -1;
-        }
-    }
+    x264_param_apply_profile( param, profile );

     /* Get the file name */
     if( optind > argc - 1 || !output_filename )
diff --git a/x264.h b/x264.h
index e7d19b7..f317e98 100644
--- a/x264.h
+++ b/x264.h
@@ -35,14 +35,14 @@

 #include <stdarg.h>

-#define X264_BUILD 85
+#define X264_BUILD 86

 /* x264_t:
  *      opaque handler for encoder */
 typedef struct x264_t x264_t;

 /****************************************************************************
- * Initialisation structure and function.
+ * Encoder parameters
  ****************************************************************************/
 /* CPU flags
  */
@@ -332,6 +332,10 @@ typedef struct x264_param_t
     void (*param_free)( void* );
 } x264_param_t;

+/****************************************************************************
+ * H.264 level restriction information
+ ****************************************************************************/
+
 typedef struct {
     int level_idc;
     int mbps;        /* max macroblock processing rate (macroblocks/sec) */
@@ -350,6 +354,10 @@ typedef struct {
 /* all of the levels defined in the standard, terminated by .level_idc=0 */
 extern const x264_level_t x264_levels[];

+/****************************************************************************
+ * Basic parameter handling functions
+ ****************************************************************************/
+
 /* x264_param_default:
  *      fill x264_param_t with default values and do CPU detection */
 void    x264_param_default( x264_param_t * );
@@ -366,15 +374,73 @@ void    x264_param_default( x264_param_t * );
 int x264_param_parse( x264_param_t *, const char *name, const char *value );

 /****************************************************************************
- * Picture structures and functions.
+ * Advanced parameter handling functions
+ ****************************************************************************/
+
+/* These functions expose the full power of x264's preset-tune-profile system for
+ * easy adjustment of large numbers of internal parameters.
+ *
+ * In order to replicate x264CLI's option handling, these functions MUST be called
+ * in the following order:
+ * 1) x264_param_default_preset
+ * 2) Custom user options (via param_parse or directly assigned variables)
+ * 3) x264_param_apply_fastfirstpass
+ * 4) x264_param_apply_profile
+ *
+ * Additionally, x264CLI does not apply step 3 if the preset chosen is "placebo"
+ * or --slow-firstpass is set. */
+
+/* x264_param_default_preset:
+ *      The same as x264_param_default, but also use the passed preset and tune
+ *      to modify the default settings.
+ *      (either can be NULL, which implies no preset or no tune, respectively)
+ *
+ *      Currently available presets are, ordered from fastest to slowest: */
+static const char * const x264_preset_names[] = { "ultrafast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", "placebo", 0 };
+
+/*      Warning: the speed of these presets scales dramatically.  Ultrafast is a full
+ *      100 times faster than placebo!
+ *
+ *      Currently available tunings are: */
+static const char * const x264_tune_names[] = { "film", "animation", "grain", "psnr", "ssim", "fastdecode", "zerolatency", 0 };
+
+/*      Multiple tunings can be used if separated by a delimiter in ",./-+",
+ *      however multiple psy tunings cannot be used.
+ *      film, animation, grain, psnr, and ssim are psy tunings.
+ *
+ *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
+int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
+
+/* x264_param_apply_fastfirstpass:
+ *      If first-pass mode is set (rc.b_stat_read == 1, rc.b_stat_write == 0),
+ *      modify the encoder settings to disable options generally not useful on
+ *      the first pass. */
+void    x264_param_apply_fastfirstpass( x264_param_t * );
+
+/* x264_param_apply_profile:
+ *      Applies the restrictions of the given profile.
+ *      Currently available profiles are, from most to least restrictive: */
+static const char * const x264_profile_names[] = { "baseline", "main", "high", 0 };
+
+/*      (can be NULL, in which case the function will do nothing)
+ *
+ *      Does NOT guarantee that the given profile will be used: if the restrictions
+ *      of "High" are applied to settings that are already Baseline-compatible, the
+ *      stream will remain baseline.  In short, it does not increase settings, only
+ *      decrease them.
+ *
+ *      returns 0 on success, negative on failure (e.g. invalid profile name). */
+int     x264_param_apply_profile( x264_param_t *, const char *profile );
+
+/****************************************************************************
+ * Picture structures and functions
  ****************************************************************************/
 typedef struct
 {
-    int     i_csp;
-
-    int     i_plane;
-    int     i_stride[4];
-    uint8_t *plane[4];
+    int     i_csp;       /* Colorspace */
+    int     i_plane;     /* Number of image planes */
+    int     i_stride[4]; /* Strides for each plane */
+    uint8_t *plane[4];   /* Pointers to each plane */
 } x264_image_t;

 typedef struct
@@ -421,9 +487,9 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
 void x264_picture_clean( x264_picture_t *pic );

 /****************************************************************************
- * NAL structure and functions:
+ * NAL structure and functions
  ****************************************************************************/
-/* nal */
+
 enum nal_unit_type_e
 {
     NAL_UNKNOWN = 0,
@@ -465,7 +531,7 @@ typedef struct
 } x264_nal_t;

 /****************************************************************************
- * Encoder functions:
+ * Encoder functions
  ****************************************************************************/

 /* Force a link error in the case of linking against an incompatible API version.
@@ -497,16 +563,16 @@ int     x264_encoder_reconfig( x264_t *, x264_param_t * );
 void    x264_encoder_parameters( x264_t *, x264_param_t * );
 /* x264_encoder_headers:
  *      return the SPS and PPS that will be used for the whole stream.
- *      if i_nal > 0, returns the total size of all NAL payloads.
+ *      *pi_nal is the number of NAL units outputted in pp_nal.
  *      returns negative on error.
  *      the payloads of all output NALs are guaranteed to be sequential in memory. */
-int     x264_encoder_headers( x264_t *, x264_nal_t **, int * );
+int     x264_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
 /* x264_encoder_encode:
  *      encode one picture.
- *      if i_nal > 0, returns the total size of all NAL payloads.
+ *      *pi_nal is the number of NAL units outputted in pp_nal.
  *      returns negative on error, zero if no NAL units returned.
  *      the payloads of all output NALs are guaranteed to be sequential in memory. */
-int     x264_encoder_encode ( x264_t *, x264_nal_t **, int *, x264_picture_t *, x264_picture_t * );
+int     x264_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
 /* x264_encoder_close:
  *      close an encoder handler */
 void    x264_encoder_close  ( x264_t * );
--
1.6.1.2


From cb7143299578377dbe1e11a93c074d0890d487e0 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sun, 21 Feb 2010 03:56:06 -0800
Subject: [PATCH 12/16] Make b-pyramid normal the default
 Now that b-pyramid works with MB-tree and is spec compliant, there's no real reason not to make it default.
 Improves compression 0-5% depending on the video.
 Also allow 0/1/2 to be used as aliases for none/strict/normal (for conciseness).

---
 common/common.c |    9 ++++++++-
 x264.h          |    2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/common/common.c b/common/common.c
index a99b65b..2faf139 100644
--- a/common/common.c
+++ b/common/common.c
@@ -75,7 +75,7 @@ void x264_param_default( x264_param_t *param )
     param->i_scenecut_threshold = 40;
     param->i_bframe_adaptive = X264_B_ADAPT_FAST;
     param->i_bframe_bias = 0;
-    param->i_bframe_pyramid = 0;
+    param->i_bframe_pyramid = X264_B_PYRAMID_NORMAL;
     param->b_interlaced = 0;
     param->b_constrained_intra = 0;

@@ -637,7 +637,14 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     OPT("b-bias")
         p->i_bframe_bias = atoi(value);
     OPT("b-pyramid")
+    {
         b_error |= parse_enum( value, x264_b_pyramid_names, &p->i_bframe_pyramid );
+        if( b_error )
+        {
+            b_error = 0;
+            p->i_bframe_pyramid = atoi(value);
+        }
+    }
     OPT("nf")
         p->b_deblocking_filter = !atobool(value);
     OPT2("filter", "deblock")
diff --git a/x264.h b/x264.h
index f317e98..dec296c 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@

 #include <stdarg.h>

-#define X264_BUILD 86
+#define X264_BUILD 87

 /* x264_t:
  *      opaque handler for encoder */
--
1.6.1.2


From edebcf0074105c058c60e33b5bf7323743eb19e6 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sun, 21 Feb 2010 13:20:19 -0800
Subject: [PATCH 13/16] Abide by the MinCR level limit
 Some Blu-ray analyzers were complaining about this.

---
 encoder/ratecontrol.c |   29 +++++++++++++++++++++++++++--
 encoder/set.c         |   32 ++++++++++++++++----------------
 x264.h                |    3 ++-
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 3d86aaa..d0fdb50 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -136,6 +136,7 @@ struct x264_ratecontrol_t
     /* MBRC stuff */
     float frame_size_estimated; /* Access to this variable must be atomic: double is
                                  * not atomic on all arches we care about */
+    double frame_size_maximum;  /* Maximum frame size due to MinCR */
     double frame_size_planned;
     double slice_size_planned;
     double max_frame_error;
@@ -1039,6 +1040,24 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
         memset( h->fdec->i_row_bits, 0, h->sps->i_mb_height * sizeof(int) );
         rc->row_pred = &rc->row_preds[h->sh.i_type];
         update_vbv_plan( h, overhead );
+
+        const x264_level_t *l = x264_levels;
+        while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
+            l++;
+
+        /* The spec has a bizarre special case for the first frame. */
+        if( h->i_frame == 0 )
+        {
+            //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
+            double fr = 1. / 172;
+            int pic_size_in_mbs = h->sps->i_mb_width * h->sps->i_mb_height;
+            rc->frame_size_maximum = 384 * 8 * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / l->mincr;
+        }
+        else
+        {
+            //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR
+            rc->frame_size_maximum = 384 * 8 * (1 / rc->fps) * l->mbps / l->mincr;
+        }
     }

     if( h->sh.i_type != SLICE_TYPE_B )
@@ -1220,9 +1239,10 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
             b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
         }

-        /* avoid VBV underflow */
+        /* avoid VBV underflow or MinCR violation */
         while( (rc->qpm < h->param.rc.i_qp_max)
-               && (rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) )
+               && ((rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) ||
+                   (rc->frame_size_maximum - b1 < rc->frame_size_maximum * rc->max_frame_error)))
         {
             rc->qpm ++;
             b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
@@ -1677,6 +1697,11 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
             q = X264_MAX( q0, q );
         }

+        /* Apply MinCR restrictions */
+        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+        if( bits > rcc->frame_size_maximum )
+            q *= bits / rcc->frame_size_maximum;
+
         /* Check B-frame complexity, and use up any bits that would
          * overflow before the next P-frame. */
         if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
diff --git a/encoder/set.c b/encoder/set.c
index f79919b..03a6dee 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -536,22 +536,22 @@ fail:

 const x264_level_t x264_levels[] =
 {
-    { 10,   1485,    99,   152064,     64,    175,  64, 64,  0, 0, 0, 1 },
-//  {"1b",  1485,    99,   152064,    128,    350,  64, 64,  0, 0, 0, 1 },
-    { 11,   3000,   396,   345600,    192,    500, 128, 64,  0, 0, 0, 1 },
-    { 12,   6000,   396,   912384,    384,   1000, 128, 64,  0, 0, 0, 1 },
-    { 13,  11880,   396,   912384,    768,   2000, 128, 64,  0, 0, 0, 1 },
-    { 20,  11880,   396,   912384,   2000,   2000, 128, 64,  0, 0, 0, 1 },
-    { 21,  19800,   792,  1824768,   4000,   4000, 256, 64,  0, 0, 0, 0 },
-    { 22,  20250,  1620,  3110400,   4000,   4000, 256, 64,  0, 0, 0, 0 },
-    { 30,  40500,  1620,  3110400,  10000,  10000, 256, 32, 22, 0, 1, 0 },
-    { 31, 108000,  3600,  6912000,  14000,  14000, 512, 16, 60, 1, 1, 0 },
-    { 32, 216000,  5120,  7864320,  20000,  20000, 512, 16, 60, 1, 1, 0 },
-    { 40, 245760,  8192, 12582912,  20000,  25000, 512, 16, 60, 1, 1, 0 },
-    { 41, 245760,  8192, 12582912,  50000,  62500, 512, 16, 24, 1, 1, 0 },
-    { 42, 522240,  8704, 13369344,  50000,  62500, 512, 16, 24, 1, 1, 1 },
-    { 50, 589824, 22080, 42393600, 135000, 135000, 512, 16, 24, 1, 1, 1 },
-    { 51, 983040, 36864, 70778880, 240000, 240000, 512, 16, 24, 1, 1, 1 },
+    { 10,   1485,    99,   152064,     64,    175,  64, 64,  0, 2, 0, 0, 1 },
+//  {"1b",  1485,    99,   152064,    128,    350,  64, 64,  0, 2, 0, 0, 1 },
+    { 11,   3000,   396,   345600,    192,    500, 128, 64,  0, 2, 0, 0, 1 },
+    { 12,   6000,   396,   912384,    384,   1000, 128, 64,  0, 2, 0, 0, 1 },
+    { 13,  11880,   396,   912384,    768,   2000, 128, 64,  0, 2, 0, 0, 1 },
+    { 20,  11880,   396,   912384,   2000,   2000, 128, 64,  0, 2, 0, 0, 1 },
+    { 21,  19800,   792,  1824768,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
+    { 22,  20250,  1620,  3110400,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
+    { 30,  40500,  1620,  3110400,  10000,  10000, 256, 32, 22, 2, 0, 1, 0 },
+    { 31, 108000,  3600,  6912000,  14000,  14000, 512, 16, 60, 4, 1, 1, 0 },
+    { 32, 216000,  5120,  7864320,  20000,  20000, 512, 16, 60, 4, 1, 1, 0 },
+    { 40, 245760,  8192, 12582912,  20000,  25000, 512, 16, 60, 4, 1, 1, 0 },
+    { 41, 245760,  8192, 12582912,  50000,  62500, 512, 16, 24, 2, 1, 1, 0 },
+    { 42, 522240,  8704, 13369344,  50000,  62500, 512, 16, 24, 2, 1, 1, 1 },
+    { 50, 589824, 22080, 42393600, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
+    { 51, 983040, 36864, 70778880, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
     { 0 }
 };

diff --git a/x264.h b/x264.h
index dec296c..7474a50 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@

 #include <stdarg.h>

-#define X264_BUILD 87
+#define X264_BUILD 88

 /* x264_t:
  *      opaque handler for encoder */
@@ -346,6 +346,7 @@ typedef struct {
     int mv_range;    /* max vertical mv component range (pixels) */
     int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
     int slice_rate;  /* ?? */
+    int mincr;       /* min compression ratio */
     int bipred8x8;   /* limit bipred to >=8x8 */
     int direct8x8;   /* limit b_direct to >=8x8 */
     int frame_only;  /* forbid interlacing */
--
1.6.1.2


From 1df2cf28b68242423638468f94ed742105f40d28 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Sun, 21 Feb 2010 13:21:11 -0800
Subject: [PATCH 14/16] New algorithm for AQ mode 2
 Combines the auto-ness of AQ2 with a new var^0.25 instead of log(var) formula.
 Works better with MB-tree than the old AQ mode 2 and should give higher SSIM.

---
 encoder/ratecontrol.c |    9 ++++++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index d0fdb50..8b47e29 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -246,17 +246,20 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )

     if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
     {
+        float avg_adj_pow2 = 0.f;
         for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
             for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
             {
                 uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
-                float qp_adj = x264_log2( energy + 2 );
-                qp_adj *= qp_adj;
+                float qp_adj = powf( energy + 1, 0.125f );
                 frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
                 avg_adj += qp_adj;
+                avg_adj_pow2 += qp_adj * qp_adj;
             }
         avg_adj /= h->mb.i_mb_count;
-        strength = h->param.rc.f_aq_strength * avg_adj * (1.f / 6000.f);
+        avg_adj_pow2 /= h->mb.i_mb_count;
+        strength = h->param.rc.f_aq_strength * avg_adj;
+        avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
     }
     else
         strength = h->param.rc.f_aq_strength * 1.0397f;
--
1.6.1.2


From b487fb0af745cdc276e059d58fb2b2590203fe85 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sun, 21 Feb 2010 17:30:52 -0800
Subject: [PATCH 15/16] Use short startcodes whenever possible
 Saves one byte per frame for every slice beyond the first.
 Only applies to Annex-B output mode.

---
 common/common.c   |    6 +++---
 common/common.h   |    2 +-
 encoder/encoder.c |   12 +++++++++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/common/common.c b/common/common.c
index 2faf139..0410588 100644
--- a/common/common.c
+++ b/common/common.c
@@ -985,17 +985,17 @@ void x264_picture_clean( x264_picture_t *pic )
 /****************************************************************************
  * x264_nal_encode:
  ****************************************************************************/
-int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal )
+int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
 {
     uint8_t *src = nal->p_payload;
     uint8_t *end = nal->p_payload + nal->i_payload;
     uint8_t *orig_dst = dst;
     int i_count = 0, size;

-    /* long nal start code (we always use long ones) */
     if( b_annexb )
     {
-        *dst++ = 0x00;
+        if( b_long_startcode )
+            *dst++ = 0x00;
         *dst++ = 0x00;
         *dst++ = 0x00;
         *dst++ = 0x01;
diff --git a/common/common.h b/common/common.h
index 413b82f..d2b53b0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -121,7 +121,7 @@ int64_t x264_mdate( void );
  * the encoding options */
 char *x264_param2string( x264_param_t *p, int b_res );

-int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal );
+int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );

 /* log */
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 89bf457..c76938c 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1228,10 +1228,14 @@ static int x264_encoder_encapsulate_nals( x264_t *h )
     }

     uint8_t *nal_buffer = h->nal_buffer;
+    int long_startcode = 1;

     for( i = 0; i < h->out.i_nal; i++ )
     {
-        int size = x264_nal_encode( nal_buffer, h->param.b_annexb, &h->out.nal[i] );
+        int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
+        /* Don't use long startcodes for any slice beyond the first. */
+        if( h->out.nal[i].i_type >= NAL_SLICE && h->out.nal[i].i_type <= NAL_SLICE_IDR )
+            long_startcode = 0;
         h->out.nal[i].i_payload = size;
         h->out.nal[i].p_payload = nal_buffer;
         nal_buffer += size;
@@ -1715,8 +1719,10 @@ static int x264_slice_write( x264_t *h )
     bs_t bs_bak;
     x264_cabac_t cabac_bak;
     uint8_t cabac_prevbyte_bak = 0; /* Shut up GCC. */
-    /* Assume no more than 3 bytes of NALU escaping. */
-    int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-3-NALU_OVERHEAD)*8 : INT_MAX;
+    /* Assume no more than 3 bytes of NALU escaping.
+     * Slices other than the first use a 3-byte startcode. */
+    int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->sh.i_first_mb)) + 3;
+    int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : INT_MAX;
     int starting_bits = bs_pos(&h->out.bs);
     bs_realign( &h->out.bs );

--
1.6.1.2


From 81c1ae7de624e837cb3cc058ea0d8e8d3dccbeb3 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Mon, 22 Feb 2010 17:33:17 -0800
Subject: [PATCH 16/16] Faster probe_skip, 2x2 DC transform handling
 Move the 2x2 DC DCT into the dct_dc asm function to avoid some store-to-load forwarding penalties and extra register loads.
 Use dct_dc as part of the early termination in probe_skip.
 x86 asm partially by Holger Lubitz.
 ARM NEON asm by David Conrad.

---
 common/arm/dct-a.S   |   14 +++++++++++---
 common/dct.c         |   11 +++++++++++
 common/x86/dct-a.asm |   50 ++++++++++++++++++++++++++++++++++----------------
 encoder/macroblock.c |   13 +++++++++----
 4 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S
index 0ed7238..3b9fab9 100644
--- a/common/arm/dct-a.S
+++ b/common/arm/dct-a.S
@@ -639,12 +639,20 @@ function x264_sub8x8_dct_dc_neon
     vld1.64         {d30}, [r1,:64], r3
     vadd.s16        q1,  q12, q13
     vld1.64         {d31}, [r2,:64], ip
-    vpadd.s16       d0,  d0,  d1
-    vadd.s16        q1,  q1,  q14
     vsubl.u8        q15, d30, d31
+    vadd.s16        q1,  q1,  q14
+
+    vadd.s16        d4,  d0,  d1
     vadd.s16        q1,  q1,  q15
-    vpadd.s16       d2,  d2,  d3
+    vsub.s16        d5,  d0,  d1
+    vadd.s16        d6,  d2,  d3
+    vsub.s16        d7,  d2,  d3
+    vadd.s16        q0,  q2,  q3
+    vsub.s16        q1,  q2,  q3
+
     vpadd.s16       d0,  d0,  d2
+    vpadd.s16       d1,  d1,  d3
+    vpadd.s16       d0,  d0,  d1
     vst1.64         {d0}, [r0,:64]
     bx              lr
 .endfunc
diff --git a/common/dct.c b/common/dct.c
index aa83ef4..55f78a5 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -184,10 +184,21 @@ static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )

 static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
 {
+    int d0, d1, d2, d3;
     dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
     dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
     dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
     dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
+
+    /* 2x2 DC transform */
+    d0 = dct[0] + dct[1];
+    d1 = dct[2] + dct[3];
+    d2 = dct[0] - dct[1];
+    d3 = dct[2] - dct[3];
+    dct[0] = d0 + d1;
+    dct[2] = d2 + d3;
+    dct[1] = d0 - d1;
+    dct[3] = d2 - d3;
 }

 static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 618433c..5dd51e5 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -509,28 +509,43 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
     movq      m1, m2
     punpckldq m2, m3
     punpckhdq m1, m3
-    psadbw    %1, m7
-    psadbw    %2, m7
-    psadbw    m2, m7
-    psadbw    m1, m7
+    pxor      m3, m3
+    psadbw    %1, m3
+    psadbw    %2, m3
+    psadbw    m2, m3
+    psadbw    m1, m3
     psubw     %1, m2
     psubw     %2, m1
 %endmacro

+%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
+    pshufw    mm1, %1, 10100000b  ;  s1  s1  s0  s0
+    pshufw    mm0, %2, 10110001b  ;  s3  __  s2  __
+    paddw     mm1, %2             ;  s1 s13  s0 s02
+    psubw     mm1, mm0            ; d13 s13 d02 s02
+    pshufw    mm0, mm1, 01000100b ; d02 s02 d02 s02
+    psrlq     mm1, 32             ;  __  __ d13 s13
+    paddw     mm0, mm1            ; d02 s02 d02+d13 s02+s13
+    psllq     mm1, 32             ; d13 s13
+    psubw     mm0, mm1            ; d02-d13 s02-s13 d02+d13 s02+s13
+%endmacro
+
 INIT_MMX
 cglobal x264_sub8x8_dct_dc_mmxext, 3,3
-    pxor      m7, m7
-    call .loop
-    add       r1, FENC_STRIDE*4
-    add       r2, FDEC_STRIDE*4
-    add       r0, 4
-.loop:
     DCTDC_2ROW_MMX m0, m4, 0
     DCTDC_2ROW_MMX m5, m6, 2
     paddw     m0, m5
     paddw     m4, m6
-    punpcklwd m0, m4
-    movd    [r0], m0
+    punpckldq m0, m4
+    add       r1, FENC_STRIDE*4
+    add       r2, FDEC_STRIDE*4
+    DCTDC_2ROW_MMX m7, m4, 0
+    DCTDC_2ROW_MMX m5, m6, 2
+    paddw     m7, m5
+    paddw     m4, m6
+    punpckldq m7, m4
+    DCT2x2    m0, m7
+    movq    [r0], m0
     ret

 INIT_XMM
@@ -558,13 +573,16 @@ cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
     DCTDC_2ROW_SSE2 2, 1, m4
     add      r1, FENC_STRIDE*4
     add      r2, FDEC_STRIDE*4
-    psubq    m4, m6
+    psubd    m4, m6
     DCTDC_2ROW_SSE2 0, 0, m5
     DCTDC_2ROW_SSE2 2, 1, m5
-    psubq    m5, m6
+    psubd    m5, m6
     packssdw m4, m5
-    packssdw m4, m4
-    movq   [r0], m4
+    movhlps  m5, m4
+    movdq2q mm0, m4
+    movdq2q mm7, m5
+    DCT2x2  mm0, mm7
+    movq   [r0], mm0
     RET

 ;-----------------------------------------------------------------------------
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index f67a898..0be6201 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -365,7 +365,6 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                 if( ssd[ch] > thresh )
                 {
                     h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
-                    dct2x2dc_dconly( dct2x2 );
                     if( h->mb.b_trellis )
                         nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
                     else
@@ -980,10 +979,10 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
         if( ssd < thresh )
             continue;

-        h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+        /* The vast majority of chroma checks will terminate during the DC check or the higher
+         * threshold check, so we can save time by doing a DC-only DCT. */
+        h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );

-        /* calculate dct DC */
-        dct2x2dc( dct2x2, dct4x4 );
         if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
             return 0;

@@ -991,9 +990,15 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
         if( ssd < thresh*4 )
             continue;

+        h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+
         /* calculate dct coeffs */
         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
         {
+            /* We don't need to zero the DC coefficient before quantization because we already
+             * checked that all the DCs were zero above at twice the precision that quant4x4
+             * uses.  This applies even though the DC here is being quantized before the 2x2
+             * transform. */
             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
                 continue;
             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
--
1.6.1.2