Dark Shikari

From f21e71a04ba65aff9b5a4bfa8a73fd86c463f4ee Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <[email protected]>
Date: Mon, 3 Aug 2009 20:52:30 -0700
Subject: [PATCH 1/2] Various 1-pass VBV tweaks
 Make predictors have an offset in addition to a multiplier.
 This primarily fixes issues in sources with lots of extremely static scenes, such as anime and CGI.
 We tried linear regressions, but they were very unreliable as predictors.
 Also allow VBV to be slightly more aggressive in raising QPs to avoid not having enough bits left in some situations.
 Up to 1db improvement on some clips.

---
 encoder/ratecontrol.c |   32 +++++++++++++++++++++-----------
 1 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 2f88708..087e658 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -58,6 +58,7 @@ typedef struct
     double coeff;
     double count;
     double decay;
+    double offset;
 } predictor_t;

 struct x264_ratecontrol_t
@@ -409,9 +410,11 @@ int x264_ratecontrol_new( x264_t *h )
         rc->pred[i].coeff= 2.0;
         rc->pred[i].count= 1.0;
         rc->pred[i].decay= 0.5;
+        rc->pred[i].offset= 0.0;
         rc->row_preds[i].coeff= .25;
         rc->row_preds[i].count= 1.0;
         rc->row_preds[i].decay= 0.5;
+        rc->row_preds[i].offset= 0.0;
     }
     *rc->pred_b_from_p = rc->pred[0];

@@ -953,7 +956,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
         if( y < h->sps->i_mb_height-1 )
         {
             int i_estimated;
-            int avg_qp = X264_MAX(h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1])
+            int avg_qp = X264_MIN(h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1])
                        + rc->pb_offset * ((h->fenc->i_type == X264_TYPE_BREF) ? 0.5 : 1);
             rc->qpm = X264_MIN(X264_MAX( rc->qp, avg_qp), 51); //avg_qp could go higher than 51 due to pb_offset
             i_estimated = row_bits_so_far(h, y); //FIXME: compute full estimated size
@@ -1153,10 +1156,6 @@ void x264_ratecontrol_end( x264_t *h, int bits )
             {
                 update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa_rc),
                                   h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes );
-                /* In some cases, such as completely blank scenes, pred_b_from_p can go nuts */
-                /* Hackily cap the predictor coeff in case this happens. */
-                /* FIXME FIXME FIXME */
-                rc->pred_b_from_p->coeff = X264_MIN( rc->pred_b_from_p->coeff, 10. );
                 rc->bframe_bits = 0;
             }
         }
@@ -1270,17 +1269,28 @@ static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q)

 static double predict_size( predictor_t *p, double q, double var )
 {
-     return p->coeff*var / (q*p->count);
+     return (p->coeff*var + p->offset) / (q*p->count);
 }

 static void update_predictor( predictor_t *p, double q, double var, double bits )
 {
+    const double range = 1.5;
     if( var < 10 )
         return;
-    p->count *= p->decay;
-    p->coeff *= p->decay;
-    p->count ++;
-    p->coeff += bits*q / var;
+    double old_coeff = p->coeff / p->count;
+    double new_coeff = bits*q / var;
+    double new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range );
+    double new_offset = bits*q - new_coeff_clipped * var;
+    if( new_offset >= 0 )
+        new_coeff = new_coeff_clipped;
+    else
+        new_offset = 0;
+    p->count  *= p->decay;
+    p->coeff  *= p->decay;
+    p->offset *= p->decay;
+    p->count  ++;
+    p->coeff  += new_coeff;
+    p->offset += new_offset;
 }

 // update VBV after encoding a frame
@@ -1350,7 +1360,7 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
         double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
         double qf = 1.0;
         if( bits > rcc->buffer_fill/2 )
-            qf = x264_clip3f( rcc->buffer_fill/(2*bits), 0.2, 1.0 );
+            qf = rcc->buffer_fill/(2*bits);
         q /= qf;
         bits *= qf;
         if( bits < rcc->buffer_rate/2 )
--
1.6.1.2


From 5ed78fa5b21d686682b8779cc114844b4b204f4d Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <[email protected]>
Date: Tue, 4 Aug 2009 17:46:33 -0700
Subject: [PATCH 2/2] Macroblock-tree ratecontrol
 On by default; can be turned off with --no-mbtree.
 Uses a large lookahead to track temporal propagation of data and weight quality accordingly.
 Requires a very large separate statsfile (4 bytes per macroblock) in multi-pass mode.
 Doesn't work with b-pyramid yet.
 Note that MB-tree inherently measures quality different from the standard qcomp method, so bitrates produced by CRF may change somewhat.
 This makes the "medium" preset a bit slower.  Accordingly, make "fast" slower as well, and introduce a new preset "faster" between "fast" and "veryfast".
 Add a new option, --lookahead, to control the distance MB tree looks ahead to perform propagation analysis.
 Default is 50; larger values will be slower and require more memory but give more accurate results.
 Add a new option, --no-psy, to disable all psy optimizations that don't improve PSNR or SSIM.
 This disables psy-RD/trellis, but also other more subtle internal psy optimizations that can't be controlled directly via external parameters.
 Quality improvement from MB-tree is about 2-70% depending on content.
 Strength of MB-tree adjustments can be tweaked using qcompress; higher values mean lower MB-tree strength.

---
 common/common.c       |   22 ++-
 common/common.h       |   25 +++-
 common/frame.c        |   10 +-
 common/frame.h        |    3 +
 encoder/analyse.c     |    4 +-
 encoder/encoder.c     |   39 +++++-
 encoder/ratecontrol.c |  136 +++++++++++++---
 encoder/ratecontrol.h |    1 +
 encoder/slicetype.c   |  422 ++++++++++++++++++++++++++++++++++++++-----------
 x264.c                |   28 +++-
 x264.h                |    5 +-
 11 files changed, 556 insertions(+), 139 deletions(-)

diff --git a/common/common.c b/common/common.c
index 9260c64..8513217 100644
--- a/common/common.c
+++ b/common/common.c
@@ -72,6 +72,7 @@ void    x264_param_default( x264_param_t *param )
     param->i_bframe_adaptive = X264_B_ADAPT_FAST;
     param->i_bframe_bias = 0;
     param->b_bframe_pyramid = 0;
+    param->i_lookahead = 50;

     param->b_deblocking_filter = 1;
     param->i_deblocking_filter_alphac0 = 0;
@@ -104,6 +105,7 @@ void    x264_param_default( x264_param_t *param )
     param->rc.f_qblur = 0.5;
     param->rc.f_complexity_blur = 20;
     param->rc.i_zones = 0;
+    param->rc.b_mb_tree = 1;

     /* Log */
     param->pf_log = x264_log_default;
@@ -117,6 +119,7 @@ void    x264_param_default( x264_param_t *param )
     param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
     param->analyse.i_me_method = X264_ME_HEX;
     param->analyse.f_psy_rd = 1.0;
+    param->analyse.b_psy = 1;
     param->analyse.f_psy_trellis = 0;
     param->analyse.i_me_range = 16;
     param->analyse.i_subpel_refine = 7;
@@ -337,6 +340,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     }
     OPT("bframes")
         p->i_bframe = atoi(value);
+    OPT("lookahead")
+        p->i_lookahead = atoi(value);
     OPT("b-adapt")
     {
         p->i_bframe_adaptive = atobool(value);
@@ -493,6 +498,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
             p->analyse.f_psy_trellis = 0;
         }
     }
+    OPT("psy")
+        p->analyse.b_psy = atobool(value);
     OPT("chroma-me")
         p->analyse.b_chroma_me = atobool(value);
     OPT("mixed-refs")
@@ -559,6 +566,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     }
     OPT("qcomp")
         p->rc.f_qcompress = atof(value);
+    OPT("mbtree")
+        p->rc.b_mb_tree = atobool(value);
     OPT("qblur")
         p->rc.f_qblur = atof(value);
     OPT2("cplxblur", "cplx-blur")
@@ -843,7 +852,9 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
     s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
     s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
-    s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
+    s += sprintf( s, " psy=%d", p->analyse.b_psy );
+    if( p->analyse.b_psy )
+        s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
     s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
     s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
     s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
@@ -868,9 +879,12 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
                   p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );

-    s += sprintf( s, " rc=%s", p->rc.i_rc_method == X264_RC_ABR ?
+    if( p->i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+        s += sprintf( s, " lookahead=%d", p->i_lookahead );
+
+    s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
                                ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size ? "cbr" : "abr" )
-                               : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp" );
+                               : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
     if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
     {
         if( p->rc.i_rc_method == X264_RC_CRF )
@@ -892,7 +906,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     if( !(p->rc.i_rc_method == X264_RC_CQP && p->rc.i_qp_constant == 0) )
     {
         s += sprintf( s, " ip_ratio=%.2f", p->rc.f_ip_factor );
-        if( p->i_bframe )
+        if( p->i_bframe && !p->rc.b_mb_tree )
             s += sprintf( s, " pb_ratio=%.2f", p->rc.f_pb_factor );
         s += sprintf( s, " aq=%d", p->rc.i_aq_mode );
         if( p->rc.i_aq_mode )
diff --git a/common/common.h b/common/common.h
index 8a25a13..5f9284e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -51,6 +51,7 @@
 #define X264_SLICE_MAX 4
 #define X264_NAL_MAX (4 + X264_SLICE_MAX)
 #define X264_PCM_COST (386*8)
+#define X264_LOOKAHEAD_MAX 250

 // number of pixels (per thread) in progress at any given time.
 // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
@@ -152,6 +153,24 @@ static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
     return amvd0 + (amvd1<<16);
 }

+static const uint8_t exp2_lut[64] = {
+      1,   4,   7,  10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,  44,  47,
+     50,  53,  57,  60,  64,  67,  71,  74,  78,  81,  85,  89,  93,  96, 100, 104,
+    108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
+    177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
+};
+
+static ALWAYS_INLINE int x264_exp2fix8( float x )
+{
+    int i, f;
+    x += 8;
+    if( x <= 0 ) return 0;
+    if( x >= 16 ) return 0xffff;
+    i = x;
+    f = (x-i)*64;
+    return (exp2_lut[f]+256) << i >> 8;
+}
+
 /****************************************************************************
  *
  ****************************************************************************/
@@ -327,11 +346,11 @@ struct x264_t
     struct
     {
         /* Frames to be encoded (whose types have been decided) */
-        x264_frame_t *current[X264_BFRAME_MAX*4+3];
+        x264_frame_t *current[X264_LOOKAHEAD_MAX+3];
         /* Temporary buffer (frames types not yet decided) */
-        x264_frame_t *next[X264_BFRAME_MAX*4+3];
+        x264_frame_t *next[X264_LOOKAHEAD_MAX+3];
         /* Unused frames */
-        x264_frame_t *unused[X264_BFRAME_MAX*4 + X264_THREAD_MAX*2 + 16+4];
+        x264_frame_t *unused[X264_LOOKAHEAD_MAX + X264_THREAD_MAX*2 + 16+4];
         /* For adaptive B decision */
         x264_frame_t *last_nonb;

diff --git a/common/frame.c b/common/frame.c
index 23e6824..98c1e2c 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -96,6 +96,15 @@ x264_frame_t *x264_frame_new( x264_t *h )
                 memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
                 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
             }
+        CHECKED_MALLOC( frame->i_intra_cost, i_mb_count * sizeof(uint16_t) );
+        memset( frame->i_intra_cost, -1, i_mb_count * sizeof(int16_t) );
+        CHECKED_MALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint32_t) );
+        for( j = 0; j <= h->param.i_bframe+1; j++ )
+            for( i = 0; i <= h->param.i_bframe+1; i++ )
+            {
+                CHECKED_MALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
+                CHECKED_MALLOC( frame->lowres_inter_types[j][i], i_mb_count * sizeof(uint8_t) );
+            }
     }

     if( h->param.analyse.i_me_method >= X264_ME_ESA )
@@ -116,7 +125,6 @@ x264_frame_t *x264_frame_new( x264_t *h )
     CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
     CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
     CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( frame->i_intra_cost, i_mb_count * sizeof(uint16_t) );
     if( h->param.i_bframe )
     {
         CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
diff --git a/common/frame.h b/common/frame.h
index aad77f5..a3da4e4 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -63,6 +63,8 @@ typedef struct
     int8_t  *mb_type;
     int16_t (*mv[2])[2];
     int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+    uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
+    uint8_t  (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
     int     *lowres_mv_costs[2][X264_BFRAME_MAX+1];
     int8_t  *ref[2];
     int     i_ref[2];
@@ -83,6 +85,7 @@ typedef struct
     float   *f_qp_offset;
     int     b_intra_calculated;
     uint16_t *i_intra_cost;
+    uint32_t *i_propagate_cost;
     uint16_t *i_inv_qscale_factor;

     /* threading */
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 4a36fcd..38b9976 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -276,8 +276,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
     }
     h->mb.i_psy_rd_lambda = a->i_lambda;
-    /* Adjusting chroma lambda based on QP offset hurts PSNR, so we'll leave it as part of psy-RD. */
-    h->mb.i_chroma_lambda2_offset = h->mb.i_psy_rd ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
+    /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
+    h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;

     h->mb.i_me_method = h->param.analyse.i_me_method;
     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 0f1ccc8..bce1f1a 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -441,6 +441,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
         h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
         h->param.rc.i_aq_mode = 0;
+        h->param.rc.b_mb_tree = 0;
     }
     h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
     h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
@@ -473,6 +474,15 @@ static int x264_validate_parameters( x264_t *h )
     if( !h->param.i_bframe )
         h->param.i_bframe_adaptive = X264_B_ADAPT_NONE;
     h->param.analyse.b_weighted_bipred = h->param.analyse.b_weighted_bipred && h->param.i_bframe > 0;
+    h->param.i_lookahead = X264_MIN( h->param.i_lookahead, X264_LOOKAHEAD_MAX );
+    h->param.i_lookahead = X264_MIN( h->param.i_lookahead, h->param.i_keyint_max );
+    if( h->param.rc.b_stat_read )
+        h->param.i_lookahead = 0;
+    else if( !h->param.i_lookahead )
+        h->param.rc.b_mb_tree = 0;
+    if( h->param.rc.f_qcompress == 1 )
+        h->param.rc.b_mb_tree = 0;
+
     h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
                                 && h->param.i_bframe
                                 && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
@@ -513,6 +523,11 @@ static int x264_validate_parameters( x264_t *h )
     if( !h->param.b_cabac )
         h->param.analyse.i_trellis = 0;
     h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
+    if( !h->param.analyse.b_psy )
+    {
+        h->param.analyse.f_psy_rd = 0;
+        h->param.analyse.f_psy_trellis = 0;
+    }
     if( !h->param.analyse.i_trellis )
         h->param.analyse.f_psy_trellis = 0;
     h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
@@ -537,6 +552,17 @@ static int x264_validate_parameters( x264_t *h )
     h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
     if( h->param.rc.f_aq_strength == 0 )
         h->param.rc.i_aq_mode = 0;
+    /* MB-tree requires AQ to be on, even if the strength is zero. */
+    if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree )
+    {
+        h->param.rc.i_aq_mode = 1;
+        h->param.rc.f_aq_strength = 0;
+        if( h->param.b_bframe_pyramid )
+        {
+            x264_log( h, X264_LOG_WARNING, "b-pyramid + mb-tree is not supported\n" );
+            h->param.b_bframe_pyramid = 0;
+        }
+    }
     h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 );
     if( h->param.analyse.i_subpel_refine == 10 && (h->param.analyse.i_trellis != 2 || !h->param.rc.i_aq_mode) )
         h->param.analyse.i_subpel_refine = 9;
@@ -723,6 +749,9 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4 + h->param.i_threads - 1;
     else
         h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1;
+    if( h->param.rc.b_mb_tree )
+        h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.i_lookahead );
+
     h->frames.i_max_ref0 = h->param.i_frame_reference;
     h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
     h->frames.i_max_dpb  = h->sps->vui.i_max_dec_frame_buffering;
@@ -730,7 +759,8 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         && ( h->param.rc.i_rc_method == X264_RC_ABR
           || h->param.rc.i_rc_method == X264_RC_CRF
           || h->param.i_bframe_adaptive
-          || h->param.i_scenecut_threshold );
+          || h->param.i_scenecut_threshold
+          || h->param.rc.b_mb_tree );
     h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
     h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);

@@ -1443,7 +1473,12 @@ int     x264_encoder_encode( x264_t *h,
         if( h->frames.b_have_lowres )
             x264_frame_init_lowres( h, fenc );

-        if( h->param.rc.i_aq_mode )
+        if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
+        {
+            if( x264_macroblock_tree_read( h, fenc ) )
+                return -1;
+        }
+        else if( h->param.rc.i_aq_mode )
             x264_adaptive_quant_frame( h, fenc );

         if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 087e658..2a85a52 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -71,6 +71,7 @@ struct x264_ratecontrol_t
     double fps;
     double bitrate;
     double rate_tolerance;
+    double qcompress;
     int nmb;                    /* number of macroblocks in a frame */
     int qp_constant[5];

@@ -106,6 +107,10 @@ struct x264_ratecontrol_t
     /* 2pass stuff */
     FILE *p_stat_file_out;
     char *psz_stat_file_tmpname;
+    FILE *p_mbtree_stat_file_out;
+    char *psz_mbtree_stat_file_tmpname;
+    char *psz_mbtree_stat_file_name;
+    FILE *p_mbtree_stat_file_in;

     int num_entries;            /* number of ratecontrol_entry_ts */
     ratecontrol_entry_t *entry; /* FIXME: copy needed data and free this once init is done */
@@ -210,30 +215,12 @@ static const float log2_lut[128] = {
     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 };

-static const uint8_t exp2_lut[64] = {
-      1,   4,   7,  10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,  44,  47,
-     50,  53,  57,  60,  64,  67,  71,  74,  78,  81,  85,  89,  93,  96, 100, 104,
-    108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
-    177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
-};
-
 static ALWAYS_INLINE float x264_log2( uint32_t x )
 {
     int lz = x264_clz( x );
     return log2_lut[(x<<lz>>24)&0x7f] + (31 - lz);
 }

-static ALWAYS_INLINE int x264_exp2fix8( float x )
-{
-    int i, f;
-    x += 8;
-    if( x <= 0 ) return 0;
-    if( x >= 16 ) return 0xffff;
-    i = x;
-    f = (x-i)*64;
-    return (exp2_lut[f]+256) << i >> 8;
-}
-
 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
 {
     /* constants chosen to result in approximately the same overall bitrate as without AQ.
@@ -241,6 +228,17 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
     int mb_x, mb_y;
     float strength;
     float avg_adj = 0.f;
+    /* Need to init it anyways for MB tree. */
+    if( h->param.rc.f_aq_strength == 0 )
+    {
+        int mb_xy;
+        memset( frame->f_qp_offset, 0, sizeof(float) * h->mb.i_mb_count );
+        if( h->frames.b_have_lowres )
+            for( mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
+                frame->i_inv_qscale_factor[mb_xy] = 256;
+        return;
+    }
+
     if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
     {
         for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
@@ -257,6 +255,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
     }
     else
         strength = h->param.rc.f_aq_strength * 1.0397f;
+
     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
         for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
         {
@@ -291,6 +290,34 @@ void x264_adaptive_quant( x264_t *h )
     h->mb.i_qp = x264_clip3( h->rc->f_qpm + h->fenc->f_qp_offset[h->mb.i_mb_xy] + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
 }

+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
+{
+    x264_ratecontrol_t *rc = h->rc;
+    uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
+
+    if( i_type_actual != SLICE_TYPE_B )
+    {
+        uint8_t i_type;
+        if( !fread( &i_type, 1, 1, rc->p_mbtree_stat_file_in ) )
+            goto fail;
+
+        if( i_type != i_type_actual )
+        {
+            x264_log(h, X264_LOG_ERROR, "MB-tree frametype %d doesn't match actual frametype %d.\n", i_type,i_type_actual);
+            return -1;
+        }
+
+        if( fread( frame->f_qp_offset, sizeof(float), h->mb.i_mb_count, rc->p_mbtree_stat_file_in ) != h->mb.i_mb_count )
+            goto fail;
+    }
+    else
+        x264_adaptive_quant_frame( h, frame );
+    return 0;
+fail:
+    x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
+    return -1;
+}
+
 int x264_ratecontrol_new( x264_t *h )
 {
     x264_ratecontrol_t *rc;
@@ -310,6 +337,14 @@ int x264_ratecontrol_new( x264_t *h )
     else
         rc->fps = 25.0;

+    if( h->param.rc.b_mb_tree )
+    {
+        h->param.rc.f_pb_factor = 1;
+        rc->qcompress = 1;
+    }
+    else
+        rc->qcompress = h->param.rc.f_qcompress;
+
     rc->bitrate = h->param.rc.i_bitrate * 1000.;
     rc->rate_tolerance = h->param.rc.f_rate_tolerance;
     rc->nmb = h->mb.i_mb_count;
@@ -379,17 +414,18 @@ int x264_ratecontrol_new( x264_t *h )
         rc->accum_p_norm = .01;
         rc->accum_p_qp = ABR_INIT_QP * rc->accum_p_norm;
         /* estimated ratio that produces a reasonable QP for the first I-frame */
-        rc->cplxr_sum = .01 * pow( 7.0e5, h->param.rc.f_qcompress ) * pow( h->mb.i_mb_count, 0.5 );
+        rc->cplxr_sum = .01 * pow( 7.0e5, rc->qcompress ) * pow( h->mb.i_mb_count, 0.5 );
         rc->wanted_bits_window = 1.0 * rc->bitrate / rc->fps;
         rc->last_non_b_pict_type = SLICE_TYPE_I;
     }

     if( h->param.rc.i_rc_method == X264_RC_CRF )
     {
-        /* arbitrary rescaling to make CRF somewhat similar to QP */
+        /* Arbitrary rescaling to make CRF somewhat similar to QP.
+         * Try to compensate for MB-tree's effects as well. */
         double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
-        rc->rate_factor_constant = pow( base_cplx, 1 - h->param.rc.f_qcompress )
-                                 / qp2qscale( h->param.rc.f_rf_constant );
+        rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
+                                 / qp2qscale( h->param.rc.f_rf_constant + (h->param.rc.b_mb_tree?5:0) );
     }

     rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
@@ -437,6 +473,19 @@ int x264_ratecontrol_new( x264_t *h )
             x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n");
             return -1;
         }
+        if( h->param.rc.b_mb_tree )
+        {
+            char *mbtree_stats_in = x264_malloc( strlen(h->param.rc.psz_stat_in) + 8 );
+            strcpy( mbtree_stats_in, h->param.rc.psz_stat_in );
+            strcat( mbtree_stats_in, ".mbtree" );
+            rc->p_mbtree_stat_file_in = fopen( mbtree_stats_in, "rb" );
+            x264_free( mbtree_stats_in );
+            if( !rc->p_mbtree_stat_file_in )
+            {
+                x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n");
+                return -1;
+            }
+        }

         /* check whether 1st pass options were compatible with current options */
         if( !strncmp( stats_buf, "#options:", 9 ) )
@@ -600,6 +649,22 @@ int x264_ratecontrol_new( x264_t *h )
         p = x264_param2string( &h->param, 1 );
         fprintf( rc->p_stat_file_out, "#options: %s\n", p );
         x264_free( p );
+        if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
+        {
+            rc->psz_mbtree_stat_file_tmpname = x264_malloc( strlen(h->param.rc.psz_stat_out) + 13 );
+            strcpy( rc->psz_mbtree_stat_file_tmpname, h->param.rc.psz_stat_out );
+            strcat( rc->psz_mbtree_stat_file_tmpname, ".mbtree.temp" );
+            rc->psz_mbtree_stat_file_name = x264_malloc( strlen(h->param.rc.psz_stat_out) + 8 );
+            strcpy( rc->psz_mbtree_stat_file_name, h->param.rc.psz_stat_out );
+            strcat( rc->psz_mbtree_stat_file_name, ".mbtree" );
+
+            rc->p_mbtree_stat_file_out = fopen( rc->psz_mbtree_stat_file_tmpname, "wb" );
+            if( rc->p_mbtree_stat_file_out == NULL )
+            {
+                x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n");
+                return -1;
+            }
+        }
     }

     for( i=0; i<h->param.i_threads; i++ )
@@ -739,8 +804,8 @@ void x264_ratecontrol_summary( x264_t *h )
     {
         double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
         x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
-                  qscale2qp( pow( base_cplx, 1 - h->param.rc.f_qcompress )
-                             * rc->cplxr_sum / rc->wanted_bits_window ) );
+                  qscale2qp( pow( base_cplx, 1 - rc->qcompress )
+                             * rc->cplxr_sum / rc->wanted_bits_window ) - (h->param.rc.b_mb_tree?5:0) );
     }
 }

@@ -760,6 +825,18 @@ void x264_ratecontrol_delete( x264_t *h )
             }
         x264_free( rc->psz_stat_file_tmpname );
     }
+    if( rc->p_mbtree_stat_file_out )
+    {
+        fclose( rc->p_mbtree_stat_file_out );
+        if( h->i_frame >= rc->num_entries )
+            if( rename( rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name ) != 0 )
+            {
+                x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
+                          rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name );
+            }
+        x264_free( rc->psz_mbtree_stat_file_tmpname );
+        x264_free( rc->psz_mbtree_stat_file_name );
+    }
     x264_free( rc->pred );
     x264_free( rc->pred_b_from_p );
     x264_free( rc->entry );
@@ -1125,6 +1202,15 @@ void x264_ratecontrol_end( x264_t *h, int bits )
                  h->stat.frame.i_mb_count_p,
                  h->stat.frame.i_mb_count_skip,
                  c_direct);
+
+        /* TODO: deal with endianness.
+         * Don't re-write the data in multi-pass mode. */
+        if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read )
+        {
+            uint8_t i_type = h->sh.i_type;
+            fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out );
+            fwrite( h->fenc->f_qp_offset, sizeof(float), h->mb.i_mb_count, rc->p_mbtree_stat_file_out );
+        }
     }

     if( rc->b_abr )
@@ -1177,7 +1263,7 @@ static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor
     double q;
     x264_zone_t *zone = get_zone( h, frame_num );

-    q = pow( rce->blurred_complexity, 1 - h->param.rc.f_qcompress );
+    q = pow( rce->blurred_complexity, 1 - rcc->qcompress );

     // avoid NaN's in the rc_eq
     if(!isfinite(q) || rce->tex_bits + rce->mv_bits == 0)
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
index 3310d3c..a0b62b2 100644
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -29,6 +29,7 @@ void x264_ratecontrol_delete( x264_t * );

 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
 void x264_adaptive_quant( x264_t * );
+int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
 void x264_ratecontrol_start( x264_t *, int i_force_qp );
 int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 2c16429..7b15781 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -63,6 +63,7 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     x264_me_t m[2];
     int i_bcost = COST_MAX;
     int l, i;
+    int list_used = 0;

     h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
     h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 );
@@ -107,8 +108,7 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
         i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
                            m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
-        if( i_bcost > i_cost ) \
-            i_bcost = i_cost; \
+        COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
     }

     m[0].i_pixel = PIXEL_8x8;
@@ -138,8 +138,7 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             int i_cost;
             h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight );
             i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 );
-            if( i_bcost > i_cost )
-                i_bcost = i_cost;
+            COPY2_IF_LT( i_bcost, i_cost, list_used, 3 );
         }
     }

@@ -181,16 +180,18 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l];
             m[l].cost = *fenc_costs[l];
         }
-        i_bcost = X264_MIN( i_bcost, m[l].cost );
+        COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
     }

     if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
         TRY_BIDIR( m[0].mv, m[1].mv, 5 );

+    frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy] = list_used;
+
 lowres_intra_mb:
     /* forbid intra-mbs in B-frames, because it's rare and not worth checking */
     /* FIXME: Should we still forbid them now that we cache intra scores? */
-    if( !b_bidir )
+    if( !b_bidir || h->param.rc.b_mb_tree )
     {
         int i_icost, b_intra;
         if( !fenc->b_intra_calculated )
@@ -237,18 +238,23 @@ lowres_intra_mb:
         }
         else
             i_icost = fenc->i_intra_cost[i_mb_xy];
-        b_intra = i_icost < i_bcost;
-        if( b_intra )
-            i_bcost = i_icost;
-        if(   (i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
-            && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1)
-            || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+        if( !b_bidir )
         {
-            fenc->i_intra_mbs[b-p0] += b_intra;
-            fenc->i_cost_est[0][0] += i_icost;
+            b_intra = i_icost < i_bcost;
+            if( b_intra )
+                i_bcost = i_icost;
+            if(   (i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
+                && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1)
+                || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+            {
+                fenc->i_intra_mbs[b-p0] += b_intra;
+                fenc->i_cost_est[0][0] += i_icost;
+            }
         }
     }

+    frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost;
+
     return i_bcost;
 }
 #undef TRY_BIDIR
@@ -262,6 +268,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                x264_frame_t **frames, int p0, int p1, int b,
                                int b_intra_penalty )
 {
+
     int i_score = 0;
     /* Don't use the AQ'd scores for slicetype decision. */
     int i_score_aq = 0;
@@ -299,7 +306,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,

         /* the edge mbs seem to reduce the predictive quality of the
          * whole frame's score, but are needed for a spatial distribution. */
-        if( h->param.rc.i_vbv_buffer_size || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
+            h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
         {
             for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
             {
@@ -355,7 +363,170 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
     return i_score;
 }

-#define MAX_LENGTH (X264_BFRAME_MAX*4)
+/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
+ * re-running lookahead. */
+static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames,
+                                                  int p0, int p1, int b )
+{
+    int i_score = 0;
+    int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
+    x264_emms();
+    for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+    {
+        row_satd[ h->mb.i_mb_y ] = 0;
+        for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
+        {
+            int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
+            int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy];
+            float qp_adj = frames[b]->f_qp_offset[i_mb_xy];
+            i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj*(-1.f/6.f)) + 128) >> 8;
+            row_satd[ h->mb.i_mb_y ] += i_mb_cost;
+            if( (h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
+                 h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1) ||
+                 h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+            {
+                i_score += i_mb_cost;
+            }
+        }
+    }
+    return i_score;
+}
+
+static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b )
+{
+    int refs[2] = {p0,p1};
+    int dist_scale_factor = p1 != p0 ? 128 : ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+    int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
+
+    for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+    {
+        for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
+        {
+            int mb_index = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
+            int inter_cost = frames[b]->lowres_costs[b-p0][p1-b][mb_index];
+            int intra_cost = (frames[b]->i_intra_cost[mb_index] * frames[b]->i_inv_qscale_factor[mb_index]+128)>>8;
+            int lists_used = frames[b]->lowres_inter_types[b-p0][p1-b][mb_index];
+            /* The approximate amount of data that this block contains. */
+            int propagate_amount = intra_cost + frames[b]->i_propagate_cost[mb_index];
+
+            /* Divide by 64 for per-pixel summing. */
+            propagate_amount = (((uint64_t)propagate_amount*(intra_cost-inter_cost)) / intra_cost + 32) >> 6;
+
+            /* Don't propagate for an intra block. */
+            if( inter_cost < intra_cost )
+            {
+                int mv[2][2], list;
+                mv[0][0] = frames[b]->lowres_mvs[0][b-p0-1][mb_index][0];
+                mv[0][1] = frames[b]->lowres_mvs[0][b-p0-1][mb_index][1];
+                if( b != p1 )
+                {
+                    mv[1][0] = frames[b]->lowres_mvs[1][p1-b-1][mb_index][0];
+                    mv[1][1] = frames[b]->lowres_mvs[1][p1-b-1][mb_index][1];
+                }
+
+                /* Follow the MVs to the previous frame(s). */
+                for( list = 0; list < 2; list++ )
+                    if( (lists_used >> list)&1 )
+                    {
+                        int x = mv[list][0];
+                        int y = mv[list][1];
+                        int listamount = propagate_amount;
+                        int mbx = (x>>5)+h->mb.i_mb_x;
+                        int mby = ((y>>5)+h->mb.i_mb_y);
+                        int idx0 = mbx + mby*h->mb.i_mb_stride;
+                        int idx1 = idx0 + 1;
+                        int idx2 = idx0 + h->mb.i_mb_stride;
+                        int idx3 = idx0 + h->mb.i_mb_stride + 1;
+                        int idx0weight = (32-(y&31))*(32-(x&31));
+                        int idx1weight = (32-(y&31))*(x&31);
+                        int idx2weight = (y&31)*(32-(x&31));
+                        int idx3weight = (y&31)*(x&31);
+
+                        /* Apply bipred weighting. */
+                        if( lists_used == 3 )
+                            listamount = (listamount * (list?(64-i_bipred_weight):i_bipred_weight) + 32) >> 6;
+
+                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
+                         * be counted. */
+                        if( mbx < h->sps->i_mb_width-1 && mby < h->sps->i_mb_height-1 && mbx >= 0 && mby >= 0 )
+                        {
+                            frames[refs[list]]->i_propagate_cost[idx0] += (listamount*idx0weight+8)>>4;
+                            frames[refs[list]]->i_propagate_cost[idx1] += (listamount*idx1weight+8)>>4;
+                            frames[refs[list]]->i_propagate_cost[idx2] += (listamount*idx2weight+8)>>4;
+                            frames[refs[list]]->i_propagate_cost[idx3] += (listamount*idx3weight+8)>>4;
+                        }
+                        else /* Check offsets individually */
+                        {
+                            if( mbx < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx >= 0 && mby >= 0 )
+                                frames[refs[list]]->i_propagate_cost[idx0] += (listamount*idx0weight+8)>>4;
+                            if( mbx+1 < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx+1 >= 0 && mby >= 0 )
+                                frames[refs[list]]->i_propagate_cost[idx1] += (listamount*idx1weight+8)>>4;
+                            if( mbx < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx >= 0 && mby+1 >= 0 )
+                                frames[refs[list]]->i_propagate_cost[idx2] += (listamount*idx2weight+8)>>4;
+                            if( mbx+1 < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
+                                frames[refs[list]]->i_propagate_cost[idx3] += (listamount*idx3weight+8)>>4;
+                        }
+                    }
+            }
+        }
+    }
+}
+
+static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
+{
+    int i, idx = !b_intra;
+    int last_nonb, cur_nonb = 1;
+    if( b_intra )
+       x264_slicetype_frame_cost( h, a, frames, 0, 0, 0, 0 );
+
+    i = num_frames-1;
+    while( i > 0 && frames[i]->i_type == X264_TYPE_B )
+        i--;
+    last_nonb = i;
+
+    if( last_nonb < 0 )
+        return;
+
+    memset( frames[last_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint32_t) );
+    while( i-- > idx )
+    {
+        cur_nonb = i;
+        while( frames[cur_nonb]->i_type == X264_TYPE_B && cur_nonb > 0 )
+            cur_nonb--;
+        if( cur_nonb < idx )
+            break;
+        x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, last_nonb, 0 );
+        memset( frames[cur_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint32_t) );
+        x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb );
+        while( frames[i]->i_type == X264_TYPE_B && i > 0 )
+        {
+            x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
+            memset( frames[i]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint32_t) );
+            x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i );
+            i--;
+        }
+        last_nonb = cur_nonb;
+    }
+    x264_emms();
+
+    for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+    {
+        for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
+        {
+            int mb_index = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
+            float intra_cost =  (frames[last_nonb]->i_intra_cost[mb_index] * frames[last_nonb]->i_inv_qscale_factor[mb_index]+128)>>8;
+
+            if( intra_cost )
+            {
+                float propagate_cost = frames[last_nonb]->i_propagate_cost[mb_index];
+                float ratio = (intra_cost + propagate_cost) / (intra_cost);
+                /* Allow the constant to be adjusted via qcompress, since the two
+                 * concepts are very similar. */
+                frames[last_nonb]->f_qp_offset[mb_index] -= 5.0 * (1.0 - h->param.rc.f_qcompress) * log2f(ratio);
+            }
+        }
+    }
+}

 static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, int threshold )
 {
@@ -393,14 +564,14 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram
 /* Uses strings due to the fact that the speed of the control functions is
    negligable compared to the cost of running slicetype_frame_cost, and because
    it makes debugging easier. */
-static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[MAX_LENGTH] )
+static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[X264_LOOKAHEAD_MAX] )
 {
-    char paths[X264_BFRAME_MAX+2][MAX_LENGTH] = {{0}};
+    char paths[X264_BFRAME_MAX+2][X264_LOOKAHEAD_MAX] = {{0}};
     int num_paths = X264_MIN(max_bframes+1, length);
     int suffix_size, loc, path;
     int best_cost = COST_MAX;
     int best_path_index = 0;
-    length = X264_MIN(length,MAX_LENGTH);
+    length = X264_MIN(length,X264_LOOKAHEAD_MAX);

     /* Iterate over all currently possible paths and add suffixes to each one */
     for( suffix_size = 0; suffix_size < num_paths; suffix_size++ )
@@ -426,15 +597,6 @@ static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
     memcpy( best_paths[length], paths[best_path_index], length );
 }

-static int x264_slicetype_path_search( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int bframes, int buffer )
-{
-    char best_paths[MAX_LENGTH][MAX_LENGTH] = {"","P"};
-    int n;
-    for( n = 2; n < length-1; n++ )
-        x264_slicetype_path( h, a, frames, n, bframes, buffer, best_paths );
-    return strspn( best_paths[length-2], "B" );
-}
-
 static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1 )
 {
     x264_frame_t *frame = frames[p1];
@@ -477,13 +639,13 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
     return res;
 }

-static void x264_slicetype_analyse( x264_t *h )
+static void x264_slicetype_analyse( x264_t *h, int keyframe )
 {
     x264_mb_analysis_t a;
-    x264_frame_t *frames[X264_BFRAME_MAX*4+3] = { NULL, };
+    x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
     int num_frames;
     int keyint_limit;
-    int j;
+    int i,j;
     int i_mb_count = NUM_MBS;
     int cost1p0, cost2p0, cost1b1, cost2p1;
     int idr_frame_type;
@@ -497,96 +659,150 @@ static void x264_slicetype_analyse( x264_t *h )
         frames[j+1] = h->frames.next[j];
     keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1;
     num_frames = X264_MIN( j, keyint_limit );
-    if( num_frames == 0 )
+
+    if( num_frames == 0 && (!j || !h->param.rc.b_mb_tree) )
         return;

     x264_lowres_context_init( h, &a );
     idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;

-    if( num_frames == 1 )
+    if( num_frames == 1 && !h->param.rc.b_mb_tree )
     {
-no_b_frames:
         frames[1]->i_type = X264_TYPE_P;
         if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
             frames[1]->i_type = idr_frame_type;
         return;
     }

-    if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+    /* This is important psy-wise: if we have a non-scenecut keyframe,
+     * there will be significant visual artifacts if the frames just before
+     * go down in quality due to being referenced less, despite it being
+     * more RD-optimal. */
+    if( h->param.analyse.b_psy && h->param.rc.b_mb_tree )
+        num_frames = j;
+
+    char best_paths[X264_LOOKAHEAD_MAX][X264_LOOKAHEAD_MAX] = {"","P"};
+    int n;
+    int num_bframes = 0;
+    int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+    int num_analysed_frames = num_frames;
+    int reset_start;
+    if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
     {
-        int num_bframes;
-        int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
-        if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
+        frames[1]->i_type = idr_frame_type;
+        return;
+    }
+
+    if( h->param.i_bframe )
+    {
+        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
         {
-            frames[1]->i_type = idr_frame_type;
-            return;
+            /* Perform the frametype analysis. */
+            for( n = 2; n < num_frames-1; n++ )
+                x264_slicetype_path( h, &a, frames, n, max_bframes, num_frames-max_bframes, best_paths );
+            num_bframes = strspn( best_paths[num_frames-2], "B" );
+            /* Load the results of the analysis into the frame types. */
+            for( j = 1; j < num_frames; j++ )
+                frames[j]->i_type = best_paths[num_frames-2][j-1] == 'B' ? X264_TYPE_B : X264_TYPE_P;
+            frames[num_frames]->i_type = X264_TYPE_P;
         }
-        num_bframes = x264_slicetype_path_search( h, &a, frames, num_frames, max_bframes, num_frames-max_bframes );
-        assert(num_bframes < num_frames);
-
-        for( j = 1; j < num_bframes+1; j++ )
+        else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
         {
-            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1 ) )
+            for( i = 0; i < num_frames-(2-!i); )
             {
-                frames[j]->i_type = X264_TYPE_P;
-                return;
-            }
-            frames[j]->i_type = X264_TYPE_B;
-        }
-        frames[num_bframes+1]->i_type = X264_TYPE_P;
-    }
-    else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
-    {
-        cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 );
-        if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 )
-            goto no_b_frames;
+                cost2p1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+2, 1 );
+                if( frames[i+2]->i_intra_mbs[2] > i_mb_count / 2 )
+                {
+                    frames[i+1]->i_type = X264_TYPE_P;
+                    frames[i+2]->i_type = X264_TYPE_P;
+                    i += 2;
+                    continue;
+                }

-        cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 );
-        cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
-        cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 );
+                cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
+                cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
+                cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );

-        if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
-            goto no_b_frames;
+                if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
+                {
+                    frames[i+1]->i_type = X264_TYPE_P;
+                    frames[i+2]->i_type = X264_TYPE_P;
+                    i += 2;
+                    continue;
+                }

-        // arbitrary and untuned
-        #define INTER_THRESH 300
-        #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
-        frames[1]->i_type = X264_TYPE_B;
+                // arbitrary and untuned
+                #define INTER_THRESH 300
+                #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
+                frames[i+1]->i_type = X264_TYPE_B;
+                frames[i+2]->i_type = X264_TYPE_P;

-        for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ )
+                for( j = i+2; j <= X264_MIN( h->param.i_bframe, num_frames-2 ); j++ )
+                {
+                    int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-i-1), INTER_THRESH/10);
+                    int pcost = x264_slicetype_frame_cost( h, &a, frames, i+0, j+1, j+1, 1 );
+
+                    if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-i+1] > i_mb_count/3 )
+                    {
+                        frames[j]->i_type = X264_TYPE_P;
+                        break;
+                    }
+                    else
+                        frames[j]->i_type = X264_TYPE_B;
+                }
+                i = j;
+            }
+            frames[i+!i]->i_type = X264_TYPE_P;
+            num_bframes = 0;
+            while( num_bframes < num_frames && frames[num_bframes+1]->i_type == X264_TYPE_B )
+                num_bframes++;
+        }
+        else
         {
-            int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10);
-            int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 );
+            num_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+            for( j = 1; j < num_frames; j++ )
+                frames[j]->i_type = (j%(num_bframes+1)) ? X264_TYPE_B : X264_TYPE_P;
+            frames[num_frames]->i_type = X264_TYPE_P;
+        }

-            if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 )
+        /* Check scenecut on the first minigop. */
+        for( j = 1; j < num_bframes+1; j++ )
+            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1 ) )
             {
                 frames[j]->i_type = X264_TYPE_P;
+                num_analysed_frames = j;
                 break;
             }
-            else
-                frames[j]->i_type = X264_TYPE_B;
-        }
+
+        reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 );
     }
     else
     {
-        int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
-        if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
-        {
-            frames[1]->i_type = idr_frame_type;
-            return;
-        }
+        for( j = 1; j < num_frames; j++ )
+            frames[j]->i_type = X264_TYPE_P;
+        reset_start = !keyframe + 1;
+    }

-        for( j = 1; j < max_bframes+1; j++ )
-        {
-            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1 ) )
+    /* Perform the actual macroblock tree analysis.
+     * Don't go farther than the lookahead parameter; this helps in short GOPs. */
+    if( h->param.rc.b_mb_tree )
+        x264_macroblock_tree( h, &a, frames, X264_MIN(num_analysed_frames, h->param.i_lookahead), keyframe );
+
+    /* Enforce keyframe limit. */
+    if( h->param.i_bframe )
+        for( j = 0; j <= num_bframes; j++ )
+            if( j+1 > keyint_limit )
             {
-                frames[j]->i_type = X264_TYPE_P;
-                return;
+                if( j )
+                    frames[j]->i_type = X264_TYPE_P;
+                frames[j+1]->i_type = idr_frame_type;
+                reset_start = j+2;
+                break;
             }
-            frames[j]->i_type = X264_TYPE_B;
-        }
-        frames[max_bframes+1]->i_type = X264_TYPE_P;
-    }
+
+    /* Restore frametypes for all frames that haven't actually been decided yet. */
+    for( j = reset_start; j <= num_frames; j++ )
+        frames[j]->i_type = X264_TYPE_AUTO;
 }

 void x264_slicetype_decide( x264_t *h )
@@ -606,8 +822,9 @@ void x264_slicetype_decide( x264_t *h )
                 x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame );
     }
     else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
-             || h->param.i_scenecut_threshold )
-        x264_slicetype_analyse( h );
+             || h->param.i_scenecut_threshold
+             || h->param.rc.b_mb_tree )
+        x264_slicetype_analyse( h, 0 );

     for( bframes = 0;; bframes++ )
     {
@@ -645,7 +862,9 @@ void x264_slicetype_decide( x264_t *h )
                 frm->i_type = X264_TYPE_P;
         }

-        if( frm->i_type == X264_TYPE_AUTO ) frm->i_type = X264_TYPE_B;
+        if( frm->i_type == X264_TYPE_AUTO )
+            frm->i_type = X264_TYPE_B;
+
         else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
     }
 }
@@ -653,7 +872,7 @@ void x264_slicetype_decide( x264_t *h )
 int x264_rc_analyse_slice( x264_t *h )
 {
     x264_mb_analysis_t a;
-    x264_frame_t *frames[X264_BFRAME_MAX*4+2] = { NULL, };
+    x264_frame_t *frames[X264_LOOKAHEAD_MAX+2] = { NULL, };
     int p0=0, p1, b;
     int cost;

@@ -662,6 +881,12 @@ int x264_rc_analyse_slice( x264_t *h )
     if( IS_X264_TYPE_I(h->fenc->i_type) )
     {
         p1 = b = 0;
+        /* For MB-tree, we have to perform propagation analysis on I-frames too. */
+        if( h->param.rc.b_mb_tree )
+        {
+            h->frames.last_nonb = h->fenc;
+            x264_slicetype_analyse( h, 1 );
+        }
     }
     else if( X264_TYPE_P == h->fenc->i_type )
     {
@@ -680,11 +905,16 @@ int x264_rc_analyse_slice( x264_t *h )
     frames[p0] = h->fref0[0];
     frames[b] = h->fenc;

-    cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+    if( h->param.rc.b_mb_tree )
+        cost = x264_slicetype_frame_cost_recalculate( h, &a, frames, p0, p1, b );
+    else
+    {
+        cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );

-    /* In AQ, use the weighted score instead. */
-    if( h->param.rc.i_aq_mode )
-        cost = frames[b]->i_cost_est[b-p0][p1-b];
+        /* In AQ, use the weighted score instead. */
+        if( h->param.rc.i_aq_mode )
+            cost = frames[b]->i_cost_est[b-p0][p1-b];
+    }

     h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
     h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
diff --git a/x264.c b/x264.c
index c3b4f29..5452dba 100644
--- a/x264.c
+++ b/x264.c
@@ -168,9 +168,8 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H0( "                                  - baseline,main,high\n" );
     H0( "      --preset                Use a preset to select encoding settings [medium]\n" );
     H0( "                                  Overridden by user settings\n");
-    H1( "                                  - ultrafast,veryfast,fast,medium\n"
-        "                                  - slow,slower,placebo\n" );
-    else H0( "                                  - ultrafast,veryfast,fast,medium,slow,slower\n" );
+    H0( "                                  - ultrafast,veryfast,faster,fast\n"
+        "                                  - medium,slow,slower,placebo\n" );
     H0( "      --tune                  Tune the settings for a particular type of source\n" );
     H0( "                                  Overridden by user settings\n");
     H1( "                                  - film,animation,grain,psnr,ssim\n"
@@ -184,6 +183,7 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H1( "  -i, --min-keyint <integer>  Minimum GOP size [%d]\n", defaults->i_keyint_min );
     H1( "      --no-scenecut           Disable adaptive I-frame decision\n" );
     H1( "      --scenecut <integer>    How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
+    H0( "      --lookahead <integer>   Number of frames for frametype lookahead [%d]\n", defaults->i_lookahead );
     H0( "  -b, --bframes <integer>     Number of B-frames between I and P [%d]\n", defaults->i_bframe );
     H1( "      --b-adapt               Adaptive B-frame decision method [%d]\n"
         "                                  Higher values may lower threading efficiency.\n"
@@ -228,6 +228,7 @@ static void Help( x264_param_t *defaults, int b_longhelp )
         "                                  - 2: Last pass, does not overwrite stats file\n"
         "                                  - 3: Nth pass, overwrites stats file\n" );
     H0( "      --stats <string>        Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
+    H0( "      --no-mbtree                Disable mb-tree ratecontrol.\n");
     H0( "      --qcomp <float>         QP curve compression: 0.0 => CBR, 1.0 => CQP [%.2f]\n", defaults->rc.f_qcompress );
     H1( "      --cplxblur <float>      Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
     H1( "      --qblur <float>         Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
@@ -277,6 +278,8 @@ static void Help( x264_param_t *defaults, int b_longhelp )
         "                                  #1: RD (requires subme>=6)\n"
         "                                  #2: Trellis (requires trellis, experimental)\n",
                                        defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis );
+    H1( "      --no-psy                Disable all visual optimizations that worsen\n"
+        "                              both PSNR and SSIM.\n" );
     H0( "      --no-mixed-refs         Don't decide references on a per partition basis\n" );
     H1( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
     H0( "      --no-8x8dct             Disable adaptive spatial transform size\n" );
@@ -383,6 +386,7 @@ static struct option long_options[] =
     { "slow-firstpass",    no_argument, NULL, OPT_SLOWFIRSTPASS },
     { "bitrate",     required_argument, NULL, 'B' },
     { "bframes",     required_argument, NULL, 'b' },
+    { "lookahead",   required_argument, NULL, 0 },
     { "b-adapt",     required_argument, NULL, 0 },
     { "no-b-adapt",        no_argument, NULL, 0 },
     { "b-bias",      required_argument, NULL, 0 },
@@ -422,6 +426,7 @@ static struct option long_options[] =
     { "mvrange-thread", required_argument, NULL, 0 },
     { "subme",       required_argument, NULL, 'm' },
     { "psy-rd",      required_argument, NULL, 0 },
+    { "no-psy",            no_argument, NULL, 0 },
     { "mixed-refs",        no_argument, NULL, 0 },
     { "no-mixed-refs",     no_argument, NULL, 0 },
     { "no-chroma-me",      no_argument, NULL, 0 },
@@ -446,6 +451,8 @@ static struct option long_options[] =
     { "pass",        required_argument, NULL, 'p' },
     { "stats",       required_argument, NULL, 0 },
     { "qcomp",       required_argument, NULL, 0 },
+    { "mbtree",            no_argument, NULL, 0 },
+    { "no-mbtree",         no_argument, NULL, 0 },
     { "qblur",       required_argument, NULL, 0 },
     { "cplxblur",    required_argument, NULL, 0 },
     { "zones",       required_argument, NULL, 0 },
@@ -542,6 +549,8 @@ static int  Parse( int argc, char **argv,
                 param->rc.i_aq_mode = 0;
                 param->analyse.b_mixed_references = 0;
                 param->analyse.i_trellis = 0;
+                param->i_bframe_adaptive = X264_B_ADAPT_NONE;
+                param->rc.b_mb_tree = 0;
             }
             else if( !strcasecmp( optarg, "veryfast" ) )
             {
@@ -551,12 +560,20 @@ static int  Parse( int argc, char **argv,
                 param->i_frame_reference = 1;
                 param->analyse.b_mixed_references = 0;
                 param->analyse.i_trellis = 0;
+                param->rc.b_mb_tree = 0;
             }
-            else if( !strcasecmp( optarg, "fast" ) )
+            else if( !strcasecmp( optarg, "faster" ) )
             {
                 param->analyse.b_mixed_references = 0;
                 param->i_frame_reference = 2;
                 param->analyse.i_subpel_refine = 4;
+                param->i_lookahead = 30;
+            }
+            else if( !strcasecmp( optarg, "fast" ) )
+            {
+                param->i_frame_reference = 2;
+                param->analyse.i_subpel_refine = 6;
+                param->i_lookahead = 40;
             }
             else if( !strcasecmp( optarg, "medium" ) )
             {
@@ -644,11 +661,13 @@ static int  Parse( int argc, char **argv,
             {
                 param->analyse.f_psy_rd = 0;
                 param->rc.i_aq_mode = X264_AQ_NONE;
+                param->analyse.b_psy = 0;
             }
             else if( !strcasecmp( optarg, "ssim" ) )
             {
                 param->analyse.f_psy_rd = 0;
                 param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
+                param->analyse.b_psy = 0;
             }
             else if( !strcasecmp( optarg, "fastdecode" ) )
             {
@@ -662,7 +681,6 @@ static int  Parse( int argc, char **argv,
                 param->i_deblocking_filter_alphac0 = -1;
                 param->i_deblocking_filter_beta = -1;
                 param->analyse.f_psy_trellis = 0.2;
-                param->rc.f_ip_factor = 2.1;
                 param->rc.f_aq_strength = 1.3;
                 if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
                     param->analyse.inter |= X264_ANALYSE_PSUB8x8;
diff --git a/x264.h b/x264.h
index 2dfcc8d..9ea74af 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@

 #include <stdarg.h>

-#define X264_BUILD 68
+#define X264_BUILD 69

 /* x264_t:
  *      opaque handler for encoder */
@@ -188,6 +188,7 @@ typedef struct x264_param_t
     int         i_keyint_max;       /* Force an IDR keyframe at this interval */
     int         i_keyint_min;       /* Scenecuts closer together than this are coded as I, not IDR. */
     int         i_scenecut_threshold; /* how aggressively to insert extra I frames */
+    int         i_lookahead;
     int         i_bframe;   /* how many b-frame between 2 references pictures */
     int         i_bframe_adaptive;
     int         i_bframe_bias;
@@ -242,6 +243,7 @@ typedef struct x264_param_t
         int          i_noise_reduction; /* adaptive pseudo-deadzone */
         float        f_psy_rd; /* Psy RD strength */
         float        f_psy_trellis; /* Psy trellis strength */
+        int          b_psy; /* Toggle all psy optimizations */

         /* the deadzone size that will be used in luma quantization */
         int          i_luma_deadzone[2]; /* {inter, intra} */
@@ -271,6 +273,7 @@ typedef struct x264_param_t

         int         i_aq_mode;      /* psy adaptive QP. (X264_AQ_*) */
         float       f_aq_strength;
+        int         b_mb_tree;      /* Macroblock-tree ratecontrol. */

         /* 2pass */
         int         b_stat_write;   /* Enable stat writing in psz_stat_out */
--
1.6.1.2