Untitled

From 636d85b07cab192f796485969bc5e7a5538b8372 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sat, 21 Aug 2010 16:51:39 -0500
Subject: [PATCH 1/9] Add global #define for maximum reference count
 This should make it easier to play around with reference frame counts that exceed the spec maximum.

---
 common/common.h     |   35 ++++++++++++++++++-----------------
 common/frame.h      |    6 +++---
 common/macroblock.c |   10 +++++-----
 encoder/encoder.c   |   16 ++++++++--------
 encoder/set.c       |    2 +-
 5 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/common/common.h b/common/common.h
index 72fc1d8..670fd12 100644
--- a/common/common.h
+++ b/common/common.h
@@ -51,6 +51,7 @@ do {\
 } while( 0 )

 #define X264_BFRAME_MAX 16
+#define X264_REF_MAX 16
 #define X264_THREAD_MAX 128
 #define X264_PCM_COST (384*BIT_DEPTH+16)
 #define X264_LOOKAHEAD_MAX 250
@@ -340,10 +341,10 @@ typedef struct
     {
         int idc;
         int arg;
-    } ref_pic_list_order[2][16];
+    } ref_pic_list_order[2][X264_REF_MAX];

     /* P-frame weighting */
-    x264_weight_t weight[32][3];
+    x264_weight_t weight[X264_REF_MAX*2][3];

     int i_mmco_remove_from_end;
     int i_mmco_command_count;
@@ -351,7 +352,7 @@ typedef struct
     {
         int i_difference_of_pic_nums;
         int i_poc;
-    } mmco[16];
+    } mmco[X264_REF_MAX];

     int i_cabac_init_idc;

@@ -479,7 +480,7 @@ struct x264_t
         x264_frame_t **blank_unused;

         /* frames used for reference + sentinels */
-        x264_frame_t *reference[16+2];
+        x264_frame_t *reference[X264_REF_MAX+2];

         int i_last_keyframe;       /* Frame number of the last keyframe */
         int i_last_idr;            /* Frame number of the last IDR (not RP)*/
@@ -511,9 +512,9 @@ struct x264_t

     /* references lists */
     int             i_ref0;
-    x264_frame_t    *fref0[16+3];     /* ref list 0 */
+    x264_frame_t    *fref0[X264_REF_MAX+3];     /* ref list 0 */
     int             i_ref1;
-    x264_frame_t    *fref1[16+3];     /* ref list 1 */
+    x264_frame_t    *fref1[X264_REF_MAX+3];     /* ref list 1 */
     int             b_ref_reorder[2];

     /* hrd */
@@ -605,14 +606,14 @@ struct x264_t
         int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
         uint8_t (*mvd[2])[8][2];            /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
         int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
-        int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
+        int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
         int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
         int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
         uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
                                              * NOTE: this will fail on resolutions above 2^16 MBs... */

          /* buffer for weighted versions of the reference frames */
-        pixel *p_weight_buf[16];
+        pixel *p_weight_buf[X264_REF_MAX];

         /* current value */
         int     i_type;
@@ -675,9 +676,9 @@ struct x264_t

             /* pointer over mb of the references */
             int i_fref[2];
-            pixel *p_fref[2][32][4+1]; /* last: yN, yH, yV, yHV, uv */
-            pixel *p_fref_w[32];  /* weighted fullpel luma */
-            uint16_t *p_integral[2][16];
+            pixel *p_fref[2][X264_REF_MAX*2][4+1]; /* last: yN, yH, yV, yHV, uv */
+            pixel *p_fref_w[X264_REF_MAX*2];  /* weighted fullpel luma */
+            uint16_t *p_integral[2][X264_REF_MAX];

             /* fref stride */
             int     i_stride[3];
@@ -732,15 +733,15 @@ struct x264_t
         int     i_chroma_lambda2_offset;

         /* B_direct and weighted prediction */
-        int16_t dist_scale_factor_buf[2][32][4];
+        int16_t dist_scale_factor_buf[2][X264_REF_MAX*2][4];
         int16_t (*dist_scale_factor)[4];
-        int8_t bipred_weight_buf[2][32][4];
+        int8_t bipred_weight_buf[2][X264_REF_MAX*2][4];
         int8_t (*bipred_weight)[4];
         /* maps fref1[0]'s ref indices into the current list0 */
 #define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
-        int8_t  map_col_to_list0[18];
+        int8_t  map_col_to_list0[X264_REF_MAX+2];
         int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
-        int8_t deblock_ref_table[32+2];
+        int8_t deblock_ref_table[X264_REF_MAX*2+2];
 #define deblock_ref_table(x) h->mb.deblock_ref_table[(x)+2]
     } mb;

@@ -765,7 +766,7 @@ struct x264_t
             int i_mb_count_p;
             int i_mb_count_skip;
             int i_mb_count_8x8dct[2];
-            int i_mb_count_ref[2][32];
+            int i_mb_count_ref[2][X264_REF_MAX*2];
             int i_mb_partition[17];
             int i_mb_cbp[6];
             int i_mb_pred_mode[4][13];
@@ -794,7 +795,7 @@ struct x264_t
         int64_t i_mb_count[5][19];
         int64_t i_mb_partition[2][17];
         int64_t i_mb_count_8x8dct[2];
-        int64_t i_mb_count_ref[2][2][32];
+        int64_t i_mb_count_ref[2][2][X264_REF_MAX*2];
         int64_t i_mb_cbp[6];
         int64_t i_mb_pred_mode[4][13];
         /* */
diff --git a/common/frame.h b/common/frame.h
index fcc28d7..3e0a3f5 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -75,8 +75,8 @@ typedef struct x264_frame
     pixel *buffer[4];
     pixel *buffer_lowres[4];

-    x264_weight_t weight[16][3]; /* [ref_index][plane] */
-    pixel *weighted[16]; /* plane[0] weighted of the reference frames */
+    x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
+    pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */
     int b_duplicate;
     struct x264_frame *orig;

@@ -97,7 +97,7 @@ typedef struct x264_frame
     int     *lowres_mv_costs[2][X264_BFRAME_MAX+1];
     int8_t  *ref[2];
     int     i_ref[2];
-    int     ref_poc[2][16];
+    int     ref_poc[2][X264_REF_MAX];
     int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction

     /* for adaptive B-frame decision.
diff --git a/common/macroblock.c b/common/macroblock.c
index 7347645..6efd7e6 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -233,11 +233,11 @@ int x264_macroblock_cache_allocate( x264_t *h )

     for( int i = 0; i < 2; i++ )
     {
-        int i_refs = X264_MIN(16, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
+        int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
-            i_refs = X264_MIN(16, i_refs + 2); //smart weights add two duplicate frames
+            i_refs = X264_MIN(X264_REF_MAX, i_refs + 2); //smart weights add two duplicate frames
         else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
-            i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame
+            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1); //blind weights add one duplicate frame

         for( int j = !i; j < i_refs; j++ )
         {
@@ -289,10 +289,10 @@ fail:
 void x264_macroblock_cache_free( x264_t *h )
 {
     for( int i = 0; i < 2; i++ )
-        for( int j = !i; j < 32; j++ )
+        for( int j = !i; j < X264_REF_MAX*2; j++ )
             if( h->mb.mvr[i][j] )
                 x264_free( h->mb.mvr[i][j]-1 );
-    for( int i = 0; i < 16; i++ )
+    for( int i = 0; i < X264_REF_MAX; i++ )
         x264_free( h->mb.p_weight_buf[i] );

     if( h->param.b_cabac )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 0b65d51..f6d9965 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -571,8 +571,8 @@ static int x264_validate_parameters( x264_t *h )
             h->param.i_slice_count = 0;
     }

-    h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
-    h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, 16 );
+    h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, X264_REF_MAX );
+    h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, X264_REF_MAX );
     h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
     if( h->param.i_scenecut_threshold < 0 )
         h->param.i_scenecut_threshold = 0;
@@ -1005,7 +1005,7 @@ x264_t *x264_encoder_open( x264_param_t *param )

     CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
     /* Allocate room for max refs plus a few extra just in case. */
-    CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + 20) * sizeof(x264_frame_t *) );
+    CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + X264_REF_MAX + 4) * sizeof(x264_frame_t *) );
     CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
                         + h->i_thread_frames + 3) * sizeof(x264_frame_t *) );
     if( h->param.analyse.i_weighted_pred > 0 )
@@ -1434,9 +1434,9 @@ int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t

     /* shift the frames to make space for the dupe. */
     h->b_ref_reorder[0] = 1;
-    if( h->i_ref0 < 16 )
+    if( h->i_ref0 < X264_REF_MAX )
         ++h->i_ref0;
-    h->fref0[15] = NULL;
+    h->fref0[X264_REF_MAX-1] = NULL;
     x264_frame_unshift( &h->fref0[j], newframe );

     return j;
@@ -1616,7 +1616,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
         h->mb.ref_blind_dupe = idx;
     }

-    assert( h->i_ref0 + h->i_ref1 <= 16 );
+    assert( h->i_ref0 + h->i_ref1 <= X264_REF_MAX );
     h->mb.pic.i_fref[0] = h->i_ref0;
     h->mb.pic.i_fref[1] = h->i_ref1;
 }
@@ -2801,7 +2801,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
             h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j];
     if( h->sh.i_type != SLICE_TYPE_I )
         for( int i_list = 0; i_list < 2; i_list++ )
-            for( int i = 0; i < 32; i++ )
+            for( int i = 0; i < X264_REF_MAX*2; i++ )
                 h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i];
     if( h->sh.i_type == SLICE_TYPE_P )
     {
@@ -3169,7 +3169,7 @@ void    x264_encoder_close  ( x264_t *h )
                 char *p = buf;
                 int64_t i_den = 0;
                 int i_max = 0;
-                for( int i = 0; i < 32; i++ )
+                for( int i = 0; i < X264_REF_MAX*2; i++ )
                     if( h->stat.i_mb_count_ref[i_slice][i_list][i] )
                     {
                         i_den += h->stat.i_mb_count_ref[i_slice][i_list][i];
diff --git a/encoder/set.c b/encoder/set.c
index a520b8a..2c93618 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -125,7 +125,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     /* extra slot with pyramid so that we don't have to override the
      * order of forgetting old pictures */
     sps->vui.i_max_dec_frame_buffering =
-    sps->i_num_ref_frames = X264_MIN(16, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
+    sps->i_num_ref_frames = X264_MIN(X264_REF_MAX, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
                             param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size));
     sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;

--
1.7.1


From cd21d0551318972a58a7e497e0321e373f0d1237 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sat, 21 Aug 2010 00:15:53 -0700
Subject: [PATCH 2/9] CAVLC "trellis"
 ~3-10% improved compression with CAVLC.
 --trellis is now a valid option with CAVLC.
 Perhaps more importantly, this means psy-trellis now works with CAVLC.

This isn't a real trellis; it's actually just a simplified QNS.
But it takes enough shortcuts that it's still roughly as fast as a trellis; just not quite optimal.
Thus the name is a bit of a misnomer, but we're reusing the option name because it does the same thing.
A real trellis would be better, but CAVLC is much harder to trellis than CABAC.
I'm not aware of any published polynomial-time solutions that are significantly close to optimal.
---
 encoder/cavlc.c      |    6 +-
 encoder/encoder.c    |    2 -
 encoder/macroblock.c |    2 +-
 encoder/rdo.c        |  263 ++++++++++++++++++++++++++++++++++++++++++++++---
 x264.c               |    2 +-
 5 files changed, 251 insertions(+), 24 deletions(-)

diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 6f0b60f..2f7cde9 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -95,7 +95,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_len
             {
 #if RDO_SKIP_BS
                 /* Weight highly against overflows. */
-                s->i_bits_encoded += 1000000;
+                s->i_bits_encoded += 2000;
 #else
                 x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code );
                 /* clip level, preserving sign */
@@ -113,7 +113,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_len
     return i_suffix_length;
 }

-static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, dctcoef *l, int nC )
+static int block_residual_write_cavlc_internal( x264_t *h, int i_ctxBlockCat, dctcoef *l, int nC )
 {
     bs_t *s = &h->out.bs;
     static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
@@ -199,7 +199,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
     if( !*nnz )\
         bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
     else\
-        *nnz = block_residual_write_cavlc(h,cat,l,nC);\
+        *nnz = block_residual_write_cavlc_internal(h,cat,l,nC);\
 }

 static void cavlc_qp_delta( x264_t *h )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index f6d9965..f5fe2c5 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -683,8 +683,6 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
     }
     h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
-    if( !h->param.b_cabac )
-        h->param.analyse.i_trellis = 0;
     h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
     if( !h->param.analyse.b_psy )
     {
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 99cb433..4297cfb 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -739,7 +739,7 @@ void x264_macroblock_encode( x264_t *h )
         else if( h->mb.b_transform_8x8 )
         {
             ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
-            b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
+            b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
             h->nr_count[1] += h->mb.b_noise_reduction * 4;

diff --git a/encoder/rdo.c b/encoder/rdo.c
index d4e6b0c..36ba677 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -410,10 +410,12 @@ typedef struct {
 // comparable to the input. so unquant is the direct inverse of quant,
 // and uses the dct scaling factors, not the idct ones.

-static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct,
-                                 const uint16_t *quant_mf, const int *unquant_mf,
-                                 const int *coef_weight, const uint8_t *zigzag,
-                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
+static ALWAYS_INLINE
+int quant_trellis_cabac( x264_t *h, dctcoef *dct,
+                         const uint16_t *quant_mf, const int *unquant_mf,
+                         const int *coef_weight, const uint8_t *zigzag,
+                         int i_ctxBlockCat, int i_lambda2, int b_ac,
+                         int dc, int i_coefs, int idx )
 {
     int abs_coefs[64], signs[64];
     trellis_node_t nodes[2][8];
@@ -629,35 +631,262 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct,
     return 1;
 }

+/* FIXME: This is a gigantic hack.  See below.
+ *
+ * CAVLC is much more difficult to trellis than CABAC.
+ *
+ * CABAC has only three states to track: significance map, last, and the
+ * level state machine.
+ * CAVLC, by comparison, has five: coeff_token (trailing + total),
+ * total_zeroes, zero_run, and the level state machine.
+ *
+ * I know of no paper that has managed to design a close-to-optimal trellis
+ * that covers all five of these and isn't exponential-time.  As a result, this
+ * "trellis" isn't: it's just a QNS search.  Patches welcome for something better.
+ * It's actually surprisingly fast, albeit not quite optimal.  It's pretty close
+ * though; since CAVLC only has 2^16 possible rounding modes (assuming only two
+ * roundings as options), a bruteforce search is feasible.  Testing shows
+ * that this QNS is reasonably close to optimal in terms of compression.
+ *
+ * TODO:
+ *  Don't bother changing large coefficients when it wouldn't affect bit cost
+ *  (e.g. only affecting bypassed suffix bits).
+ *  Don't re-run all parts of CAVLC bit cost calculation when not necessary.
+ *  e.g. when changing a coefficient from one non-zero value to another in
+ *  such a way that trailing ones and suffix length isn't affected. */
+static ALWAYS_INLINE
+int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
+                         const uint16_t *quant_mf, const int *unquant_mf,
+                         const int *coef_weight, const uint8_t *zigzag,
+                         int i_ctxBlockCat, int i_lambda2, int b_ac,
+                         int dc, int i_coefs, int idx, int b_8x8 )
+{
+    ALIGNED_16( dctcoef quant_coefs[2][16] );
+    ALIGNED_16( dctcoef coefs[16] ) = {0};
+    int delta_distortion[16];
+    int64_t score = 1ULL<<62;
+    int i, j;
+    const int f = 1<<15;
+    int nC = i_ctxBlockCat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_ctxBlockCat == DCT_LUMA_DC ? 0 : idx )];
+
+    /* Code for handling 8x8dct -> 4x4dct CAVLC munging.  Input/output use a different
+     * step/start/end than internal processing. */
+    int step = 1;
+    int start = b_ac;
+    int end = i_coefs - 1;
+    if( b_8x8 )
+    {
+        start = idx&3;
+        end = 60 + start;
+        step = 4;
+    }
+
+    i_lambda2 <<= LAMBDA_BITS;
+
+    /* Find last non-zero coefficient. */
+    for( i = end; i >= start; i -= step )
+        if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
+            break;
+
+    if( i < start )
+        goto zeroblock;
+
+    /* Prepare for QNS search: calculate distortion caused by each DCT coefficient
+     * rounding to be searched.
+     *
+     * We only search two roundings (nearest and nearest-1) like in CABAC trellis,
+     * so we just store the difference in distortion between them. */
+    int i_last_nnz = b_8x8 ? i >> 2 : i;
+    int coef_mask = 0;
+    int round_mask = 0;
+    for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step )
+    {
+        int coef = dct[zigzag[j]];
+        int abs_coef = abs(coef);
+        int sign = coef < 0 ? -1 : 1;
+        int nearest_quant = ( f + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16;
+        quant_coefs[1][i] = quant_coefs[0][i] = sign * nearest_quant;
+        coefs[i] = quant_coefs[1][i];
+        if( nearest_quant )
+        {
+            /* We initialize the trellis with a deadzone halfway between nearest rounding
+             * and always-round-down.  This gives much better results than initializing to either
+             * extreme.
+             * FIXME: should we initialize to the deadzones used by deadzone quant? */
+            int deadzone_quant = ( f/2 + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16;
+            int unquant1 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-0) + 128) >> 8);
+            int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-1) + 128) >> 8);
+            int d1 = abs_coef - unquant1;
+            int d0 = abs_coef - unquant0;
+            delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight[j]);
+
+            /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
+            if( h->mb.i_psy_trellis && j && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
+            {
+                int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]];
+                int predicted_coef = orig_coef - coef;
+                int psy_weight = b_8x8 ? x264_dct8_weight_tab[zigzag[j]] : x264_dct4_weight_tab[zigzag[j]];
+                int psy_value0 = h->mb.i_psy_trellis * abs(predicted_coef + unquant0 * sign);
+                int psy_value1 = h->mb.i_psy_trellis * abs(predicted_coef + unquant1 * sign);
+                delta_distortion[i] += (psy_value0 - psy_value1) * psy_weight;
+            }
+
+            quant_coefs[0][i] = sign * (nearest_quant-1);
+            if( deadzone_quant != nearest_quant )
+                coefs[i] = quant_coefs[0][i];
+            else
+                round_mask |= 1 << i;
+        }
+        else
+            delta_distortion[i] = 0;
+        coef_mask |= (!!coefs[i]) << i;
+    }
+
+    /* Calculate the cost of the starting state. */
+    h->out.bs.i_bits_encoded = 0;
+    if( !coef_mask )
+        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
+    else
+        block_residual_write_cavlc_internal( h, i_ctxBlockCat, coefs + b_ac, nC );
+    score = (int64_t)h->out.bs.i_bits_encoded * i_lambda2;
+
+    /* QNS loop: pick the change that improves RD the most, apply it, repeat.
+     * coef_mask and round_mask are used to simplify tracking of nonzeroness
+     * and rounding modes chosen. */
+    while( 1 )
+    {
+        int64_t iter_score = score;
+        int iter_distortion_delta = 0;
+        int iter_coef = -1;
+        int iter_mask = coef_mask;
+        int iter_round = round_mask;
+        for( i = b_ac; i <= i_last_nnz; i++ )
+        {
+            if( !delta_distortion[i] )
+                continue;
+
+            /* Set up all the variables for this iteration. */
+            int cur_round = round_mask ^ (1 << i);
+            int round_change = (cur_round >> i)&1;
+            int old_coef = coefs[i];
+            int new_coef = quant_coefs[round_change][i];
+            int cur_mask = (coef_mask&~(1 << i))|(!!new_coef << i);
+            int cur_distortion_delta = delta_distortion[i] * (round_change ? -1 : 1);
+            int64_t cur_score = cur_distortion_delta;
+            coefs[i] = new_coef;
+
+            /* Count up bits. */
+            h->out.bs.i_bits_encoded = 0;
+            if( !cur_mask )
+                bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
+            else
+                block_residual_write_cavlc_internal( h, i_ctxBlockCat, coefs + b_ac, nC );
+            cur_score += (int64_t)h->out.bs.i_bits_encoded * i_lambda2;
+
+            coefs[i] = old_coef;
+            if( cur_score < iter_score )
+            {
+                iter_score = cur_score;
+                iter_coef = i;
+                iter_mask = cur_mask;
+                iter_round = cur_round;
+                iter_distortion_delta = cur_distortion_delta;
+            }
+        }
+        if( iter_coef >= 0 )
+        {
+            score = iter_score - iter_distortion_delta;
+            coef_mask = iter_mask;
+            round_mask = iter_round;
+            coefs[iter_coef] = quant_coefs[((round_mask >> iter_coef)&1)][iter_coef];
+            /* Don't try adjusting coefficients we've already adjusted.
+             * Testing suggests this doesn't hurt results -- and sometimes actually helps. */
+            delta_distortion[iter_coef] = 0;
+        }
+        else
+            break;
+    }
+
+    if( coef_mask )
+    {
+        for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step )
+            dct[zigzag[j]] = coefs[i];
+        for( ; j <= end; j += step )
+            dct[zigzag[j]] = 0;
+        return 1;
+    }
+
+zeroblock:
+    if( !dc )
+    {
+        if( b_8x8 )
+            for( i = start; i <= end; i+=step )
+                dct[zigzag[i]] = 0;
+        else
+            memset( dct, 0, 16*sizeof(dctcoef) );
+    }
+    return 0;
+}
+
 const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};

 int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                            int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma )
 {
-    return quant_trellis_cabac( h, dct,
+    if( h->param.b_cabac )
+        return quant_trellis_cabac( h, dct,
+            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
+            NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
+            i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
+
+    return quant_trellis_cavlc( h, dct,
         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
-        i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
+        i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0, 0 );
 }

 int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                             int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx )
 {
     int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
-    return quant_trellis_cabac( h, dct,
-        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
-        x264_dct4_weight2_zigzag[h->mb.b_interlaced],
-        x264_zigzag_scan4[h->mb.b_interlaced],
-        i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx );
+    if( h->param.b_cabac )
+        return quant_trellis_cabac( h, dct,
+            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
+            x264_dct4_weight2_zigzag[h->mb.b_interlaced],
+            x264_zigzag_scan4[h->mb.b_interlaced],
+            i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx );
+
+    return quant_trellis_cavlc( h, dct,
+            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
+            x264_dct4_weight2_zigzag[h->mb.b_interlaced],
+            x264_zigzag_scan4[h->mb.b_interlaced],
+            i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx, 0 );
 }

 int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                             int i_qp, int b_intra, int idx )
 {
-    return quant_trellis_cabac( h, dct,
-        h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
-        x264_dct8_weight2_zigzag[h->mb.b_interlaced],
-        x264_zigzag_scan8[h->mb.b_interlaced],
-        DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx );
-}
+    if( h->param.b_cabac )
+    {
+        return quant_trellis_cabac( h, dct,
+            h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
+            x264_dct8_weight2_zigzag[h->mb.b_interlaced],
+            x264_zigzag_scan8[h->mb.b_interlaced],
+            DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx );
+    }

+    /* 8x8 CAVLC is split into 4 4x4 blocks */
+    int nzaccum = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        int nz = quant_trellis_cavlc( h, dct,
+            h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
+            x264_dct8_weight2_zigzag[h->mb.b_interlaced],
+            x264_zigzag_scan8[h->mb.b_interlaced],
+            DCT_LUMA_4x4, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 16, idx*4+i, 1 );
+        /* Set up nonzero count for future calls */
+        h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
+        nzaccum |= nz;
+    }
+    return nzaccum;
+}
diff --git a/x264.c b/x264.c
index 9c3ce5e..7d98518 100644
--- a/x264.c
+++ b/x264.c
@@ -595,7 +595,7 @@ static void Help( x264_param_t *defaults, int longhelp )
     H2( "      --no-mixed-refs         Don't decide references on a per partition basis\n" );
     H2( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
     H1( "      --no-8x8dct             Disable adaptive spatial transform size\n" );
-    H1( "  -t, --trellis <integer>     Trellis RD quantization. Requires CABAC. [%d]\n"
+    H1( "  -t, --trellis <integer>     Trellis RD quantization. [%d]\n"
         "                                  - 0: disabled\n"
         "                                  - 1: enabled only on the final encode of a MB\n"
         "                                  - 2: enabled on all mode decisions\n", defaults->analyse.i_trellis );
--
1.7.1


From 5b8f40714b10df5a5bf24ebb6be530a8458e2fdf Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Thu, 26 Aug 2010 09:12:01 -0400
Subject: [PATCH 3/9] Don't do deblock-aware RD if deblocking is off

---
 encoder/analyse.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index fdc2498..3ddd3f0 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -357,7 +357,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
     /* mbrd == 2 -> RD refinement */
     /* mbrd == 3 -> QPRD */
     a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
-    h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9;
+    h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;

     x264_mb_analyse_init_qp( h, a, i_qp );

--
1.7.1


From 5978cbc53dec1e7023b2ba9c9f9ce6ed24ffc68b Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Sun, 29 Aug 2010 16:35:32 +0400
Subject: [PATCH 4/9] Fix bug in 2pass if the first P-frames are all skip
 last_qscale_for was read before being initialized in this case, resulting
 in the value from the previous iteration being used instead.

---
 encoder/ratecontrol.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index d9d118a..cf51f37 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -2518,6 +2518,7 @@ static int init_pass2( x264_t *h )
     const int filter_size = (int)(qblur*4) | 1;
     double expected_bits;
     double *qscale, *blurred_qscale;
+    double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);

     /* find total/average complexity & const_bits */
     for( int i = 0; i < rcc->num_entries; i++ )
@@ -2602,6 +2603,10 @@ static int init_pass2( x264_t *h )
         rcc->last_accum_p_norm = 1;
         rcc->accum_p_norm = 0;

+        rcc->last_qscale_for[0] =
+        rcc->last_qscale_for[1] =
+        rcc->last_qscale_for[2] = pow( base_cplx, 1 - rcc->qcompress ) / rate_factor;
+
         /* find qscale */
         for( int i = 0; i < rcc->num_entries; i++ )
         {
--
1.7.1


From 26f9e9417034eaccccc7ec0bc225eaef3f0f4de0 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sun, 29 Aug 2010 22:18:07 -0700
Subject: [PATCH 5/9] Faster cabac_encode_ue_bypass
 Use CLZ + a lut instead of a loop.

---
 common/cabac.c |   15 ++++++++++-----
 1 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/common/cabac.c b/common/cabac.c
index d0888d0..cd57d90 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -850,14 +850,19 @@ void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
     x264_cabac_putbyte( cb );
 }

+static const int bypass_lut[16] =
+{
+    -1,      0x2,     0x14,     0x68,     0x1d0,     0x7a0,     0x1f40,     0x7e80,
+    0x1fd00, 0x7fa00, 0x1ff400, 0x7fe800, 0x1ffd000, 0x7ffa000, 0x1fff4000, 0x7ffe8000
+};
+
 void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
 {
-    int k, i;
-    for( k = exp_bits; val >= (1<<k); k++ )
-        val -= 1<<k;
-    uint32_t x = (((1<<(k-exp_bits))-1)<<(k+1))+val;
+    uint32_t v = val + (1<<exp_bits);
+    int k = 31 - x264_clz( v );
+    uint32_t x = (bypass_lut[k-exp_bits]<<exp_bits) + v;
     k = 2*k+1-exp_bits;
-    i = ((k-1)&7)+1;
+    int i = ((k-1)&7)+1;
     do {
         k -= i;
         cb->i_low <<= i;
--
1.7.1


From 4c6ed36e092bb4fd3fb86668c34a07a9abfc170d Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Mon, 30 Aug 2010 12:32:31 -0700
Subject: [PATCH 6/9] Use POC type 2 for streams with no B-frames
 Saves a few bits per slice header.

---
 encoder/set.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/encoder/set.c b/encoder/set.c
index 2c93618..2b3bbce 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -135,7 +135,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
         sps->i_log2_max_frame_num++;

-    sps->i_poc_type = 0;
+    sps->i_poc_type = param->i_bframe ? 0 : 2;
     if( sps->i_poc_type == 0 )
     {
         int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
--
1.7.1


From 59557dc35a4d70a8ebaec969f83f4de043c58b31 Mon Sep 17 00:00:00 2001
From: Takashi Hirata <silverfilain@gmail.com>
Date: Mon, 30 Aug 2010 18:13:49 +0900
Subject: [PATCH 7/9] Add support for level 1b
 This level is a stupid hack in the H.264 spec, so it's a stupid hack in x264 too.
 Since level is an integer, calling applications need to set level_idc=9 to use it.
 String-based option handling will accept "1b" just fine though, so CLI users don't have to worry.

---
 common/common.c   |    4 +++-
 common/set.h      |    1 +
 encoder/encoder.c |   12 ++++++++----
 encoder/set.c     |   17 ++++++++++++++---
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/common/common.c b/common/common.c
index 47fcaa2..b0bb4e7 100644
--- a/common/common.c
+++ b/common/common.c
@@ -603,7 +603,9 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->b_deterministic = atobool(value);
     OPT2("level", "level-idc")
     {
-        if( atof(value) < 6 )
+        if( !strcmp(value, "1b") )
+            p->i_level_idc = 9;
+        else if( atof(value) < 6 )
             p->i_level_idc = (int)(10*atof(value)+.5);
         else
             p->i_level_idc = atoi(value);
diff --git a/common/set.h b/common/set.h
index ee27d74..6625ae4 100644
--- a/common/set.h
+++ b/common/set.h
@@ -59,6 +59,7 @@ typedef struct
     int b_constraint_set0;
     int b_constraint_set1;
     int b_constraint_set2;
+    int b_constraint_set3;

     int i_log2_max_frame_num;

diff --git a/encoder/encoder.c b/encoder/encoder.c
index f5fe2c5..2f8626c 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1157,16 +1157,20 @@ x264_t *x264_encoder_open( x264_param_t *param )
                           h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
                           h->sps->i_profile_idc == PROFILE_HIGH10 ? "High 10" :
                           "High 4:4:4 Predictive";
+    char level[4];
+    snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
+    if( h->sps->i_level_idc == 9 || ( h->sps->i_level_idc == 11 && h->sps->b_constraint_set3 ) )
+        strcpy( level, "1b" );

     if( h->sps->i_profile_idc < PROFILE_HIGH10 )
     {
-        x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n",
-            profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
+        x264_log( h, X264_LOG_INFO, "profile %s, level %s\n",
+            profile, level );
     }
     else
     {
-        x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d, bit depth %d\n",
-            profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10, BIT_DEPTH );
+        x264_log( h, X264_LOG_INFO, "profile %s, level %s, bit depth %d\n",
+            profile, level, BIT_DEPTH );
     }

     return h;
diff --git a/encoder/set.c b/encoder/set.c
index 2b3bbce..3dee484 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -112,7 +112,6 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
         sps->i_profile_idc  = PROFILE_MAIN;
     else
         sps->i_profile_idc  = PROFILE_BASELINE;
-    sps->i_level_idc = param->i_level_idc;

     sps->b_constraint_set0  = sps->i_profile_idc == PROFILE_BASELINE;
     /* x264 doesn't support the features that are in Baseline and not in Main,
@@ -121,6 +120,17 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     /* Never set constraint_set2, it is not necessary and not used in real world. */
     sps->b_constraint_set2  = 0;

+    if( param->i_level_idc == 9 && ( sps->i_profile_idc >= PROFILE_BASELINE && sps->i_profile_idc <= PROFILE_EXTENDED ) )
+    {
+        sps->b_constraint_set3 = 1; /* level 1b with Baseline, Main or Extended profile is signalled via constraint_set3 */
+        sps->i_level_idc      = 11;
+    }
+    else
+    {
+        sps->b_constraint_set3 = 0;
+        sps->i_level_idc = param->i_level_idc;
+    }
+
     sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
     /* extra slot with pyramid so that we don't have to override the
      * order of forgetting old pictures */
@@ -252,8 +262,9 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
     bs_write( s, 1, sps->b_constraint_set0 );
     bs_write( s, 1, sps->b_constraint_set1 );
     bs_write( s, 1, sps->b_constraint_set2 );
+    bs_write( s, 1, sps->b_constraint_set3 );

-    bs_write( s, 5, 0 );    /* reserved */
+    bs_write( s, 4, 0 );    /* reserved */

     bs_write( s, 8, sps->i_level_idc );

@@ -640,7 +651,7 @@ void x264_filler_write( x264_t *h, bs_t *s, int filler )
 const x264_level_t x264_levels[] =
 {
     { 10,   1485,    99,   152064,     64,    175,  64, 64,  0, 2, 0, 0, 1 },
-//  {"1b",  1485,    99,   152064,    128,    350,  64, 64,  0, 2, 0, 0, 1 },
+    {  9,   1485,    99,   152064,    128,    350,  64, 64,  0, 2, 0, 0, 1 }, /* "1b" */
     { 11,   3000,   396,   345600,    192,    500, 128, 64,  0, 2, 0, 0, 1 },
     { 12,   6000,   396,   912384,    384,   1000, 128, 64,  0, 2, 0, 0, 1 },
     { 13,  11880,   396,   912384,    768,   2000, 128, 64,  0, 2, 0, 0, 1 },
--
1.7.1


From 1572fda2ac8080f615e7bce85f8b556c292afdf9 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Tue, 31 Aug 2010 08:45:22 -0700
Subject: [PATCH 8/9] Allow --demuxer forcing with known extensions

---
 x264.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/x264.c b/x264.c
index 7d98518..bf2b3ee 100644
--- a/x264.c
+++ b/x264.c
@@ -933,9 +933,9 @@ static int select_output( const char *muxer, char *filename, x264_param_t *param
 static int select_input( const char *demuxer, char *used_demuxer, char *filename,
                          hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
 {
-    const char *ext = get_filename_extension( filename );
-    int b_regular = strcmp( filename, "-" );
     int b_auto = !strcasecmp( demuxer, "auto" );
+    const char *ext = b_auto ? get_filename_extension( filename ) : "";
+    int b_regular = strcmp( filename, "-" );
     if( !b_regular && b_auto )
         ext = "raw";
     b_regular = b_regular && x264_is_regular_file_path( filename );
--
1.7.1


From bcb3c527f918864b26094732de05f466b91633f8 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <hengar-6@student.ltu.se>
Date: Wed, 1 Sep 2010 00:53:42 +0200
Subject: [PATCH 9/9] Faster nal_escape asm

---
 common/x86/bitstream-a.asm |   77 ++++++++++++++++++++++++++-----------------
 1 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
index 69a47a7..25b426a 100644
--- a/common/x86/bitstream-a.asm
+++ b/common/x86/bitstream-a.asm
@@ -30,74 +30,89 @@ SECTION .text
 ;-----------------------------------------------------------------------------

 %macro NAL_LOOP 2
+%1_escape:
+    ; Detect false positive to avoid unneccessary escape loop
+    xor      r3d, r3d
+    cmp byte [r0+r1-1], 0
+    setnz    r3b
+    xor      r3d, r4d
+    jnz .escape
+    jmp %1_continue
 ALIGN 16
 %1:
-    mova      m0, [r1+r2]
-    mova      m1, m0
-%if mmsize == 8
-    psllq     m0, 8
-%else
-    pslldq    m0, 1
-%endif
-    %2   [r0+r1], m1
-    por       m1, m0
-    pcmpeqb   m1, m2
+    mova      m3, m1
+    mova      m2, m0
+    pcmpeqb   m1, m4
+    pcmpeqb   m0, m4
     pmovmskb r3d, m1
-    test     r3d, r3d
-    jnz .escape
-    add       r1, mmsize
+    %2   [r0+r1], m2
+    pmovmskb r4d, m0
+    shl      r3d, mmsize
+    mova      m0, [r1+r2+2*mmsize]
+    or       r4d, r3d
+    mova      m1, [r1+r2+3*mmsize]
+    lea      r3d, [r4+r4+1]
+    %2 [r0+r1+mmsize], m3
+    and      r4d, r3d
+    jnz %1_escape
+%1_continue:
+    add       r1, 2*mmsize
     jl %1
 %endmacro

 %macro NAL_ESCAPE 1

 cglobal nal_escape_%1, 3,5
-    pxor      m2, m2
+    mov      r3w, [r1]
     sub       r1, r2 ; r1 = offset of current src pointer from end of src
+    pxor      m4, m4
     sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
-
-    mov      r3b, [r1+r2]
-    mov  [r0+r1], r3b
-    inc       r1
+    mov  [r0+r1], r3w
+    add       r1, 2
     jge .ret

     ; Start off by jumping into the escape loop in
     ; case there's an escape at the start.
     ; And do a few more in scalar until src is aligned again.
-    lea      r4d, [r1+r2]
-    or       r4d, -mmsize
-    neg      r4d
     jmp .first_escape

     NAL_LOOP .loop_aligned, mova
 %if mmsize==16
+    jmp .ret
     NAL_LOOP .loop_unaligned, movu
 %endif
-
 .ret:
     movifnidn rax, r0
     RET
+
 ALIGN 16
 .escape:
-    mov      r4d, mmsize
-.first_escape:
-    mov      r3b, [r1+r2]
+    ; Skip bytes that are known to be valid
+    and      r4d, r3d
+    bsf      r3d, r4d
+    add       r1, r3
 .escape_loop:
-    mov  [r0+r1], r3b
-    inc      r1
+    inc       r1
     jge .ret
-    mov      r3b, [r1+r2]
-    cmp      r3b, 3
+.first_escape:
+    movzx    r3d, byte [r1+r2]
+    lea       r4, [r1+r2]
+    cmp      r3d, 3
     jna .escape_check
 .no_escape:
-    dec      r4d
-    jg .escape_loop
+    mov  [r0+r1], r3b
+    test     r4d, mmsize-1 ; Do SIMD when src is aligned
+    jnz .escape_loop
+    mova      m0, [r4]
+    mova      m1, [r4+mmsize]
 %if mmsize==16
     lea      r4d, [r0+r1]
     test     r4d, mmsize-1
     jnz .loop_unaligned
 %endif
     jmp .loop_aligned
+
+ALIGN 16
 .escape_check:
     cmp word [r0+r1-2], 0
     jnz .no_escape
--
1.7.1