Untitled

From 167c2760cbc36911302ace046db5eef6fe1ea54a Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 11 Jan 2011 20:05:54 +0000
Subject: [PATCH 01/25] Save interlace decision for all macroblocks

---
 common/common.h     |    1 +
 common/frame.c      |    3 +++
 common/frame.h      |    1 +
 common/macroblock.c |    1 +
 encoder/encoder.c   |    8 ++++++++
 5 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/common/common.h b/common/common.h
index 868f526..231254f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -617,6 +617,7 @@ struct x264_t
         int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
         uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
                                              * NOTE: this will fail on resolutions above 2^16 MBs... */
+        uint8_t *field;

          /* buffer for weighted versions of the reference frames */
         pixel *p_weight_buf[X264_REF_MAX];
diff --git a/common/frame.c b/common/frame.c
index ca90539..eff8ca5 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -145,6 +145,8 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
                             frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
             frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
         }
+        if( h->param.b_interlaced )
+            CHECKED_MALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
     }
     else /* fenc frame */
     {
@@ -219,6 +221,7 @@ void x264_frame_delete( x264_frame_t *frame )
         x264_free( frame->i_inv_qscale_factor );
         x264_free( frame->i_row_bits );
         x264_free( frame->f_row_qp );
+        x264_free( frame->field );
         x264_free( frame->mb_type );
         x264_free( frame->mb_partition );
         x264_free( frame->mv[0] );
diff --git a/common/frame.h b/common/frame.h
index 38d0bf2..0e0ab3d 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -92,6 +92,7 @@ typedef struct x264_frame
     int16_t (*mv[2])[2];
     int16_t (*mv16x16)[2];
     int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+    uint8_t *field;

     /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
      * Doesn't need special addressing for intra cost because
diff --git a/common/macroblock.c b/common/macroblock.c
index 24c2af9..569d544 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -371,6 +371,7 @@ void x264_macroblock_slice_init( x264_t *h )
     h->mb.ref[1] = h->fdec->ref[1];
     h->mb.type = h->fdec->mb_type;
     h->mb.partition = h->fdec->mb_partition;
+    h->mb.field = h->fdec->field;

     h->fdec->i_ref[0] = h->i_ref[0];
     h->fdec->i_ref[1] = h->i_ref[1];
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 67646df..af1342b 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1956,6 +1956,14 @@ static int x264_slice_write( x264_t *h )
             }
         }

+        if( h->param.b_interlaced )
+        {
+            if( !(i_mb_y&1) )
+                h->mb.b_interlaced = 1;
+            x264_zigzag_init( h->param.cpu, &h->zigzagf, h->mb.b_interlaced );
+            h->mb.field[mb_xy] = h->mb.b_interlaced;
+        }
+
         if( i_mb_x == 0 && !h->mb.b_reencode_mb )
             x264_fdec_filter_row( h, i_mb_y, 1 );

--
1.7.4


From 2847b697f1fdbee1b8c3128895f2a50c1cba606e Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 11 Jan 2011 20:09:00 +0000
Subject: [PATCH 02/25] Disable adaptive mbaff when subme 0 is used

---
 common/common.h   |    1 +
 encoder/encoder.c |   13 ++++++++++---
 x264.h            |    1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/common/common.h b/common/common.h
index 231254f..75c4d59 100644
--- a/common/common.h
+++ b/common/common.h
@@ -568,6 +568,7 @@ struct x264_t
         int     i_psy_trellis; /* Psy trellis strength--fixed point value*/

         int     b_interlaced;
+        int     b_adaptive_mbaff;

         /* Allowed qpel MV range to stay within the picture + emulated edge pixels */
         int     mv_min[2];
diff --git a/encoder/encoder.c b/encoder/encoder.c
index af1342b..9f294d1 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -606,6 +606,10 @@ static int x264_validate_parameters( x264_t *h )
         x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" );
         h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
     }
+    /* Adaptive MBAFF and subme 0 are not supported as motion vectors between
+     * field macroblocks and frame macroblocks require halving and hpel pixels.
+     * The chosen solution is to make MBAFF non-adaptive in this case. */
+    h->mb.b_adaptive_mbaff = !(h->param.b_interlaced && !h->param.analyse.i_subpel_refine);
     h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_MIN( X264_BFRAME_MAX, h->param.i_keyint_max-1 ) );
     h->param.i_open_gop = x264_clip3( h->param.i_open_gop, X264_OPEN_GOP_NONE, X264_OPEN_GOP_BLURAY );
     h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
@@ -1958,9 +1962,12 @@ static int x264_slice_write( x264_t *h )

         if( h->param.b_interlaced )
         {
-            if( !(i_mb_y&1) )
-                h->mb.b_interlaced = 1;
-            x264_zigzag_init( h->param.cpu, &h->zigzagf, h->mb.b_interlaced );
+            if( h->mb.b_adaptive_mbaff )
+            {
+                if( !(i_mb_y&1) )
+                    h->mb.b_interlaced = 1;
+                x264_zigzag_init( h->param.cpu, &h->zigzagf, h->mb.b_interlaced );
+            }
             h->mb.field[mb_xy] = h->mb.b_interlaced;
         }

diff --git a/x264.h b/x264.h
index 24c3792..da8746b 100644
--- a/x264.h
+++ b/x264.h
@@ -291,6 +291,7 @@ typedef struct x264_param_t
     int         i_cabac_init_idc;

     int         b_interlaced;
+    int         b_adaptive_mbaff; /* MBAFF+subme 0 require non-adaptive MBAFF i.e. all field mbs */
     int         b_constrained_intra;

     int         i_cqm_preset;
--
1.7.4


From eb50f5f3757d825b2664e7991b89c7647605dd28 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 11 Jan 2011 20:16:18 +0000
Subject: [PATCH 03/25] Store left references in a table

---
 common/common.h     |    1 +
 common/macroblock.c |   56 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/common/common.h b/common/common.h
index 75c4d59..992ba9c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -596,6 +596,7 @@ struct x264_t
         int     i_mb_top_xy;
         int     i_mb_topleft_xy;
         int     i_mb_topright_xy;
+        int     *left_index_table;

         /**** thread synchronization ends here ****/
         /* subsequent variables are either thread-local or constant,
diff --git a/common/macroblock.c b/common/macroblock.c
index 569d544..d10b19f 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -550,6 +550,18 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
         }
 }

+static const int left_indices[5][22] = {
+/*    intra modes     nnz                                          mv          ref    real indices */
+    /* Current is progressive. */
+    { 4, 4, 5, 5,     3,  3,  7,  7, 16+1, 16+1, 16+4+1, 16+4+1,   0, 0, 1, 1, 0, 0,  0, 0, 1, 1 },
+    { 6, 6, 3, 3,    11, 11, 15, 15, 16+3, 16+3, 16+4+3, 16+4+3,   2, 2, 3, 3, 1, 1,  2, 2, 3, 3 },
+    /* Current is interlaced.*/
+    { 4, 6, 4, 6,     3, 11,  3, 11, 16+1, 16+1, 16+4+1, 16+4+1,   0, 2, 0, 2, 0, 0,  0, 2, 0, 2 },
+    { 4, 6, 4, 6,     3, 11,  3, 11, 16+1, 16+1, 16+4+1, 16+4+1,   0, 2, 0, 2, 0, 0,  0, 2, 0, 2 },
+    /*Both same.*/
+    { 4, 5, 6, 3,     3,  7, 11, 15, 16+1, 16+3, 16+4+1, 16+4+3,   0, 1, 2, 3, 0, 1,  0, 1, 2, 3 },
+};
+
 static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y )
 {
     int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
@@ -570,6 +582,7 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
     h->mb.i_mb_type_left = -1;
     h->mb.i_mb_type_topleft = -1;
     h->mb.i_mb_type_topright = -1;
+    h->mb.left_index_table = left_indices[4];

     if( mb_x > 0 )
     {
@@ -661,6 +674,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
     uint8_t (*nnz)[24] = h->mb.non_zero_count;
     int16_t *cbp = h->mb.cbp;

+    int *left_index_table = h->mb.left_index_table;
+
     /* load cache */
     if( h->mb.i_neighbour & MB_TOP )
     {
@@ -703,22 +718,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         h->mb.cache.i_cbp_left = cbp[left];

         /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][left_index_table[0]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][left_index_table[1]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][left_index_table[2]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][left_index_table[3]];

         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
-        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
-        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][left_index_table[4+0]];
+        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][left_index_table[4+1]];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][left_index_table[4+2]];
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][left_index_table[4+3]];

-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1];
-        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3];
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][left_index_table[4+4]];
+        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][left_index_table[4+5]];

-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
-        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][left_index_table[4+6]];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][left_index_table[4+7]];
     }
     else
     {
@@ -857,10 +872,10 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

             if( h->mb.i_neighbour & MB_LEFT )
             {
-                CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][left_index_table[0]] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][left_index_table[1]] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][left_index_table[2]] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][left_index_table[3]] );
             }
             else
                 for( int i = 0; i < 4; i++ )
@@ -949,6 +964,7 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
             int s4x4 = h->mb.i_b4_stride;

             uint8_t (*nnz)[24] = h->mb.non_zero_count;
+            int *left_index_table = h->mb.left_index_table;

             if( h->mb.i_neighbour & MB_TOP )
                 CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
@@ -956,10 +972,10 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
             if( h->mb.i_neighbour & MB_LEFT )
             {
                 int left = h->mb.i_mb_left_xy;
-                h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
-                h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
-                h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
-                h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
+                h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][left_index_table[4+0]];
+                h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][left_index_table[4+1]];
+                h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][left_index_table[4+2]];
+                h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][left_index_table[4+3]];
             }

             for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
--
1.7.4


From c83e9ad2f2cb3a7f5d2ae6c4fd4b5a8cc04f894a Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 11 Jan 2011 20:21:26 +0000
Subject: [PATCH 04/25] Store references to the two left macroblocks

Fix compiler warnings about discarding const qualifiers
---
 common/common.h     |    6 +++---
 common/deblock.c    |    4 ++--
 common/macroblock.c |   24 ++++++++++++------------
 common/mvpred.c     |    2 +-
 encoder/analyse.c   |    8 ++++----
 encoder/cabac.c     |    8 ++++----
 6 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/common/common.h b/common/common.h
index 992ba9c..f839e7e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -588,15 +588,15 @@ struct x264_t
         unsigned int i_neighbour_intra;     /* for constrained intra pred */
         unsigned int i_neighbour_frame;     /* ignoring slice boundaries */
         int     i_mb_type_top;
-        int     i_mb_type_left;
+        int     i_mb_type_left[2];
         int     i_mb_type_topleft;
         int     i_mb_type_topright;
         int     i_mb_prev_xy;
-        int     i_mb_left_xy;
+        int     i_mb_left_xy[2];
         int     i_mb_top_xy;
         int     i_mb_topleft_xy;
         int     i_mb_topright_xy;
-        int     *left_index_table;
+        const int *left_index_table;

         /**** thread synchronization ends here ****/
         /* subsequent variables are either thread-local or constant,
diff --git a/common/deblock.c b/common/deblock.c
index 1b6448f..0800461 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -347,10 +347,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )

         if( h->mb.i_neighbour & MB_LEFT )
         {
-            int qpl = h->mb.qp[h->mb.i_mb_left_xy];
+            int qpl = h->mb.qp[h->mb.i_mb_left_xy[0]];
             int qp_left = (qp + qpl + 1) >> 1;
             int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1;
-            int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_left_xy] );
+            int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] );
             if( intra_cur || intra_left )
                 FILTER( _intra, 0, 0, qp_left, qpc_left );
             else
diff --git a/common/macroblock.c b/common/macroblock.c
index d10b19f..aa194a7 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -575,11 +575,11 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
     h->mb.i_neighbour_intra = 0;
     h->mb.i_neighbour_frame = 0;
     h->mb.i_mb_top_xy = -1;
-    h->mb.i_mb_left_xy = -1;
+    h->mb.i_mb_left_xy[0] = h->mb.i_mb_left_xy[1] = -1;
     h->mb.i_mb_topleft_xy = -1;
     h->mb.i_mb_topright_xy = -1;
     h->mb.i_mb_type_top = -1;
-    h->mb.i_mb_type_left = -1;
+    h->mb.i_mb_type_left[0] = h->mb.i_mb_type_left[1] = -1;
     h->mb.i_mb_type_topleft = -1;
     h->mb.i_mb_type_topright = -1;
     h->mb.left_index_table = left_indices[4];
@@ -587,13 +587,13 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
     if( mb_x > 0 )
     {
         h->mb.i_neighbour_frame |= MB_LEFT;
-        h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1;
-        h->mb.i_mb_type_left = h->mb.type[h->mb.i_mb_left_xy];
+        h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
+        h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]];
         if( h->mb.i_mb_xy > h->sh.i_first_mb )
         {
             h->mb.i_neighbour |= MB_LEFT;

-            if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left ) )
+            if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left[0] ) )
                 h->mb.i_neighbour_intra |= MB_LEFT;
         }
     }
@@ -659,7 +659,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
 {
     x264_macroblock_cache_load_neighbours( h, mb_x, mb_y );

-    int left = h->mb.i_mb_left_xy;
+    int left = h->mb.i_mb_left_xy[0];
     int top  = h->mb.i_mb_top_xy;
     int top_y = mb_y - (1 << h->mb.b_interlaced);
     int s8x8 = h->mb.i_b8_stride;
@@ -674,7 +674,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
     uint8_t (*nnz)[24] = h->mb.non_zero_count;
     int16_t *cbp = h->mb.cbp;

-    int *left_index_table = h->mb.left_index_table;
+    const int *left_index_table = h->mb.left_index_table;

     /* load cache */
     if( h->mb.i_neighbour & MB_TOP )
@@ -927,8 +927,8 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_

     if( mb_x > 0 )
     {
-        h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1;
-        if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_left_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
+        h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
+        if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy] )
             h->mb.i_neighbour |= MB_LEFT;
     }

@@ -964,14 +964,14 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
             int s4x4 = h->mb.i_b4_stride;

             uint8_t (*nnz)[24] = h->mb.non_zero_count;
-            int *left_index_table = h->mb.left_index_table;
+            const int *left_index_table = h->mb.left_index_table;

             if( h->mb.i_neighbour & MB_TOP )
                 CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );

             if( h->mb.i_neighbour & MB_LEFT )
             {
-                int left = h->mb.i_mb_left_xy;
+                int left = h->mb.i_mb_left_xy[0];
                 h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][left_index_table[4+0]];
                 h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][left_index_table[4+1]];
                 h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][left_index_table[4+2]];
@@ -1046,7 +1046,7 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
     {
         uint8_t (*nnz)[24] = h->mb.non_zero_count;
         int top = h->mb.i_mb_top_xy;
-        int left = h->mb.i_mb_left_xy;
+        int left = h->mb.i_mb_left_xy[0];

         if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
         {
diff --git a/common/mvpred.c b/common/mvpred.c
index a24dde8..c8efe1f 100644
--- a/common/mvpred.c
+++ b/common/mvpred.c
@@ -426,7 +426,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
     }

     /* spatial predictors */
-    SET_MVP( mvr[h->mb.i_mb_left_xy] );
+    SET_MVP( mvr[h->mb.i_mb_left_xy[0]] );
     SET_MVP( mvr[h->mb.i_mb_top_xy] );
     SET_MVP( mvr[h->mb.i_mb_topleft_xy] );
     SET_MVP( mvr[h->mb.i_mb_topright_xy] );
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 5419bd1..87125c1 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -516,7 +516,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
         {
             /* Always run in fast-intra mode for subme < 3 */
             if( h->mb.i_subpel_refine > 2 &&
-              ( IS_INTRA( h->mb.i_mb_type_left ) ||
+              ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
                 IS_INTRA( h->mb.i_mb_type_top ) ||
                 IS_INTRA( h->mb.i_mb_type_topleft ) ||
                 IS_INTRA( h->mb.i_mb_type_topright ) ||
@@ -1296,7 +1296,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
     /* early termination: if 16x16 chose ref 0, then evalute no refs older
      * than those used by the neighbors */
     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
-        h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
+        h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
     {
         i_maxref = 0;
         CHECK_NEIGHBOUR(  -8 - 1 );
@@ -2063,7 +2063,7 @@ static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
     {
         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
         if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
-            h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
+            h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
         {
             i_maxref[l] = 0;
             CHECK_NEIGHBOUR(  -8 - 1 );
@@ -2817,7 +2817,7 @@ intra_analysis:
                     {}
                 else if( h->param.analyse.i_subpel_refine >= 3 )
                     analysis.b_try_skip = 1;
-                else if( h->mb.i_mb_type_left == P_SKIP ||
+                else if( h->mb.i_mb_type_left[0] == P_SKIP ||
                          h->mb.i_mb_type_top == P_SKIP ||
                          h->mb.i_mb_type_topleft == P_SKIP ||
                          h->mb.i_mb_type_topright == P_SKIP )
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 6333737..334318d 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -79,7 +79,7 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
     if( h->sh.i_type == SLICE_TYPE_I )
     {
         int ctx = 0;
-        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != I_4x4 )
+        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != I_4x4 )
             ctx++;
         if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 )
             ctx++;
@@ -113,7 +113,7 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
     else //if( h->sh.i_type == SLICE_TYPE_B )
     {
         int ctx = 0;
-        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
+        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != B_SKIP && h->mb.i_mb_type_left[0] != B_DIRECT )
             ctx++;
         if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
             ctx++;
@@ -198,7 +198,7 @@ static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
     int       ctx = 0;

     /* No need to test for I4x4 or I_16x16 as cache_save handle that */
-    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy] != 0 )
+    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 )
         ctx++;
     if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 )
         ctx++;
@@ -280,7 +280,7 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
 #if !RDO_SKIP_BS
 void x264_cabac_mb_skip( x264_t *h, int b_skip )
 {
-    int ctx = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left ))
+    int ctx = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] ))
             + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top ))
             + (h->sh.i_type == SLICE_TYPE_P ? 11 : 24);
     x264_cabac_encode_decision( &h->cabac, ctx, b_skip );
--
1.7.4


From 135b93d39e3d8d8540a41bd66d90aa42f7a73ba4 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Fri, 14 Jan 2011 21:18:14 +0000
Subject: [PATCH 05/25] Neighbour calculation for mbaff

Back up intra borders correctly and make neighbour calculation several times longer.
---
 common/common.h     |    9 ++-
 common/macroblock.c |  286 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 249 insertions(+), 46 deletions(-)

diff --git a/common/common.h b/common/common.h
index f839e7e..c993857 100644
--- a/common/common.h
+++ b/common/common.h
@@ -549,6 +549,8 @@ struct x264_t
         int     i_mb_stride;
         int     i_b8_stride;
         int     i_b4_stride;
+        int     left_b8[2];
+        int     left_b4[2];

         /* Current index */
         int     i_mb_x;
@@ -597,6 +599,10 @@ struct x264_t
         int     i_mb_topleft_xy;
         int     i_mb_topright_xy;
         const int *left_index_table;
+        int     topleft_partition;
+        int     intra_border_index;
+        int     topleft_border_index;
+        int     topright_border_index;

         /**** thread synchronization ends here ****/
         /* subsequent variables are either thread-local or constant,
@@ -827,7 +833,8 @@ struct x264_t

     /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
-    pixel *intra_border_backup[2][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+    pixel *intra_border_backup[3][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+    pixel *intra_diagonal_backup[5][3];
     uint8_t (*deblock_strength[2])[2][4][4];

     /* CPU functions dependents */
diff --git a/common/macroblock.c b/common/macroblock.c
index aa194a7..60275ae 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -314,18 +314,35 @@ void x264_macroblock_cache_free( x264_t *h )
 int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
 {
     if( !b_lookahead )
-        for( int i = 0; i <= h->param.b_interlaced; i++ )
+    {
+        for( int i = 0; i <= 2*h->param.b_interlaced; i++ )
         {
             for( int j = 0; j < 2; j++ )
             {
                 /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
                 CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
                 h->intra_border_backup[i][j] += 16;
-                h->intra_border_backup[1][j] = h->intra_border_backup[i][j];
+                if( !h->param.b_interlaced )
+                    h->intra_border_backup[1][j] = h->intra_border_backup[i][j];
+            }
+        }
+        for( int i = 0; i < 4*h->mb.b_interlaced; i++ )
+        {
+            for( int j = 0; j < 3; j++ )
+            {
+                const int width = 1 + 8; // top left pixel + eight top right pixels (for luma)
+                CHECKED_MALLOCZERO( h->intra_diagonal_backup[i][j], (h->sps->i_mb_width*width+32) * sizeof(pixel) );
+                h->intra_diagonal_backup[i][j] += 16;
+                if( !h->param.b_interlaced )
+                    h->intra_diagonal_backup[1][j] = h->intra_diagonal_backup[i][j];
             }
+        }
+        for( int i = 0; i <= h->param.b_interlaced; i++ )
+        {
             CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
             h->deblock_strength[1] = h->deblock_strength[i];
         }
+    }

     /* Allocate scratch buffer */
     int scratch_size = 0;
@@ -353,12 +370,20 @@ fail:
 void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
 {
     if( !b_lookahead )
+    {
         for( int i = 0; i <= h->param.b_interlaced; i++ )
-        {
             x264_free( h->deblock_strength[i] );
+        for( int i = 0; i <= 2*h->param.b_interlaced; i++ )
+        {
             for( int j = 0; j < 2; j++ )
                 x264_free( h->intra_border_backup[i][j] - 16 );
         }
+        for( int i = 0; i < 4*h->param.b_interlaced; i++ )
+        {
+            for( int j = 0; j < 3; j++ )
+                x264_free( h->intra_diagonal_backup[i][j] - 16 );
+        }
+    }
     x264_free( h->scratch_buffer );
 }

@@ -494,14 +519,15 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
 {
     int w = (i ? 8 : 16);
     int i_stride = h->fdec->i_stride[i];
-    int i_stride2 = i_stride << b_interlaced;
-    int i_pix_offset = b_interlaced
+    int i_stride2 = i_stride << h->mb.b_interlaced;
+    int i_pix_offset = h->mb.b_interlaced
                      ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                      : 16 * mb_x + w * mb_y * i_stride;
     pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
-    pixel *intra_fdec = &h->intra_border_backup[mb_y&1][i][mb_x*16];
+    pixel *intra_fdec = &h->intra_border_backup[h->mb.intra_border_index][i][mb_x*16];
     int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
-    if( b_interlaced )
+    /* ref_pix_offset[0] references the current field and [1] the opposite field. */
+    if( h->mb.b_interlaced )
         ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride;
     h->mb.pic.i_stride[i] = i_stride2;
     h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
@@ -510,11 +536,28 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
         h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
         memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
         memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
+        if( h->sh.b_mbaff )
+        {
+            // Top left samples.
+            h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->intra_diagonal_backup[h->mb.topleft_border_index][1][mb_x*9];
+            h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->intra_diagonal_backup[h->mb.topleft_border_index][2][mb_x*9];
+            // Top right samples.
+            CP32( &h->mb.pic.p_fdec[1][-FDEC_STRIDE+8], &h->intra_diagonal_backup[h->mb.topright_border_index][1][mb_x*9+1] );
+            CP32( &h->mb.pic.p_fdec[2][-FDEC_STRIDE+8], &h->intra_diagonal_backup[h->mb.topright_border_index][2][mb_x*9+1] );
+        }
+
     }
     else
     {
         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 );
-        memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
+        if( h->sh.b_mbaff )
+        {
+            memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 16*sizeof(pixel) );
+            h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->intra_diagonal_backup[h->mb.topleft_border_index][0][mb_x*9];
+            CP64( &h->mb.pic.p_fdec[0][-FDEC_STRIDE+16], &h->intra_diagonal_backup[h->mb.topright_border_index][0][mb_x*9+1] );
+        }
+        else
+            memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
     }
     if( b_interlaced )
     {
@@ -571,6 +614,10 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
     h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
     h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x);
     h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x);
+    h->mb.left_b8[0] =
+    h->mb.left_b8[1] = -1;
+    h->mb.left_b4[0] =
+    h->mb.left_b4[1] = -1;
     h->mb.i_neighbour = 0;
     h->mb.i_neighbour_intra = 0;
     h->mb.i_neighbour_frame = 0;
@@ -583,16 +630,105 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
     h->mb.i_mb_type_topleft = -1;
     h->mb.i_mb_type_topright = -1;
     h->mb.left_index_table = left_indices[4];
+    h->mb.topleft_partition = 0;
+    h->mb.topright_border_index =
+    h->mb.topleft_border_index = !(mb_y&1);
+    h->mb.intra_border_index = mb_y&1;
+
+    int topleft = top - 1;
+    int topright = top + 1;
+    int left[2];
+
+    left[0] = left[1] = h->mb.i_mb_xy - 1;
+    h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2;
+    h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4;
+
+    if( h->sh.b_mbaff )
+    {
+        if( mb_y&1 )
+        {
+            if( mb_x && h->mb.b_interlaced != h->mb.field[h->mb.i_mb_xy-1] )
+            {
+                left[0] = left[1] = h->mb.i_mb_xy - 1 - h->mb.i_mb_stride;
+                h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2 - 2*h->mb.i_b8_stride;
+                h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4 - 4*h->mb.i_b4_stride;
+
+                if( h->mb.b_interlaced )
+                {
+                    h->mb.left_index_table = left_indices[3];
+                    left[1] += h->mb.i_mb_stride;
+                    h->mb.left_b8[1] += 2*h->mb.i_b8_stride;
+                    h->mb.left_b4[1] += 4*h->mb.i_b4_stride;
+                }
+                else
+                {
+                    h->mb.left_index_table = left_indices[1];
+                    topleft += h->mb.i_mb_stride;
+                    h->mb.topleft_partition = 1;
+                    h->mb.topleft_border_index = 3;
+                }
+            }
+            if( h->mb.b_interlaced )
+                h->mb.topleft_border_index = 1;
+            else
+            {
+                topright = -1;
+                h->mb.intra_border_index = 0;
+            }
+            h->mb.topright_border_index = 1;
+        }
+        else
+        {
+            if( h->mb.b_interlaced && top >= 0 )
+            {
+                if( !h->mb.field[top] )
+                {
+                    top += h->mb.i_mb_stride;
+                    h->mb.intra_border_index = 2;
+                }
+                if( mb_x )
+                    topleft += h->mb.i_mb_stride*(!h->mb.field[topleft]);
+                if( mb_x < h->mb.i_mb_width-1 )
+                    topright += h->mb.i_mb_stride*(!h->mb.field[topright]);
+
+                if( topright >=0 && h->mb.field[topright] )
+                    h->mb.topright_border_index = 0;
+                else
+                    h->mb.topright_border_index = 2;
+                if( topleft >=0 && h->mb.field[topleft] )
+                    h->mb.topleft_border_index = 0;
+                else
+                    h->mb.topleft_border_index = 2;
+            }
+            else
+                h->mb.intra_border_index = 1;
+            if( mb_x && h->mb.b_interlaced != h->mb.field[h->mb.i_mb_xy-1] )
+            {
+                if( h->mb.b_interlaced )
+                {
+                    h->mb.left_index_table = left_indices[2];
+                    left[1] += h->mb.i_mb_stride;
+                    h->mb.left_b8[1] += 2*h->mb.i_b8_stride;
+                    h->mb.left_b4[1] += 4*h->mb.i_b4_stride;
+                }
+                else
+                    h->mb.left_index_table = left_indices[0];
+            }
+        }
+    }

     if( mb_x > 0 )
     {
         h->mb.i_neighbour_frame |= MB_LEFT;
-        h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
+        h->mb.i_mb_left_xy[0] = left[0];
+        h->mb.i_mb_left_xy[1] = left[1];
         h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]];
+        h->mb.i_mb_type_left[1] = h->mb.type[h->mb.i_mb_left_xy[1]];
         if( h->mb.i_mb_xy > h->sh.i_first_mb )
         {
             h->mb.i_neighbour |= MB_LEFT;

+            // FIXME: We don't currently support constrained intra + mbaff.
             if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left[0] ) )
                 h->mb.i_neighbour_intra |= MB_LEFT;
         }
@@ -625,12 +761,12 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
             }
         }

-        if( mb_x > 0 && top - 1 >= 0  )
+        if( mb_x > 0 && topleft >= 0  )
         {
             h->mb.i_neighbour_frame |= MB_TOPLEFT;
-            h->mb.i_mb_topleft_xy = top - 1;
+            h->mb.i_mb_topleft_xy = topleft;
             h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topleft_xy];
-            if( top - 1 >= h->sh.i_first_mb )
+            if( topleft >= h->sh.i_first_mb )
             {
                 h->mb.i_neighbour |= MB_TOPLEFT;

@@ -639,12 +775,12 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
             }
         }

-        if( mb_x < h->mb.i_mb_width - 1 && top + 1 >= 0 )
+        if( mb_x < h->mb.i_mb_width - 1 && topright >= 0 )
         {
             h->mb.i_neighbour_frame |= MB_TOPRIGHT;
-            h->mb.i_mb_topright_xy = top + 1;
+            h->mb.i_mb_topright_xy = topright;
             h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topright_xy];
-            if( top + 1 >= h->sh.i_first_mb )
+            if( topright >= h->sh.i_first_mb )
             {
                 h->mb.i_neighbour |= MB_TOPRIGHT;

@@ -659,9 +795,9 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
 {
     x264_macroblock_cache_load_neighbours( h, mb_x, mb_y );

-    int left = h->mb.i_mb_left_xy[0];
+    int *left = h->mb.i_mb_left_xy;
     int top  = h->mb.i_mb_top_xy;
-    int top_y = mb_y - (1 << h->mb.b_interlaced);
+    int top_y = top / h->mb.i_mb_stride;
     int s8x8 = h->mb.i_b8_stride;
     int s4x4 = h->mb.i_b4_stride;
     int top_8x8 = (2*top_y+1) * s8x8 + 2*mb_x;
@@ -715,25 +851,25 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

     if( h->mb.i_neighbour & MB_LEFT )
     {
-        h->mb.cache.i_cbp_left = cbp[left];
+        h->mb.cache.i_cbp_left = cbp[left[0]];

         /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][left_index_table[0]];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][left_index_table[1]];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][left_index_table[2]];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][left_index_table[3]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left[0]][left_index_table[0]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left[0]][left_index_table[1]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left[1]][left_index_table[2]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left[1]][left_index_table[3]];

         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][left_index_table[4+0]];
-        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][left_index_table[4+1]];
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][left_index_table[4+2]];
-        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][left_index_table[4+3]];
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table[4+0]];
+        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table[4+1]];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table[4+2]];
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table[4+3]];

-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][left_index_table[4+4]];
-        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][left_index_table[4+5]];
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left[0]][left_index_table[4+4]];
+        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left[1]][left_index_table[4+5]];

-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][left_index_table[4+6]];
-        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][left_index_table[4+7]];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left[0]][left_index_table[4+6]];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left[1]][left_index_table[4+7]];
     }
     else
     {
@@ -758,7 +894,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
     if( h->pps->b_transform_8x8_mode )
     {
         h->mb.cache.i_neighbour_transform_size =
-            ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] )
+            ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left[0]] )
           + ( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top]  );
     }

@@ -771,7 +907,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
           + !!(h->mb.i_neighbour & MB_TOP);
     }

-    if( !h->mb.b_interlaced )
+    if( !h->sh.b_mbaff )
     {
         x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
         x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
@@ -872,10 +1008,10 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

             if( h->mb.i_neighbour & MB_LEFT )
             {
-                CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][left_index_table[0]] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][left_index_table[1]] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][left_index_table[2]] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][left_index_table[3]] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[0]][left_index_table[0]] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[0]][left_index_table[1]] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[1]][left_index_table[2]] );
+                CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[1]][left_index_table[3]] );
             }
             else
                 for( int i = 0; i < 4; i++ )
@@ -892,7 +1028,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         {
             uint8_t skipbp;
             x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
-            skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left] : 0;
+            skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0;
             h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
             h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
             skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0;
@@ -971,11 +1107,11 @@ void x264_macroblock_cache_load_deblock( x264_t *h )

             if( h->mb.i_neighbour & MB_LEFT )
             {
-                int left = h->mb.i_mb_left_xy[0];
-                h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][left_index_table[4+0]];
-                h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][left_index_table[4+1]];
-                h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][left_index_table[4+2]];
-                h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][left_index_table[4+3]];
+                int *left = h->mb.i_mb_left_xy;
+                h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table[4+0]];
+                h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table[4+1]];
+                h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table[4+2]];
+                h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table[4+3]];
             }

             for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
@@ -1106,20 +1242,80 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb
     int i_pix_offset = b_interlaced
                      ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                      : 16 * mb_x + w * mb_y * i_stride;
+    const int intra_diag_width = 8+1; // One top left sample, then eight top right samples.
     pixel *intra_fdec = &h->intra_border_backup[mb_y&1][i][mb_x*16];
     if( i )
     {
+        if( h->sh.b_mbaff )
+        {
+            /* Frame macroblocks use the macroblock directly above for intra
+             * prediction. Field macroblock pairs predict from fields of the same
+             * parity. However field macroblock pairs predicting from frame pairs
+             * use the bottom two rows of the frame for prediction, the penultimate
+             * row is stored in intra_border_backup[2]. */
+            if( mb_y&1 )
+            {
+                if( mb_x )
+                {
+                    // Store top left.
+                    h->intra_diagonal_backup[1][1][mb_x*intra_diag_width] = h->intra_border_backup[1][1][(mb_x-1)*16+7];
+                    h->intra_diagonal_backup[1][2][mb_x*intra_diag_width] = h->intra_border_backup[1][1][(mb_x-1)*16+8+7];
+                    h->intra_diagonal_backup[2][1][mb_x*intra_diag_width] = h->intra_border_backup[2][1][(mb_x-1)*16+7];
+                    h->intra_diagonal_backup[2][2][mb_x*intra_diag_width] = h->intra_border_backup[2][1][(mb_x-1)*16+8+7];
+                    // Store top right.
+                    CP32( &h->intra_diagonal_backup[1][1][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*7 );
+                    CP32( &h->intra_diagonal_backup[1][2][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*7 );
+                    CP32( &h->intra_diagonal_backup[2][1][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*6 );
+                    CP32( &h->intra_diagonal_backup[2][2][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*6 );
+                }
+                memcpy( &h->intra_border_backup[2][i][mb_x*16],   h->mb.pic.p_fdec[1]+FDEC_STRIDE*6, 8*sizeof(pixel) );
+                memcpy( &h->intra_border_backup[2][i][mb_x*16]+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE*6, 8*sizeof(pixel) );
+            }
+            else
+            {
+                if( mb_x )
+                {
+                    h->intra_diagonal_backup[0][1][mb_x*intra_diag_width] = h->intra_border_backup[0][1][(mb_x-1)*16+7];
+                    h->intra_diagonal_backup[0][2][mb_x*intra_diag_width] = h->intra_border_backup[0][1][(mb_x-1)*16+8+7];
+                    CP32( &h->intra_diagonal_backup[0][1][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7 );
+                    CP32( &h->intra_diagonal_backup[0][2][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7 );
+                }
+                // Sometimes needed for bottom macroblock of this pair.
+                h->intra_diagonal_backup[3][1][mb_x*intra_diag_width] = h->mb.pic.p_fdec[1][-1+7*FDEC_STRIDE];
+                h->intra_diagonal_backup[3][2][mb_x*intra_diag_width] = h->mb.pic.p_fdec[2][-1+7*FDEC_STRIDE];
+            }
+        }
         h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
         memcpy( intra_fdec,   h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) );
         memcpy( intra_fdec+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) );
-        twiddle_topleft_pixel( h->mb.pic.p_fdec[1]-FDEC_STRIDE-1, h->mb.pic.p_fdec[1]-FDEC_STRIDE+7, b_interlaced );
-        twiddle_topleft_pixel( h->mb.pic.p_fdec[2]-FDEC_STRIDE-1, h->mb.pic.p_fdec[2]-FDEC_STRIDE+7, b_interlaced );
+        twiddle_topleft_pixel( h->mb.pic.p_fdec[1]-FDEC_STRIDE-1, h->mb.pic.p_fdec[1]-FDEC_STRIDE+7, h->sh.b_mbaff );
+        twiddle_topleft_pixel( h->mb.pic.p_fdec[2]-FDEC_STRIDE-1, h->mb.pic.p_fdec[2]-FDEC_STRIDE+7, h->sh.b_mbaff );
     }
     else
     {
+        if( h->sh.b_mbaff )
+        {
+            if( mb_y&1 )
+            {
+                if( mb_x )
+                {
+                    h->intra_diagonal_backup[1][0][mb_x*intra_diag_width] = h->intra_border_backup[1][0][(mb_x-1)*16+15];
+                    h->intra_diagonal_backup[2][0][mb_x*intra_diag_width] = h->intra_border_backup[2][0][(mb_x-1)*16+15];
+                    CP64( &h->intra_diagonal_backup[1][0][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15 );
+                    CP64( &h->intra_diagonal_backup[2][0][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*14 );
+                }
+                memcpy( &h->intra_border_backup[2][i][mb_x*16], h->mb.pic.p_fdec[0]+FDEC_STRIDE*14, 16*sizeof(pixel) );
+            }
+            else
+            {
+                h->intra_diagonal_backup[0][0][mb_x*intra_diag_width] = h->intra_border_backup[0][0][(mb_x-1)*16+15];
+                CP64( &h->intra_diagonal_backup[0][0][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15 );
+                h->intra_diagonal_backup[3][0][mb_x*intra_diag_width] = h->mb.pic.p_fdec[0][-1+15*FDEC_STRIDE];
+            }
+        }
         h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
         memcpy( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
-        twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, b_interlaced );
+        twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, h->sh.b_mbaff );
     }
 }

--
1.7.4


From db33884079bf79074a67fac2851d8c9425c45bfa Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Thu, 17 Feb 2011 00:56:59 +0000
Subject: [PATCH 06/25] Change b_interlaced in store_pic back to its original meaning

---
 common/macroblock.c |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 60275ae..63a8933 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1238,15 +1238,15 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb
 {
     int w = i ? 8 : 16;
     int i_stride = h->fdec->i_stride[i];
-    int i_stride2 = i_stride << b_interlaced;
-    int i_pix_offset = b_interlaced
+    int i_stride2 = i_stride << (b_interlaced && h->mb.b_interlaced);
+    int i_pix_offset = (b_interlaced && h->mb.b_interlaced)
                      ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                      : 16 * mb_x + w * mb_y * i_stride;
     const int intra_diag_width = 8+1; // One top left sample, then eight top right samples.
     pixel *intra_fdec = &h->intra_border_backup[mb_y&1][i][mb_x*16];
     if( i )
     {
-        if( h->sh.b_mbaff )
+        if( b_interlaced )
         {
             /* Frame macroblocks use the macroblock directly above for intra
              * prediction. Field macroblock pairs predict from fields of the same
@@ -1293,7 +1293,7 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb
     }
     else
     {
-        if( h->sh.b_mbaff )
+        if( b_interlaced )
         {
             if( mb_y&1 )
             {
@@ -1333,7 +1333,7 @@ void x264_macroblock_cache_save( x264_t *h )
     int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
     uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];

-    if( h->mb.b_interlaced )
+    if( h->sh.b_mbaff )
     {
         x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 1 );
         x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1 );
--
1.7.4


From b816ff5ba65edb03226237e2fc6fff06d7d9e60a Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Fri, 18 Feb 2011 18:57:15 +0000
Subject: [PATCH 07/25] Only enable twiddle_topleft in progressive

---
 common/macroblock.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 63a8933..f775030 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1313,9 +1313,10 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb
                 h->intra_diagonal_backup[3][0][mb_x*intra_diag_width] = h->mb.pic.p_fdec[0][-1+15*FDEC_STRIDE];
             }
         }
+        else
+            twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, h->sh.b_mbaff );
         h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
         memcpy( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
-        twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, h->sh.b_mbaff );
     }
 }

--
1.7.4


From 2b1351a44177a7dc289c9bfe38c7ddc7b0f9a3a2 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Wed, 16 Mar 2011 21:34:28 +0000
Subject: [PATCH 08/25] Initial inter support

---
 common/common.h      |    5 +
 common/frame.c       |   33 +++++++-
 common/frame.h       |    3 +
 common/macroblock.c  |  217 +++++++++++++++++++++++++++++++++++++++++++-------
 common/mc.c          |   37 +++++++--
 common/mvpred.c      |   23 +++++-
 common/x86/util.h    |   24 ------
 encoder/macroblock.c |    4 +-
 8 files changed, 277 insertions(+), 69 deletions(-)

diff --git a/common/common.h b/common/common.h
index c993857..ef9b35a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -600,6 +600,7 @@ struct x264_t
         int     i_mb_topright_xy;
         const int *left_index_table;
         int     topleft_partition;
+        int     allow_skip;
         int     intra_border_index;
         int     topleft_border_index;
         int     topright_border_index;
@@ -730,6 +731,10 @@ struct x264_t
             /* neighbor CBPs */
             int     i_cbp_top;
             int     i_cbp_left;
+
+            /* extra data required for mbaff in mv prediction */
+            int16_t topright_mv[2][3][2];
+            int8_t  topright_ref[2][3];
         } cache;

         /* */
diff --git a/common/frame.c b/common/frame.c
index eff8ca5..d04f047 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -48,7 +48,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )

     int i_mb_count = h->mb.i_mb_count;
     int i_stride, i_width, i_lines;
-    int i_padv = PADV << h->param.b_interlaced;
+    int i_padv = PADV << 2*h->param.b_interlaced;
     int luma_plane_size, chroma_plane_size;
     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
     int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
@@ -99,21 +99,30 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));

     CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
+    CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
     frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
+    frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH;

     /* all 4 luma planes allocated together, since the cacheline split code
      * requires them to be in-phase wrt cacheline alignment. */
     if( h->param.analyse.i_subpel_refine && b_fdec )
     {
         CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size * sizeof(pixel) );
+        CHECKED_MALLOC( frame->buffer_fld[0], 4*luma_plane_size * sizeof(pixel) );
         for( int i = 0; i < 4; i++ )
+        {
             frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+            frame->filtered_fld[i] = frame->buffer_fld[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+        }
         frame->plane[0] = frame->filtered[0];
+        frame->plane_fld[0] = frame->filtered_fld[0];
     }
     else
     {
         CHECKED_MALLOC( frame->buffer[0], luma_plane_size * sizeof(pixel) );
+        CHECKED_MALLOC( frame->buffer_fld[0], luma_plane_size * sizeof(pixel) );
         frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
+        frame->filtered_fld[0] = frame->plane_fld[0] = frame->buffer_fld[0] + frame->i_stride[0] * i_padv + PADH;
     }

     frame->b_duplicate = 0;
@@ -200,7 +209,10 @@ void x264_frame_delete( x264_frame_t *frame )
     if( !frame->b_duplicate )
     {
         for( int i = 0; i < 4; i++ )
+        {
             x264_free( frame->buffer[i] );
+            x264_free( frame->buffer_fld[i] );
+        }
         for( int i = 0; i < 4; i++ )
             x264_free( frame->buffer_lowres[i] );
         for( int i = 0; i < X264_BFRAME_MAX+2; i++ )
@@ -363,16 +375,25 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
         int padh = PADH;
         int padv = PADV >> !!i;
         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
-        pixel *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
         if( b_end && !b_start )
             height += 4 >> (!!i + h->sh.b_mbaff);
+        pixel *pix;
         if( h->sh.b_mbaff )
         {
+            // border samples for each field are extended separately
+            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i );
             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i );
+
+            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> !!i;
+            if( b_end && !b_start )
+                height += 4 >> (!!i);
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i );
         }
         else
         {
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i );
         }
     }
@@ -392,14 +413,16 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y
     for( int i = 1; i < 4; i++ )
     {
         // buffer: 8 luma, to match the hpel filter
-        pixel *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
+        pixel *pix;
         if( h->sh.b_mbaff )
         {
+            pix = frame->filtered_fld[i] + (16*mb_y - 16) * stride - 4;
             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
         }
-        else
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 );
+
+        pix = frame->filtered[i] + (16*mb_y - 8) * stride - 4;
+        plane_expand_border( pix, stride, width, height << h->sh.b_mbaff, padh, padv, b_start, b_end, 0 );
     }
 }

diff --git a/common/frame.h b/common/frame.h
index 0e0ab3d..8fe0627 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -72,13 +72,16 @@ typedef struct x264_frame
     int     i_width_lowres;
     int     i_lines_lowres;
     pixel *plane[2];
+    pixel *plane_fld[2];
     pixel *filtered[4]; /* plane[0], H, V, HV */
+    pixel *filtered_fld[4];
     pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
     uint16_t *integral;

     /* for unrestricted mv we allocate more data than needed
      * allocated data are stored in buffer */
     pixel *buffer[4];
+    pixel *buffer_fld[4];
     pixel *buffer_lowres[4];

     x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
diff --git a/common/macroblock.c b/common/macroblock.c
index f775030..a441981 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -523,7 +523,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
     int i_pix_offset = h->mb.b_interlaced
                      ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                      : 16 * mb_x + w * mb_y * i_stride;
-    pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+    pixel *plane_fdec = h->mb.b_interlaced ? &h->fdec->plane_fld[i][i_pix_offset] : &h->fdec->plane[i][i_pix_offset];
     pixel *intra_fdec = &h->intra_border_backup[h->mb.intra_border_index][i][mb_x*16];
     int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
     /* ref_pix_offset[0] references the current field and [1] the opposite field. */
@@ -570,15 +570,28 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
             else
                 h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
     }
+    pixel *plane_src, **filtered_src;
     for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
     {
-        h->mb.pic.p_fref[0][j][i?4:0] = &h->fref[0][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]];
+        // Interpolate between pixels in same field.
+        if( h->mb.b_interlaced )
+        {
+            plane_src = h->fref[0][j>>1]->plane_fld[i];
+            filtered_src = h->fref[0][j>>1]->filtered_fld;
+        }
+        else
+        {
+            plane_src = h->fref[0][j]->plane[i];
+            filtered_src = h->fref[0][j]->filtered;
+        }
+        h->mb.pic.p_fref[0][j][i?4:0] = plane_src + ref_pix_offset[j&1];
+
         if( !i )
         {
             for( int k = 1; k < 4; k++ )
-                h->mb.pic.p_fref[0][j][k] = &h->fref[0][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+                h->mb.pic.p_fref[0][j][k] = filtered_src[k] + ref_pix_offset[j&1];
             if( h->sh.weight[j][0].weightfn )
-                h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> b_interlaced][ref_pix_offset[j&1]];
+                h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> h->mb.b_interlaced][ref_pix_offset[j&1]];
             else
                 h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
         }
@@ -586,10 +599,21 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
     if( h->sh.i_type == SLICE_TYPE_B )
         for( int j = 0; j < h->mb.pic.i_fref[1]; j++ )
         {
-            h->mb.pic.p_fref[1][j][i?4:0] = &h->fref[1][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]];
+             if( h->mb.b_interlaced )
+             {
+                 plane_src = h->fref[1][j>>1]->plane_fld[i];
+                 filtered_src = h->fref[1][j>>1]->filtered_fld;
+             }
+             else
+             {
+                 plane_src = h->fref[1][j]->plane[i];
+                 filtered_src = h->fref[1][j]->filtered;
+             }
+             h->mb.pic.p_fref[1][j][i?4:0] = plane_src + ref_pix_offset[j&1];
+
             if( !i )
                 for( int k = 1; k < 4; k++ )
-                    h->mb.pic.p_fref[1][j][k] = &h->fref[1][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+                    h->mb.pic.p_fref[1][j][k] = filtered_src[k] + ref_pix_offset[j&1];
         }
 }

@@ -851,7 +875,9 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

     if( h->mb.i_neighbour & MB_LEFT )
     {
-        h->mb.cache.i_cbp_left = cbp[left[0]];
+        const int16_t top_luma = (cbp[left[0]] >> (left_index_table[18+0]&(~1))) & 2;
+        const int16_t bot_luma = (cbp[left[1]] >> (left_index_table[18+2]&(~1))) & 2;
+        h->mb.cache.i_cbp_left = (cbp[left[0]] & 0xfff0) | (bot_luma<<2) | top_luma;

         /* load intra4x4 */
         h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left[0]][left_index_table[0]];
@@ -903,8 +929,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         h->mb.pic.i_fref[0] = h->i_ref[0] << h->mb.b_interlaced;
         h->mb.pic.i_fref[1] = h->i_ref[1] << h->mb.b_interlaced;
         h->mb.cache.i_neighbour_interlaced =
-            !!(h->mb.i_neighbour & MB_LEFT)
-          + !!(h->mb.i_neighbour & MB_TOP);
+            !!(h->mb.i_neighbour & MB_LEFT && h->mb.field[left[0]])
+          + !!(h->mb.i_neighbour & MB_TOP && h->mb.field[top]);
     }

     if( !h->sh.b_mbaff )
@@ -941,8 +967,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         int i8 = x264_scan8[0] - 1 - 1*8;
         if( h->mb.i_neighbour & MB_TOPLEFT )
         {
-            h->mb.cache.ref[l][i8] = ref[top_8x8 - 1];
-            CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] );
+            int y = h->mb.i_mb_topleft_xy / h->mb.i_mb_stride;
+            int ir = 2*(s8x8*y + mb_x-1)+1+s8x8;
+            int iv = 4*(s4x4*y + mb_x-1)+3+3*s4x4;
+            if( h->mb.topleft_partition )
+            {
+                /* Take motion vector from the middle of macroblock instead of
+                 * the bottom right as usual. */
+                iv -= 2*s4x4;
+                ir -= s8x8;
+            }
+            h->mb.cache.ref[l][i8] = ref[ir];
+            CP32( h->mb.cache.mv[l][i8], mv[iv] );
         }
         else
         {
@@ -968,8 +1004,9 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         i8 = x264_scan8[0] + 4 - 1*8;
         if( h->mb.i_neighbour & MB_TOPRIGHT )
         {
-            h->mb.cache.ref[l][i8] = ref[top_8x8 + 2];
-            CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] );
+            int y = h->mb.i_mb_topright_xy / h->mb.i_mb_stride;
+            h->mb.cache.ref[l][i8] = ref[2*(s8x8*y + (mb_x+1))+s8x8];
+            CP32( h->mb.cache.mv[l][i8], mv[4*(s4x4*y + (mb_x+1))+3*s4x4] );
         }
         else
              h->mb.cache.ref[l][i8] = -2;
@@ -977,17 +1014,15 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         i8 = x264_scan8[0] - 1;
         if( h->mb.i_neighbour & MB_LEFT )
         {
-            const int ir = h->mb.i_b8_xy - 1;
-            const int iv = h->mb.i_b4_xy - 1;
-            h->mb.cache.ref[l][i8+0*8] =
-            h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
-            h->mb.cache.ref[l][i8+2*8] =
-            h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
-
-            CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
-            CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
-            CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
-            CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
+            h->mb.cache.ref[l][i8+0*8] = ref[h->mb.left_b8[0] + 1 + s8x8*((left_index_table[12+0]&~1)>>1)];
+            h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*((left_index_table[12+1]&~1)>>1)];
+            h->mb.cache.ref[l][i8+2*8] = ref[h->mb.left_b8[1] + 1 + s8x8*((left_index_table[12+2]&~1)>>1)];
+            h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*((left_index_table[12+3]&~1)>>1)];
+
+            CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table[12+0]] );
+            CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table[12+1]] );
+            CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table[12+2]] );
+            CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table[12+3]] );
         }
         else
         {
@@ -998,6 +1033,42 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
             }
         }

+        /* Extra logic for top right mv in mbaff.
+         * . . . d  . . a .
+         * . . . e  . . . .
+         * . . . f  b . c .
+         * . . . .  . . . .
+         *
+         * If the top right of the 4x4 partitions labeled a, b and c in the
+         * above diagram do not exist, but the entries d, e and f exist (in
+         * the macroblock to the left) then use those instead.
+         */
+        if( h->param.b_interlaced )
+        {
+            if( h->mb.i_neighbour & MB_LEFT )
+            {
+                if( h->mb.b_interlaced && !h->mb.field[h->mb.i_mb_xy-1] )
+                {
+                    h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*0];
+                    h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*1];
+                    h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[1] + 1 + s8x8*0];
+                    CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table[12+0]+1)] );
+                    CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table[12+1]+1)] );
+                    CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[1] + 3 + s4x4*(left_index_table[12+2]+1)] );
+                }
+                else if( !h->mb.b_interlaced && h->mb.field[h->mb.i_mb_xy-1] )
+                {
+                    // Looking at the bottom field so always take the bottom macroblock of the pair.
+                    h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table[12+4]];
+                    h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table[12+4]];
+                    h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table[12+5]];
+                    CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table[12+0]] );
+                    CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table[12+1]] );
+                    CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table[12+2]] );
+                }
+            }
+        }
+
         if( h->param.b_cabac )
         {
             uint8_t (*mvd)[8][2] = h->mb.mvd[l];
@@ -1006,16 +1077,103 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
             else
                 M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0;

-            if( h->mb.i_neighbour & MB_LEFT )
+            if( h->mb.cache.ref[l][x264_scan8[0]-1] >= 0 )
             {
                 CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[0]][left_index_table[0]] );
                 CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[0]][left_index_table[1]] );
+            }
+            else
+            {
+                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+0*8] ) = 0;
+                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+1*8] ) = 0;
+            }
+            if( h->mb.cache.ref[l][x264_scan8[0]-1+2*8] >=0 )
+            {
                 CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[1]][left_index_table[2]] );
                 CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[1]][left_index_table[3]] );
             }
             else
-                for( int i = 0; i < 4; i++ )
-                    M16( h->mb.cache.mvd[l][x264_scan8[0]-1+i*8] ) = 0;
+            {
+                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+2*8] ) = 0;
+                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+3*8] ) = 0;
+            }
+        }
+
+        /* If motion vectors are cached from frame macroblocks but this
+         * macroblock is a field macroblock then the motion vector must be
+         * halved. Similarly, motion vectors from field macroblocks are doubled. */
+        if( h->sh.b_mbaff )
+        {
+#define MAP_MVS\
+                MAP_F2F(mv, ref, x264_scan8[0] - 1 - 1*8, h->mb.i_mb_topleft_xy)\
+                MAP_F2F(mv, ref, x264_scan8[0] + 0 - 1*8, top)\
+                MAP_F2F(mv, ref, x264_scan8[0] + 1 - 1*8, top)\
+                MAP_F2F(mv, ref, x264_scan8[0] + 2 - 1*8, top)\
+                MAP_F2F(mv, ref, x264_scan8[0] + 3 - 1*8, top)\
+                MAP_F2F(mv, ref, x264_scan8[0] + 4 - 1*8, h->mb.i_mb_topright_xy)\
+                MAP_F2F(mv, ref, x264_scan8[0] - 1 + 0*8, left[0])\
+                MAP_F2F(mv, ref, x264_scan8[0] - 1 + 1*8, left[0])\
+                MAP_F2F(mv, ref, x264_scan8[0] - 1 + 2*8, left[1])\
+                MAP_F2F(mv, ref, x264_scan8[0] - 1 + 3*8, left[1])\
+                MAP_F2F(topright_mv, topright_ref, 0, left[0])\
+                MAP_F2F(topright_mv, topright_ref, 1, left[0])\
+                MAP_F2F(topright_mv, topright_ref, 2, left[1])
+
+            if( h->mb.b_interlaced )
+            {
+#define MAP_F2F(varmv, varref, index, macroblock)\
+                if( h->mb.cache.varref[l][index] >= 0 && macroblock >= 0 && !h->mb.field[macroblock] )\
+                {\
+                    h->mb.cache.varref[l][index] <<= 1;\
+                    h->mb.cache.varmv[l][index][1] /= 2;\
+                    h->mb.cache.mvd[l][index][1] >>= 1;\
+                }
+                MAP_MVS
+#undef MAP_F2F
+            }
+            else
+            {
+#define MAP_F2F(varmv, varref, index, macroblock)\
+                if( h->mb.cache.varref[l][index] >= 0 && macroblock >= 0 && h->mb.field[macroblock] )\
+                {\
+                    h->mb.cache.varref[l][index] >>= 1;\
+                    h->mb.cache.varmv[l][index][1] <<= 1;\
+                    h->mb.cache.mvd[l][index][1] <<= 1;\
+                }
+                MAP_MVS
+#undef MAP_F2F
+            }
+        }
+    }
+
+    /* Check whether skip here would cause decoder to predict interlace mode incorrectly. */
+    h->mb.allow_skip = 1;
+    if( h->sh.b_mbaff && (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
+    {
+        if( h->mb.i_neighbour & MB_LEFT )
+        {
+            if( h->mb.field[h->mb.i_mb_xy - 1] != h->mb.b_interlaced )
+                h->mb.allow_skip = 0;
+        }
+        else if( h->mb.i_neighbour & MB_TOP )
+        {
+            if( h->mb.field[h->mb.i_mb_top_xy] != h->mb.b_interlaced )
+                h->mb.allow_skip = 0;
+        }
+        else // Frame mb pair is predicted
+        {
+            if( h->mb.b_interlaced )
+                h->mb.allow_skip = 0;
+        }
+        if( !h->mb.allow_skip )
+        {
+            if( IS_SKIP(h->mb.i_type) )
+            {
+                if( h->mb.i_type == P_SKIP )
+                    h->mb.i_type = P_L0;
+                else if( h->mb.i_type == B_SKIP )
+                    h->mb.i_type = B_DIRECT;
+            }
         }
     }

@@ -1286,6 +1444,7 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb
             }
         }
         h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
+        h->mc.store_interleave_8x8x2( &h->fdec->plane_fld[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
         memcpy( intra_fdec,   h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) );
         memcpy( intra_fdec+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) );
         twiddle_topleft_pixel( h->mb.pic.p_fdec[1]-FDEC_STRIDE-1, h->mb.pic.p_fdec[1]-FDEC_STRIDE+7, h->sh.b_mbaff );
@@ -1299,12 +1458,13 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb
             {
                 if( mb_x )
                 {
+                    // Take rightmost sample from top border of left mb to use as topleft here.
                     h->intra_diagonal_backup[1][0][mb_x*intra_diag_width] = h->intra_border_backup[1][0][(mb_x-1)*16+15];
                     h->intra_diagonal_backup[2][0][mb_x*intra_diag_width] = h->intra_border_backup[2][0][(mb_x-1)*16+15];
                     CP64( &h->intra_diagonal_backup[1][0][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15 );
                     CP64( &h->intra_diagonal_backup[2][0][(mb_x-1)*intra_diag_width+1], h->mb.pic.p_fdec[0]+FDEC_STRIDE*14 );
                 }
-                memcpy( &h->intra_border_backup[2][i][mb_x*16], h->mb.pic.p_fdec[0]+FDEC_STRIDE*14, 16*sizeof(pixel) );
+                memcpy( &h->intra_border_backup[2][0][mb_x*16], h->mb.pic.p_fdec[0]+FDEC_STRIDE*14, 16*sizeof(pixel) );
             }
             else
             {
@@ -1316,6 +1476,7 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb
         else
             twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, h->sh.b_mbaff );
         h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
+        h->mc.copy[PIXEL_16x16]( &h->fdec->plane_fld[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
         memcpy( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
     }
 }
diff --git a/common/mc.c b/common/mc.c
index 76061c3..e594785 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -512,22 +512,43 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 {
     const int b_interlaced = h->sh.b_mbaff;
-    const int stride = frame->i_stride[0] << b_interlaced;
+    int stride = frame->i_stride[0];
     const int width = frame->i_width[0];
-    int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
-    int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
+    int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
+    int height = (b_end ? frame->i_lines[0] + 16 : (mb_y+b_interlaced)*16) + 8;
     int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd

     if( mb_y & b_interlaced )
         return;

-    for( int y = 0; y <= b_interlaced; y++, offs += frame->i_stride[0] )
+    h->mc.hpel_filter(
+        frame->filtered[1] + offs,
+        frame->filtered[2] + offs,
+        frame->filtered[3] + offs,
+        frame->plane[0] + offs,
+        stride, width + 16, height - start,
+        h->scratch_buffer );
+
+    if( b_interlaced )
     {
+        /* MC must happen between pixels in the same field. */
+        stride = frame->i_stride[0] << 1;
+        start = (mb_y*16 >> 1) - 8;
+        height = ((b_end ? frame->i_lines[0] : mb_y*16) >> 1) + 8;
+        offs = start*stride - 8;
+        h->mc.hpel_filter(
+            frame->filtered_fld[1] + offs,
+            frame->filtered_fld[2] + offs,
+            frame->filtered_fld[3] + offs,
+            frame->plane_fld[0] + offs,
+            stride, width + 16, height - start,
+            h->scratch_buffer );
+        offs += frame->i_stride[0];
         h->mc.hpel_filter(
-            frame->filtered[1] + offs,
-            frame->filtered[2] + offs,
-            frame->filtered[3] + offs,
-            frame->plane[0] + offs,
+            frame->filtered_fld[1] + offs,
+            frame->filtered_fld[2] + offs,
+            frame->filtered_fld[3] + offs,
+            frame->plane_fld[0] + offs,
             stride, width + 16, height - start,
             h->scratch_buffer );
     }
diff --git a/common/mvpred.c b/common/mvpred.c
index c8efe1f..278e0ac 100644
--- a/common/mvpred.c
+++ b/common/mvpred.c
@@ -38,12 +38,33 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
     int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
     int16_t *mv_c  = h->mb.cache.mv[i_list][i8 - 8 + i_width];

+    // Partitions not yet reached in scan order are unavailable.
     if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 )
     {
         i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
         mv_c   = h->mb.cache.mv[i_list][i8 - 8 - 1];
-    }

+        if( h->param.b_interlaced
+            && h->mb.cache.ref[i_list][x264_scan8[0]-1] != -2
+            && h->mb.b_interlaced != h->mb.field[h->mb.i_mb_left_xy[0]] )
+        {
+            if( idx == 2 )
+            {
+                mv_c = h->mb.cache.topright_mv[i_list][0];
+                i_refc = h->mb.cache.topright_ref[i_list][0];
+            }
+            else if( idx == 8 )
+            {
+                mv_c = h->mb.cache.topright_mv[i_list][1];
+                i_refc = h->mb.cache.topright_ref[i_list][1];
+            }
+            else if( idx == 10 )
+            {
+                mv_c = h->mb.cache.topright_mv[i_list][2];
+                i_refc = h->mb.cache.topright_ref[i_list][2];
+            }
+        }
+    }
     if( h->mb.i_partition == D_16x8 )
     {
         if( idx == 0 )
diff --git a/common/x86/util.h b/common/x86/util.h
index 6544207..01e54f9 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -87,30 +87,6 @@ static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], in
     return sum;
 }

-#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
-static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
-{
-    static const uint64_t pb_2    = 0x0202020202020202ULL;
-    static const uint64_t pb_32   = 0x2020202020202020ULL;
-    int amvd;
-    asm(
-        "movd         %1, %%mm0 \n"
-        "movd         %2, %%mm1 \n"
-        "paddb     %%mm1, %%mm0 \n"
-        "pxor      %%mm2, %%mm2 \n"
-        "movq      %%mm0, %%mm1 \n"
-        "pcmpgtb      %3, %%mm0 \n"
-        "pcmpgtb      %4, %%mm1 \n"
-        "psubb     %%mm0, %%mm2 \n"
-        "psubb     %%mm1, %%mm2 \n"
-        "movd      %%mm2, %0    \n"
-        :"=r"(amvd)
-        :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
-         "m"(pb_2),"m"(pb_32)
-    );
-    return amvd;
-}
-
 #define x264_predictor_roundclip x264_predictor_roundclip_mmxext
 static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
 {
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index a3fcd61..28609d8 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -609,9 +609,7 @@ void x264_macroblock_encode( x264_t *h )
         return;
     }

-    if( h->sh.b_mbaff
-        && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
-        && IS_SKIP(h->mb.type[h->sh.i_first_mb]) )
+    if( !h->mb.allow_skip )
     {
         /* The first skip is predicted to be a frame mb pair.
          * We don't yet support the aff part of mbaff, so force it to non-skip
--
1.7.4


From 88aa754aa9a2e3f2907f656e21439e7b93b8cfab Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Wed, 16 Mar 2011 21:34:51 +0000
Subject: [PATCH 09/25] Copy deblocked pixels to other plane

---
 common/deblock.c |   15 +++++++++++++++
 1 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/common/deblock.c b/common/deblock.c
index 0800461..52d410d 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -389,6 +389,21 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )

         #undef FILTER
     }
+
+    // FIXME: Don't copy the whole frame around.
+    int y = mb_y*16;
+    int start = mb_y == h->i_threadslice_start;
+    int last = mb_y == h->i_threadslice_end - (1 << h->sh.b_mbaff);
+    int height = last ? 32+4 : 32;
+    if( !start ) y -= 4; // Make sure to copy the above four rows of deblocked pixels.
+    for( int i = y; i < y+height; i++ )
+        memcpy( h->fdec->plane_fld[0] + i*stridey, h->fdec->plane[0] + i*stridey, h->mb.i_mb_width*16*sizeof(pixel) );
+
+    y = mb_y*8;
+    height = last ? 16+2 : 16;
+    if( !start ) y -=2;
+    for( int i = y; i < y+height; i++ )
+        memcpy( h->fdec->plane_fld[1] + i*strideuv, h->fdec->plane[1] + i*strideuv, h->mb.i_mb_width*16*sizeof(pixel) );
 }

 /* For deblock-aware RD.
--
1.7.4


From d42239c146bd2fc1417987ce1794fbf049796112 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Sun, 6 Feb 2011 22:58:39 +0000
Subject: [PATCH 10/25] Fix thread max mv check

---
 encoder/analyse.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 87125c1..4f439d4 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -460,7 +460,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )

                 if( h->param.b_deterministic )
                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
-                if( h->mb.b_interlaced )
+                if( h->sh.b_mbaff )
                     thread_mvy_range >>= 1;

                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
--
1.7.4


From 0ce16f54027e00ee0a9070e807eed3ca570325ad Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Sun, 20 Feb 2011 15:31:55 +0000
Subject: [PATCH 11/25] Track what interlace decision the decoder is using

---
 common/common.h   |    1 +
 encoder/cabac.c   |   17 ++++++++++++++++-
 encoder/encoder.c |   11 +++++++++++
 3 files changed, 28 insertions(+), 1 deletions(-)

diff --git a/common/common.h b/common/common.h
index ef9b35a..bc14c10 100644
--- a/common/common.h
+++ b/common/common.h
@@ -604,6 +604,7 @@ struct x264_t
         int     intra_border_index;
         int     topleft_border_index;
         int     topright_border_index;
+        int     field_decoding_flag;

         /**** thread synchronization ends here ****/
         /* subsequent variables are either thread-local or constant,
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 334318d..6138d06 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -66,6 +66,21 @@ static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_
     }
 }

+static void x264_cabac_field_decoding_flag( x264_t *h, x264_cabac_t *cb )
+{
+    const int top = h->mb.i_mb_xy - 2*h->mb.i_mb_stride;
+    int ctx = 0;
+    ctx += h->mb.field_decoding_flag & !!h->mb.i_mb_x;
+    ctx += (top >= 0
+            && h->mb.slice_table[top] == h->sh.i_first_mb
+            && h->mb.field[top]);
+
+    x264_cabac_encode_decision_noup( cb, 70 + ctx, h->mb.b_interlaced );
+#if !RDO_SKIP_BS
+    h->mb.field_decoding_flag = h->mb.b_interlaced;
+#endif
+}
+
 static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
 {
     const int i_mb_type = h->mb.i_type;
@@ -73,7 +88,7 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
     if( h->sh.b_mbaff &&
         (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
     {
-        x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced );
+        x264_cabac_field_decoding_flag( h, cb );
     }

     if( h->sh.i_type == SLICE_TYPE_I )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 9f294d1..2525fec 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1932,6 +1932,8 @@ static int x264_slice_write( x264_t *h )
     i_mb_x = h->sh.i_first_mb % h->mb.i_mb_width;
     i_skip = 0;

+    h->mb.field_decoding_flag = 0;
+
     while( (mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width) <= h->sh.i_last_mb )
     {
         int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
@@ -1988,7 +1990,12 @@ static int x264_slice_write( x264_t *h )
                 x264_cabac_encode_terminal( &h->cabac );

             if( IS_SKIP( h->mb.i_type ) )
+            {
+                // FIXME: It might be better to change the interlace type
+                // rather than forcing a skip to be non-skip, but this would
+                // require modifying the already saved image data.
                 x264_cabac_mb_skip( h, 1 );
+            }
             else
             {
                 if( h->sh.i_type != SLICE_TYPE_I )
@@ -2145,6 +2152,10 @@ static int x264_slice_write( x264_t *h )
         {
             i_mb_y++;
             i_mb_x = 0;
+            if( h->sh.b_mbaff && i_mb_y > 0 )
+                h->mb.field_decoding_flag = h->mb.field[i_mb_x+(i_mb_y-1)*h->mb.i_mb_stride];
+            else
+                h->mb.field_decoding_flag = 0;
         }
     }
     h->out.nal[h->out.i_nal].i_last_mb = h->sh.i_last_mb;
--
1.7.4


From 02af876c83b5a6fc69a20c1928ad604069e533a7 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Sun, 20 Feb 2011 15:35:44 +0000
Subject: [PATCH 12/25] Disallow skip where interlace would be wrong

---
 common/macroblock.c |   27 +++++++++++++++++++--------
 1 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index a441981..4fe1f82 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1148,6 +1148,17 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

     /* Check whether skip here would cause decoder to predict interlace mode incorrectly. */
     h->mb.allow_skip = 1;
+    int prevmb = mb_x + h->mb.i_mb_stride*mb_y;
+    if( mb_y&1 )
+        prevmb -= h->mb.i_mb_stride;
+    else if( mb_x )
+        prevmb = (mb_y+1)*h->mb.i_mb_stride + mb_x - 1;
+    else
+        prevmb = (mb_y-1)*h->mb.i_mb_stride + h->mb.i_mb_width;
+    if( h->mb.b_interlaced != h->mb.field_decoding_flag && IS_SKIP(h->mb.type[prevmb]) )
+    {
+        h->mb.allow_skip = 0;
+    }
     if( h->sh.b_mbaff && (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
     {
         if( h->mb.i_neighbour & MB_LEFT )
@@ -1165,15 +1176,15 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
             if( h->mb.b_interlaced )
                 h->mb.allow_skip = 0;
         }
-        if( !h->mb.allow_skip )
+    }
+    if( !h->mb.allow_skip )
+    {
+        if( IS_SKIP(h->mb.i_type) )
         {
-            if( IS_SKIP(h->mb.i_type) )
-            {
-                if( h->mb.i_type == P_SKIP )
-                    h->mb.i_type = P_L0;
-                else if( h->mb.i_type == B_SKIP )
-                    h->mb.i_type = B_DIRECT;
-            }
+            if( h->mb.i_type == P_SKIP )
+                h->mb.i_type = P_L0;
+            else if( h->mb.i_type == B_SKIP )
+                h->mb.i_type = B_DIRECT;
         }
     }

--
1.7.4


From 2a15d908619a780aa5a0f8bd9c6b61ade5d52237 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Sun, 20 Feb 2011 15:36:29 +0000
Subject: [PATCH 13/25] CABAC encoding of skips

---
 common/common.h     |    2 ++
 common/macroblock.c |   29 +++++++++++++++++++++++++++++
 encoder/cabac.c     |   14 +++++++-------
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/common/common.h b/common/common.h
index bc14c10..6d81496 100644
--- a/common/common.h
+++ b/common/common.h
@@ -605,6 +605,8 @@ struct x264_t
         int     topleft_border_index;
         int     topright_border_index;
         int     field_decoding_flag;
+        int     left_skip;
+        int     top_skip;

         /**** thread synchronization ends here ****/
         /* subsequent variables are either thread-local or constant,
diff --git a/common/macroblock.c b/common/macroblock.c
index 4fe1f82..c01320c 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1188,6 +1188,35 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         }
     }

+    if( h->param.b_cabac )
+    {
+        /* Neighbours here are calculated based on field_decoding_flag */
+        int left_xy, top_xy;
+        if( h->sh.b_mbaff )
+        {
+            int mb_xy = mb_x + (h->mb.i_mb_y&~1)*h->mb.i_mb_stride;
+            left_xy = mb_xy - 1;
+            if( (mb_y&1) && mb_x > 0 && h->mb.field_decoding_flag == h->mb.field[left_xy] )
+                left_xy += h->mb.i_mb_stride;
+            if( h->mb.field_decoding_flag )
+            {
+                top_xy = mb_xy - h->mb.i_mb_stride;
+                if( !(mb_y&1) && top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && h->mb.field[top_xy] )
+                    top_xy -= h->mb.i_mb_stride;
+            }
+            else
+                top_xy = mb_x + (mb_y-1)*h->mb.i_mb_stride;
+        }
+        else
+        {
+            left_xy = h->mb.i_mb_xy - 1;
+            top_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+        }
+
+        h->mb.left_skip = mb_x > 0 && h->mb.slice_table[left_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[left_xy] );
+        h->mb.top_skip = top_xy >= 0 && (h->mb.slice_table[top_xy] == h->sh.i_first_mb) && !IS_SKIP( h->mb.type[top_xy] );
+    }
+
     /* load skip */
     if( h->sh.i_type == SLICE_TYPE_B )
     {
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 6138d06..3435048 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -295,10 +295,10 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
 #if !RDO_SKIP_BS
 void x264_cabac_mb_skip( x264_t *h, int b_skip )
 {
-    int ctx = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] ))
-            + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top ))
-            + (h->sh.i_type == SLICE_TYPE_P ? 11 : 24);
-    x264_cabac_encode_decision( &h->cabac, ctx, b_skip );
+    int ctx = h->mb.left_skip + h->mb.top_skip;
+    if( h->sh.i_type != SLICE_TYPE_P )
+       ctx += 13;
+    x264_cabac_encode_decision( &h->cabac, 11+ctx, b_skip );
 }
 #endif

@@ -350,7 +350,7 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
     const int i8 = x264_scan8[idx];
     const int i_refa = h->mb.cache.ref[i_list][i8 - 1];
     const int i_refb = h->mb.cache.ref[i_list][i8 - 8];
-    int ctx  = 0;
+    int ctx = 0;

     if( i_refa > 0 && !h->mb.cache.skip[i8 - 1] )
         ctx++;
@@ -423,9 +423,9 @@ static ALWAYS_INLINE int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int
         x264_cabac_encode_bypass( cb, mvd < 0 );
     }
 #endif
-    /* Since we don't need to keep track of MVDs larger than 33, just cap the value.
+    /* Since we don't need to keep track of MVDs larger than 70, just cap the value.
      * This lets us store MVDs as 8-bit values instead of 16-bit. */
-    return X264_MIN( i_abs, 33 );
+    return X264_MIN( i_abs, 70 );
 }

 static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
--
1.7.4


From b49fa962fcd25984e5f65a36a2784fbe702bdb4c Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Wed, 16 Mar 2011 21:18:59 +0000
Subject: [PATCH 14/25] Add mbaff deblock strength calculation

---
 common/deblock.c  |   38 ++++++++++++++++++++++++++++++++++++++
 common/frame.h    |    3 +++
 encoder/encoder.c |   12 +++++++++---
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/common/deblock.c b/common/deblock.c
index 52d410d..55a0154 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -269,6 +269,44 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
             }
     }
 }
+void deblock_strength_mbaff( x264_t *h, uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                             int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, int bframe )
+{
+    const int vertical = 0, horizontal = 1;
+    int neighbour_field[2];
+    int current_field = h->mb.b_interlaced;
+    neighbour_field[vertical] = h->mb.i_mb_left_xy[0] >= 0 && h->mb.field[h->mb.i_mb_left_xy[0]];
+    neighbour_field[horizontal] = h->mb.i_mb_top_xy >= 0 && h->mb.field[h->mb.i_mb_top_xy];
+
+    for( int dir = 0; dir < 2; dir++ )
+    {
+        int edge_stride = dir ? 8 : 1;
+        int part_stride = dir ? 1 : 8;
+        for( int edge = 0; edge < 4; edge++ )
+        {
+            for( int i = 0, q = X264_SCAN8_0+edge*edge_stride; i < 4; i++, q += part_stride )
+            {
+                int p = q - edge_stride;
+                if( nnz[q] || nnz[p] )
+                {
+                    bs[dir][edge][i] = 2;
+                }
+                else if( (edge == 0 && current_field != neighbour_field[dir]) ||
+                         ref[0][q] != ref[0][p] ||
+                         abs( mv[0][q][0] - mv[0][p][0] ) >= 4 ||
+                         abs( mv[0][q][1] - mv[0][p][1] ) >= mvy_limit ||
+                        (bframe && (ref[1][q] != ref[1][p] ||
+                         abs( mv[1][q][0] - mv[1][p][0] ) >= 4 ||
+                         abs( mv[1][q][1] - mv[1][p][1] ) >= mvy_limit )) )
+                {
+                    bs[dir][edge][i] = 1;
+                }
+                else
+                    bs[dir][edge][i] = 0;
+            }
+        }
+    }
+}

 static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 {
diff --git a/common/frame.h b/common/frame.h
index 8fe0627..3296a2c 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -186,6 +186,9 @@ typedef struct
                                int bframe );
 } x264_deblock_function_t;

+void deblock_strength_mbaff( x264_t *h, uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                             int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, int bframe );
+
 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
 void          x264_frame_delete( x264_frame_t *frame );

diff --git a/encoder/encoder.c b/encoder/encoder.c
index 2525fec..0319126 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2129,14 +2129,20 @@ static int x264_slice_write( x264_t *h )
         /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */
         if( b_deblock )
         {
-            int mvy_limit = 4 >> h->sh.b_mbaff;
+            int mvy_limit = 4 >> h->mb.b_interlaced;
             uint8_t (*bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
             x264_macroblock_cache_load_deblock( h );
             if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
                 memset( bs, 3, 2*4*4*sizeof(uint8_t) );
             else
-                h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
-                                           bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B );
+            {
+                if( h->sh.b_mbaff )
+                    deblock_strength_mbaff( h, h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
+                                            bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B );
+                else
+                    h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
+                                               bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B );
+            }
         }

         x264_ratecontrol_mb( h, mb_size );
--
1.7.4


From 55bb59fb940208de76113875c0744694043a5f4d Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Wed, 16 Mar 2011 21:27:07 +0000
Subject: [PATCH 15/25] Initial deblocking support

---
 common/deblock.c    |   13 +++++++--
 common/macroblock.c |   67 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/common/deblock.c b/common/deblock.c
index 55a0154..48788ae 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -345,13 +345,17 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
     int b_interlaced = h->sh.b_mbaff;
     int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset );
     int stridey   = h->fdec->i_stride[0];
-    int stride2y  = stridey << b_interlaced;
     int strideuv  = h->fdec->i_stride[1];
-    int stride2uv = strideuv << b_interlaced;
+
+    // Backup mb.b_interlaced because it will be changed in x264_macroblock_cache_load_neighbours_deblock.
+    int interlaced_backup = h->mb.b_interlaced;

     for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
     {
         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
+        h->mb.i_mb_x = mb_x;
+        h->mb.i_mb_y = mb_y;
+        h->mb.i_mb_xy = mb_x + h->mb.i_mb_stride*mb_y;
         x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );

         int mb_xy = h->mb.i_mb_xy;
@@ -361,12 +365,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )

         pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
         pixel *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x;
-        if( mb_y & b_interlaced )
+        if( mb_y & h->mb.b_interlaced )
         {
             pixy -= 15*stridey;
             pixuv -= 7*strideuv;
         }

+        int stride2y  = stridey << h->mb.b_interlaced;
+        int stride2uv = strideuv << h->mb.b_interlaced;
         int qp = h->mb.qp[mb_xy];
         int qpc = h->chroma_qp_table[qp];
         int first_edge_only = h->mb.type[mb_xy] == P_SKIP || qp <= qp_thresh;
@@ -427,6 +433,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )

         #undef FILTER
     }
+    h->mb.b_interlaced = interlaced_backup;

     // FIXME: Don't copy the whole frame around.
     int y = mb_y*16;
diff --git a/common/macroblock.c b/common/macroblock.c
index c01320c..47bb2ff 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1257,18 +1257,36 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_
     int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;

     h->mb.i_neighbour = 0;
-    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
+
+    if( h->sh.b_mbaff )
+        h->mb.b_interlaced = h->mb.field[h->mb.i_mb_xy];
+
+    h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
+    h->mb.i_mb_left_xy[1] =
+    h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
+    if( h->sh.b_mbaff )
+    {
+        if( mb_y&1 )
+        {
+            if( h->mb.field[h->mb.i_mb_xy - 1] != h->mb.b_interlaced )
+                h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
+        }
+        else
+        {
+            if( h->mb.b_interlaced && !h->mb.field[h->mb.i_mb_top_xy] )
+                h->mb.i_mb_top_xy += h->mb.i_mb_stride;
+            if( h->mb.field[h->mb.i_mb_xy - 1] != h->mb.b_interlaced )
+                h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
+        }
+    }

     if( mb_x > 0 )
     {
-        h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
         if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy] )
             h->mb.i_neighbour |= MB_LEFT;
     }
-
     if( mb_y > h->mb.b_interlaced )
     {
-        h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
         if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
             h->mb.i_neighbour |= MB_TOP;
     }
@@ -1276,7 +1294,7 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_

 void x264_macroblock_cache_load_deblock( x264_t *h )
 {
-    if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
+    if( !h->sh.b_mbaff && IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
         return;

     /* If we have multiple slices and we're deblocking on slice edges, we
@@ -1291,7 +1309,7 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
         h->mb.i_neighbour &= ~old_neighbour;
         if( h->mb.i_neighbour )
         {
-            int top_y = mb_y - (1 << h->mb.b_interlaced);
+            int top_y = h->mb.i_mb_top_xy / h->mb.i_mb_stride;
             int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
             int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
             int s8x8 = h->mb.i_b8_stride;
@@ -1330,17 +1348,15 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
                 i8 = x264_scan8[0] - 1;
                 if( h->mb.i_neighbour & MB_LEFT )
                 {
-                    int ir = h->mb.i_b8_xy - 1;
-                    int iv = h->mb.i_b4_xy - 1;
                     h->mb.cache.ref[l][i8+0*8] =
-                    h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
+                    h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table[12+4]];
                     h->mb.cache.ref[l][i8+2*8] =
-                    h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
+                    h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table[12+5]];

-                    CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
-                    CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
-                    CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
-                    CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
+                    CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table[12+0]] );
+                    CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table[12+1]] );
+                    CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table[12+2]] );
+                    CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table[12+3]] );
                 }
             }
         }
@@ -1380,7 +1396,7 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
     {
         uint8_t (*nnz)[24] = h->mb.non_zero_count;
         int top = h->mb.i_mb_top_xy;
-        int left = h->mb.i_mb_left_xy[0];
+        int *left = h->mb.i_mb_left_xy;

         if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
         {
@@ -1391,15 +1407,22 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
             M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
         }

-        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] )
+        if( h->mb.i_neighbour & MB_LEFT )
         {
+            // TODO: Merge code in deblock
             int i8 = x264_scan8[0] - 1;
-            int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] );
-            int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] );
-            h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
-            h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
-            h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
-            h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
+            if( h->mb.mb_transform_size[left[0]] )
+            {
+                int nnz_left0 = M16( &nnz[left[0]][2] ) | M16( &nnz[left[0]][6] );
+                h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
+                h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
+            }
+            if( h->mb.mb_transform_size[left[1]] )
+            {
+                int nnz_left1 = M16( &nnz[left[1]][10] ) | M16( &nnz[left[1]][14] );
+                h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
+                h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
+            }
         }

         if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
--
1.7.4


From c1420786e639c96f7fd0b11bac69952edeabbafc Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Thu, 17 Mar 2011 18:15:06 +0000
Subject: [PATCH 16/25] Calculate deblock strength for mbaff

---
 common/common.h     |    1 +
 common/deblock.c    |    1 +
 common/macroblock.c |    5 +++++
 3 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/common/common.h b/common/common.h
index 6d81496..7990cf0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -844,6 +844,7 @@ struct x264_t
     pixel *intra_border_backup[3][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
     pixel *intra_diagonal_backup[5][3];
     uint8_t (*deblock_strength[2])[2][4][4];
+    uint8_t (*deblock_strength_mbaff[2])[2][8]; /* store [field][mb_x][dir][partition], there can be 8 different block strengths in mbaff left/top edges */

     /* CPU functions dependents */
     x264_predict_t      predict_16x16[4+3];
diff --git a/common/deblock.c b/common/deblock.c
index 48788ae..c1bb10a 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -362,6 +362,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy];
         int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
         uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&1][mb_x];
+        uint8_t (*bs_mbaff)[8] = h->deblock_strength_mbaff[mb_y&1][mb_x];

         pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
         pixel *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x;
diff --git a/common/macroblock.c b/common/macroblock.c
index 47bb2ff..3a4c873 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -340,7 +340,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
         for( int i = 0; i <= h->param.b_interlaced; i++ )
         {
             CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
+            CHECKED_MALLOC( h->deblock_strength_mbaff[i], sizeof(**h->deblock_strength_mbaff) * h->mb.i_mb_width );
             h->deblock_strength[1] = h->deblock_strength[i];
+            h->deblock_strength_mbaff[1] = h->deblock_strength_mbaff[i];
         }
     }

@@ -372,7 +374,10 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
     if( !b_lookahead )
     {
         for( int i = 0; i <= h->param.b_interlaced; i++ )
+        {
             x264_free( h->deblock_strength[i] );
+            x264_free( h->deblock_strength_mbaff[i] );
+        }
         for( int i = 0; i <= 2*h->param.b_interlaced; i++ )
         {
             for( int j = 0; j < 2; j++ )
--
1.7.4


From 6cda0dc785db073de2c496c93e412bdb54c37a2d Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Wed, 16 Mar 2011 22:06:27 +0000
Subject: [PATCH 17/25] Left edge deblocking

---
 common/deblock.c    |  181 ++++++++++++++++++++++++++++++++++++++++++++++++--
 common/macroblock.c |   63 ++++++++++++++++++
 2 files changed, 236 insertions(+), 8 deletions(-)

diff --git a/common/deblock.c b/common/deblock.c
index c1bb10a..be96fc8 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -118,6 +118,128 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp
         }
     }
 }
+
+static inline void deblock_v_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+    /* Only filter 8 pixels at a time... */
+    for( int d = 0; d < 8; d++, pix += stride )
+    {
+        int p2 = pix[-3];
+        int p1 = pix[-2];
+        int p0 = pix[-1];
+        int q0 = pix[ 0];
+        int q1 = pix[ 1];
+        int q2 = pix[ 2];
+
+        if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+        {
+            int tc = tc0[d>>1];
+            int delta;
+            if( abs( p2 - p0 ) < beta )
+            {
+                if( tc0[d>>1] )
+                    pix[-2] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[d>>1], tc0[d>>1] );
+                tc++;
+            }
+            if( abs( q2 - q0 ) < beta )
+            {
+                if( tc0[d>>1] )
+                    pix[ 1] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[d>>1], tc0[d>>1] );
+                tc++;
+            }
+
+            delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+            pix[-1] = x264_clip_pixel( p0 + delta );    /* p0' */
+            pix[ 0] = x264_clip_pixel( q0 - delta );    /* q0' */
+        }
+    }
+}
+
+static inline void deblock_v_luma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+{
+    /* Only filter 8 pixels at a time... */
+    for( int d = 0; d < 8; d++, pix += stride )
+    {
+        /* Filter intra samples normally */
+        int p2 = pix[-3];
+        int p1 = pix[-2];
+        int p0 = pix[-1];
+        int q0 = pix[ 0];
+        int q1 = pix[ 1];
+        int q2 = pix[ 2];
+
+        if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+        {
+            if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
+            {
+                if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
+                {
+                    const int p3 = pix[-4];
+                    pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                    pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                    pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                }
+                else /* p0' */
+                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
+                {
+                    const int q3 = pix[3];
+                    pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                    pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                    pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                }
+                else /* q0' */
+                    pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+            }
+            else /* p0', q0' */
+            {
+                pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+            }
+        }
+    }
+}
+
+static inline void deblock_v_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+    const int xstride = 2;
+    for( int i = 0; i < 4; i++, pix += stride )
+    {
+        /* We don't worry about p2 or q2 */
+        int tc = tc0[i];
+        int p1 = pix[xstride*-2];
+        int p0 = pix[xstride*-1];
+        int q0 = pix[xstride* 0];
+        int q1 = pix[xstride* 1];
+
+        if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+        {
+            int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+            pix[xstride*-1] = x264_clip_pixel( p0 + delta );    /* p0' */
+            pix[xstride* 0] = x264_clip_pixel( q0 - delta );    /* q0' */
+        }
+    }
+}
+
+static inline void deblock_v_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+{
+    const int xstride = 2;
+    for( int i = 0; i < 4; i++, pix += stride )
+    {
+        int p1 = pix[xstride*-2];
+        int p0 = pix[xstride*-1];
+        int q0 = pix[xstride* 0];
+        int q1 = pix[xstride* 1];
+
+        if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+        {
+            /* p0', q0' */
+            pix[xstride*-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+            pix[xstride* 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+        }
+    }
+}
+
 static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
 {
     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
@@ -392,16 +514,59 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )

         if( h->mb.i_neighbour & MB_LEFT )
         {
-            int qpl = h->mb.qp[h->mb.i_mb_left_xy[0]];
-            int qp_left = (qp + qpl + 1) >> 1;
-            int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1;
-            int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] );
-            if( intra_cur || intra_left )
-                FILTER( _intra, 0, 0, qp_left, qpc_left );
+            if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != h->mb.b_interlaced )
+            {
+                int luma_qp[2];
+                int chroma_qp[2];
+                int left_qp[2];
+                int current_qp = h->mb.qp[mb_xy];
+                left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
+                luma_qp[0] = (current_qp + left_qp[0] + 1) >> 1;
+                chroma_qp[0] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[0]] + 1) >> 1;
+                if( bs_mbaff[0][0] == 4)
+                {
+                    deblock_edge_intra( h, pixy, 2*stridey, bs_mbaff[0], luma_qp[0], 0, deblock_v_luma_intra_mbaff_c );
+                    deblock_edge_intra( h, pixuv, 2*strideuv, bs_mbaff[0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c );
+                    deblock_edge_intra( h, pixuv + 1, 2*strideuv, bs_mbaff[0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c );
+                }
+                else
+                {
+                    deblock_edge( h, pixy, 2*stridey, bs_mbaff[0], luma_qp[0], 0, deblock_v_luma_mbaff_c );
+                    deblock_edge( h, pixuv, 2*strideuv, bs_mbaff[0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c );
+                    deblock_edge( h, pixuv + 1, 2*strideuv, bs_mbaff[0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c );
+                }
+
+                int offy = h->mb.b_interlaced ? 4 : 0;
+                int offuv = h->mb.b_interlaced ? 3 : 0;
+                left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
+                luma_qp[1] = (current_qp + left_qp[1] + 1) >> 1;
+                chroma_qp[1] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
+                if( bs_mbaff[0][4] == 4)
+                {
+                    deblock_edge_intra( h, pixy + (stridey<<offy), 2*stridey, bs_mbaff[0]+4, luma_qp[1], 0, deblock_v_luma_intra_mbaff_c );
+                    deblock_edge_intra( h, pixuv + (strideuv<<offuv), 2*strideuv, bs_mbaff[0]+4, chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c );
+                    deblock_edge_intra( h, pixuv + 1 + (strideuv<<offuv), 2*strideuv, bs_mbaff[0]+4, chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c );
+                }
+                else
+                {
+                    deblock_edge( h, pixy + (stridey<<offy), 2*stridey, bs_mbaff[0]+4, luma_qp[1], 0, deblock_v_luma_mbaff_c );
+                    deblock_edge( h, pixuv + (strideuv<<offuv), 2*strideuv, bs_mbaff[0]+4, chroma_qp[1], 1, deblock_v_chroma_mbaff_c );
+                    deblock_edge( h, pixuv + 1 + (strideuv<<offuv), 2*strideuv, bs_mbaff[0]+4, chroma_qp[1], 1, deblock_v_chroma_mbaff_c );
+                }
+            }
             else
-                FILTER(       , 0, 0, qp_left, qpc_left );
-        }
+            {
+                int qpl = h->mb.qp[h->mb.i_mb_xy-1];
+                int qp_left = (qp + qpl + 1) >> 1;
+                int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1;
+                int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] );

+                if( intra_cur || intra_left ) // bs=4
+                    FILTER( _intra, 0, 0, qp_left, qpc_left );
+                else
+                    FILTER(       , 0, 0, qp_left, qpc_left );
+            }
+        }
         if( !first_edge_only )
         {
             if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc );
diff --git a/common/macroblock.c b/common/macroblock.c
index 3a4c873..1d72fe8 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1445,6 +1445,69 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
             M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
         }
     }
+
+    int mb_x = h->mb.i_mb_x;
+    int mb_y = h->mb.i_mb_y;
+    int mb_xy = h->mb.i_mb_xy;
+
+    // left is wrong without this here
+    x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
+
+    uint8_t (*bs_mbaff)[8] = h->deblock_strength_mbaff[h->mb.i_mb_y&1][h->mb.i_mb_x];
+
+    int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
+
+    if( h->mb.i_neighbour & MB_LEFT )
+    {
+        if( h->sh.b_mbaff && h->mb.field[h->mb.i_mb_left_xy[0]] != h->mb.b_interlaced )
+        {
+            static const uint8_t offset[2][2][8] = {
+                {   { 0, 0, 0, 0, 1, 1, 1, 1 },
+                    { 2, 2, 2, 2, 3, 3, 3, 3 }, },
+                {   { 0, 1, 2, 3, 0, 1, 2, 3 },
+                    { 0, 1, 2, 3, 0, 1, 2, 3 }, }
+            };
+            uint8_t bS[8];
+
+            if( intra_cur )
+                bS[0] = bS[1] = bS[2] = bS[3] =
+                bS[4] = bS[5] = bS[6] = bS[7] = 4;
+            else
+            {
+                const uint8_t *off = offset[h->mb.b_interlaced][mb_y&1];
+                uint8_t (*nnz)[24] = h->mb.non_zero_count;
+
+                for( int i=0; i<8; i++ )
+                {
+                    int left = h->mb.i_mb_left_xy[h->mb.b_interlaced ? i>>2 : i&1];
+                    int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)];
+                    int nnz_left = nnz[left][3 + 4*off[i]];
+                    if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
+                    {
+                        int j = off[i]&~1;
+                        if( h->mb.mb_transform_size[left] )
+                            nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] ));
+                    }
+                    if( IS_INTRA( h->mb.type[left] ) )
+                        bS[i] = 4;
+                    else if( nnz_left || nnz_this )
+                        bS[i] = 2;
+                    else // As left is different interlaced.
+                        bS[i] = 1;
+                }
+            }
+
+            if( h->mb.b_interlaced )
+            {
+                for( int i=0; i<8; i++ ) bs_mbaff[0][i] = bS[i];
+            }
+            else
+            {
+                for( int i=0; i<4; i++ ) bs_mbaff[0][i]   = bS[2*i];
+                for( int i=0; i<4; i++ ) bs_mbaff[0][i+4] = bS[1+2*i];
+            }
+        }
+    }
 }

 static void ALWAYS_INLINE twiddle_topleft_pixel( pixel *dst, pixel *src, int b_interlaced )
--
1.7.4


From 374cba76623f48c2084b89c471e5fdf7951dfc76 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Wed, 16 Mar 2011 21:24:42 +0000
Subject: [PATCH 18/25] Top edge deblocking

---
 common/deblock.c    |   41 ++++++++++++++++++++++++++-------
 common/macroblock.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/common/deblock.c b/common/deblock.c
index be96fc8..de9d9fb 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -576,17 +576,40 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )

         if( h->mb.i_neighbour & MB_TOP )
         {
-            int qpt = h->mb.qp[h->mb.i_mb_top_xy];
-            int qp_top = (qp + qpt + 1) >> 1;
-            int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
-            int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
-            if( ~b_interlaced & (intra_cur | intra_top) )
-                FILTER( _intra, 1, 0, qp_top, qpc_top );
+            if( b_interlaced && !(mb_y&1) && !h->mb.b_interlaced && h->mb.field[h->mb.i_mb_top_xy] )
+            {
+                /* Need to filter both fields (even for frame macroblocks) */
+                /* Filter top two rows using the top and then bottom macroblocks of the above pair. */
+                int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride;
+
+                for(int j=0; j<2; j++, mbn_xy += h->mb.i_mb_stride)
+                {
+                    int qpt = h->mb.qp[mbn_xy];
+                    int qp_top = (qp + qpt + 1) >> 1;
+                    int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
+
+                    deblock_edge( h, pixy      + j*stridey,  2* stridey, bs_mbaff[1]+4*j, qp_top,  0, deblock_v_luma_c );
+                    deblock_edge( h, pixuv     + j*strideuv, 2*strideuv, bs_mbaff[1]+4*j, qpc_top, 1, deblock_v_chroma_c );
+                }
+            }
             else
             {
-                if( intra_top )
-                    M32( bs[1][0] ) = 0x03030303;
-                FILTER(       , 1, 0, qp_top, qpc_top );
+                int qpt = h->mb.qp[h->mb.i_mb_top_xy];
+                int qp_top = (qp + qpt + 1) >> 1;
+                int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
+                int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
+
+                if( (!b_interlaced || (!h->mb.b_interlaced && !h->mb.field[h->mb.i_mb_top_xy]))
+                    && (intra_cur || intra_top) )
+                {
+                    FILTER( _intra, 1, 0, qp_top, qpc_top );
+                }
+                else
+                {
+                    if( intra_top )
+                        M32( bs[1][0] ) = 0x03030303;
+                    FILTER(       , 1, 0, qp_top, qpc_top );
+                }
             }
         }

diff --git a/common/macroblock.c b/common/macroblock.c
index 1d72fe8..0da5958 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1508,6 +1508,69 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
             }
         }
     }
+
+    if( h->mb.i_neighbour & MB_TOP )
+    {
+        if( h->sh.b_mbaff && !(mb_y&1) && !h->mb.b_interlaced && h->mb.field[h->mb.i_mb_top_xy] )
+        {
+            /* Need to filter both fields (even for frame macroblocks) */
+            /* Filter top two rows using the top and then bottom macroblocks of the above pair. */
+            int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride;
+
+            for(int j=0; j<2; j++, mbn_xy += h->mb.i_mb_stride)
+            {
+                int mbn_intra = IS_INTRA( h->mb.type[mbn_xy] );
+                uint8_t (*nnz)[24] = h->mb.non_zero_count;
+
+                uint32_t nnz_top[4];
+                uint32_t nnz_cur[4];
+                nnz_top[0] = nnz[mbn_xy][3*4+0];
+                nnz_top[1] = nnz[mbn_xy][3*4+1];
+                nnz_top[2] = nnz[mbn_xy][3*4+2];
+                nnz_top[3] = nnz[mbn_xy][3*4+3];
+                nnz_cur[0] = h->mb.cache.non_zero_count[x264_scan8[0]+0];
+                nnz_cur[1] = h->mb.cache.non_zero_count[x264_scan8[0]+1];
+                nnz_cur[2] = h->mb.cache.non_zero_count[x264_scan8[0]+2];
+                nnz_cur[3] = h->mb.cache.non_zero_count[x264_scan8[0]+3];
+
+                /* Munge NNZ for cavlc + 8x8dct */
+                if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
+                {
+                    int top = mbn_xy;
+                    if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
+                    {
+                        int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
+                        int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
+                        nnz_top[0] = nnz_top[1] = nnz_top0 ? 0x0101 : 0;
+                        nnz_top[2] = nnz_top[3] = nnz_top1 ? 0x0101 : 0;
+                    }
+                    if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
+                    {
+                        int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+                        int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
+                        nnz_cur[0] = nnz_cur[1] = !!nnz0;
+                        nnz_cur[2] = nnz_cur[3] = !!nnz1;
+                    }
+                }
+
+                uint8_t bS[4];
+                if( intra_cur || mbn_intra )
+                    bS[0] = bS[1] = bS[2] = bS[3] = 3;
+                else
+                {
+                    for( int i = 0; i < 4; i++ )
+                    {
+                        if( nnz_cur[i] || nnz_top[i] )
+                            bS[i] = 2;
+                        else
+                            bS[i] = 1;
+                    }
+                }
+                for( int i=0; i<4; i++ )
+                    bs_mbaff[1][i+4*j] = bS[i];
+            }
+        }
+    }
 }

 static void ALWAYS_INLINE twiddle_topleft_pixel( pixel *dst, pixel *src, int b_interlaced )
--
1.7.4


From 388145f81f75bfcd549335774457ad7d7fcd475f Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 15 Mar 2011 01:39:49 +0000
Subject: [PATCH 19/25] Use both left macroblocks for ref_idx calculation

---
 common/macroblock.c |   58 ++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 0da5958..5b42966 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1229,14 +1229,58 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(mb_y&1)];
         if( h->param.b_cabac )
         {
-            uint8_t skipbp;
             x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
-            skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0;
-            h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
-            h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
-            skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0;
-            h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
-            h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
+
+            if( h->mb.i_neighbour & MB_LEFT )
+            {
+                if( h->mb.skipbp[left[0]] == 0xf )
+                    h->mb.cache.skip[x264_scan8[0] - 1] = 1;
+                else if( h->mb.partition[left[0]] == D_8x8 )
+                {
+                    int off = 1+(left_index_table[18]&~1);
+                    h->mb.cache.skip[x264_scan8[0] - 1] = (h->mb.skipbp[left[0]] >> off) & 1;
+                }
+                else
+                    h->mb.cache.skip[x264_scan8[0] - 1] = h->mb.skipbp[left[0]] & 0x2;
+
+                if( h->mb.skipbp[left[1]] == 0xf )
+                    h->mb.cache.skip[x264_scan8[8] - 1] = 1;
+                else if( h->mb.partition[left[1]] == D_8x8 )
+                {
+                    int off = 1+(left_index_table[20]&~1);
+                    h->mb.cache.skip[x264_scan8[8] - 1] = (h->mb.skipbp[left[1]] >> off) & 1;
+                }
+                else
+                    h->mb.cache.skip[x264_scan8[8] - 1] = h->mb.skipbp[left[1]] & 0x8;
+            }
+            else
+            {
+                h->mb.cache.skip[x264_scan8[0] - 1 + 0*8] = 0;
+                h->mb.cache.skip[x264_scan8[0] - 1 + 1*8] = 0;
+                h->mb.cache.skip[x264_scan8[0] - 1 + 2*8] = 0;
+                h->mb.cache.skip[x264_scan8[0] - 1 + 3*8] = 0;
+            }
+
+            if( h->mb.i_neighbour & MB_TOP )
+            {
+                if( h->mb.skipbp[top] == 0xf )
+                {
+                    h->mb.cache.skip[x264_scan8[0] - 8] = 1;
+                    h->mb.cache.skip[x264_scan8[4] - 8] = 1;
+                }
+                else if( h->mb.partition[top] == D_8x8 )
+                {
+                    h->mb.cache.skip[x264_scan8[0] - 8] = h->mb.skipbp[top] & 0x4;
+                    h->mb.cache.skip[x264_scan8[4] - 8] = h->mb.skipbp[top] & 0x8;
+                }
+                else
+                {
+                    h->mb.cache.skip[x264_scan8[0] - 8] = 0;
+                    h->mb.cache.skip[x264_scan8[4] - 8] = 0;
+                }
+            }
+            else
+                M32( &h->mb.cache.skip[x264_scan8[0] - 8] ) = 0;
         }
     }

--
1.7.4


From e30f270b1830c3fc2b12014aa85d6dd959e3c4d3 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 15 Mar 2011 01:14:16 +0000
Subject: [PATCH 20/25] Fix min/max mv calculation

---
 common/macroblock.c |    5 +++++
 encoder/analyse.c   |   12 ++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 5b42966..00e9403 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -841,6 +841,11 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

     const int *left_index_table = h->mb.left_index_table;

+    int my = h->mb.i_mb_y >> h->mb.b_interlaced;
+    int mb_height = h->mb.i_mb_height >> h->mb.b_interlaced;
+    h->mb.mv_min[1] = 4*( -16*my - 24 );
+    h->mb.mv_max[1] = 4*( 16*( mb_height - my - 1 ) + 24 );
+
     /* load cache */
     if( h->mb.i_neighbour & MB_TOP )
     {
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 4f439d4..d004c66 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -443,13 +443,11 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
         if( h->mb.i_mb_x == 0 )
         {
-            int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
-            int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
             int thread_mvy_range = i_fmv_range;

             if( h->i_thread_frames > 1 )
             {
-                int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
+                int pix_y = (h->mb.i_mb_y | h->sh.b_mbaff) * 16;
                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
                 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
                     for( int j = 0; j < h->i_ref[i]; j++ )
@@ -465,7 +463,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )

                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
             }
-
+            int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
+            int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
@@ -473,6 +472,11 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
+
+            mb_y = h->mb.i_mb_y >> h->mb.b_interlaced;
+            mb_height = h->sps->i_mb_height >> h->mb.b_interlaced;
+            h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
+            h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
         }
 #undef CLIP_FMV

--
1.7.4


From 962a6ce9a008d7239c11920635e570903bed7a85 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 15 Mar 2011 01:15:06 +0000
Subject: [PATCH 21/25] Calculate bipred POCs correctly

---
 common/common.h     |    4 +-
 common/macroblock.c |   67 ++++++++++++++++++++++++++-------------------------
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/common/common.h b/common/common.h
index 7990cf0..c7670e7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -756,9 +756,9 @@ struct x264_t
         int     i_chroma_lambda2_offset;

         /* B_direct and weighted prediction */
-        int16_t dist_scale_factor_buf[2][X264_REF_MAX*2][4];
+        int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4];
         int16_t (*dist_scale_factor)[4];
-        int8_t bipred_weight_buf[2][X264_REF_MAX*2][4];
+        int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4];
         int8_t (*bipred_weight)[4];
         /* maps fref1[0]'s ref indices into the current list0 */
 #define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
diff --git a/common/macroblock.c b/common/macroblock.c
index 00e9403..f6406fe 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1230,8 +1230,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
     /* load skip */
     if( h->sh.i_type == SLICE_TYPE_B )
     {
-        h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(mb_y&1)];
-        h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(mb_y&1)];
+        h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced][h->mb.b_interlaced&(mb_y&1)];
+        h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced][h->mb.b_interlaced&(mb_y&1)];
         if( h->param.b_cabac )
         {
             x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
@@ -1903,42 +1903,43 @@ void x264_macroblock_cache_save( x264_t *h )

 void x264_macroblock_bipred_init( x264_t *h )
 {
-    for( int field = 0; field <= h->sh.b_mbaff; field++ )
-        for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<h->sh.b_mbaff); i_ref0++ )
-        {
-            x264_frame_t *l0 = h->fref[0][i_ref0>>h->sh.b_mbaff];
-            int poc0 = l0->i_poc + l0->i_delta_poc[field^(i_ref0&1)];
-            for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<h->sh.b_mbaff); i_ref1++ )
+    for( int mbfield = 0; mbfield <= h->sh.b_mbaff; mbfield++ )
+        for( int field = 0; field <= h->sh.b_mbaff; field++ )
+            for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<mbfield); i_ref0++ )
             {
-                int dist_scale_factor;
-                x264_frame_t *l1 = h->fref[1][i_ref1>>h->sh.b_mbaff];
-                int poc1 = l1->i_poc + l1->i_delta_poc[field^(i_ref1&1)];
-                int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
-                int td = x264_clip3( poc1 - poc0, -128, 127 );
-                if( td == 0 /* || pic0 is a long-term ref */ )
-                    dist_scale_factor = 256;
-                else
+                x264_frame_t *l0 = h->fref[0][i_ref0>>mbfield];
+                int poc0 = l0->i_poc + mbfield*l0->i_delta_poc[field^(i_ref0&1)];
+                for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<mbfield); i_ref1++ )
                 {
-                    int tb = x264_clip3( cur_poc - poc0, -128, 127 );
-                    int tx = (16384 + (abs(td) >> 1)) / td;
-                    dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
-                }
+                    int dist_scale_factor;
+                    x264_frame_t *l1 = h->fref[1][i_ref1>>mbfield];
+                    int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
+                    int poc1 = l1->i_poc + mbfield*l1->i_delta_poc[field^(i_ref1&1)];
+                    int td = x264_clip3( poc1 - poc0, -128, 127 );
+                    if( td == 0 /* || pic0 is a long-term ref */ )
+                        dist_scale_factor = 256;
+                    else
+                    {
+                        int tb = x264_clip3( cur_poc - poc0, -128, 127 );
+                        int tx = (16384 + (abs(td) >> 1)) / td;
+                        dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
+                    }

-                h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor;
+                    h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = dist_scale_factor;

-                dist_scale_factor >>= 2;
-                if( h->param.analyse.b_weighted_bipred
-                      && dist_scale_factor >= -64
-                      && dist_scale_factor <= 128 )
-                {
-                    h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 64 - dist_scale_factor;
-                    // ssse3 implementation of biweight doesn't support the extrema.
-                    // if we ever generate them, we'll have to drop that optimization.
-                    assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
+                    dist_scale_factor >>= 2;
+                    if( h->param.analyse.b_weighted_bipred
+                          && dist_scale_factor >= -64
+                          && dist_scale_factor <= 128 )
+                    {
+                        h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 64 - dist_scale_factor;
+                        // ssse3 implementation of biweight doesn't support the extrema.
+                        // if we ever generate them, we'll have to drop that optimization.
+                        assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
+                    }
+                    else
+                        h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32;
                 }
-                else
-                    h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 32;
             }
-        }
 }

--
1.7.4


From a9f5b05e3dc0c7482e6661a1ee8d457f1e35ee75 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 15 Mar 2011 01:16:20 +0000
Subject: [PATCH 22/25] Direct temporal

---
 common/mvpred.c |  100 +++++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/common/mvpred.c b/common/mvpred.c
index 278e0ac..28eabca 100644
--- a/common/mvpred.c
+++ b/common/mvpred.c
@@ -182,50 +182,100 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )

 static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
 {
-    int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
-    int i_mb_8x8 =  4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
-    const int type_col = h->fref[1][0]->mb_type[h->mb.i_mb_xy];
-    const int partition_col = h->fref[1][0]->mb_partition[h->mb.i_mb_xy];
-
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
-
-    h->mb.i_partition = partition_col;
-
-    if( IS_INTRA( type_col ) )
+    int mb_x = h->mb.i_mb_x;
+    int mb_y = h->mb.i_mb_y;
+    int mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
+    int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] };
+    int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] };
+    int col_parity = abs(h->fref[1][0]->i_poc - h->fdec->i_poc)
+                  >= abs(h->fref[1][0]->i_poc + h->sh.i_delta_poc_bottom - h->fdec->i_poc);
+    int preshift = h->mb.b_interlaced;
+    int postshift = h->mb.b_interlaced;
+    int offset = 1;
+    int yshift = 1;
+    h->mb.i_partition = partition_col[0];
+    if( h->param.b_interlaced && h->fref[1][0]->field[mb_xy] != h->mb.b_interlaced )
     {
-        x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
-        x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 0, 0 );
-        x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 1, 0 );
-        return 1;
+        if( h->mb.b_interlaced )
+        {
+            mb_y = h->mb.i_mb_y&~1;
+            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
+            type_col[0] = h->fref[1][0]->mb_type[mb_xy];
+            type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride];
+            partition_col[0] = h->fref[1][0]->mb_partition[mb_xy];
+            partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride];
+            preshift = 0;
+            yshift = 0;
+
+            if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16)
+                && (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16)
+                && partition_col[0] != D_8x8 )
+                h->mb.i_partition = D_16x8;
+            else
+                h->mb.i_partition = D_8x8;
+        }
+        else
+        {
+            mb_y = (h->mb.i_mb_y&~1) + col_parity;
+            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
+            type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy];
+            partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy];
+            preshift = 1;
+            yshift = 2;
+            h->mb.i_partition = partition_col[0];
+        }
+        offset = 0;
     }
+    int i_mb_4x4 = 16 * h->mb.i_mb_stride * mb_y + 4 * mb_x;
+    int i_mb_8x8 =  4 * h->mb.i_mb_stride * mb_y + 2 * mb_x;
+
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );

     /* Don't do any checks other than the ones we have to, based
      * on the size of the colocated partitions.
      * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
-    int max_i8 = (D_16x16 - partition_col) + 1;
-    int step = (partition_col == D_16x8) + 1;
-    int width = 4 >> ((D_16x16 - partition_col)&1);
-    int height = 4 >> ((D_16x16 - partition_col)>>1);
-
+    int max_i8 = (D_16x16 - h->mb.i_partition) + 1;
+    int step = (h->mb.i_partition == D_16x8) + 1;
+    int width = 4 >> ((D_16x16 - h->mb.i_partition)&1);
+    int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1);
     for( int i8 = 0; i8 < max_i8; i8 += step )
     {
         int x8 = i8&1;
         int y8 = i8>>1;
-        int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
+
+        if( IS_INTRA( type_col[y8] ) )
+        {
+            x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, 0 );
+            x264_macroblock_cache_mv(  h, 2*x8, 2*y8, width, height, 0, 0 );
+            x264_macroblock_cache_mv(  h, 2*x8, 2*y8, width, height, 1, 0 );
+            continue;
+        }
+
+        int yM = 3*y8;
+        if( h->param.b_interlaced && h->fref[1][0]->field[mb_xy] != h->mb.b_interlaced )
+        {
+            if( h->mb.b_interlaced )
+                yM = y8*6;
+            else
+                yM = 2*(h->mb.i_mb_y&1) + y8;
+        }
+
+        int i_part_8x8 = i_mb_8x8 + x8 + (yM>>1) * h->mb.i_b8_stride;
         int i_ref1_ref = h->fref[1][0]->ref[0][i_part_8x8];
-        int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
+        int i_ref = (map_col_to_list0(i_ref1_ref>>preshift) << postshift) + (offset&i_ref1_ref&h->mb.b_interlaced);

         if( i_ref >= 0 )
         {
             int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
-            int16_t *mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
+            int16_t *mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3*x8 + yM * h->mb.i_b4_stride];
+            int16_t mv_y = (mv_col[1]<<yshift)/2;
             int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
-            int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
-            if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) )
+            int l0y = ( dist_scale_factor * mv_y + 128 ) >> 8;
+            if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_y > h->mb.mv_max_spel[1]) )
                 return 0;
             x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref );
             x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) );
-            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
+            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_y) );
         }
         else
         {
--
1.7.4


From fe257a3f3ad1b5121c52999f1db6727aa50082c5 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Tue, 15 Mar 2011 01:17:01 +0000
Subject: [PATCH 23/25] Direct spatial

---
 common/mvpred.c |   78 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/common/mvpred.c b/common/mvpred.c
index 28eabca..f25fa03 100644
--- a/common/mvpred.c
+++ b/common/mvpred.c
@@ -295,15 +295,6 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
 {
     int8_t ref[2];
     ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
-    const int8_t *l1ref0 = &h->fref[1][0]->ref[0][h->mb.i_b8_xy];
-    const int8_t *l1ref1 = &h->fref[1][0]->ref[1][h->mb.i_b8_xy];
-    const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref[1][0]->mv[0][h->mb.i_b4_xy],
-                                    (const int16_t (*)[2]) &h->fref[1][0]->mv[1][h->mb.i_b4_xy] };
-    const int type_col = h->fref[1][0]->mb_type[h->mb.i_mb_xy];
-    const int partition_col = h->fref[1][0]->mb_partition[h->mb.i_mb_xy];
-
-    h->mb.i_partition = partition_col;
-
     for( int i_list = 0; i_list < 2; i_list++ )
     {
         int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
@@ -348,6 +339,49 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
         ref[i_list] = i_ref;
     }

+    int mb_x = h->mb.i_mb_x;
+    int mb_y = h->mb.i_mb_y;
+    int mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
+    int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] };
+    int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] };
+    int col_parity = abs(h->fref[1][0]->i_poc - h->fdec->i_poc)
+                  >= abs(h->fref[1][0]->i_poc + h->sh.i_delta_poc_bottom - h->fdec->i_poc);
+    h->mb.i_partition = partition_col[0];
+    if( h->param.b_interlaced && h->fref[1][0]->field[mb_xy] != h->mb.b_interlaced )
+    {
+        if( h->mb.b_interlaced )
+        {
+            mb_y = h->mb.i_mb_y&~1;
+            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
+            type_col[0] = h->fref[1][0]->mb_type[mb_xy];
+            type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride];
+            partition_col[0] = h->fref[1][0]->mb_partition[mb_xy];
+            partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride];
+
+            if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16)
+                && (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16)
+                && partition_col[0] != D_8x8 )
+                h->mb.i_partition = D_16x8;
+            else
+                h->mb.i_partition = D_8x8;
+        }
+        else
+        {
+            mb_y = (h->mb.i_mb_y&~1) + col_parity;
+            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
+            type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy];
+            partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy];
+            h->mb.i_partition = partition_col[0];
+        }
+    }
+    int i_mb_4x4 = 16 * h->mb.i_mb_stride * mb_y + 4 * mb_x;
+    int i_mb_8x8 =  4 * h->mb.i_mb_stride * mb_y + 2 * mb_x;
+
+    int8_t *l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8];
+    int8_t *l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8];
+    int16_t (*l1mv[2])[2] = { (int16_t (*)[2]) &h->fref[1][0]->mv[0][i_mb_4x4],
+                              (int16_t (*)[2]) &h->fref[1][0]->mv[1][i_mb_4x4] };
+
     if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */
     {
         x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
@@ -367,24 +401,33 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
         return 0;
     }

-    if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
+    if( !M64( mv ) || (ref[0]&&ref[1]) )
         return 1;

     /* Don't do any checks other than the ones we have to, based
      * on the size of the colocated partitions.
      * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
-    int max_i8 = (D_16x16 - partition_col) + 1;
-    int step = (partition_col == D_16x8) + 1;
-    int width = 4 >> ((D_16x16 - partition_col)&1);
-    int height = 4 >> ((D_16x16 - partition_col)>>1);
+    int max_i8 = (D_16x16 - h->mb.i_partition) + 1;
+    int step = (h->mb.i_partition == D_16x8) + 1;
+    int width = 4 >> ((D_16x16 - h->mb.i_partition)&1);
+    int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1);

     /* col_zero_flag */
     for( int i8 = 0; i8 < max_i8; i8 += step )
     {
         const int x8 = i8&1;
         const int y8 = i8>>1;
-        const int o8 = x8 + y8 * h->mb.i_b8_stride;
-        const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride);
+        int yM = 3*y8;
+        if( h->param.b_interlaced && h->fref[1][0]->field[mb_xy] != h->mb.b_interlaced )
+        {
+            if( h->mb.b_interlaced )
+                yM = y8*6;
+            else
+                yM = 2*(h->mb.i_mb_y&1) + y8;
+        }
+        int o8 = x8 + (yM>>1) * h->mb.i_b8_stride;
+        int o4 = 3*x8 + yM * h->mb.i_b4_stride;
+
         int idx;
         if( l1ref0[o8] == 0 )
             idx = 0;
@@ -393,6 +436,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
         else
             continue;

+        if( IS_INTRA( type_col[y8] ) )
+            continue;
+
         if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
         {
             if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 );
--
1.7.4


From 53c6284008aa1471a62de4f6da6d587698a817c2 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Thu, 17 Mar 2011 17:39:18 +0000
Subject: [PATCH 24/25] Fix non-determinism with AQ

---
 encoder/ratecontrol.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index e1a673f..bcbcb02 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -219,10 +219,10 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
 {
     int w = i ? 8 : 16;
     int stride = frame->i_stride[i];
-    int offset = h->mb.b_interlaced
+    int offset = h->sh.b_mbaff
         ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride
         : 16 * mb_x + w * mb_y * stride;
-    stride <<= h->mb.b_interlaced;
+    stride <<= h->sh.b_mbaff;
     if( i )
     {
         ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
--
1.7.4


From 7e2d83f1ef2dc762ffe6880ee54686088aeff660 Mon Sep 17 00:00:00 2001
From: Simon Horlick <simonhorlick@gmail.com>
Date: Mon, 14 Mar 2011 02:54:30 +0000
Subject: [PATCH 25/25] Adaptive mbaff with vsad decision

---
 encoder/encoder.c |   21 ++++++++++++++++++++-
 1 files changed, 20 insertions(+), 1 deletions(-)

diff --git a/encoder/encoder.c b/encoder/encoder.c
index 0319126..d885f31 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1873,6 +1873,25 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
     x264_macroblock_slice_init( h );
 }

+static int field_vsad( x264_t *h, int mb_x, int mb_y )
+{
+    int score_field = 0;
+    int score_frame = 0;
+    int stride = h->fenc->i_stride[0];
+    uint8_t *fenc = h->fenc->plane[0] + 16 * (mb_x + mb_y * stride);
+
+    for( int i = 1; i < 16; i++ )
+        for( int j = 0; j < 16; j++ )
+            score_frame += abs(fenc[i*stride+j] - fenc[(i-1)*stride+j]);
+    for( int i = 2; i < 16; i+=2 )
+        for( int j = 0; j < 16; j++ )
+            score_field += abs(fenc[i*stride+j] - fenc[(i-2)*stride+j]);
+    for( int i = 3; i < 16; i+=2 )
+        for( int j = 0; j < 16; j++ )
+            score_field += abs(fenc[i*stride+j] - fenc[(i-2)*stride+j]);
+    return (score_field < score_frame);
+}
+
 static int x264_slice_write( x264_t *h )
 {
     int i_skip;
@@ -1967,7 +1986,7 @@ static int x264_slice_write( x264_t *h )
             if( h->mb.b_adaptive_mbaff )
             {
                 if( !(i_mb_y&1) )
-                    h->mb.b_interlaced = 1;
+                    h->mb.b_interlaced = field_vsad( h, i_mb_x, i_mb_y );
                 x264_zigzag_init( h->param.cpu, &h->zigzagf, h->mb.b_interlaced );
             }
             h->mb.field[mb_xy] = h->mb.b_interlaced;
--
1.7.4