Untitled

From 5a463b2ff722915b2f27a8aeb4d1eaaa49de28f3 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Tue, 13 Apr 2010 01:08:29 -0700
Subject: [PATCH 1/6] Add CP128/M128 macros using SSE, fix some aliasing
 Significantly improve the speed of cache_load and cache_save functions.
 Also fix a ton of pessimization in cache_save and cache_load due to aliasing.

---
 common/common.h     |    5 +
 common/macroblock.c |  203 +++++++++++++++++++++++++++------------------------
 common/x86/util.h   |    8 ++
 3 files changed, 120 insertions(+), 96 deletions(-)

diff --git a/common/common.h b/common/common.h
index b8c6dfd..38e9b74 100644
--- a/common/common.h
+++ b/common/common.h
@@ -88,12 +88,17 @@ do {\
 typedef union { uint16_t i; uint8_t  c[2]; } MAY_ALIAS x264_union16_t;
 typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } MAY_ALIAS x264_union32_t;
 typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
+typedef struct { uint64_t i[2]; } x264_uint128_t;
+typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
 #define M16(src) (((x264_union16_t*)(src))->i)
 #define M32(src) (((x264_union32_t*)(src))->i)
 #define M64(src) (((x264_union64_t*)(src))->i)
+#define M128(src) (((x264_union128_t*)(src))->i)
+#define M128_CONST(x) ((x264_uint128_t){{x,x}})
 #define CP16(dst,src) M16(dst) = M16(src)
 #define CP32(dst,src) M32(dst) = M32(src)
 #define CP64(dst,src) M64(dst) = M64(src)
+#define CP128(dst,src) M128(dst) = M128(src)

 #include "x264.h"
 #include "bs.h"
diff --git a/common/macroblock.c b/common/macroblock.c
index 0b9b903..fb4c1a5 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1026,19 +1026,23 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
     int left = h->mb.i_mb_left_xy;
     int top  = h->mb.i_mb_top_xy;

+    /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
+    /* By only dereferencing them once, we avoid this issue. */
+    int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
+    uint8_t (*nnz)[24] = h->mb.non_zero_count;
+
     /* load cache */
     if( h->mb.i_neighbour & MB_TOP )
     {
         h->mb.cache.i_cbp_top = h->mb.cbp[top];
-
         /* load intra4x4 */
-        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[top][0] );
+        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );

         /* load non_zero_count */
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[top][12] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
         /* shift because x264_scan8[16] is misaligned */
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[top][18] ) << 8;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[top][22] ) << 8;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &nnz[top][18] ) << 8;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &nnz[top][22] ) << 8;
     }
     else
     {
@@ -1058,22 +1062,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
         h->mb.cache.i_cbp_left = h->mb.cbp[left];

         /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[left][4];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[left][5];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = h->mb.intra4x4_pred_mode[left][6];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[left][3];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3];

         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[left][3];
-        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[left][7];
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[left][11];
-        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[left][15];
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
+        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];

-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[left][16+1];
-        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = h->mb.non_zero_count[left][16+3];
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1];
+        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3];

-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = h->mb.non_zero_count[left][16+4+1];
-        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = h->mb.non_zero_count[left][16+4+3];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
     }
     else
     {
@@ -1146,11 +1150,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

         for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
         {
+            int16_t (*mv)[2] = h->mb.mv[l];
+            int8_t *ref = h->mb.ref[l];
+
             int i8 = x264_scan8[0] - 1 - 1*8;
             if( h->mb.i_neighbour & MB_TOPLEFT )
             {
-                h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 - 1];
-                CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 - 1] );
+                h->mb.cache.ref[l][i8] = ref[top_8x8 - 1];
+                CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] );
             }
             else
             {
@@ -1162,24 +1169,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
             if( h->mb.i_neighbour & MB_TOP )
             {
                 h->mb.cache.ref[l][i8+0] =
-                h->mb.cache.ref[l][i8+1] = h->mb.ref[l][top_8x8 + 0];
+                h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
                 h->mb.cache.ref[l][i8+2] =
-                h->mb.cache.ref[l][i8+3] = h->mb.ref[l][top_8x8 + 1];
-                CP64( h->mb.cache.mv[l][i8+0], h->mb.mv[l][top_4x4+0] );
-                CP64( h->mb.cache.mv[l][i8+2], h->mb.mv[l][top_4x4+2] );
+                h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
+                CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
             }
             else
             {
-                M64( h->mb.cache.mv[l][i8+0] ) = 0;
-                M64( h->mb.cache.mv[l][i8+2] ) = 0;
+                M128( h->mb.cache.mv[l][i8] ) = M128_CONST( 0 );
                 M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
             }

             i8 = x264_scan8[0] + 4 - 1*8;
             if( h->mb.i_neighbour & MB_TOPRIGHT )
             {
-                h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 + 2];
-                CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 + 4] );
+                h->mb.cache.ref[l][i8] = ref[top_8x8 + 2];
+                CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] );
             }
             else
                  h->mb.cache.ref[l][i8] = -2;
@@ -1190,14 +1195,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
                 const int ir = h->mb.i_b8_xy - 1;
                 const int iv = h->mb.i_b4_xy - 1;
                 h->mb.cache.ref[l][i8+0*8] =
-                h->mb.cache.ref[l][i8+1*8] = h->mb.ref[l][ir + 0*s8x8];
+                h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
                 h->mb.cache.ref[l][i8+2*8] =
-                h->mb.cache.ref[l][i8+3*8] = h->mb.ref[l][ir + 1*s8x8];
+                h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];

-                CP32( h->mb.cache.mv[l][i8+0*8], h->mb.mv[l][iv + 0*s4x4] );
-                CP32( h->mb.cache.mv[l][i8+1*8], h->mb.mv[l][iv + 1*s4x4] );
-                CP32( h->mb.cache.mv[l][i8+2*8], h->mb.mv[l][iv + 2*s4x4] );
-                CP32( h->mb.cache.mv[l][i8+3*8], h->mb.mv[l][iv + 3*s4x4] );
+                CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
+                CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
+                CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
+                CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
             }
             else
             {
@@ -1210,17 +1215,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

             if( h->param.b_cabac )
             {
+                uint8_t (*mvd)[8][2] = h->mb.mvd[l];
                 if( h->mb.i_neighbour & MB_TOP )
-                    CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], h->mb.mvd[l][top][0] );
+                    CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] );
                 else
                     M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0;

                 if( h->mb.i_neighbour & MB_LEFT )
                 {
-                    CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], h->mb.mvd[l][left][4] );
-                    CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], h->mb.mvd[l][left][5] );
-                    CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], h->mb.mvd[l][left][6] );
-                    CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], h->mb.mvd[l][left][3] );
+                    CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] );
+                    CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] );
+                    CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] );
+                    CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] );
                 }
                 else
                     for( int i = 0; i < 4; i++ )
@@ -1285,10 +1291,10 @@ void x264_macroblock_cache_save( x264_t *h )
     const int i_mb_4x4 = h->mb.i_b4_xy;
     const int i_mb_8x8 = h->mb.i_b8_xy;

-    /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
+    /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing.*/
     /* By only dereferencing them once, we avoid this issue. */
-    int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
-    uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
+    int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
+    uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];

     x264_macroblock_store_pic( h, 0 );
     x264_macroblock_store_pic( h, 1 );
@@ -1303,15 +1309,15 @@ void x264_macroblock_cache_save( x264_t *h )
     /* save intra4x4 */
     if( i_mb_type == I_4x4 )
     {
-        CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
-        M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
-                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
-                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
+        CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
+        M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
     }
     else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
-        M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
+        M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
     else
-        M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
+        M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL;


     if( i_mb_type == I_PCM )
@@ -1322,19 +1328,19 @@ void x264_macroblock_cache_save( x264_t *h )
         h->mb.i_cbp_luma = 0xf;
         h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
         h->mb.b_transform_8x8 = 0;
-        memset( non_zero_count, 16, sizeof( *h->mb.non_zero_count ) );
+        memset( nnz, 16, sizeof( *h->mb.non_zero_count ) );
     }
     else
     {
         /* save non zero count */
-        CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
-        CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
-        CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
-        CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
-        M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
-        M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
-        M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
-        M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
+        CP32( &nnz[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
+        CP32( &nnz[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
+        CP32( &nnz[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
+        CP32( &nnz[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
+        M16( &nnz[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
+        M16( &nnz[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
+        M16( &nnz[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
+        M16( &nnz[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;

         if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
             h->mb.i_qp = h->mb.i_last_qp;
@@ -1349,47 +1355,56 @@ void x264_macroblock_cache_save( x264_t *h )

     if( h->sh.i_type != SLICE_TYPE_I )
     {
+        int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4];
+        int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4];
+        int8_t *ref0 = &h->mb.ref[0][i_mb_8x8];
+        int8_t *ref1 = &h->mb.ref[1][i_mb_8x8];
         if( !IS_INTRA( i_mb_type ) )
         {
-            h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
-            h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
-            h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
-            h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
-            for( int y = 0; y < 4; y++ )
-            {
-                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
-                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
-            }
+            ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
+            ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
+            ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
+            ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
+            CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
+            CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
+            CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
+            CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
             if( h->sh.i_type == SLICE_TYPE_B )
             {
-                h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
-                h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
-                h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
-                h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
-                for( int y = 0; y < 4; y++ )
-                {
-                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
-                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
-                }
+                ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
+                ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
+                ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
+                ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
+                CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
+                CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
+                CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
+                CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
             }
         }
         else
         {
-            for( int i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
+            M16( ref0+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
+            M16( ref0+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
+            M128( &mv0[0*s4x4] ) = M128_CONST( 0 );
+            M128( &mv0[1*s4x4] ) = M128_CONST( 0 );
+            M128( &mv0[2*s4x4] ) = M128_CONST( 0 );
+            M128( &mv0[3*s4x4] ) = M128_CONST( 0 );
+            if( h->sh.i_type == SLICE_TYPE_B )
             {
-                M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
-                M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
-                for( int y = 0; y < 4; y++ )
-                {
-                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
-                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
-                }
+                M16( ref1+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
+                M16( ref1+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
+                M128( &mv1[0*s4x4] ) = M128_CONST( 0 );
+                M128( &mv1[1*s4x4] ) = M128_CONST( 0 );
+                M128( &mv1[2*s4x4] ) = M128_CONST( 0 );
+                M128( &mv1[3*s4x4] ) = M128_CONST( 0 );
             }
         }
     }

     if( h->param.b_cabac )
     {
+        uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
+        uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
         if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
             h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
         else
@@ -1397,27 +1412,23 @@ void x264_macroblock_cache_save( x264_t *h )

         if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
         {
-            CP64( h->mb.mvd[0][i_mb_xy][0], h->mb.cache.mvd[0][x264_scan8[10]] );
-            CP16( h->mb.mvd[0][i_mb_xy][4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
-            CP16( h->mb.mvd[0][i_mb_xy][5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
-            CP16( h->mb.mvd[0][i_mb_xy][6], h->mb.cache.mvd[0][x264_scan8[13]] );
+            CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] );
+            CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
+            CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
+            CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] );
             if( h->sh.i_type == SLICE_TYPE_B )
             {
-                CP64( h->mb.mvd[1][i_mb_xy][0], h->mb.cache.mvd[1][x264_scan8[10]] );
-                CP16( h->mb.mvd[1][i_mb_xy][4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
-                CP16( h->mb.mvd[1][i_mb_xy][5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
-                CP16( h->mb.mvd[1][i_mb_xy][6], h->mb.cache.mvd[1][x264_scan8[13]] );
+                CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] );
+                CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
+                CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
+                CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] );
             }
         }
         else
         {
-            M64( h->mb.mvd[0][i_mb_xy][0] ) = 0;
-            M64( h->mb.mvd[0][i_mb_xy][4] ) = 0;
+            M128( mvd0[0] ) = M128_CONST( 0 );
             if( h->sh.i_type == SLICE_TYPE_B )
-            {
-                M64( h->mb.mvd[1][i_mb_xy][0] ) = 0;
-                M64( h->mb.mvd[1][i_mb_xy][4] ) = 0;
-            }
+                M128( mvd1[0] ) = M128_CONST( 0 );
         }

         if( h->sh.i_type == SLICE_TYPE_B )
diff --git a/common/x86/util.h b/common/x86/util.h
index ccc0733..e094309 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -25,6 +25,9 @@
 #define X264_X86_UTIL_H

 #ifdef __GNUC__
+
+#include <xmmintrin.h>
+
 #define x264_median_mv x264_median_mv_mmxext
 static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
 {
@@ -100,6 +103,11 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
     );
     return amvd;
 }
+#undef M128_CONST
+#define M128_CONST(x) ((__m128){x,x,x,x})
+#define x264_union128_t x264_union128_sse_t
+typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
+
 #endif

 #endif
--
1.7.0.4


From 064db2907f52c95a7254f313edba9788dc6d9c03 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Wed, 14 Apr 2010 14:43:25 -0700
Subject: [PATCH 2/6] Prefetch MB data in cache_load
 Dramatically reduces L1 cache misses.
 ~10% faster cache_load.

---
 common/macroblock.c |   38 +++++++++++++++++++++++++++++++-------
 common/osdep.h      |   13 +++++++++++++
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index fb4c1a5..5c9734f 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -941,6 +941,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
 static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y )
 {
     int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
+
     h->mb.i_mb_x = mb_x;
     h->mb.i_mb_y = mb_y;
     h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
@@ -986,6 +987,16 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i

                 if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
                     h->mb.i_neighbour_intra |= MB_TOP;
+
+                /* We only need to prefetch the top blocks because the left was just written
+                 * to as part of the previous cache_save.  Since most target CPUs use write-allocate
+                 * caches, left blocks are near-guaranteed to be in L1 cache.  Top--not so much. */
+                x264_prefetch( &h->mb.cbp[top] );
+                x264_prefetch( h->mb.intra4x4_pred_mode[top] );
+                x264_prefetch( &h->mb.non_zero_count[top][12] );
+                /* These aren't always allocated, but prefetching an invalid address can't hurt. */
+                x264_prefetch( &h->mb.mb_transform_size[top] );
+                x264_prefetch( &h->mb.skipbp[top] );
             }
         }

@@ -1025,16 +1036,20 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

     int left = h->mb.i_mb_left_xy;
     int top  = h->mb.i_mb_top_xy;
+    int top_y = mb_y - (1 << h->mb.b_interlaced);
+    int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
+    int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;

     /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
     /* By only dereferencing them once, we avoid this issue. */
     int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
     uint8_t (*nnz)[24] = h->mb.non_zero_count;
+    int16_t *cbp = h->mb.cbp;

     /* load cache */
     if( h->mb.i_neighbour & MB_TOP )
     {
-        h->mb.cache.i_cbp_top = h->mb.cbp[top];
+        h->mb.cache.i_cbp_top = cbp[top];
         /* load intra4x4 */
         CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );

@@ -1059,7 +1074,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

     if( h->mb.i_neighbour & MB_LEFT )
     {
-        h->mb.cache.i_cbp_left = h->mb.cbp[left];
+        h->mb.cache.i_cbp_left = cbp[left];

         /* load intra4x4 */
         h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
@@ -1078,6 +1093,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )

         h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
         h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
+
+        /* Finish the prefetching */
+        if( h->sh.i_type != SLICE_TYPE_I )
+            for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
+            {
+                x264_prefetch( &h->mb.mv[l][top_4x4-1] );
+                /* Top right being not in the same cacheline as top left will happen
+                 * once every 4 MBs, so one extra prefetch is worthwhile */
+                x264_prefetch( &h->mb.mv[l][top_4x4+4] );
+                x264_prefetch( &h->mb.ref[l][top_8x8-1] );
+                x264_prefetch( &h->mb.mvd[l][top] );
+            }
     }
     else
     {
@@ -1142,11 +1169,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
     /* load ref/mv/mvd */
     if( h->sh.i_type != SLICE_TYPE_I )
     {
-        const int s8x8 = h->mb.i_b8_stride;
-        const int s4x4 = h->mb.i_b4_stride;
-        const int top_y = mb_y - (1 << h->mb.b_interlaced);
-        const int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
-        const int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
+        int s8x8 = h->mb.i_b8_stride;
+        int s4x4 = h->mb.i_b4_stride;

         for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
         {
diff --git a/common/osdep.h b/common/osdep.h
index f97547f..35772f7 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -251,6 +251,19 @@ static int ALWAYS_INLINE x264_ctz( uint32_t x )
 }
 #endif

+#if defined(__GNUC__) && defined(HAVE_MMX)
+/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
+ * using complex address modes properly unless we use inline asm. */
+static ALWAYS_INLINE void x264_prefetch( void *p )
+{
+    asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
+}
+#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1)
+#define x264_prefetch(x) __builtin_prefetch(x)
+#else
+#define x264_prefetch(x)
+#endif
+
 #ifdef USE_REAL_PTHREAD
 #ifdef SYS_MINGW
 #define x264_lower_thread_priority(p)\
--
1.7.0.4


From 8891a9dc2c2602e09c1fc1636b3e3da584cadee2 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Thu, 15 Apr 2010 16:32:31 -0700
Subject: [PATCH 3/6] Move deblocking/hpel into sliced threads
 Instead of doing both as a separate pass, do them during the main encode.
 This requires disabling deblocking between slices (disable_deblock_idc == 2).
 Overall performance gain is about 11% on --preset superfast with sliced threads.
 Doesn't reduce the amount of actual computation done: only better parallelizes it.

---
 common/common.h     |    5 ++-
 common/frame.c      |   12 ++++-
 common/macroblock.c |   68 ++++++++++++++++++-------
 common/macroblock.h |    9 +++-
 encoder/encoder.c   |  136 ++++++++++++++++++++++++++-------------------------
 encoder/lookahead.c |    9 ++-
 6 files changed, 146 insertions(+), 93 deletions(-)

diff --git a/common/common.h b/common/common.h
index 38e9b74..37f309d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -566,7 +566,8 @@ struct x264_t
         int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
         int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
         int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
-        uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+        uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
+                                             * NOTE: this will fail on resolutions above 2^16 pixels... */

          /* buffer for weighted versions of the reference frames */
         uint8_t *p_weight_buf[16];
@@ -763,7 +764,9 @@ struct x264_t
     ALIGNED_16( uint16_t nr_offset[2][64] );
     uint32_t        nr_count[2];

+    /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+    uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */

     /* CPU functions dependents */
     x264_predict_t      predict_16x16[4+3];
diff --git a/common/frame.c b/common/frame.c
index abcfd14..872e067 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -658,6 +658,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
     int stride2y  = stridey << b_interlaced;
     int strideuv  = h->fdec->i_stride[1];
     int stride2uv = strideuv << b_interlaced;
+    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
     uint8_t (*nnz_backup)[16] = h->scratch_buffer;

     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
@@ -778,9 +779,18 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
          * i_dir == 1 -> horizontal edge */
         #define DEBLOCK_DIR(i_dir)\
         {\
-            int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
+            int i_edge = 0;\
             int i_qpn, mbn_xy, mbn_8x8, mbn_4x4;\
             ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
+            /* We don't have to consider the MBAFF case of a slice breaking in the middle\
+             * of a row because x264 doesn't support that case.  If we add support for that,\
+             * this will have to become significantly more complex. */\
+            if( i_dir == 0 && (mb_x == 0 || (!deblock_on_slice_edges &&\
+                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-1])) )\
+                i_edge++;\
+            if( i_dir == 1 && (mb_y <= b_interlaced || (!deblock_on_slice_edges &&\
+                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-(h->mb.i_mb_stride<<b_interlaced)])) )\
+                i_edge++;\
             if( i_edge )\
                 i_edge+= b_8x8_transform;\
             else\
diff --git a/common/macroblock.c b/common/macroblock.c
index 5c9734f..4ef959f 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -675,7 +675,7 @@ void x264_mb_mc( x264_t *h )
     }
 }

-int x264_macroblock_cache_init( x264_t *h )
+int x264_macroblock_cache_allocate( x264_t *h )
 {
     int i_mb_count = h->mb.i_mb_count;

@@ -689,6 +689,8 @@ int x264_macroblock_cache_init( x264_t *h )
     CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
     CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
     CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
+    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
+    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );

     /* 0 -> 3 top(4), 4 -> 6 : left(3) */
     CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
@@ -755,22 +757,11 @@ int x264_macroblock_cache_init( x264_t *h )
 #undef ALIGN
     }

-    for( int i = 0; i <= h->param.b_interlaced; i++ )
-        for( int j = 0; j < 3; j++ )
-        {
-            /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
-            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
-            h->mb.intra_border_backup[i][j] += 8;
-        }
-
     return 0;
 fail: return -1;
 }
-void x264_macroblock_cache_end( x264_t *h )
+void x264_macroblock_cache_free( x264_t *h )
 {
-    for( int i = 0; i <= h->param.b_interlaced; i++ )
-        for( int j = 0; j < 3; j++ )
-            x264_free( h->mb.intra_border_backup[i][j] - 8 );
     for( int i = 0; i < 2; i++ )
         for( int j = 0; j < 32; j++ )
             x264_free( h->mb.mvr[i][j] );
@@ -783,6 +774,7 @@ void x264_macroblock_cache_end( x264_t *h )
         x264_free( h->mb.mvd[0] );
         x264_free( h->mb.mvd[1] );
     }
+    x264_free( h->mb.slice_table );
     x264_free( h->mb.intra4x4_pred_mode );
     x264_free( h->mb.non_zero_count );
     x264_free( h->mb.mb_transform_size );
@@ -790,6 +782,47 @@ void x264_macroblock_cache_end( x264_t *h )
     x264_free( h->mb.cbp );
     x264_free( h->mb.qp );
 }
+
+int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
+{
+    if( !b_lookahead )
+        for( int i = 0; i <= h->param.b_interlaced; i++ )
+            for( int j = 0; j < 3; j++ )
+            {
+                /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
+                CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
+                h->intra_border_backup[i][j] += 8;
+            }
+
+    /* Allocate scratch buffer */
+    int scratch_size = 0;
+    if( !b_lookahead )
+    {
+        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
+        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
+        scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
+    }
+    int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
+    scratch_size = X264_MAX( scratch_size, buf_mbtree );
+    CHECKED_MALLOC( h->scratch_buffer, scratch_size );
+
+    return 0;
+fail: return -1;
+}
+
+void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
+{
+    if( !b_lookahead )
+        for( int i = 0; i <= h->param.b_interlaced; i++ )
+            for( int j = 0; j < 3; j++ )
+                x264_free( h->intra_border_backup[i][j] - 8 );
+    x264_free( h->scratch_buffer );
+}
+
 void x264_macroblock_slice_init( x264_t *h )
 {
     h->mb.mv[0] = h->fdec->mv[0];
@@ -898,8 +931,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
                            ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
                            : w * (mb_x + mb_y * i_stride);
     const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
-    const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
-                                &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
+    const uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
     int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
     x264_frame_t **fref[2] = { h->fref0, h->fref1 };
     if( h->mb.b_interlaced )
@@ -908,10 +940,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
     h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
     h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
         h->mb.pic.p_fenc_plane[i], i_stride2, w );
-    if( mb_y > 0 )
-        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
-    else
-        memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
+    memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
     if( h->mb.b_interlaced )
         for( int j = 0; j < w; j++ )
             h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
@@ -1327,6 +1356,7 @@ void x264_macroblock_cache_save( x264_t *h )
     x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );

     h->mb.type[i_mb_xy] = i_mb_type;
+    h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb;
     h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
     h->mb.i_mb_prev_xy = i_mb_xy;

diff --git a/common/macroblock.h b/common/macroblock.h
index 5ef1498..ee8c113 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -260,13 +260,18 @@ enum cabac_ctx_block_cat_e
     DCT_LUMA_8x8  = 5,
 };

+/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
+int  x264_macroblock_cache_allocate( x264_t *h );
+void x264_macroblock_cache_free( x264_t *h );
+
+/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
+int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
+void x264_macroblock_thread_free( x264_t *h, int b_lookahead );

-int  x264_macroblock_cache_init( x264_t *h );
 void x264_macroblock_slice_init( x264_t *h );
 void x264_macroblock_thread_init( x264_t *h );
 void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
 void x264_macroblock_cache_save( x264_t *h );
-void x264_macroblock_cache_end( x264_t *h );

 void x264_macroblock_bipred_init( x264_t *h );

diff --git a/encoder/encoder.c b/encoder/encoder.c
index 300041e..a07f0ea 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -158,7 +158,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
     int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta);
     /* If effective qp <= 15, deblocking would have no effect anyway */
     if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) )
-        sh->i_disable_deblocking_filter_idc = 0;
+        sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0;
     else
         sh->i_disable_deblocking_filter_idc = 1;
     sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
@@ -519,6 +519,16 @@ static int x264_validate_parameters( x264_t *h )
         h->param.rc.i_vbv_max_bitrate = 0;
     }

+    if( h->param.b_interlaced && h->param.i_slice_max_size )
+    {
+        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
+        h->param.i_slice_max_size = 0;
+    }
+    if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+    {
+        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
+        h->param.i_slice_max_mbs = 0;
+    }
     int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
     if( h->param.b_sliced_threads )
         h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
@@ -527,16 +537,6 @@ static int x264_validate_parameters( x264_t *h )
         h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
         h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
         h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
-        if( h->param.b_interlaced && h->param.i_slice_max_size )
-        {
-            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
-            h->param.i_slice_max_size = 0;
-        }
-        if( h->param.b_interlaced && h->param.i_slice_max_mbs )
-        {
-            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
-            h->param.i_slice_max_mbs = 0;
-        }
         if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
             h->param.i_slice_count = 0;
     }
@@ -1059,23 +1059,13 @@ x264_t *x264_encoder_open( x264_param_t *param )
         CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
         h->thread[i]->out.i_nals_allocated = init_nal_count;

-        if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
+        if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
             goto fail;
     }

-    /* Allocate scratch buffer */
-    for( int i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
-    {
-        int buf_hpel = (h->fdec->i_width[0]+48) * sizeof(int16_t);
-        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
-        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
-        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
-            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
-        int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
-        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
-        int scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, X264_MAX( buf_mbtree, buf_nnz ) );
-        CHECKED_MALLOC( h->thread[i]->scratch_buffer, scratch_size );
-    }
+    for( int i = 0; i < h->param.i_threads; i++ )
+        if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
+            goto fail;

     if( x264_ratecontrol_new( h ) < 0 )
         goto fail;
@@ -1552,25 +1542,32 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
     h->mb.pic.i_fref[1] = h->i_ref1;
 }

-static void x264_fdec_filter_row( x264_t *h, int mb_y )
+static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
 {
     /* mb_y is the mb to be encoded next, not the mb to be filtered here */
     int b_hpel = h->fdec->b_kept_as_ref;
-    int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
-    int b_end = mb_y == h->sps->i_mb_height;
+    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
+    int b_end = mb_y == h->i_threadslice_end;
+    int b_measure_quality = 1;
     int min_y = mb_y - (1 << h->sh.b_mbaff);
-    int max_y = b_end ? h->sps->i_mb_height : mb_y;
+    int b_start = min_y == h->i_threadslice_start;
+    int max_y = b_end ? h->i_threadslice_end : mb_y;
     b_deblock &= b_hpel || h->param.psz_dump_yuv;
+    if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
+    {
+        b_deblock = 0;         /* We already deblocked on the inloop pass. */
+        b_measure_quality = 0; /* We already measured quality on the inloop pass. */
+    }
     if( mb_y & h->sh.b_mbaff )
         return;
-    if( min_y < 0 )
+    if( min_y < h->i_threadslice_start )
         return;

-    if( !b_end && !h->param.b_sliced_threads )
+    if( !b_end && b_inloop )
         for( int j = 0; j <= h->sh.b_mbaff; j++ )
             for( int i = 0; i < 3; i++ )
             {
-                memcpy( h->mb.intra_border_backup[j][i],
+                memcpy( h->intra_border_backup[j][i],
                         h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
                         h->sps->i_mb_width*16 >> !!i );
             }
@@ -1581,39 +1578,43 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )

     if( b_hpel )
     {
-        x264_frame_expand_border( h, h->fdec, min_y, b_end );
+        int end = mb_y == h->sps->i_mb_height;
+        x264_frame_expand_border( h, h->fdec, min_y, end );
         if( h->param.analyse.i_subpel_refine )
         {
-            x264_frame_filter( h, h->fdec, min_y, b_end );
-            x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
+            x264_frame_filter( h, h->fdec, min_y, end );
+            x264_frame_expand_border_filtered( h, h->fdec, min_y, end );
         }
     }

     if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
         x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );

-    min_y = X264_MAX( min_y*16-8, 0 );
-    max_y = b_end ? h->param.i_height : mb_y*16-8;
-
-    if( h->param.analyse.b_psnr )
-        for( int i = 0; i < 3; i++ )
-            h->stat.frame.i_ssd[i] +=
-                x264_pixel_ssd_wxh( &h->pixf,
-                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
-                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
-                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+    min_y = min_y*16 - 8 * !b_start;
+    max_y = b_end ? X264_MIN( h->i_threadslice_end*16 , h->param.i_height ) : mb_y*16 - 8;

-    if( h->param.analyse.b_ssim )
+    if( b_measure_quality )
     {
-        x264_emms();
-        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
-         * and overlap by 4 */
-        min_y += min_y == 0 ? 2 : -6;
-        h->stat.frame.f_ssim +=
-            x264_pixel_ssim_wxh( &h->pixf,
-                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
-                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
-                h->param.i_width-2, max_y-min_y, h->scratch_buffer );
+        if( h->param.analyse.b_psnr )
+            for( int i = 0; i < 3; i++ )
+                h->stat.frame.i_ssd[i] +=
+                    x264_pixel_ssd_wxh( &h->pixf,
+                        h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
+                        h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
+                        h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+
+        if( h->param.analyse.b_ssim )
+        {
+            x264_emms();
+            /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
+             * and overlap by 4 */
+            min_y += b_start ? 2 : -6;
+            h->stat.frame.f_ssim +=
+                x264_pixel_ssim_wxh( &h->pixf,
+                    h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
+                    h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
+                    h->param.i_width-2, max_y-min_y, h->scratch_buffer );
+        }
     }
 }

@@ -1808,8 +1809,8 @@ static int x264_slice_write( x264_t *h )
             }
         }

-        if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
-            x264_fdec_filter_row( h, i_mb_y );
+        if( i_mb_x == 0 && !h->mb.b_reencode_mb )
+            x264_fdec_filter_row( h, i_mb_y, 1 );

         /* load cache */
         x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
@@ -1971,14 +1972,13 @@ static int x264_slice_write( x264_t *h )
     if( x264_nal_end( h ) )
         return -1;

-    if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
+    if( h->sh.i_last_mb == (h->i_threadslice_end * h->sps->i_mb_width - 1) )
     {
         h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
                                   + (h->out.i_nal*NALU_OVERHEAD * 8)
                                   - h->stat.frame.i_tex_bits
                                   - h->stat.frame.i_mv_bits;
-        if( !h->param.b_sliced_threads )
-            x264_fdec_filter_row( h, h->sps->i_mb_height );
+        x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
     }

     return 0;
@@ -2099,9 +2099,9 @@ static int x264_threaded_slices_write( x264_t *h )
             return (intptr_t)ret;
     }

-    /* deblocking and hpel filtering */
-    for( int i = 0; i <= h->sps->i_mb_height; i++ )
-        x264_stack_align( x264_fdec_filter_row, h, i );
+    /* Go back and fix up the hpel on the borders between slices. */
+    for( int i = 1; i < h->param.i_threads; i++ )
+        x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );

     x264_threads_merge_ratecontrol( h );

@@ -2114,10 +2114,12 @@ static int x264_threaded_slices_write( x264_t *h )
             h->out.i_nal++;
             x264_nal_check_buffer( h );
         }
-        /* All entries in stat.frame are ints except for ssd/ssim,
-         * which are only calculated in the main thread. */
+        /* All entries in stat.frame are ints except for ssd/ssim. */
         for( int j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
             ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
+        for( int j = 0; j < 3; j++ )
+            h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j];
+        h->stat.frame.f_ssim += t->stat.frame.f_ssim;
     }

     return 0;
@@ -3072,9 +3074,9 @@ void    x264_encoder_close  ( x264_t *h )
             (*frame)->i_reference_count--;
             if( (*frame)->i_reference_count == 0 )
                 x264_frame_delete( *frame );
-            x264_macroblock_cache_end( h->thread[i] );
+            x264_macroblock_cache_free( h->thread[i] );
         }
-        x264_free( h->thread[i]->scratch_buffer );
+        x264_macroblock_thread_free( h->thread[i], 0 );
         x264_free( h->thread[i]->out.p_bitstream );
         x264_free( h->thread[i]->out.nal);
         x264_free( h->thread[i] );
diff --git a/encoder/lookahead.c b/encoder/lookahead.c
index 7a0c6d3..5e29fb5 100644
--- a/encoder/lookahead.c
+++ b/encoder/lookahead.c
@@ -148,7 +148,10 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )

     x264_t *look_h = h->thread[h->param.i_threads];
     *look_h = *h;
-    if( x264_macroblock_cache_init( look_h ) )
+    if( x264_macroblock_cache_allocate( look_h ) )
+        goto fail;
+
+    if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
         goto fail;

     if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
@@ -170,8 +173,8 @@ void x264_lookahead_delete( x264_t *h )
         x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
         x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
         x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
-        x264_macroblock_cache_end( h->thread[h->param.i_threads] );
-        x264_free( h->thread[h->param.i_threads]->scratch_buffer );
+        x264_macroblock_cache_free( h->thread[h->param.i_threads] );
+        x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
         x264_free( h->thread[h->param.i_threads] );
     }
     x264_synch_frame_list_delete( &h->lookahead->ifbuf );
--
1.7.0.4


From cd9762c72e81e036b8eda7d6559d0a867f187c9e Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 16 Apr 2010 03:06:46 -0700
Subject: [PATCH 4/6] Fix four minor bugs found by Clang

---
 encoder/analyse.c |    2 +-
 encoder/encoder.c |    2 +-
 input/timecode.c  |   17 ++++++++++-------
 output/matroska.c |    2 ++
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 2ece9dc..74672d1 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1480,7 +1480,7 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
     if( weight[2].weightfn ) \
-        weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
+        weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );


     if( pixel == PIXEL_4x4 )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index a07f0ea..1438ec0 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1338,7 +1338,7 @@ int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t
         if( h->fref0[i_ref]->i_frame != h->fref0[j]->i_frame )
         {
             /* found a place, after j, make sure there is not already a duplicate there */
-            if( j == i-1 || ( h->fref0[j+1] && h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
+            if( j == i-1 || ( h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
                 break;
         }

diff --git a/input/timecode.c b/input/timecode.c
index 4a369ee..5fabe61 100644
--- a/input/timecode.c
+++ b/input/timecode.c
@@ -194,15 +194,18 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
             ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
             if( ret != 3 )
                 start = end = timecodes_num - 1;
-            if( h->auto_timebase_den || h->auto_timebase_num )
-                fpss[seq_num++] = seq_fps;
-            seq_fps = correct_fps( seq_fps, h );
-            if( seq_fps < 0 )
-                goto fail;
             for( ; num < start && num < timecodes_num - 1; num++ )
                 timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
-            for( num = start; num <= end && num < timecodes_num - 1; num++ )
-                timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
+            if( num < timecodes_num - 1 )
+            {
+                if( h->auto_timebase_den || h->auto_timebase_num )
+                    fpss[seq_num++] = seq_fps;
+                seq_fps = correct_fps( seq_fps, h );
+                if( seq_fps < 0 )
+                    goto fail;
+                for( num = start; num <= end && num < timecodes_num - 1; num++ )
+                    timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
+            }
         }
         if( h->auto_timebase_den || h->auto_timebase_num )
             fpss[seq_num] = h->assume_fps;
diff --git a/output/matroska.c b/output/matroska.c
index 25e91d5..47753d7 100644
--- a/output/matroska.c
+++ b/output/matroska.c
@@ -150,6 +150,8 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
                           avcC, avcC_len, p_mkv->frame_duration, 50000,
                           p_mkv->width, p_mkv->height,
                           p_mkv->d_width, p_mkv->d_height );
+    if( ret < 0 )
+        return ret;

     free( avcC );

--
1.7.0.4


From 217f4f314a13ae21b4ef559ddfa7cb1ce6b740f8 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 16 Apr 2010 12:06:07 -0700
Subject: [PATCH 5/6] MMX code for predictor rounding/clipping
 Faster predictor checking at subme < 3.

---
 common/common.h   |   11 +++++++++++
 common/x86/util.h |   41 +++++++++++++++++++++++++++++++++++++++++
 encoder/me.c      |   11 ++++++-----
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/common/common.h b/common/common.h
index 37f309d..ce2e7af 100644
--- a/common/common.h
+++ b/common/common.h
@@ -188,6 +188,17 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
     return amvd0 + (amvd1<<8);
 }

+static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+{
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        int mx = (mvc[i][0] + 2) >> 2;
+        int my = (mvc[i][1] + 2) >> 2;
+        mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
+        mvc[i][0] = x264_clip3( my, mv_y_min, mv_y_max );
+    }
+}
+
 extern const uint8_t x264_exp2_lut[64];
 extern const float x264_log2_lut[128];
 extern const float x264_log2_lz_lut[32];
diff --git a/common/x86/util.h b/common/x86/util.h
index e094309..1a5ed32 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -45,6 +45,7 @@ static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16
         :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
     );
 }
+
 #define x264_predictor_difference x264_predictor_difference_mmxext
 static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
 {
@@ -80,6 +81,7 @@ static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], in
     );
     return sum;
 }
+
 #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
 static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
 {
@@ -103,6 +105,45 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
     );
     return amvd;
 }
+
+#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
+static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+{
+    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
+    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
+    static const uint64_t pw_2 = 0x0002000200020002ULL;
+    intptr_t i = i_mvc;
+    asm(
+        "movd    %2, %%mm5       \n"
+        "movd    %3, %%mm6       \n"
+        "movq    %4, %%mm7       \n"
+        "punpckldq %%mm5, %%mm5  \n"
+        "punpckldq %%mm6, %%mm6  \n"
+        "test $1, %0             \n"
+        "jz 1f                   \n"
+        "movd -4(%5,%0,4), %%mm0 \n"
+        "paddw %%mm7, %%mm0      \n"
+        "psraw $2, %%mm0         \n"
+        "pmaxsw %%mm5, %%mm0     \n"
+        "pminsw %%mm6, %%mm0     \n"
+        "movd %%mm0, -4(%5,%0,4) \n"
+        "dec %0                  \n"
+        "jz 2f                   \n"
+        "1:                      \n"
+        "movq -8(%5,%0,4), %%mm0 \n"
+        "paddw %%mm7, %%mm0      \n"
+        "psraw $2, %%mm0         \n"
+        "pmaxsw %%mm5, %%mm0     \n"
+        "pminsw %%mm6, %%mm0     \n"
+        "movq %%mm0, -8(%5,%0,4) \n"
+        "sub $2, %0              \n"
+        "jnz 1b                  \n"
+        "2:                      \n"
+        :"+r"(i), "+m"(M64( mvc ))
+        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
+    );
+}
+
 #undef M128_CONST
 #define M128_CONST(x) ((__m128){x,x,x,x})
 #define x264_union128_t x264_union128_sse_t
diff --git a/encoder/me.c b/encoder/me.c
index 6788022..0b519ea 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -241,14 +241,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
          * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
          * biasing against use of the predicted motion vector. */
         bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
+        uint32_t bmv = pack16to32_mask( bmx, bmy );
+        if( i_mvc )
+            x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
         for( int i = 0; i < i_mvc; i++ )
         {
-            int mx = (mvc[i][0] + 2) >> 2;
-            int my = (mvc[i][1] + 2) >> 2;
-            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
+            if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
             {
-                mx = x264_clip3( mx, mv_x_min, mv_x_max );
-                my = x264_clip3( my, mv_y_min, mv_y_max );
+                int mx = mvc[i][0];
+                int my = mvc[i][1];
                 COST_MV( mx, my );
             }
         }
--
1.7.0.4


From 292fc5e6a7c842e70e752eea9d758ad857ac7873 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 16 Apr 2010 11:36:43 -0700
Subject: [PATCH 6/6] Fix issues with extremely large timebases
 With timebase denominators >= 2^30 , x264 would silently overflow and cause odd issues.
 Now x264 will explicitly fail with timebase denominators >= 2^31 and work with timebase denominators 2^31 > x >= 2^30.

---
 common/common.c       |   14 +++++++-------
 common/common.h       |    2 +-
 common/set.h          |    4 ++--
 encoder/encoder.c     |   22 +++++++++++++++-------
 encoder/ratecontrol.c |    4 ++--
 input/input.h         |   12 ++++++------
 input/timecode.c      |   32 +++++++++++++++++---------------
 input/y4m.c           |    3 ++-
 output/flv.c          |    4 ++--
 output/matroska.c     |    4 ++--
 output/mp4.c          |    2 +-
 x264.c                |    8 ++++----
 x264.h                |   11 +++++------
 13 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/common/common.c b/common/common.c
index 924323a..6471c07 100644
--- a/common/common.c
+++ b/common/common.c
@@ -614,7 +614,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     }
     OPT("fps")
     {
-        if( sscanf( value, "%d/%d", &p->i_fps_num, &p->i_fps_den ) == 2 )
+        if( sscanf( value, "%u/%u", &p->i_fps_num, &p->i_fps_den ) == 2 )
             ;
         else
         {
@@ -1119,11 +1119,11 @@ void x264_free( void *p )
 /****************************************************************************
  * x264_reduce_fraction:
  ****************************************************************************/
-void x264_reduce_fraction( int *n, int *d )
+void x264_reduce_fraction( uint32_t *n, uint32_t *d )
 {
-    int a = *n;
-    int b = *d;
-    int c;
+    uint32_t a = *n;
+    uint32_t b = *d;
+    uint32_t c;
     if( !a || !b )
         return;
     c = a % b;
@@ -1185,8 +1185,8 @@ char *x264_param2string( x264_param_t *p, int b_res )
     if( b_res )
     {
         s += sprintf( s, "%dx%d ", p->i_width, p->i_height );
-        s += sprintf( s, "fps=%d/%d ", p->i_fps_num, p->i_fps_den );
-        s += sprintf( s, "timebase=%d/%d ", p->i_timebase_num, p->i_timebase_den );
+        s += sprintf( s, "fps=%u/%u ", p->i_fps_num, p->i_fps_den );
+        s += sprintf( s, "timebase=%u/%u ", p->i_timebase_num, p->i_timebase_den );
     }

     s += sprintf( s, "cabac=%d", p->b_cabac );
diff --git a/common/common.h b/common/common.h
index ce2e7af..f4bd5dc 100644
--- a/common/common.h
+++ b/common/common.h
@@ -134,7 +134,7 @@ int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_sta
 /* log */
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );

-void x264_reduce_fraction( int *n, int *d );
+void x264_reduce_fraction( uint32_t *n, uint32_t *d );
 void x264_init_vlc_tables();

 static ALWAYS_INLINE uint8_t x264_clip_uint8( int x )
diff --git a/common/set.h b/common/set.h
index 9783118..ee27d74 100644
--- a/common/set.h
+++ b/common/set.h
@@ -112,8 +112,8 @@ typedef struct
         int i_chroma_loc_bottom;

         int b_timing_info_present;
-        int i_num_units_in_tick;
-        int i_time_scale;
+        uint32_t i_num_units_in_tick;
+        uint32_t i_time_scale;
         int b_fixed_frame_rate;

         int b_nal_hrd_parameters_present;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 1438ec0..9b21d92 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -817,10 +817,10 @@ static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
     /* VUI */
     if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
     {
-        int i_w = param->vui.i_sar_width;
-        int i_h = param->vui.i_sar_height;
-        int old_w = h->param.vui.i_sar_width;
-        int old_h = h->param.vui.i_sar_height;
+        uint32_t i_w = param->vui.i_sar_width;
+        uint32_t i_h = param->vui.i_sar_height;
+        uint32_t old_w = h->param.vui.i_sar_width;
+        uint32_t old_h = h->param.vui.i_sar_height;

         x264_reduce_fraction( &i_w, &i_h );

@@ -886,21 +886,29 @@ x264_t *x264_encoder_open( x264_param_t *param )
     h->i_frame = -1;
     h->i_frame_num = 0;
     h->i_idr_pic_id = 0;
+    uint64_t new_timebase_den = h->param.i_timebase_den;
     if( h->param.b_dts_compress )
     {
         /* h->i_dts_compress_multiplier == h->frames.i_bframe_delay + 1 */
         h->i_dts_compress_multiplier = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 3 : 2) : 1;
         if( h->i_dts_compress_multiplier != 1 )
         {
-            x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %d/%d -> %d/%d\n",
+            new_timebase_den = h->param.i_timebase_den * h->i_dts_compress_multiplier;
+            x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %u/%u -> %u/ %"PRIu64"\n",
                       h->param.i_timebase_num, h->param.i_timebase_den,
-                      h->param.i_timebase_num, h->param.i_timebase_den * h->i_dts_compress_multiplier );
-            h->param.i_timebase_den *= h->i_dts_compress_multiplier;
+                      h->param.i_timebase_num, new_timebase_den );
         }
     }
     else
         h->i_dts_compress_multiplier = 1;

+    if( new_timebase_den * 2 >= (1ULL << 32) )
+    {
+        x264_log( h, X264_LOG_ERROR, "Effective timebase denominator %"PRIu64" exceeds H.264 maximum\n", new_timebase_den );
+        goto fail;
+    }
+    h->param.i_timebase_den = new_timebase_den;
+
     h->sps = &h->sps_array[0];
     x264_sps_init( h->sps, h->param.i_sps_id, &h->param );

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index b51dbf7..8dd38f1 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -657,14 +657,14 @@ int x264_ratecontrol_new( x264_t *h )
                 return -1;
             }

-            if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%d/%d", &i, &j ) != 2 )
+            if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%u/%u", &i, &j ) != 2 )
             {
                 x264_log( h, X264_LOG_ERROR, "timebase specified in stats file not valid\n" );
                 return -1;
             }
             if( i != h->param.i_timebase_num || j != h->param.i_timebase_den )
             {
-                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%d/%d vs %d/%d)\n",
+                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%u/%u vs %u/%u)\n",
                           h->param.i_timebase_num, h->param.i_timebase_den, i, j );
                 return -1;
             }
diff --git a/input/input.h b/input/input.h
index b6cd218..eb62fdd 100644
--- a/input/input.h
+++ b/input/input.h
@@ -38,15 +38,15 @@ typedef struct
 typedef struct
 {
     int csp; /* X264_CSP_YV12 or X264_CSP_I420 */
-    int fps_num;
-    int fps_den;
+    uint32_t fps_num;
+    uint32_t fps_den;
     int height;
     int interlaced;
-    int sar_width;
-    int sar_height;
+    uint32_t sar_width;
+    uint32_t sar_height;
     int tff;
-    int timebase_num;
-    int timebase_den;
+    uint32_t timebase_num;
+    uint32_t timebase_den;
     int vfr;
     int width;
 } video_info_t;
diff --git a/input/timecode.c b/input/timecode.c
index 5fabe61..008cb19 100644
--- a/input/timecode.c
+++ b/input/timecode.c
@@ -32,8 +32,8 @@ typedef struct
     int frame_total;
     int auto_timebase_num;
     int auto_timebase_den;
-    int timebase_num;
-    int timebase_den;
+    int64_t timebase_num;
+    int64_t timebase_den;
     int seek;
     int stored_pts_num;
     int64_t *pts;
@@ -53,7 +53,7 @@ static inline double sigexp10( double value, double *exponent )

 static double correct_fps( double fps, timecode_hnd_t *h )
 {
-    int64_t i = 1;
+    int i = 1;
     int64_t fps_num, fps_den;
     double exponent;
     double fps_sig = sigexp10( fps, &exponent );
@@ -61,7 +61,7 @@ static double correct_fps( double fps, timecode_hnd_t *h )
     {
         fps_den = i * h->timebase_num;
         fps_num = round( fps_den * fps_sig ) * exponent;
-        if( fps_num < 0 )
+        if( fps_num > UINT_MAX )
         {
             fprintf( stderr, "timecode [error]: tcfile fps correction failed.\n"
                              "                  Specify an appropriate timebase manually or remake tcfile.\n" );
@@ -74,7 +74,7 @@ static double correct_fps( double fps, timecode_hnd_t *h )
     if( h->auto_timebase_den )
     {
         h->timebase_den = h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
-        if( h->timebase_den < 0 )
+        if( h->timebase_den > UINT_MAX )
             h->auto_timebase_den = 0;
     }
     return (double)fps_num / fps_den;
@@ -86,12 +86,12 @@ static int try_mkv_timebase_den( double *fpss, timecode_hnd_t *h, int loop_num )
     h->timebase_den = MKV_TIMEBASE_DEN;
     for( int num = 0; num < loop_num; num++ )
     {
-        int fps_den;
+        int64_t fps_den;
         double exponent;
         double fps_sig = sigexp10( fpss[num], &exponent );
         fps_den = round( MKV_TIMEBASE_DEN / fps_sig ) / exponent;
         h->timebase_num = fps_den > 0 && h->timebase_num ? gcd( h->timebase_num, fps_den ) : fps_den;
-        if( h->timebase_num <= 0 )
+        if( h->timebase_num > UINT_MAX || !h->timebase_num )
         {
             fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
                              "                  Specify timebase manually.\n" );
@@ -305,19 +305,19 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
                 if( h->timebase_den >= 0 )
                 {
                     int i = 1;
-                    int fps_num, fps_den;
+                    int64_t fps_num, fps_den;
                     double exponent;
                     double fps_sig = sigexp10( fpss[num], &exponent );
                     while( 1 )
                     {
                         fps_den = i * h->timebase_num;
                         fps_num = round( fps_den * fps_sig ) * exponent;
-                        if( fps_num < 0 || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
+                        if( fps_num > UINT_MAX || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
                             break;
                         ++i;
                     }
                     h->timebase_den = fps_num > 0 && h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
-                    if( h->timebase_den < 0 )
+                    if( h->timebase_den > UINT_MAX )
                     {
                         h->auto_timebase_den = 0;
                         continue;
@@ -339,10 +339,12 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info

     if( h->auto_timebase_den || h->auto_timebase_num )
     {
-        x264_reduce_fraction( &h->timebase_num, &h->timebase_den );
-        fprintf( stderr, "timecode [info]: automatic timebase generation %d/%d\n", h->timebase_num, h->timebase_den );
+        int64_t i = gcd( h->timebase_num, h->timebase_den );
+        h->timebase_num /= i;
+        h->timebase_den /= i;
+        fprintf( stderr, "timecode [info]: automatic timebase generation %"PRId64"/%"PRId64"\n", h->timebase_num, h->timebase_den );
     }
-    else if( h->timebase_den <= 0 )
+    else if( h->timebase_den > UINT_MAX || !h->timebase_den )
     {
         fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
                          "                  Specify an appropriate timebase manually.\n" );
@@ -394,9 +396,9 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     h->frame_total = input.get_frame_total( h->p_handle );
     h->seek = opt->seek;
     if( opt->timebase )
-        ret = sscanf( opt->timebase, "%d/%d", &h->timebase_num, &h->timebase_den );
+        ret = sscanf( opt->timebase, "%"PRId64"/%"PRId64"", &h->timebase_num, &h->timebase_den );
     if( ret == 1 )
-        h->timebase_num = atoi( opt->timebase );
+        h->timebase_num = strtoul( opt->timebase, NULL, 10 );
     h->auto_timebase_num = !ret;
     h->auto_timebase_den = ret < 2;
     if( h->auto_timebase_num )
diff --git a/input/y4m.c b/input/y4m.c
index c34f264..842b986 100644
--- a/input/y4m.c
+++ b/input/y4m.c
@@ -40,7 +40,8 @@ typedef struct
 static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
 {
     y4m_hnd_t *h = malloc( sizeof(y4m_hnd_t) );
-    int  i, n, d;
+    int i;
+    uint32_t n, d;
     char header[MAX_YUV4_HEADER+10];
     char *tokend, *header_end;
     int colorspace = X264_CSP_NONE;
diff --git a/output/flv.c b/output/flv.c
index 04f4428..e441b6d 100644
--- a/output/flv.c
+++ b/output/flv.c
@@ -47,8 +47,8 @@ typedef struct
     int64_t i_prev_dts;
     int64_t i_prev_pts;

-    int i_timebase_num;
-    int i_timebase_den;
+    uint32_t i_timebase_num;
+    uint32_t i_timebase_den;
     int b_vfr_input;

     unsigned start;
diff --git a/output/matroska.c b/output/matroska.c
index 47753d7..0304c84 100644
--- a/output/matroska.c
+++ b/output/matroska.c
@@ -30,8 +30,8 @@ typedef struct
     int64_t frame_duration;

     char b_writing_frame;
-    int i_timebase_num;
-    int i_timebase_den;
+    uint32_t i_timebase_num;
+    uint32_t i_timebase_den;

 } mkv_hnd_t;

diff --git a/output/mp4.c b/output/mp4.c
index cbe9f5c..f76541e 100644
--- a/output/mp4.c
+++ b/output/mp4.c
@@ -38,7 +38,7 @@ typedef struct
     GF_ISOSample *p_sample;
     int i_track;
     uint32_t i_descidx;
-    int i_time_res;
+    uint32_t i_time_res;
     int64_t i_time_inc;
     int i_numframe;
     int i_delay_time;
diff --git a/x264.c b/x264.c
index 3f46fd9..cabdb1d 100644
--- a/x264.c
+++ b/x264.c
@@ -1205,9 +1205,9 @@ generic_option:
     }
     if( !tcfile_name && input_opt.timebase )
     {
-        int i_user_timebase_num;
-        int i_user_timebase_den;
-        int ret = sscanf( input_opt.timebase, "%d/%d", &i_user_timebase_num, &i_user_timebase_den );
+        uint32_t i_user_timebase_num;
+        uint32_t i_user_timebase_den;
+        int ret = sscanf( input_opt.timebase, "%u/%u", &i_user_timebase_num, &i_user_timebase_den );
         if( !ret )
         {
             fprintf( stderr, "x264 [error]: invalid argument: timebase = %s\n", input_opt.timebase );
@@ -1216,7 +1216,7 @@ generic_option:
         else if( ret == 1 )
         {
             i_user_timebase_num = param->i_timebase_num;
-            i_user_timebase_den = atoi( input_opt.timebase );
+            i_user_timebase_den = strtoul( input_opt.timebase, NULL, 10 );
         }
         opt->timebase_convert_multiplier = ((double)i_user_timebase_den / param->i_timebase_den)
                                          * ((double)param->i_timebase_num / i_user_timebase_num);
diff --git a/x264.h b/x264.h
index d30effe..83f087e 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@

 #include <stdarg.h>

-#define X264_BUILD 93
+#define X264_BUILD 94

 /* x264_t:
  *      opaque handler for encoder */
@@ -208,9 +208,6 @@ typedef struct x264_param_t
         int         i_chroma_loc;    /* both top & bottom */
     } vui;

-    int         i_fps_num;
-    int         i_fps_den;
-
     /* Bitstream parameters */
     int         i_frame_reference;  /* Maximum number of reference frames */
     int         i_keyint_max;       /* Force an IDR keyframe at this interval */
@@ -330,8 +327,10 @@ typedef struct x264_param_t
                                  * otherwise place size (4 bytes) before NAL units. */
     int i_sps_id;               /* SPS and PPS id number */
     int b_vfr_input;            /* VFR input */
-    int i_timebase_num;         /* Timebase numerator */
-    int i_timebase_den;         /* Timebase denominator */
+    uint32_t i_fps_num;
+    uint32_t i_fps_den;
+    uint32_t i_timebase_num;    /* Timebase numerator */
+    uint32_t i_timebase_den;    /* Timebase denominator */
     int b_dts_compress;         /* DTS compression: this algorithm eliminates negative DTS
                                  * by compressing them to be less than the second PTS.
                                  * Warning: this will change the timebase! */
--
1.7.0.4