SHARE
TWEET

Untitled

a guest May 19th, 2017 164 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. From 5a463b2ff722915b2f27a8aeb4d1eaaa49de28f3 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Tue, 13 Apr 2010 01:08:29 -0700
  4. Subject: [PATCH 1/6] Add CP128/M128 macros using SSE, fix some aliasing
  5.  Significantly improve the speed of cache_load and cache_save functions.
  6.  Also fix a ton of pessimization in cache_save and cache_load due to aliasing.
  7.  
  8. ---
  9. common/common.h     |    5 +
  10.  common/macroblock.c |  203 +++++++++++++++++++++++++++------------------------
  11.  common/x86/util.h   |    8 ++
  12.  3 files changed, 120 insertions(+), 96 deletions(-)
  13.  
  14. diff --git a/common/common.h b/common/common.h
  15. index b8c6dfd..38e9b74 100644
  16. --- a/common/common.h
  17. +++ b/common/common.h
  18. @@ -88,12 +88,17 @@ do {\
  19.  typedef union { uint16_t i; uint8_t  c[2]; } MAY_ALIAS x264_union16_t;
  20.  typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } MAY_ALIAS x264_union32_t;
  21.  typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
  22. +typedef struct { uint64_t i[2]; } x264_uint128_t;
  23. +typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
  24.  #define M16(src) (((x264_union16_t*)(src))->i)
  25.  #define M32(src) (((x264_union32_t*)(src))->i)
  26.  #define M64(src) (((x264_union64_t*)(src))->i)
  27. +#define M128(src) (((x264_union128_t*)(src))->i)
  28. +#define M128_CONST(x) ((x264_uint128_t){{x,x}})
  29.  #define CP16(dst,src) M16(dst) = M16(src)
  30.  #define CP32(dst,src) M32(dst) = M32(src)
  31.  #define CP64(dst,src) M64(dst) = M64(src)
  32. +#define CP128(dst,src) M128(dst) = M128(src)
  33.  
  34.  #include "x264.h"
  35.  #include "bs.h"
  36. diff --git a/common/macroblock.c b/common/macroblock.c
  37. index 0b9b903..fb4c1a5 100644
  38. --- a/common/macroblock.c
  39. +++ b/common/macroblock.c
  40. @@ -1026,19 +1026,23 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  41.      int left = h->mb.i_mb_left_xy;
  42.      int top  = h->mb.i_mb_top_xy;
  43.  
  44. +    /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
  45. +    /* By only dereferencing them once, we avoid this issue. */
  46. +    int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
  47. +    uint8_t (*nnz)[24] = h->mb.non_zero_count;
  48. +
  49.      /* load cache */
  50.      if( h->mb.i_neighbour & MB_TOP )
  51.      {
  52.          h->mb.cache.i_cbp_top = h->mb.cbp[top];
  53. -
  54.          /* load intra4x4 */
  55. -        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[top][0] );
  56. +        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
  57.  
  58.          /* load non_zero_count */
  59. -        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[top][12] );
  60. +        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
  61.          /* shift because x264_scan8[16] is misaligned */
  62. -        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[top][18] ) << 8;
  63. -        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[top][22] ) << 8;
  64. +        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &nnz[top][18] ) << 8;
  65. +        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &nnz[top][22] ) << 8;
  66.      }
  67.      else
  68.      {
  69. @@ -1058,22 +1062,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  70.          h->mb.cache.i_cbp_left = h->mb.cbp[left];
  71.  
  72.          /* load intra4x4 */
  73. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[left][4];
  74. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[left][5];
  75. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = h->mb.intra4x4_pred_mode[left][6];
  76. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[left][3];
  77. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
  78. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5];
  79. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6];
  80. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3];
  81.  
  82.          /* load non_zero_count */
  83. -        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[left][3];
  84. -        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[left][7];
  85. -        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[left][11];
  86. -        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[left][15];
  87. +        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
  88. +        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
  89. +        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
  90. +        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
  91.  
  92. -        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[left][16+1];
  93. -        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = h->mb.non_zero_count[left][16+3];
  94. +        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1];
  95. +        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3];
  96.  
  97. -        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = h->mb.non_zero_count[left][16+4+1];
  98. -        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = h->mb.non_zero_count[left][16+4+3];
  99. +        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
  100. +        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
  101.      }
  102.      else
  103.      {
  104. @@ -1146,11 +1150,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  105.  
  106.          for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
  107.          {
  108. +            int16_t (*mv)[2] = h->mb.mv[l];
  109. +            int8_t *ref = h->mb.ref[l];
  110. +
  111.              int i8 = x264_scan8[0] - 1 - 1*8;
  112.              if( h->mb.i_neighbour & MB_TOPLEFT )
  113.              {
  114. -                h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 - 1];
  115. -                CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 - 1] );
  116. +                h->mb.cache.ref[l][i8] = ref[top_8x8 - 1];
  117. +                CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] );
  118.              }
  119.              else
  120.              {
  121. @@ -1162,24 +1169,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  122.              if( h->mb.i_neighbour & MB_TOP )
  123.              {
  124.                  h->mb.cache.ref[l][i8+0] =
  125. -                h->mb.cache.ref[l][i8+1] = h->mb.ref[l][top_8x8 + 0];
  126. +                h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
  127.                  h->mb.cache.ref[l][i8+2] =
  128. -                h->mb.cache.ref[l][i8+3] = h->mb.ref[l][top_8x8 + 1];
  129. -                CP64( h->mb.cache.mv[l][i8+0], h->mb.mv[l][top_4x4+0] );
  130. -                CP64( h->mb.cache.mv[l][i8+2], h->mb.mv[l][top_4x4+2] );
  131. +                h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
  132. +                CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
  133.              }
  134.              else
  135.              {
  136. -                M64( h->mb.cache.mv[l][i8+0] ) = 0;
  137. -                M64( h->mb.cache.mv[l][i8+2] ) = 0;
  138. +                M128( h->mb.cache.mv[l][i8] ) = M128_CONST( 0 );
  139.                  M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
  140.              }
  141.  
  142.              i8 = x264_scan8[0] + 4 - 1*8;
  143.              if( h->mb.i_neighbour & MB_TOPRIGHT )
  144.              {
  145. -                h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 + 2];
  146. -                CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 + 4] );
  147. +                h->mb.cache.ref[l][i8] = ref[top_8x8 + 2];
  148. +                CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] );
  149.              }
  150.              else
  151.                   h->mb.cache.ref[l][i8] = -2;
  152. @@ -1190,14 +1195,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  153.                  const int ir = h->mb.i_b8_xy - 1;
  154.                  const int iv = h->mb.i_b4_xy - 1;
  155.                  h->mb.cache.ref[l][i8+0*8] =
  156. -                h->mb.cache.ref[l][i8+1*8] = h->mb.ref[l][ir + 0*s8x8];
  157. +                h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
  158.                  h->mb.cache.ref[l][i8+2*8] =
  159. -                h->mb.cache.ref[l][i8+3*8] = h->mb.ref[l][ir + 1*s8x8];
  160. +                h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
  161.  
  162. -                CP32( h->mb.cache.mv[l][i8+0*8], h->mb.mv[l][iv + 0*s4x4] );
  163. -                CP32( h->mb.cache.mv[l][i8+1*8], h->mb.mv[l][iv + 1*s4x4] );
  164. -                CP32( h->mb.cache.mv[l][i8+2*8], h->mb.mv[l][iv + 2*s4x4] );
  165. -                CP32( h->mb.cache.mv[l][i8+3*8], h->mb.mv[l][iv + 3*s4x4] );
  166. +                CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
  167. +                CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
  168. +                CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
  169. +                CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
  170.              }
  171.              else
  172.              {
  173. @@ -1210,17 +1215,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  174.  
  175.              if( h->param.b_cabac )
  176.              {
  177. +                uint8_t (*mvd)[8][2] = h->mb.mvd[l];
  178.                  if( h->mb.i_neighbour & MB_TOP )
  179. -                    CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], h->mb.mvd[l][top][0] );
  180. +                    CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] );
  181.                  else
  182.                      M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0;
  183.  
  184.                  if( h->mb.i_neighbour & MB_LEFT )
  185.                  {
  186. -                    CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], h->mb.mvd[l][left][4] );
  187. -                    CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], h->mb.mvd[l][left][5] );
  188. -                    CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], h->mb.mvd[l][left][6] );
  189. -                    CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], h->mb.mvd[l][left][3] );
  190. +                    CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] );
  191. +                    CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] );
  192. +                    CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] );
  193. +                    CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] );
  194.                  }
  195.                  else
  196.                      for( int i = 0; i < 4; i++ )
  197. @@ -1285,10 +1291,10 @@ void x264_macroblock_cache_save( x264_t *h )
  198.      const int i_mb_4x4 = h->mb.i_b4_xy;
  199.      const int i_mb_8x8 = h->mb.i_b8_xy;
  200.  
  201. -    /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
  202. +    /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing.*/
  203.      /* By only dereferencing them once, we avoid this issue. */
  204. -    int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
  205. -    uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
  206. +    int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
  207. +    uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];
  208.  
  209.      x264_macroblock_store_pic( h, 0 );
  210.      x264_macroblock_store_pic( h, 1 );
  211. @@ -1303,15 +1309,15 @@ void x264_macroblock_cache_save( x264_t *h )
  212.      /* save intra4x4 */
  213.      if( i_mb_type == I_4x4 )
  214.      {
  215. -        CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
  216. -        M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
  217. -                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
  218. -                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
  219. +        CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
  220. +        M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
  221. +                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
  222. +                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
  223.      }
  224.      else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
  225. -        M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
  226. +        M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
  227.      else
  228. -        M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
  229. +        M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL;
  230.  
  231.  
  232.      if( i_mb_type == I_PCM )
  233. @@ -1322,19 +1328,19 @@ void x264_macroblock_cache_save( x264_t *h )
  234.          h->mb.i_cbp_luma = 0xf;
  235.          h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
  236.          h->mb.b_transform_8x8 = 0;
  237. -        memset( non_zero_count, 16, sizeof( *h->mb.non_zero_count ) );
  238. +        memset( nnz, 16, sizeof( *h->mb.non_zero_count ) );
  239.      }
  240.      else
  241.      {
  242.          /* save non zero count */
  243. -        CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
  244. -        CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
  245. -        CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
  246. -        CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
  247. -        M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
  248. -        M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
  249. -        M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
  250. -        M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
  251. +        CP32( &nnz[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
  252. +        CP32( &nnz[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
  253. +        CP32( &nnz[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
  254. +        CP32( &nnz[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
  255. +        M16( &nnz[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
  256. +        M16( &nnz[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
  257. +        M16( &nnz[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
  258. +        M16( &nnz[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
  259.  
  260.          if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
  261.              h->mb.i_qp = h->mb.i_last_qp;
  262. @@ -1349,47 +1355,56 @@ void x264_macroblock_cache_save( x264_t *h )
  263.  
  264.      if( h->sh.i_type != SLICE_TYPE_I )
  265.      {
  266. +        int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4];
  267. +        int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4];
  268. +        int8_t *ref0 = &h->mb.ref[0][i_mb_8x8];
  269. +        int8_t *ref1 = &h->mb.ref[1][i_mb_8x8];
  270.          if( !IS_INTRA( i_mb_type ) )
  271.          {
  272. -            h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
  273. -            h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
  274. -            h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
  275. -            h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
  276. -            for( int y = 0; y < 4; y++ )
  277. -            {
  278. -                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
  279. -                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
  280. -            }
  281. +            ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
  282. +            ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
  283. +            ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
  284. +            ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
  285. +            CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
  286. +            CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
  287. +            CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
  288. +            CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
  289.              if( h->sh.i_type == SLICE_TYPE_B )
  290.              {
  291. -                h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
  292. -                h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
  293. -                h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
  294. -                h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
  295. -                for( int y = 0; y < 4; y++ )
  296. -                {
  297. -                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
  298. -                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
  299. -                }
  300. +                ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
  301. +                ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
  302. +                ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
  303. +                ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
  304. +                CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
  305. +                CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
  306. +                CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
  307. +                CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
  308.              }
  309.          }
  310.          else
  311.          {
  312. -            for( int i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
  313. +            M16( ref0+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
  314. +            M16( ref0+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
  315. +            M128( &mv0[0*s4x4] ) = M128_CONST( 0 );
  316. +            M128( &mv0[1*s4x4] ) = M128_CONST( 0 );
  317. +            M128( &mv0[2*s4x4] ) = M128_CONST( 0 );
  318. +            M128( &mv0[3*s4x4] ) = M128_CONST( 0 );
  319. +            if( h->sh.i_type == SLICE_TYPE_B )
  320.              {
  321. -                M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
  322. -                M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
  323. -                for( int y = 0; y < 4; y++ )
  324. -                {
  325. -                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
  326. -                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
  327. -                }
  328. +                M16( ref1+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
  329. +                M16( ref1+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
  330. +                M128( &mv1[0*s4x4] ) = M128_CONST( 0 );
  331. +                M128( &mv1[1*s4x4] ) = M128_CONST( 0 );
  332. +                M128( &mv1[2*s4x4] ) = M128_CONST( 0 );
  333. +                M128( &mv1[3*s4x4] ) = M128_CONST( 0 );
  334.              }
  335.          }
  336.      }
  337.  
  338.      if( h->param.b_cabac )
  339.      {
  340. +        uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
  341. +        uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
  342.          if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
  343.              h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
  344.          else
  345. @@ -1397,27 +1412,23 @@ void x264_macroblock_cache_save( x264_t *h )
  346.  
  347.          if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
  348.          {
  349. -            CP64( h->mb.mvd[0][i_mb_xy][0], h->mb.cache.mvd[0][x264_scan8[10]] );
  350. -            CP16( h->mb.mvd[0][i_mb_xy][4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
  351. -            CP16( h->mb.mvd[0][i_mb_xy][5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
  352. -            CP16( h->mb.mvd[0][i_mb_xy][6], h->mb.cache.mvd[0][x264_scan8[13]] );
  353. +            CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] );
  354. +            CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
  355. +            CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
  356. +            CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] );
  357.              if( h->sh.i_type == SLICE_TYPE_B )
  358.              {
  359. -                CP64( h->mb.mvd[1][i_mb_xy][0], h->mb.cache.mvd[1][x264_scan8[10]] );
  360. -                CP16( h->mb.mvd[1][i_mb_xy][4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
  361. -                CP16( h->mb.mvd[1][i_mb_xy][5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
  362. -                CP16( h->mb.mvd[1][i_mb_xy][6], h->mb.cache.mvd[1][x264_scan8[13]] );
  363. +                CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] );
  364. +                CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
  365. +                CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
  366. +                CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] );
  367.              }
  368.          }
  369.          else
  370.          {
  371. -            M64( h->mb.mvd[0][i_mb_xy][0] ) = 0;
  372. -            M64( h->mb.mvd[0][i_mb_xy][4] ) = 0;
  373. +            M128( mvd0[0] ) = M128_CONST( 0 );
  374.              if( h->sh.i_type == SLICE_TYPE_B )
  375. -            {
  376. -                M64( h->mb.mvd[1][i_mb_xy][0] ) = 0;
  377. -                M64( h->mb.mvd[1][i_mb_xy][4] ) = 0;
  378. -            }
  379. +                M128( mvd1[0] ) = M128_CONST( 0 );
  380.          }
  381.  
  382.          if( h->sh.i_type == SLICE_TYPE_B )
  383. diff --git a/common/x86/util.h b/common/x86/util.h
  384. index ccc0733..e094309 100644
  385. --- a/common/x86/util.h
  386. +++ b/common/x86/util.h
  387. @@ -25,6 +25,9 @@
  388.  #define X264_X86_UTIL_H
  389.  
  390.  #ifdef __GNUC__
  391. +
  392. +#include <xmmintrin.h>
  393. +
  394.  #define x264_median_mv x264_median_mv_mmxext
  395.  static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
  396.  {
  397. @@ -100,6 +103,11 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
  398.      );
  399.      return amvd;
  400.  }
  401. +#undef M128_CONST
  402. +#define M128_CONST(x) ((__m128){x,x,x,x})
  403. +#define x264_union128_t x264_union128_sse_t
  404. +typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
  405. +
  406.  #endif
  407.  
  408.  #endif
  409. --
  410. 1.7.0.4
  411.  
  412.  
  413. From 064db2907f52c95a7254f313edba9788dc6d9c03 Mon Sep 17 00:00:00 2001
  414. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  415. Date: Wed, 14 Apr 2010 14:43:25 -0700
  416. Subject: [PATCH 2/6] Prefetch MB data in cache_load
  417.  Dramatically reduces L1 cache misses.
  418.  ~10% faster cache_load.
  419.  
  420. ---
  421. common/macroblock.c |   38 +++++++++++++++++++++++++++++++-------
  422.  common/osdep.h      |   13 +++++++++++++
  423.  2 files changed, 44 insertions(+), 7 deletions(-)
  424.  
  425. diff --git a/common/macroblock.c b/common/macroblock.c
  426. index fb4c1a5..5c9734f 100644
  427. --- a/common/macroblock.c
  428. +++ b/common/macroblock.c
  429. @@ -941,6 +941,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
  430.  static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y )
  431.  {
  432.      int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
  433. +
  434.      h->mb.i_mb_x = mb_x;
  435.      h->mb.i_mb_y = mb_y;
  436.      h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
  437. @@ -986,6 +987,16 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
  438.  
  439.                  if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
  440.                      h->mb.i_neighbour_intra |= MB_TOP;
  441. +
  442. +                /* We only need to prefetch the top blocks because the left was just written
  443. +                 * to as part of the previous cache_save.  Since most target CPUs use write-allocate
  444. +                 * caches, left blocks are near-guaranteed to be in L1 cache.  Top--not so much. */
  445. +                x264_prefetch( &h->mb.cbp[top] );
  446. +                x264_prefetch( h->mb.intra4x4_pred_mode[top] );
  447. +                x264_prefetch( &h->mb.non_zero_count[top][12] );
  448. +                /* These aren't always allocated, but prefetching an invalid address can't hurt. */
  449. +                x264_prefetch( &h->mb.mb_transform_size[top] );
  450. +                x264_prefetch( &h->mb.skipbp[top] );
  451.              }
  452.          }
  453.  
  454. @@ -1025,16 +1036,20 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  455.  
  456.      int left = h->mb.i_mb_left_xy;
  457.      int top  = h->mb.i_mb_top_xy;
  458. +    int top_y = mb_y - (1 << h->mb.b_interlaced);
  459. +    int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
  460. +    int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
  461.  
  462.      /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
  463.      /* By only dereferencing them once, we avoid this issue. */
  464.      int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
  465.      uint8_t (*nnz)[24] = h->mb.non_zero_count;
  466. +    int16_t *cbp = h->mb.cbp;
  467.  
  468.      /* load cache */
  469.      if( h->mb.i_neighbour & MB_TOP )
  470.      {
  471. -        h->mb.cache.i_cbp_top = h->mb.cbp[top];
  472. +        h->mb.cache.i_cbp_top = cbp[top];
  473.          /* load intra4x4 */
  474.          CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
  475.  
  476. @@ -1059,7 +1074,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  477.  
  478.      if( h->mb.i_neighbour & MB_LEFT )
  479.      {
  480. -        h->mb.cache.i_cbp_left = h->mb.cbp[left];
  481. +        h->mb.cache.i_cbp_left = cbp[left];
  482.  
  483.          /* load intra4x4 */
  484.          h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
  485. @@ -1078,6 +1093,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  486.  
  487.          h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
  488.          h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
  489. +
  490. +        /* Finish the prefetching */
  491. +        if( h->sh.i_type != SLICE_TYPE_I )
  492. +            for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
  493. +            {
  494. +                x264_prefetch( &h->mb.mv[l][top_4x4-1] );
  495. +                /* Top right being not in the same cacheline as top left will happen
  496. +                 * once every 4 MBs, so one extra prefetch is worthwhile */
  497. +                x264_prefetch( &h->mb.mv[l][top_4x4+4] );
  498. +                x264_prefetch( &h->mb.ref[l][top_8x8-1] );
  499. +                x264_prefetch( &h->mb.mvd[l][top] );
  500. +            }
  501.      }
  502.      else
  503.      {
  504. @@ -1142,11 +1169,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  505.      /* load ref/mv/mvd */
  506.      if( h->sh.i_type != SLICE_TYPE_I )
  507.      {
  508. -        const int s8x8 = h->mb.i_b8_stride;
  509. -        const int s4x4 = h->mb.i_b4_stride;
  510. -        const int top_y = mb_y - (1 << h->mb.b_interlaced);
  511. -        const int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
  512. -        const int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
  513. +        int s8x8 = h->mb.i_b8_stride;
  514. +        int s4x4 = h->mb.i_b4_stride;
  515.  
  516.          for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
  517.          {
  518. diff --git a/common/osdep.h b/common/osdep.h
  519. index f97547f..35772f7 100644
  520. --- a/common/osdep.h
  521. +++ b/common/osdep.h
  522. @@ -251,6 +251,19 @@ static int ALWAYS_INLINE x264_ctz( uint32_t x )
  523.  }
  524.  #endif
  525.  
  526. +#if defined(__GNUC__) && defined(HAVE_MMX)
  527. +/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
  528. + * using complex address modes properly unless we use inline asm. */
  529. +static ALWAYS_INLINE void x264_prefetch( void *p )
  530. +{
  531. +    asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
  532. +}
  533. +#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1)
  534. +#define x264_prefetch(x) __builtin_prefetch(x)
  535. +#else
  536. +#define x264_prefetch(x)
  537. +#endif
  538. +
  539.  #ifdef USE_REAL_PTHREAD
  540.  #ifdef SYS_MINGW
  541.  #define x264_lower_thread_priority(p)\
  542. --
  543. 1.7.0.4
  544.  
  545.  
  546. From 8891a9dc2c2602e09c1fc1636b3e3da584cadee2 Mon Sep 17 00:00:00 2001
  547. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  548. Date: Thu, 15 Apr 2010 16:32:31 -0700
  549. Subject: [PATCH 3/6] Move deblocking/hpel into sliced threads
  550.  Instead of doing both as a separate pass, do them during the main encode.
  551.  This requires disabling deblocking between slices (disable_deblock_idc == 2).
  552.  Overall performance gain is about 11% on --preset superfast with sliced threads.
  553.  Doesn't reduce the amount of actual computation done: only better parallelizes it.
  554.  
  555. ---
  556. common/common.h     |    5 ++-
  557.  common/frame.c      |   12 ++++-
  558.  common/macroblock.c |   68 ++++++++++++++++++-------
  559.  common/macroblock.h |    9 +++-
  560.  encoder/encoder.c   |  136 ++++++++++++++++++++++++++-------------------------
  561.  encoder/lookahead.c |    9 ++-
  562.  6 files changed, 146 insertions(+), 93 deletions(-)
  563.  
  564. diff --git a/common/common.h b/common/common.h
  565. index 38e9b74..37f309d 100644
  566. --- a/common/common.h
  567. +++ b/common/common.h
  568. @@ -566,7 +566,8 @@ struct x264_t
  569.          int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
  570.          int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
  571.          int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
  572. -        uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
  573. +        uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
  574. +                                             * NOTE: this will fail on resolutions above 2^16 pixels... */
  575.  
  576.           /* buffer for weighted versions of the reference frames */
  577.          uint8_t *p_weight_buf[16];
  578. @@ -763,7 +764,9 @@ struct x264_t
  579.      ALIGNED_16( uint16_t nr_offset[2][64] );
  580.      uint32_t        nr_count[2];
  581.  
  582. +    /* Buffers that are allocated per-thread even in sliced threads. */
  583.      void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
  584. +    uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
  585.  
  586.      /* CPU functions dependents */
  587.      x264_predict_t      predict_16x16[4+3];
  588. diff --git a/common/frame.c b/common/frame.c
  589. index abcfd14..872e067 100644
  590. --- a/common/frame.c
  591. +++ b/common/frame.c
  592. @@ -658,6 +658,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  593.      int stride2y  = stridey << b_interlaced;
  594.      int strideuv  = h->fdec->i_stride[1];
  595.      int stride2uv = strideuv << b_interlaced;
  596. +    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
  597.      uint8_t (*nnz_backup)[16] = h->scratch_buffer;
  598.  
  599.      if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
  600. @@ -778,9 +779,18 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  601.           * i_dir == 1 -> horizontal edge */
  602.          #define DEBLOCK_DIR(i_dir)\
  603.          {\
  604. -            int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
  605. +            int i_edge = 0;\
  606.              int i_qpn, mbn_xy, mbn_8x8, mbn_4x4;\
  607.              ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
  608. +            /* We don't have to consider the MBAFF case of a slice breaking in the middle\
  609. +             * of a row because x264 doesn't support that case.  If we add support for that,\
  610. +             * this will have to become significantly more complex. */\
  611. +            if( i_dir == 0 && (mb_x == 0 || (!deblock_on_slice_edges &&\
  612. +                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-1])) )\
  613. +                i_edge++;\
  614. +            if( i_dir == 1 && (mb_y <= b_interlaced || (!deblock_on_slice_edges &&\
  615. +                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-(h->mb.i_mb_stride<<b_interlaced)])) )\
  616. +                i_edge++;\
  617.              if( i_edge )\
  618.                  i_edge+= b_8x8_transform;\
  619.              else\
  620. diff --git a/common/macroblock.c b/common/macroblock.c
  621. index 5c9734f..4ef959f 100644
  622. --- a/common/macroblock.c
  623. +++ b/common/macroblock.c
  624. @@ -675,7 +675,7 @@ void x264_mb_mc( x264_t *h )
  625.      }
  626.  }
  627.  
  628. -int x264_macroblock_cache_init( x264_t *h )
  629. +int x264_macroblock_cache_allocate( x264_t *h )
  630.  {
  631.      int i_mb_count = h->mb.i_mb_count;
  632.  
  633. @@ -689,6 +689,8 @@ int x264_macroblock_cache_init( x264_t *h )
  634.      CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
  635.      CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
  636.      CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
  637. +    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
  638. +    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
  639.  
  640.      /* 0 -> 3 top(4), 4 -> 6 : left(3) */
  641.      CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
  642. @@ -755,22 +757,11 @@ int x264_macroblock_cache_init( x264_t *h )
  643.  #undef ALIGN
  644.      }
  645.  
  646. -    for( int i = 0; i <= h->param.b_interlaced; i++ )
  647. -        for( int j = 0; j < 3; j++ )
  648. -        {
  649. -            /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
  650. -            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
  651. -            h->mb.intra_border_backup[i][j] += 8;
  652. -        }
  653. -
  654.      return 0;
  655.  fail: return -1;
  656.  }
  657. -void x264_macroblock_cache_end( x264_t *h )
  658. +void x264_macroblock_cache_free( x264_t *h )
  659.  {
  660. -    for( int i = 0; i <= h->param.b_interlaced; i++ )
  661. -        for( int j = 0; j < 3; j++ )
  662. -            x264_free( h->mb.intra_border_backup[i][j] - 8 );
  663.      for( int i = 0; i < 2; i++ )
  664.          for( int j = 0; j < 32; j++ )
  665.              x264_free( h->mb.mvr[i][j] );
  666. @@ -783,6 +774,7 @@ void x264_macroblock_cache_end( x264_t *h )
  667.          x264_free( h->mb.mvd[0] );
  668.          x264_free( h->mb.mvd[1] );
  669.      }
  670. +    x264_free( h->mb.slice_table );
  671.      x264_free( h->mb.intra4x4_pred_mode );
  672.      x264_free( h->mb.non_zero_count );
  673.      x264_free( h->mb.mb_transform_size );
  674. @@ -790,6 +782,47 @@ void x264_macroblock_cache_end( x264_t *h )
  675.      x264_free( h->mb.cbp );
  676.      x264_free( h->mb.qp );
  677.  }
  678. +
  679. +int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
  680. +{
  681. +    if( !b_lookahead )
  682. +        for( int i = 0; i <= h->param.b_interlaced; i++ )
  683. +            for( int j = 0; j < 3; j++ )
  684. +            {
  685. +                /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
  686. +                CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
  687. +                h->intra_border_backup[i][j] += 8;
  688. +            }
  689. +
  690. +    /* Allocate scratch buffer */
  691. +    int scratch_size = 0;
  692. +    if( !b_lookahead )
  693. +    {
  694. +        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
  695. +        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
  696. +        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
  697. +        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
  698. +            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
  699. +        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
  700. +        scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
  701. +    }
  702. +    int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
  703. +    scratch_size = X264_MAX( scratch_size, buf_mbtree );
  704. +    CHECKED_MALLOC( h->scratch_buffer, scratch_size );
  705. +
  706. +    return 0;
  707. +fail: return -1;
  708. +}
  709. +
  710. +void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
  711. +{
  712. +    if( !b_lookahead )
  713. +        for( int i = 0; i <= h->param.b_interlaced; i++ )
  714. +            for( int j = 0; j < 3; j++ )
  715. +                x264_free( h->intra_border_backup[i][j] - 8 );
  716. +    x264_free( h->scratch_buffer );
  717. +}
  718. +
  719.  void x264_macroblock_slice_init( x264_t *h )
  720.  {
  721.      h->mb.mv[0] = h->fdec->mv[0];
  722. @@ -898,8 +931,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
  723.                             ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
  724.                             : w * (mb_x + mb_y * i_stride);
  725.      const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
  726. -    const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
  727. -                                &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
  728. +    const uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
  729.      int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
  730.      x264_frame_t **fref[2] = { h->fref0, h->fref1 };
  731.      if( h->mb.b_interlaced )
  732. @@ -908,10 +940,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
  733.      h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
  734.      h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
  735.          h->mb.pic.p_fenc_plane[i], i_stride2, w );
  736. -    if( mb_y > 0 )
  737. -        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
  738. -    else
  739. -        memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
  740. +    memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
  741.      if( h->mb.b_interlaced )
  742.          for( int j = 0; j < w; j++ )
  743.              h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
  744. @@ -1327,6 +1356,7 @@ void x264_macroblock_cache_save( x264_t *h )
  745.      x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
  746.  
  747.      h->mb.type[i_mb_xy] = i_mb_type;
  748. +    h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb;
  749.      h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
  750.      h->mb.i_mb_prev_xy = i_mb_xy;
  751.  
  752. diff --git a/common/macroblock.h b/common/macroblock.h
  753. index 5ef1498..ee8c113 100644
  754. --- a/common/macroblock.h
  755. +++ b/common/macroblock.h
  756. @@ -260,13 +260,18 @@ enum cabac_ctx_block_cat_e
  757.      DCT_LUMA_8x8  = 5,
  758.  };
  759.  
  760. +/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
  761. +int  x264_macroblock_cache_allocate( x264_t *h );
  762. +void x264_macroblock_cache_free( x264_t *h );
  763. +
  764. +/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
  765. +int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
  766. +void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
  767.  
  768. -int  x264_macroblock_cache_init( x264_t *h );
  769.  void x264_macroblock_slice_init( x264_t *h );
  770.  void x264_macroblock_thread_init( x264_t *h );
  771.  void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
  772.  void x264_macroblock_cache_save( x264_t *h );
  773. -void x264_macroblock_cache_end( x264_t *h );
  774.  
  775.  void x264_macroblock_bipred_init( x264_t *h );
  776.  
  777. diff --git a/encoder/encoder.c b/encoder/encoder.c
  778. index 300041e..a07f0ea 100644
  779. --- a/encoder/encoder.c
  780. +++ b/encoder/encoder.c
  781. @@ -158,7 +158,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
  782.      int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta);
  783.      /* If effective qp <= 15, deblocking would have no effect anyway */
  784.      if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) )
  785. -        sh->i_disable_deblocking_filter_idc = 0;
  786. +        sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0;
  787.      else
  788.          sh->i_disable_deblocking_filter_idc = 1;
  789.      sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
  790. @@ -519,6 +519,16 @@ static int x264_validate_parameters( x264_t *h )
  791.          h->param.rc.i_vbv_max_bitrate = 0;
  792.      }
  793.  
  794. +    if( h->param.b_interlaced && h->param.i_slice_max_size )
  795. +    {
  796. +        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
  797. +        h->param.i_slice_max_size = 0;
  798. +    }
  799. +    if( h->param.b_interlaced && h->param.i_slice_max_mbs )
  800. +    {
  801. +        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
  802. +        h->param.i_slice_max_mbs = 0;
  803. +    }
  804.      int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
  805.      if( h->param.b_sliced_threads )
  806.          h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
  807. @@ -527,16 +537,6 @@ static int x264_validate_parameters( x264_t *h )
  808.          h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
  809.          h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
  810.          h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
  811. -        if( h->param.b_interlaced && h->param.i_slice_max_size )
  812. -        {
  813. -            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
  814. -            h->param.i_slice_max_size = 0;
  815. -        }
  816. -        if( h->param.b_interlaced && h->param.i_slice_max_mbs )
  817. -        {
  818. -            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
  819. -            h->param.i_slice_max_mbs = 0;
  820. -        }
  821.          if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
  822.              h->param.i_slice_count = 0;
  823.      }
  824. @@ -1059,23 +1059,13 @@ x264_t *x264_encoder_open( x264_param_t *param )
  825.          CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
  826.          h->thread[i]->out.i_nals_allocated = init_nal_count;
  827.  
  828. -        if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
  829. +        if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
  830.              goto fail;
  831.      }
  832.  
  833. -    /* Allocate scratch buffer */
  834. -    for( int i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
  835. -    {
  836. -        int buf_hpel = (h->fdec->i_width[0]+48) * sizeof(int16_t);
  837. -        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
  838. -        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
  839. -        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
  840. -            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
  841. -        int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
  842. -        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
  843. -        int scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, X264_MAX( buf_mbtree, buf_nnz ) );
  844. -        CHECKED_MALLOC( h->thread[i]->scratch_buffer, scratch_size );
  845. -    }
  846. +    for( int i = 0; i < h->param.i_threads; i++ )
  847. +        if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
  848. +            goto fail;
  849.  
  850.      if( x264_ratecontrol_new( h ) < 0 )
  851.          goto fail;
  852. @@ -1552,25 +1542,32 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
  853.      h->mb.pic.i_fref[1] = h->i_ref1;
  854.  }
  855.  
  856. -static void x264_fdec_filter_row( x264_t *h, int mb_y )
  857. +static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
  858.  {
  859.      /* mb_y is the mb to be encoded next, not the mb to be filtered here */
  860.      int b_hpel = h->fdec->b_kept_as_ref;
  861. -    int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
  862. -    int b_end = mb_y == h->sps->i_mb_height;
  863. +    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
  864. +    int b_end = mb_y == h->i_threadslice_end;
  865. +    int b_measure_quality = 1;
  866.      int min_y = mb_y - (1 << h->sh.b_mbaff);
  867. -    int max_y = b_end ? h->sps->i_mb_height : mb_y;
  868. +    int b_start = min_y == h->i_threadslice_start;
  869. +    int max_y = b_end ? h->i_threadslice_end : mb_y;
  870.      b_deblock &= b_hpel || h->param.psz_dump_yuv;
  871. +    if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
  872. +    {
  873. +        b_deblock = 0;         /* We already deblocked on the inloop pass. */
  874. +        b_measure_quality = 0; /* We already measured quality on the inloop pass. */
  875. +    }
  876.      if( mb_y & h->sh.b_mbaff )
  877.          return;
  878. -    if( min_y < 0 )
  879. +    if( min_y < h->i_threadslice_start )
  880.          return;
  881.  
  882. -    if( !b_end && !h->param.b_sliced_threads )
  883. +    if( !b_end && b_inloop )
  884.          for( int j = 0; j <= h->sh.b_mbaff; j++ )
  885.              for( int i = 0; i < 3; i++ )
  886.              {
  887. -                memcpy( h->mb.intra_border_backup[j][i],
  888. +                memcpy( h->intra_border_backup[j][i],
  889.                          h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
  890.                          h->sps->i_mb_width*16 >> !!i );
  891.              }
  892. @@ -1581,39 +1578,43 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
  893.  
  894.      if( b_hpel )
  895.      {
  896. -        x264_frame_expand_border( h, h->fdec, min_y, b_end );
  897. +        int end = mb_y == h->sps->i_mb_height;
  898. +        x264_frame_expand_border( h, h->fdec, min_y, end );
  899.          if( h->param.analyse.i_subpel_refine )
  900.          {
  901. -            x264_frame_filter( h, h->fdec, min_y, b_end );
  902. -            x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
  903. +            x264_frame_filter( h, h->fdec, min_y, end );
  904. +            x264_frame_expand_border_filtered( h, h->fdec, min_y, end );
  905.          }
  906.      }
  907.  
  908.      if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
  909.          x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
  910.  
  911. -    min_y = X264_MAX( min_y*16-8, 0 );
  912. -    max_y = b_end ? h->param.i_height : mb_y*16-8;
  913. -
  914. -    if( h->param.analyse.b_psnr )
  915. -        for( int i = 0; i < 3; i++ )
  916. -            h->stat.frame.i_ssd[i] +=
  917. -                x264_pixel_ssd_wxh( &h->pixf,
  918. -                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
  919. -                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
  920. -                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
  921. +    min_y = min_y*16 - 8 * !b_start;
  922. +    max_y = b_end ? X264_MIN( h->i_threadslice_end*16 , h->param.i_height ) : mb_y*16 - 8;
  923.  
  924. -    if( h->param.analyse.b_ssim )
  925. +    if( b_measure_quality )
  926.      {
  927. -        x264_emms();
  928. -        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
  929. -         * and overlap by 4 */
  930. -        min_y += min_y == 0 ? 2 : -6;
  931. -        h->stat.frame.f_ssim +=
  932. -            x264_pixel_ssim_wxh( &h->pixf,
  933. -                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
  934. -                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
  935. -                h->param.i_width-2, max_y-min_y, h->scratch_buffer );
  936. +        if( h->param.analyse.b_psnr )
  937. +            for( int i = 0; i < 3; i++ )
  938. +                h->stat.frame.i_ssd[i] +=
  939. +                    x264_pixel_ssd_wxh( &h->pixf,
  940. +                        h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
  941. +                        h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
  942. +                        h->param.i_width >> !!i, (max_y-min_y) >> !!i );
  943. +
  944. +        if( h->param.analyse.b_ssim )
  945. +        {
  946. +            x264_emms();
  947. +            /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
  948. +             * and overlap by 4 */
  949. +            min_y += b_start ? 2 : -6;
  950. +            h->stat.frame.f_ssim +=
  951. +                x264_pixel_ssim_wxh( &h->pixf,
  952. +                    h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
  953. +                    h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
  954. +                    h->param.i_width-2, max_y-min_y, h->scratch_buffer );
  955. +        }
  956.      }
  957.  }
  958.  
  959. @@ -1808,8 +1809,8 @@ static int x264_slice_write( x264_t *h )
  960.              }
  961.          }
  962.  
  963. -        if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
  964. -            x264_fdec_filter_row( h, i_mb_y );
  965. +        if( i_mb_x == 0 && !h->mb.b_reencode_mb )
  966. +            x264_fdec_filter_row( h, i_mb_y, 1 );
  967.  
  968.          /* load cache */
  969.          x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
  970. @@ -1971,14 +1972,13 @@ static int x264_slice_write( x264_t *h )
  971.      if( x264_nal_end( h ) )
  972.          return -1;
  973.  
  974. -    if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
  975. +    if( h->sh.i_last_mb == (h->i_threadslice_end * h->sps->i_mb_width - 1) )
  976.      {
  977.          h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
  978.                                    + (h->out.i_nal*NALU_OVERHEAD * 8)
  979.                                    - h->stat.frame.i_tex_bits
  980.                                    - h->stat.frame.i_mv_bits;
  981. -        if( !h->param.b_sliced_threads )
  982. -            x264_fdec_filter_row( h, h->sps->i_mb_height );
  983. +        x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
  984.      }
  985.  
  986.      return 0;
  987. @@ -2099,9 +2099,9 @@ static int x264_threaded_slices_write( x264_t *h )
  988.              return (intptr_t)ret;
  989.      }
  990.  
  991. -    /* deblocking and hpel filtering */
  992. -    for( int i = 0; i <= h->sps->i_mb_height; i++ )
  993. -        x264_stack_align( x264_fdec_filter_row, h, i );
  994. +    /* Go back and fix up the hpel on the borders between slices. */
  995. +    for( int i = 1; i < h->param.i_threads; i++ )
  996. +        x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
  997.  
  998.      x264_threads_merge_ratecontrol( h );
  999.  
  1000. @@ -2114,10 +2114,12 @@ static int x264_threaded_slices_write( x264_t *h )
  1001.              h->out.i_nal++;
  1002.              x264_nal_check_buffer( h );
  1003.          }
  1004. -        /* All entries in stat.frame are ints except for ssd/ssim,
  1005. -         * which are only calculated in the main thread. */
  1006. +        /* All entries in stat.frame are ints except for ssd/ssim. */
  1007.          for( int j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
  1008.              ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
  1009. +        for( int j = 0; j < 3; j++ )
  1010. +            h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j];
  1011. +        h->stat.frame.f_ssim += t->stat.frame.f_ssim;
  1012.      }
  1013.  
  1014.      return 0;
  1015. @@ -3072,9 +3074,9 @@ void    x264_encoder_close  ( x264_t *h )
  1016.              (*frame)->i_reference_count--;
  1017.              if( (*frame)->i_reference_count == 0 )
  1018.                  x264_frame_delete( *frame );
  1019. -            x264_macroblock_cache_end( h->thread[i] );
  1020. +            x264_macroblock_cache_free( h->thread[i] );
  1021.          }
  1022. -        x264_free( h->thread[i]->scratch_buffer );
  1023. +        x264_macroblock_thread_free( h->thread[i], 0 );
  1024.          x264_free( h->thread[i]->out.p_bitstream );
  1025.          x264_free( h->thread[i]->out.nal);
  1026.          x264_free( h->thread[i] );
  1027. diff --git a/encoder/lookahead.c b/encoder/lookahead.c
  1028. index 7a0c6d3..5e29fb5 100644
  1029. --- a/encoder/lookahead.c
  1030. +++ b/encoder/lookahead.c
  1031. @@ -148,7 +148,10 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )
  1032.  
  1033.      x264_t *look_h = h->thread[h->param.i_threads];
  1034.      *look_h = *h;
  1035. -    if( x264_macroblock_cache_init( look_h ) )
  1036. +    if( x264_macroblock_cache_allocate( look_h ) )
  1037. +        goto fail;
  1038. +
  1039. +    if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
  1040.          goto fail;
  1041.  
  1042.      if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
  1043. @@ -170,8 +173,8 @@ void x264_lookahead_delete( x264_t *h )
  1044.          x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
  1045.          x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
  1046.          x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
  1047. -        x264_macroblock_cache_end( h->thread[h->param.i_threads] );
  1048. -        x264_free( h->thread[h->param.i_threads]->scratch_buffer );
  1049. +        x264_macroblock_cache_free( h->thread[h->param.i_threads] );
  1050. +        x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
  1051.          x264_free( h->thread[h->param.i_threads] );
  1052.      }
  1053.      x264_synch_frame_list_delete( &h->lookahead->ifbuf );
  1054. --
  1055. 1.7.0.4
  1056.  
  1057.  
  1058. From 5d4a3f077e7add2f93ee7cd7772f64589eb36f5d Mon Sep 17 00:00:00 2001
  1059. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1060. Date: Fri, 16 Apr 2010 03:06:46 -0700
  1061. Subject: [PATCH 4/6] Fix three minor bugs found by Clang
  1062.  
  1063. ---
  1064. encoder/analyse.c |    2 +-
  1065.  encoder/encoder.c |    2 +-
  1066.  output/matroska.c |    2 ++
  1067.  3 files changed, 4 insertions(+), 2 deletions(-)
  1068.  
  1069. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1070. index 2ece9dc..74672d1 100644
  1071. --- a/encoder/analyse.c
  1072. +++ b/encoder/analyse.c
  1073. @@ -1480,7 +1480,7 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
  1074.          weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
  1075.      h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
  1076.      if( weight[2].weightfn ) \
  1077. -        weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
  1078. +        weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
  1079.  
  1080.  
  1081.      if( pixel == PIXEL_4x4 )
  1082. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1083. index a07f0ea..1438ec0 100644
  1084. --- a/encoder/encoder.c
  1085. +++ b/encoder/encoder.c
  1086. @@ -1338,7 +1338,7 @@ int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t
  1087.          if( h->fref0[i_ref]->i_frame != h->fref0[j]->i_frame )
  1088.          {
  1089.              /* found a place, after j, make sure there is not already a duplicate there */
  1090. -            if( j == i-1 || ( h->fref0[j+1] && h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
  1091. +            if( j == i-1 || ( h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
  1092.                  break;
  1093.          }
  1094.  
  1095. diff --git a/output/matroska.c b/output/matroska.c
  1096. index 25e91d5..47753d7 100644
  1097. --- a/output/matroska.c
  1098. +++ b/output/matroska.c
  1099. @@ -150,6 +150,8 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
  1100.                            avcC, avcC_len, p_mkv->frame_duration, 50000,
  1101.                            p_mkv->width, p_mkv->height,
  1102.                            p_mkv->d_width, p_mkv->d_height );
  1103. +    if( ret < 0 )
  1104. +        return ret;
  1105.  
  1106.      free( avcC );
  1107.  
  1108. --
  1109. 1.7.0.4
  1110.  
  1111.  
  1112. From 822d21d2fd1d116cbeaac0be676edabeeca026cb Mon Sep 17 00:00:00 2001
  1113. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1114. Date: Fri, 16 Apr 2010 11:36:43 -0700
  1115. Subject: [PATCH 5/6] Fix issues with extremely large timebases
  1116.  With timebase denominators >= 2^30 , x264 would silently overflow and cause odd issues.
  1117.  Now x264 will explicitly fail with timebase denominators >= 2^31 and work with timebase denominators 2^31 > x >= 2^30.
  1118.  
  1119. ---
  1120. common/common.c       |   14 +++++++-------
  1121.  common/common.h       |    2 +-
  1122.  common/set.h          |    4 ++--
  1123.  encoder/encoder.c     |   22 +++++++++++++++-------
  1124.  encoder/ratecontrol.c |    4 ++--
  1125.  input/input.h         |   12 ++++++------
  1126.  input/timecode.c      |    8 ++++----
  1127.  input/y4m.c           |    3 ++-
  1128.  output/flv.c          |    4 ++--
  1129.  output/matroska.c     |    4 ++--
  1130.  output/mp4.c          |    2 +-
  1131.  x264.c                |    8 ++++----
  1132.  x264.h                |   11 +++++------
  1133.  13 files changed, 53 insertions(+), 45 deletions(-)
  1134.  
  1135. diff --git a/common/common.c b/common/common.c
  1136. index 924323a..6471c07 100644
  1137. --- a/common/common.c
  1138. +++ b/common/common.c
  1139. @@ -614,7 +614,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
  1140.      }
  1141.      OPT("fps")
  1142.      {
  1143. -        if( sscanf( value, "%d/%d", &p->i_fps_num, &p->i_fps_den ) == 2 )
  1144. +        if( sscanf( value, "%u/%u", &p->i_fps_num, &p->i_fps_den ) == 2 )
  1145.              ;
  1146.          else
  1147.          {
  1148. @@ -1119,11 +1119,11 @@ void x264_free( void *p )
  1149.  /****************************************************************************
  1150.   * x264_reduce_fraction:
  1151.   ****************************************************************************/
  1152. -void x264_reduce_fraction( int *n, int *d )
  1153. +void x264_reduce_fraction( uint32_t *n, uint32_t *d )
  1154.  {
  1155. -    int a = *n;
  1156. -    int b = *d;
  1157. -    int c;
  1158. +    uint32_t a = *n;
  1159. +    uint32_t b = *d;
  1160. +    uint32_t c;
  1161.      if( !a || !b )
  1162.          return;
  1163.      c = a % b;
  1164. @@ -1185,8 +1185,8 @@ char *x264_param2string( x264_param_t *p, int b_res )
  1165.      if( b_res )
  1166.      {
  1167.          s += sprintf( s, "%dx%d ", p->i_width, p->i_height );
  1168. -        s += sprintf( s, "fps=%d/%d ", p->i_fps_num, p->i_fps_den );
  1169. -        s += sprintf( s, "timebase=%d/%d ", p->i_timebase_num, p->i_timebase_den );
  1170. +        s += sprintf( s, "fps=%u/%u ", p->i_fps_num, p->i_fps_den );
  1171. +        s += sprintf( s, "timebase=%u/%u ", p->i_timebase_num, p->i_timebase_den );
  1172.      }
  1173.  
  1174.      s += sprintf( s, "cabac=%d", p->b_cabac );
  1175. diff --git a/common/common.h b/common/common.h
  1176. index 37f309d..311decb 100644
  1177. --- a/common/common.h
  1178. +++ b/common/common.h
  1179. @@ -134,7 +134,7 @@ int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_sta
  1180.  /* log */
  1181.  void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  1182.  
  1183. -void x264_reduce_fraction( int *n, int *d );
  1184. +void x264_reduce_fraction( uint32_t *n, uint32_t *d );
  1185.  void x264_init_vlc_tables();
  1186.  
  1187.  static ALWAYS_INLINE uint8_t x264_clip_uint8( int x )
  1188. diff --git a/common/set.h b/common/set.h
  1189. index 9783118..ee27d74 100644
  1190. --- a/common/set.h
  1191. +++ b/common/set.h
  1192. @@ -112,8 +112,8 @@ typedef struct
  1193.          int i_chroma_loc_bottom;
  1194.  
  1195.          int b_timing_info_present;
  1196. -        int i_num_units_in_tick;
  1197. -        int i_time_scale;
  1198. +        uint32_t i_num_units_in_tick;
  1199. +        uint32_t i_time_scale;
  1200.          int b_fixed_frame_rate;
  1201.  
  1202.          int b_nal_hrd_parameters_present;
  1203. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1204. index 1438ec0..9b21d92 100644
  1205. --- a/encoder/encoder.c
  1206. +++ b/encoder/encoder.c
  1207. @@ -817,10 +817,10 @@ static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
  1208.      /* VUI */
  1209.      if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
  1210.      {
  1211. -        int i_w = param->vui.i_sar_width;
  1212. -        int i_h = param->vui.i_sar_height;
  1213. -        int old_w = h->param.vui.i_sar_width;
  1214. -        int old_h = h->param.vui.i_sar_height;
  1215. +        uint32_t i_w = param->vui.i_sar_width;
  1216. +        uint32_t i_h = param->vui.i_sar_height;
  1217. +        uint32_t old_w = h->param.vui.i_sar_width;
  1218. +        uint32_t old_h = h->param.vui.i_sar_height;
  1219.  
  1220.          x264_reduce_fraction( &i_w, &i_h );
  1221.  
  1222. @@ -886,21 +886,29 @@ x264_t *x264_encoder_open( x264_param_t *param )
  1223.      h->i_frame = -1;
  1224.      h->i_frame_num = 0;
  1225.      h->i_idr_pic_id = 0;
  1226. +    uint64_t new_timebase_den = h->param.i_timebase_den;
  1227.      if( h->param.b_dts_compress )
  1228.      {
  1229.          /* h->i_dts_compress_multiplier == h->frames.i_bframe_delay + 1 */
  1230.          h->i_dts_compress_multiplier = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 3 : 2) : 1;
  1231.          if( h->i_dts_compress_multiplier != 1 )
  1232.          {
  1233. -            x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %d/%d -> %d/%d\n",
  1234. +            new_timebase_den = h->param.i_timebase_den * h->i_dts_compress_multiplier;
  1235. +            x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %u/%u -> %u/ %"PRIu64"\n",
  1236.                        h->param.i_timebase_num, h->param.i_timebase_den,
  1237. -                      h->param.i_timebase_num, h->param.i_timebase_den * h->i_dts_compress_multiplier );
  1238. -            h->param.i_timebase_den *= h->i_dts_compress_multiplier;
  1239. +                      h->param.i_timebase_num, new_timebase_den );
  1240.          }
  1241.      }
  1242.      else
  1243.          h->i_dts_compress_multiplier = 1;
  1244.  
  1245. +    if( new_timebase_den * 2 >= (1ULL << 32) )
  1246. +    {
  1247. +        x264_log( h, X264_LOG_ERROR, "Effective timebase denominator %"PRIu64" exceeds H.264 maximum\n", new_timebase_den );
  1248. +        goto fail;
  1249. +    }
  1250. +    h->param.i_timebase_den = new_timebase_den;
  1251. +
  1252.      h->sps = &h->sps_array[0];
  1253.      x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
  1254.  
  1255. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  1256. index b51dbf7..8dd38f1 100644
  1257. --- a/encoder/ratecontrol.c
  1258. +++ b/encoder/ratecontrol.c
  1259. @@ -657,14 +657,14 @@ int x264_ratecontrol_new( x264_t *h )
  1260.                  return -1;
  1261.              }
  1262.  
  1263. -            if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%d/%d", &i, &j ) != 2 )
  1264. +            if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%u/%u", &i, &j ) != 2 )
  1265.              {
  1266.                  x264_log( h, X264_LOG_ERROR, "timebase specified in stats file not valid\n" );
  1267.                  return -1;
  1268.              }
  1269.              if( i != h->param.i_timebase_num || j != h->param.i_timebase_den )
  1270.              {
  1271. -                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%d/%d vs %d/%d)\n",
  1272. +                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%u/%u vs %u/%u)\n",
  1273.                            h->param.i_timebase_num, h->param.i_timebase_den, i, j );
  1274.                  return -1;
  1275.              }
  1276. diff --git a/input/input.h b/input/input.h
  1277. index b6cd218..eb62fdd 100644
  1278. --- a/input/input.h
  1279. +++ b/input/input.h
  1280. @@ -38,15 +38,15 @@ typedef struct
  1281.  typedef struct
  1282.  {
  1283.      int csp; /* X264_CSP_YV12 or X264_CSP_I420 */
  1284. -    int fps_num;
  1285. -    int fps_den;
  1286. +    uint32_t fps_num;
  1287. +    uint32_t fps_den;
  1288.      int height;
  1289.      int interlaced;
  1290. -    int sar_width;
  1291. -    int sar_height;
  1292. +    uint32_t sar_width;
  1293. +    uint32_t sar_height;
  1294.      int tff;
  1295. -    int timebase_num;
  1296. -    int timebase_den;
  1297. +    uint32_t timebase_num;
  1298. +    uint32_t timebase_den;
  1299.      int vfr;
  1300.      int width;
  1301.  } video_info_t;
  1302. diff --git a/input/timecode.c b/input/timecode.c
  1303. index 4a369ee..1fc2eab 100644
  1304. --- a/input/timecode.c
  1305. +++ b/input/timecode.c
  1306. @@ -30,10 +30,10 @@ typedef struct
  1307.      cli_input_t input;
  1308.      hnd_t p_handle;
  1309.      int frame_total;
  1310. -    int auto_timebase_num;
  1311. -    int auto_timebase_den;
  1312. -    int timebase_num;
  1313. -    int timebase_den;
  1314. +    uint32_t auto_timebase_num;
  1315. +    uint32_t auto_timebase_den;
  1316. +    uint32_t timebase_num;
  1317. +    uint32_t timebase_den;
  1318.      int seek;
  1319.      int stored_pts_num;
  1320.      int64_t *pts;
  1321. diff --git a/input/y4m.c b/input/y4m.c
  1322. index c34f264..842b986 100644
  1323. --- a/input/y4m.c
  1324. +++ b/input/y4m.c
  1325. @@ -40,7 +40,8 @@ typedef struct
  1326.  static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
  1327.  {
  1328.      y4m_hnd_t *h = malloc( sizeof(y4m_hnd_t) );
  1329. -    int  i, n, d;
  1330. +    int i;
  1331. +    uint32_t n, d;
  1332.      char header[MAX_YUV4_HEADER+10];
  1333.      char *tokend, *header_end;
  1334.      int colorspace = X264_CSP_NONE;
  1335. diff --git a/output/flv.c b/output/flv.c
  1336. index 04f4428..e441b6d 100644
  1337. --- a/output/flv.c
  1338. +++ b/output/flv.c
  1339. @@ -47,8 +47,8 @@ typedef struct
  1340.      int64_t i_prev_dts;
  1341.      int64_t i_prev_pts;
  1342.  
  1343. -    int i_timebase_num;
  1344. -    int i_timebase_den;
  1345. +    uint32_t i_timebase_num;
  1346. +    uint32_t i_timebase_den;
  1347.      int b_vfr_input;
  1348.  
  1349.      unsigned start;
  1350. diff --git a/output/matroska.c b/output/matroska.c
  1351. index 47753d7..0304c84 100644
  1352. --- a/output/matroska.c
  1353. +++ b/output/matroska.c
  1354. @@ -30,8 +30,8 @@ typedef struct
  1355.      int64_t frame_duration;
  1356.  
  1357.      char b_writing_frame;
  1358. -    int i_timebase_num;
  1359. -    int i_timebase_den;
  1360. +    uint32_t i_timebase_num;
  1361. +    uint32_t i_timebase_den;
  1362.  
  1363.  } mkv_hnd_t;
  1364.  
  1365. diff --git a/output/mp4.c b/output/mp4.c
  1366. index cbe9f5c..f76541e 100644
  1367. --- a/output/mp4.c
  1368. +++ b/output/mp4.c
  1369. @@ -38,7 +38,7 @@ typedef struct
  1370.      GF_ISOSample *p_sample;
  1371.      int i_track;
  1372.      uint32_t i_descidx;
  1373. -    int i_time_res;
  1374. +    uint32_t i_time_res;
  1375.      int64_t i_time_inc;
  1376.      int i_numframe;
  1377.      int i_delay_time;
  1378. diff --git a/x264.c b/x264.c
  1379. index 3f46fd9..cabdb1d 100644
  1380. --- a/x264.c
  1381. +++ b/x264.c
  1382. @@ -1205,9 +1205,9 @@ generic_option:
  1383.      }
  1384.      if( !tcfile_name && input_opt.timebase )
  1385.      {
  1386. -        int i_user_timebase_num;
  1387. -        int i_user_timebase_den;
  1388. -        int ret = sscanf( input_opt.timebase, "%d/%d", &i_user_timebase_num, &i_user_timebase_den );
  1389. +        uint32_t i_user_timebase_num;
  1390. +        uint32_t i_user_timebase_den;
  1391. +        int ret = sscanf( input_opt.timebase, "%u/%u", &i_user_timebase_num, &i_user_timebase_den );
  1392.          if( !ret )
  1393.          {
  1394.              fprintf( stderr, "x264 [error]: invalid argument: timebase = %s\n", input_opt.timebase );
  1395. @@ -1216,7 +1216,7 @@ generic_option:
  1396.          else if( ret == 1 )
  1397.          {
  1398.              i_user_timebase_num = param->i_timebase_num;
  1399. -            i_user_timebase_den = atoi( input_opt.timebase );
  1400. +            i_user_timebase_den = strtoul( input_opt.timebase, NULL, 10 );
  1401.          }
  1402.          opt->timebase_convert_multiplier = ((double)i_user_timebase_den / param->i_timebase_den)
  1403.                                           * ((double)param->i_timebase_num / i_user_timebase_num);
  1404. diff --git a/x264.h b/x264.h
  1405. index d30effe..83f087e 100644
  1406. --- a/x264.h
  1407. +++ b/x264.h
  1408. @@ -35,7 +35,7 @@
  1409.  
  1410.  #include <stdarg.h>
  1411.  
  1412. -#define X264_BUILD 93
  1413. +#define X264_BUILD 94
  1414.  
  1415.  /* x264_t:
  1416.   *      opaque handler for encoder */
  1417. @@ -208,9 +208,6 @@ typedef struct x264_param_t
  1418.          int         i_chroma_loc;    /* both top & bottom */
  1419.      } vui;
  1420.  
  1421. -    int         i_fps_num;
  1422. -    int         i_fps_den;
  1423. -
  1424.      /* Bitstream parameters */
  1425.      int         i_frame_reference;  /* Maximum number of reference frames */
  1426.      int         i_keyint_max;       /* Force an IDR keyframe at this interval */
  1427. @@ -330,8 +327,10 @@ typedef struct x264_param_t
  1428.                                   * otherwise place size (4 bytes) before NAL units. */
  1429.      int i_sps_id;               /* SPS and PPS id number */
  1430.      int b_vfr_input;            /* VFR input */
  1431. -    int i_timebase_num;         /* Timebase numerator */
  1432. -    int i_timebase_den;         /* Timebase denominator */
  1433. +    uint32_t i_fps_num;
  1434. +    uint32_t i_fps_den;
  1435. +    uint32_t i_timebase_num;    /* Timebase numerator */
  1436. +    uint32_t i_timebase_den;    /* Timebase denominator */
  1437.      int b_dts_compress;         /* DTS compression: this algorithm eliminates negative DTS
  1438.                                   * by compressing them to be less than the second PTS.
  1439.                                   * Warning: this will change the timebase! */
  1440. --
  1441. 1.7.0.4
  1442.  
  1443.  
  1444. From a146be13483474c117bc0ce4638d4a444b37c85b Mon Sep 17 00:00:00 2001
  1445. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1446. Date: Fri, 16 Apr 2010 12:06:07 -0700
  1447. Subject: [PATCH 6/6] MMX code for predictor rounding/clipping
  1448.  Faster predictor checking at subme < 3.
  1449.  
  1450. ---
  1451. common/common.h   |   11 +++++++++++
  1452.  common/x86/util.h |   41 +++++++++++++++++++++++++++++++++++++++++
  1453.  encoder/me.c      |   11 ++++++-----
  1454.  3 files changed, 58 insertions(+), 5 deletions(-)
  1455.  
  1456. diff --git a/common/common.h b/common/common.h
  1457. index 311decb..f4bd5dc 100644
  1458. --- a/common/common.h
  1459. +++ b/common/common.h
  1460. @@ -188,6 +188,17 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
  1461.      return amvd0 + (amvd1<<8);
  1462.  }
  1463.  
  1464. +static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
  1465. +{
  1466. +    for( int i = 0; i < i_mvc; i++ )
  1467. +    {
  1468. +        int mx = (mvc[i][0] + 2) >> 2;
  1469. +        int my = (mvc[i][1] + 2) >> 2;
  1470. +        mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
  1471. +        mvc[i][0] = x264_clip3( my, mv_y_min, mv_y_max );
  1472. +    }
  1473. +}
  1474. +
  1475.  extern const uint8_t x264_exp2_lut[64];
  1476.  extern const float x264_log2_lut[128];
  1477.  extern const float x264_log2_lz_lut[32];
  1478. diff --git a/common/x86/util.h b/common/x86/util.h
  1479. index e094309..1a5ed32 100644
  1480. --- a/common/x86/util.h
  1481. +++ b/common/x86/util.h
  1482. @@ -45,6 +45,7 @@ static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16
  1483.          :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
  1484.      );
  1485.  }
  1486. +
  1487.  #define x264_predictor_difference x264_predictor_difference_mmxext
  1488.  static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
  1489.  {
  1490. @@ -80,6 +81,7 @@ static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], in
  1491.      );
  1492.      return sum;
  1493.  }
  1494. +
  1495.  #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
  1496.  static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
  1497.  {
  1498. @@ -103,6 +105,45 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
  1499.      );
  1500.      return amvd;
  1501.  }
  1502. +
  1503. +#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
  1504. +static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
  1505. +{
  1506. +    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
  1507. +    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
  1508. +    static const uint64_t pw_2 = 0x0002000200020002ULL;
  1509. +    intptr_t i = i_mvc;
  1510. +    asm(
  1511. +        "movd    %2, %%mm5       \n"
  1512. +        "movd    %3, %%mm6       \n"
  1513. +        "movq    %4, %%mm7       \n"
  1514. +        "punpckldq %%mm5, %%mm5  \n"
  1515. +        "punpckldq %%mm6, %%mm6  \n"
  1516. +        "test $1, %0             \n"
  1517. +        "jz 1f                   \n"
  1518. +        "movd -4(%5,%0,4), %%mm0 \n"
  1519. +        "paddw %%mm7, %%mm0      \n"
  1520. +        "psraw $2, %%mm0         \n"
  1521. +        "pmaxsw %%mm5, %%mm0     \n"
  1522. +        "pminsw %%mm6, %%mm0     \n"
  1523. +        "movd %%mm0, -4(%5,%0,4) \n"
  1524. +        "dec %0                  \n"
  1525. +        "jz 2f                   \n"
  1526. +        "1:                      \n"
  1527. +        "movq -8(%5,%0,4), %%mm0 \n"
  1528. +        "paddw %%mm7, %%mm0      \n"
  1529. +        "psraw $2, %%mm0         \n"
  1530. +        "pmaxsw %%mm5, %%mm0     \n"
  1531. +        "pminsw %%mm6, %%mm0     \n"
  1532. +        "movq %%mm0, -8(%5,%0,4) \n"
  1533. +        "sub $2, %0              \n"
  1534. +        "jnz 1b                  \n"
  1535. +        "2:                      \n"
  1536. +        :"+r"(i), "+m"(M64( mvc ))
  1537. +        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
  1538. +    );
  1539. +}
  1540. +
  1541.  #undef M128_CONST
  1542.  #define M128_CONST(x) ((__m128){x,x,x,x})
  1543.  #define x264_union128_t x264_union128_sse_t
  1544. diff --git a/encoder/me.c b/encoder/me.c
  1545. index 6788022..0b519ea 100644
  1546. --- a/encoder/me.c
  1547. +++ b/encoder/me.c
  1548. @@ -241,14 +241,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
  1549.           * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
  1550.           * biasing against use of the predicted motion vector. */
  1551.          bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
  1552. +        uint32_t bmv = pack16to32_mask( bmx, bmy );
  1553. +        if( i_mvc )
  1554. +            x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
  1555.          for( int i = 0; i < i_mvc; i++ )
  1556.          {
  1557. -            int mx = (mvc[i][0] + 2) >> 2;
  1558. -            int my = (mvc[i][1] + 2) >> 2;
  1559. -            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
  1560. +            if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
  1561.              {
  1562. -                mx = x264_clip3( mx, mv_x_min, mv_x_max );
  1563. -                my = x264_clip3( my, mv_y_min, mv_y_max );
  1564. +                int mx = mvc[i][0];
  1565. +                int my = mvc[i][1];
  1566.                  COST_MV( mx, my );
  1567.              }
  1568.          }
  1569. --
  1570. 1.7.0.4
RAW Paste Data
Top