SHARE
TWEET

Untitled

a guest May 19th, 2017 164 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. From 5a463b2ff722915b2f27a8aeb4d1eaaa49de28f3 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Tue, 13 Apr 2010 01:08:29 -0700
  4. Subject: [PATCH 1/6] Add CP128/M128 macros using SSE, fix some aliasing
  5.  Significantly improve the speed of cache_load and cache_save functions.
  6.  Also fix a ton of pessimization in cache_save and cache_load due to aliasing.
  7.  
  8. ---
  9. common/common.h     |    5 +
  10.  common/macroblock.c |  203 +++++++++++++++++++++++++++------------------------
  11.  common/x86/util.h   |    8 ++
  12.  3 files changed, 120 insertions(+), 96 deletions(-)
  13.  
  14. diff --git a/common/common.h b/common/common.h
  15. index b8c6dfd..38e9b74 100644
  16. --- a/common/common.h
  17. +++ b/common/common.h
  18. @@ -88,12 +88,17 @@ do {\
  19.  typedef union { uint16_t i; uint8_t  c[2]; } MAY_ALIAS x264_union16_t;
  20.  typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } MAY_ALIAS x264_union32_t;
  21.  typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
  22. +typedef struct { uint64_t i[2]; } x264_uint128_t;
  23. +typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
  24.  #define M16(src) (((x264_union16_t*)(src))->i)
  25.  #define M32(src) (((x264_union32_t*)(src))->i)
  26.  #define M64(src) (((x264_union64_t*)(src))->i)
  27. +#define M128(src) (((x264_union128_t*)(src))->i)
  28. +#define M128_CONST(x) ((x264_uint128_t){{x,x}})
  29.  #define CP16(dst,src) M16(dst) = M16(src)
  30.  #define CP32(dst,src) M32(dst) = M32(src)
  31.  #define CP64(dst,src) M64(dst) = M64(src)
  32. +#define CP128(dst,src) M128(dst) = M128(src)
  33.  
  34.  #include "x264.h"
  35.  #include "bs.h"
  36. diff --git a/common/macroblock.c b/common/macroblock.c
  37. index 0b9b903..fb4c1a5 100644
  38. --- a/common/macroblock.c
  39. +++ b/common/macroblock.c
  40. @@ -1026,19 +1026,23 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  41.      int left = h->mb.i_mb_left_xy;
  42.      int top  = h->mb.i_mb_top_xy;
  43.  
  44. +    /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
  45. +    /* By only dereferencing them once, we avoid this issue. */
  46. +    int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
  47. +    uint8_t (*nnz)[24] = h->mb.non_zero_count;
  48. +
  49.      /* load cache */
  50.      if( h->mb.i_neighbour & MB_TOP )
  51.      {
  52.          h->mb.cache.i_cbp_top = h->mb.cbp[top];
  53. -
  54.          /* load intra4x4 */
  55. -        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[top][0] );
  56. +        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
  57.  
  58.          /* load non_zero_count */
  59. -        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[top][12] );
  60. +        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
  61.          /* shift because x264_scan8[16] is misaligned */
  62. -        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[top][18] ) << 8;
  63. -        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[top][22] ) << 8;
  64. +        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &nnz[top][18] ) << 8;
  65. +        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &nnz[top][22] ) << 8;
  66.      }
  67.      else
  68.      {
  69. @@ -1058,22 +1062,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  70.          h->mb.cache.i_cbp_left = h->mb.cbp[left];
  71.  
  72.          /* load intra4x4 */
  73. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[left][4];
  74. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[left][5];
  75. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = h->mb.intra4x4_pred_mode[left][6];
  76. -        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[left][3];
  77. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
  78. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5];
  79. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6];
  80. +        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3];
  81.  
  82.          /* load non_zero_count */
  83. -        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[left][3];
  84. -        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[left][7];
  85. -        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[left][11];
  86. -        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[left][15];
  87. +        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
  88. +        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
  89. +        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
  90. +        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
  91.  
  92. -        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[left][16+1];
  93. -        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = h->mb.non_zero_count[left][16+3];
  94. +        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1];
  95. +        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3];
  96.  
  97. -        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = h->mb.non_zero_count[left][16+4+1];
  98. -        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = h->mb.non_zero_count[left][16+4+3];
  99. +        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
  100. +        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
  101.      }
  102.      else
  103.      {
  104. @@ -1146,11 +1150,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  105.  
  106.          for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
  107.          {
  108. +            int16_t (*mv)[2] = h->mb.mv[l];
  109. +            int8_t *ref = h->mb.ref[l];
  110. +
  111.              int i8 = x264_scan8[0] - 1 - 1*8;
  112.              if( h->mb.i_neighbour & MB_TOPLEFT )
  113.              {
  114. -                h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 - 1];
  115. -                CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 - 1] );
  116. +                h->mb.cache.ref[l][i8] = ref[top_8x8 - 1];
  117. +                CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] );
  118.              }
  119.              else
  120.              {
  121. @@ -1162,24 +1169,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  122.              if( h->mb.i_neighbour & MB_TOP )
  123.              {
  124.                  h->mb.cache.ref[l][i8+0] =
  125. -                h->mb.cache.ref[l][i8+1] = h->mb.ref[l][top_8x8 + 0];
  126. +                h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
  127.                  h->mb.cache.ref[l][i8+2] =
  128. -                h->mb.cache.ref[l][i8+3] = h->mb.ref[l][top_8x8 + 1];
  129. -                CP64( h->mb.cache.mv[l][i8+0], h->mb.mv[l][top_4x4+0] );
  130. -                CP64( h->mb.cache.mv[l][i8+2], h->mb.mv[l][top_4x4+2] );
  131. +                h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
  132. +                CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
  133.              }
  134.              else
  135.              {
  136. -                M64( h->mb.cache.mv[l][i8+0] ) = 0;
  137. -                M64( h->mb.cache.mv[l][i8+2] ) = 0;
  138. +                M128( h->mb.cache.mv[l][i8] ) = M128_CONST( 0 );
  139.                  M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
  140.              }
  141.  
  142.              i8 = x264_scan8[0] + 4 - 1*8;
  143.              if( h->mb.i_neighbour & MB_TOPRIGHT )
  144.              {
  145. -                h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 + 2];
  146. -                CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 + 4] );
  147. +                h->mb.cache.ref[l][i8] = ref[top_8x8 + 2];
  148. +                CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] );
  149.              }
  150.              else
  151.                   h->mb.cache.ref[l][i8] = -2;
  152. @@ -1190,14 +1195,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  153.                  const int ir = h->mb.i_b8_xy - 1;
  154.                  const int iv = h->mb.i_b4_xy - 1;
  155.                  h->mb.cache.ref[l][i8+0*8] =
  156. -                h->mb.cache.ref[l][i8+1*8] = h->mb.ref[l][ir + 0*s8x8];
  157. +                h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
  158.                  h->mb.cache.ref[l][i8+2*8] =
  159. -                h->mb.cache.ref[l][i8+3*8] = h->mb.ref[l][ir + 1*s8x8];
  160. +                h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
  161.  
  162. -                CP32( h->mb.cache.mv[l][i8+0*8], h->mb.mv[l][iv + 0*s4x4] );
  163. -                CP32( h->mb.cache.mv[l][i8+1*8], h->mb.mv[l][iv + 1*s4x4] );
  164. -                CP32( h->mb.cache.mv[l][i8+2*8], h->mb.mv[l][iv + 2*s4x4] );
  165. -                CP32( h->mb.cache.mv[l][i8+3*8], h->mb.mv[l][iv + 3*s4x4] );
  166. +                CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
  167. +                CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
  168. +                CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
  169. +                CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
  170.              }
  171.              else
  172.              {
  173. @@ -1210,17 +1215,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  174.  
  175.              if( h->param.b_cabac )
  176.              {
  177. +                uint8_t (*mvd)[8][2] = h->mb.mvd[l];
  178.                  if( h->mb.i_neighbour & MB_TOP )
  179. -                    CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], h->mb.mvd[l][top][0] );
  180. +                    CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] );
  181.                  else
  182.                      M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0;
  183.  
  184.                  if( h->mb.i_neighbour & MB_LEFT )
  185.                  {
  186. -                    CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], h->mb.mvd[l][left][4] );
  187. -                    CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], h->mb.mvd[l][left][5] );
  188. -                    CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], h->mb.mvd[l][left][6] );
  189. -                    CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], h->mb.mvd[l][left][3] );
  190. +                    CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] );
  191. +                    CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] );
  192. +                    CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] );
  193. +                    CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] );
  194.                  }
  195.                  else
  196.                      for( int i = 0; i < 4; i++ )
  197. @@ -1285,10 +1291,10 @@ void x264_macroblock_cache_save( x264_t *h )
  198.      const int i_mb_4x4 = h->mb.i_b4_xy;
  199.      const int i_mb_8x8 = h->mb.i_b8_xy;
  200.  
  201. -    /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
  202. +    /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing.*/
  203.      /* By only dereferencing them once, we avoid this issue. */
  204. -    int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
  205. -    uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
  206. +    int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
  207. +    uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];
  208.  
  209.      x264_macroblock_store_pic( h, 0 );
  210.      x264_macroblock_store_pic( h, 1 );
  211. @@ -1303,15 +1309,15 @@ void x264_macroblock_cache_save( x264_t *h )
  212.      /* save intra4x4 */
  213.      if( i_mb_type == I_4x4 )
  214.      {
  215. -        CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
  216. -        M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
  217. -                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
  218. -                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
  219. +        CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
  220. +        M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
  221. +                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
  222. +                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
  223.      }
  224.      else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
  225. -        M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
  226. +        M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
  227.      else
  228. -        M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
  229. +        M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL;
  230.  
  231.  
  232.      if( i_mb_type == I_PCM )
  233. @@ -1322,19 +1328,19 @@ void x264_macroblock_cache_save( x264_t *h )
  234.          h->mb.i_cbp_luma = 0xf;
  235.          h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
  236.          h->mb.b_transform_8x8 = 0;
  237. -        memset( non_zero_count, 16, sizeof( *h->mb.non_zero_count ) );
  238. +        memset( nnz, 16, sizeof( *h->mb.non_zero_count ) );
  239.      }
  240.      else
  241.      {
  242.          /* save non zero count */
  243. -        CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
  244. -        CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
  245. -        CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
  246. -        CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
  247. -        M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
  248. -        M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
  249. -        M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
  250. -        M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
  251. +        CP32( &nnz[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
  252. +        CP32( &nnz[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
  253. +        CP32( &nnz[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
  254. +        CP32( &nnz[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
  255. +        M16( &nnz[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
  256. +        M16( &nnz[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
  257. +        M16( &nnz[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
  258. +        M16( &nnz[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
  259.  
  260.          if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
  261.              h->mb.i_qp = h->mb.i_last_qp;
  262. @@ -1349,47 +1355,56 @@ void x264_macroblock_cache_save( x264_t *h )
  263.  
  264.      if( h->sh.i_type != SLICE_TYPE_I )
  265.      {
  266. +        int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4];
  267. +        int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4];
  268. +        int8_t *ref0 = &h->mb.ref[0][i_mb_8x8];
  269. +        int8_t *ref1 = &h->mb.ref[1][i_mb_8x8];
  270.          if( !IS_INTRA( i_mb_type ) )
  271.          {
  272. -            h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
  273. -            h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
  274. -            h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
  275. -            h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
  276. -            for( int y = 0; y < 4; y++ )
  277. -            {
  278. -                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
  279. -                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
  280. -            }
  281. +            ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
  282. +            ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
  283. +            ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
  284. +            ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
  285. +            CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
  286. +            CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
  287. +            CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
  288. +            CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
  289.              if( h->sh.i_type == SLICE_TYPE_B )
  290.              {
  291. -                h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
  292. -                h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
  293. -                h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
  294. -                h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
  295. -                for( int y = 0; y < 4; y++ )
  296. -                {
  297. -                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
  298. -                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
  299. -                }
  300. +                ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
  301. +                ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
  302. +                ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
  303. +                ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
  304. +                CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
  305. +                CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
  306. +                CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
  307. +                CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
  308.              }
  309.          }
  310.          else
  311.          {
  312. -            for( int i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
  313. +            M16( ref0+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
  314. +            M16( ref0+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
  315. +            M128( &mv0[0*s4x4] ) = M128_CONST( 0 );
  316. +            M128( &mv0[1*s4x4] ) = M128_CONST( 0 );
  317. +            M128( &mv0[2*s4x4] ) = M128_CONST( 0 );
  318. +            M128( &mv0[3*s4x4] ) = M128_CONST( 0 );
  319. +            if( h->sh.i_type == SLICE_TYPE_B )
  320.              {
  321. -                M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
  322. -                M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
  323. -                for( int y = 0; y < 4; y++ )
  324. -                {
  325. -                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
  326. -                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
  327. -                }
  328. +                M16( ref1+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
  329. +                M16( ref1+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
  330. +                M128( &mv1[0*s4x4] ) = M128_CONST( 0 );
  331. +                M128( &mv1[1*s4x4] ) = M128_CONST( 0 );
  332. +                M128( &mv1[2*s4x4] ) = M128_CONST( 0 );
  333. +                M128( &mv1[3*s4x4] ) = M128_CONST( 0 );
  334.              }
  335.          }
  336.      }
  337.  
  338.      if( h->param.b_cabac )
  339.      {
  340. +        uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
  341. +        uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
  342.          if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
  343.              h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
  344.          else
  345. @@ -1397,27 +1412,23 @@ void x264_macroblock_cache_save( x264_t *h )
  346.  
  347.          if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
  348.          {
  349. -            CP64( h->mb.mvd[0][i_mb_xy][0], h->mb.cache.mvd[0][x264_scan8[10]] );
  350. -            CP16( h->mb.mvd[0][i_mb_xy][4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
  351. -            CP16( h->mb.mvd[0][i_mb_xy][5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
  352. -            CP16( h->mb.mvd[0][i_mb_xy][6], h->mb.cache.mvd[0][x264_scan8[13]] );
  353. +            CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] );
  354. +            CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
  355. +            CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
  356. +            CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] );
  357.              if( h->sh.i_type == SLICE_TYPE_B )
  358.              {
  359. -                CP64( h->mb.mvd[1][i_mb_xy][0], h->mb.cache.mvd[1][x264_scan8[10]] );
  360. -                CP16( h->mb.mvd[1][i_mb_xy][4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
  361. -                CP16( h->mb.mvd[1][i_mb_xy][5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
  362. -                CP16( h->mb.mvd[1][i_mb_xy][6], h->mb.cache.mvd[1][x264_scan8[13]] );
  363. +                CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] );
  364. +                CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
  365. +                CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
  366. +                CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] );
  367.              }
  368.          }
  369.          else
  370.          {
  371. -            M64( h->mb.mvd[0][i_mb_xy][0] ) = 0;
  372. -            M64( h->mb.mvd[0][i_mb_xy][4] ) = 0;
  373. +            M128( mvd0[0] ) = M128_CONST( 0 );
  374.              if( h->sh.i_type == SLICE_TYPE_B )
  375. -            {
  376. -                M64( h->mb.mvd[1][i_mb_xy][0] ) = 0;
  377. -                M64( h->mb.mvd[1][i_mb_xy][4] ) = 0;
  378. -            }
  379. +                M128( mvd1[0] ) = M128_CONST( 0 );
  380.          }
  381.  
  382.          if( h->sh.i_type == SLICE_TYPE_B )
  383. diff --git a/common/x86/util.h b/common/x86/util.h
  384. index ccc0733..e094309 100644
  385. --- a/common/x86/util.h
  386. +++ b/common/x86/util.h
  387. @@ -25,6 +25,9 @@
  388.  #define X264_X86_UTIL_H
  389.  
  390.  #ifdef __GNUC__
  391. +
  392. +#include <xmmintrin.h>
  393. +
  394.  #define x264_median_mv x264_median_mv_mmxext
  395.  static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
  396.  {
  397. @@ -100,6 +103,11 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
  398.      );
  399.      return amvd;
  400.  }
  401. +#undef M128_CONST
  402. +#define M128_CONST(x) ((__m128){x,x,x,x})
  403. +#define x264_union128_t x264_union128_sse_t
  404. +typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
  405. +
  406.  #endif
  407.  
  408.  #endif
  409. --
  410. 1.7.0.4
  411.  
  412.  
  413. From 064db2907f52c95a7254f313edba9788dc6d9c03 Mon Sep 17 00:00:00 2001
  414. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  415. Date: Wed, 14 Apr 2010 14:43:25 -0700
  416. Subject: [PATCH 2/6] Prefetch MB data in cache_load
  417.  Dramatically reduces L1 cache misses.
  418.  ~10% faster cache_load.
  419.  
  420. ---
  421. common/macroblock.c |   38 +++++++++++++++++++++++++++++++-------
  422.  common/osdep.h      |   13 +++++++++++++
  423.  2 files changed, 44 insertions(+), 7 deletions(-)
  424.  
  425. diff --git a/common/macroblock.c b/common/macroblock.c
  426. index fb4c1a5..5c9734f 100644
  427. --- a/common/macroblock.c
  428. +++ b/common/macroblock.c
  429. @@ -941,6 +941,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
  430.  static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y )
  431.  {
  432.      int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
  433. +
  434.      h->mb.i_mb_x = mb_x;
  435.      h->mb.i_mb_y = mb_y;
  436.      h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
  437. @@ -986,6 +987,16 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
  438.  
  439.                  if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
  440.                      h->mb.i_neighbour_intra |= MB_TOP;
  441. +
  442. +                /* We only need to prefetch the top blocks because the left was just written
  443. +                 * to as part of the previous cache_save.  Since most target CPUs use write-allocate
  444. +                 * caches, left blocks are near-guaranteed to be in L1 cache.  Top--not so much. */
  445. +                x264_prefetch( &h->mb.cbp[top] );
  446. +                x264_prefetch( h->mb.intra4x4_pred_mode[top] );
  447. +                x264_prefetch( &h->mb.non_zero_count[top][12] );
  448. +                /* These aren't always allocated, but prefetching an invalid address can't hurt. */
  449. +                x264_prefetch( &h->mb.mb_transform_size[top] );
  450. +                x264_prefetch( &h->mb.skipbp[top] );
  451.              }
  452.          }
  453.  
  454. @@ -1025,16 +1036,20 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  455.  
  456.      int left = h->mb.i_mb_left_xy;
  457.      int top  = h->mb.i_mb_top_xy;
  458. +    int top_y = mb_y - (1 << h->mb.b_interlaced);
  459. +    int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
  460. +    int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
  461.  
  462.      /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
  463.      /* By only dereferencing them once, we avoid this issue. */
  464.      int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
  465.      uint8_t (*nnz)[24] = h->mb.non_zero_count;
  466. +    int16_t *cbp = h->mb.cbp;
  467.  
  468.      /* load cache */
  469.      if( h->mb.i_neighbour & MB_TOP )
  470.      {
  471. -        h->mb.cache.i_cbp_top = h->mb.cbp[top];
  472. +        h->mb.cache.i_cbp_top = cbp[top];
  473.          /* load intra4x4 */
  474.          CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
  475.  
  476. @@ -1059,7 +1074,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  477.  
  478.      if( h->mb.i_neighbour & MB_LEFT )
  479.      {
  480. -        h->mb.cache.i_cbp_left = h->mb.cbp[left];
  481. +        h->mb.cache.i_cbp_left = cbp[left];
  482.  
  483.          /* load intra4x4 */
  484.          h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
  485. @@ -1078,6 +1093,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  486.  
  487.          h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
  488.          h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
  489. +
  490. +        /* Finish the prefetching */
  491. +        if( h->sh.i_type != SLICE_TYPE_I )
  492. +            for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
  493. +            {
  494. +                x264_prefetch( &h->mb.mv[l][top_4x4-1] );
  495. +                /* Top right being not in the same cacheline as top left will happen
  496. +                 * once every 4 MBs, so one extra prefetch is worthwhile */
  497. +                x264_prefetch( &h->mb.mv[l][top_4x4+4] );
  498. +                x264_prefetch( &h->mb.ref[l][top_8x8-1] );
  499. +                x264_prefetch( &h->mb.mvd[l][top] );
  500. +            }
  501.      }
  502.      else
  503.      {
  504. @@ -1142,11 +1169,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  505.      /* load ref/mv/mvd */
  506.      if( h->sh.i_type != SLICE_TYPE_I )
  507.      {
  508. -        const int s8x8 = h->mb.i_b8_stride;
  509. -        const int s4x4 = h->mb.i_b4_stride;
  510. -        const int top_y = mb_y - (1 << h->mb.b_interlaced);
  511. -        const int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
  512. -        const int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
  513. +        int s8x8 = h->mb.i_b8_stride;
  514. +        int s4x4 = h->mb.i_b4_stride;
  515.  
  516.          for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
  517.          {
  518. diff --git a/common/osdep.h b/common/osdep.h
  519. index f97547f..35772f7 100644
  520. --- a/common/osdep.h
  521. +++ b/common/osdep.h
  522. @@ -251,6 +251,19 @@ static int ALWAYS_INLINE x264_ctz( uint32_t x )
  523.  }
  524.  #endif
  525.  
  526. +#if defined(__GNUC__) && defined(HAVE_MMX)
  527. +/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
  528. + * using complex address modes properly unless we use inline asm. */
  529. +static ALWAYS_INLINE void x264_prefetch( void *p )
  530. +{
  531. +    asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
  532. +}
  533. +#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1)
  534. +#define x264_prefetch(x) __builtin_prefetch(x)
  535. +#else
  536. +#define x264_prefetch(x)
  537. +#endif
  538. +
  539.  #ifdef USE_REAL_PTHREAD
  540.  #ifdef SYS_MINGW
  541.  #define x264_lower_thread_priority(p)\
  542. --
  543. 1.7.0.4
  544.  
  545.  
  546. From 8891a9dc2c2602e09c1fc1636b3e3da584cadee2 Mon Sep 17 00:00:00 2001
  547. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  548. Date: Thu, 15 Apr 2010 16:32:31 -0700
  549. Subject: [PATCH 3/6] Move deblocking/hpel into sliced threads
  550.  Instead of doing both as a separate pass, do them during the main encode.
  551.  This requires disabling deblocking between slices (disable_deblock_idc == 2).
  552.  Overall performance gain is about 11% on --preset superfast with sliced threads.
  553.  Doesn't reduce the amount of actual computation done: only better parallelizes it.
  554.  
  555. ---
  556. common/common.h     |    5 ++-
  557.  common/frame.c      |   12 ++++-
  558.  common/macroblock.c |   68 ++++++++++++++++++-------
  559.  common/macroblock.h |    9 +++-
  560.  encoder/encoder.c   |  136 ++++++++++++++++++++++++++-------------------------
  561.  encoder/lookahead.c |    9 ++-
  562.  6 files changed, 146 insertions(+), 93 deletions(-)
  563.  
  564. diff --git a/common/common.h b/common/common.h
  565. index 38e9b74..37f309d 100644
  566. --- a/common/common.h
  567. +++ b/common/common.h
  568. @@ -566,7 +566,8 @@ struct x264_t
  569.          int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
  570.          int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
  571.          int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
  572. -        uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
  573. +        uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
  574. +                                             * NOTE: this will fail on resolutions above 2^16 pixels... */
  575.  
  576.           /* buffer for weighted versions of the reference frames */
  577.          uint8_t *p_weight_buf[16];
  578. @@ -763,7 +764,9 @@ struct x264_t
  579.      ALIGNED_16( uint16_t nr_offset[2][64] );
  580.      uint32_t        nr_count[2];
  581.  
  582. +    /* Buffers that are allocated per-thread even in sliced threads. */
  583.      void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
  584. +    uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
  585.  
  586.      /* CPU functions dependents */
  587.      x264_predict_t      predict_16x16[4+3];
  588. diff --git a/common/frame.c b/common/frame.c
  589. index abcfd14..872e067 100644
  590. --- a/common/frame.c
  591. +++ b/common/frame.c
  592. @@ -658,6 +658,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  593.      int stride2y  = stridey << b_interlaced;
  594.      int strideuv  = h->fdec->i_stride[1];
  595.      int stride2uv = strideuv << b_interlaced;
  596. +    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
  597.      uint8_t (*nnz_backup)[16] = h->scratch_buffer;
  598.  
  599.      if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
  600. @@ -778,9 +779,18 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  601.           * i_dir == 1 -> horizontal edge */
  602.          #define DEBLOCK_DIR(i_dir)\
  603.          {\
  604. -            int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
  605. +            int i_edge = 0;\
  606.              int i_qpn, mbn_xy, mbn_8x8, mbn_4x4;\
  607.              ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
  608. +            /* We don't have to consider the MBAFF case of a slice breaking in the middle\
  609. +             * of a row because x264 doesn't support that case.  If we add support for that,\
  610. +             * this will have to become significantly more complex. */\
  611. +            if( i_dir == 0 && (mb_x == 0 || (!deblock_on_slice_edges &&\
  612. +                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-1])) )\
  613. +                i_edge++;\
  614. +            if( i_dir == 1 && (mb_y <= b_interlaced || (!deblock_on_slice_edges &&\
  615. +                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-(h->mb.i_mb_stride<<b_interlaced)])) )\
  616. +                i_edge++;\
  617.              if( i_edge )\
  618.                  i_edge+= b_8x8_transform;\
  619.              else\
  620. diff --git a/common/macroblock.c b/common/macroblock.c
  621. index 5c9734f..4ef959f 100644
  622. --- a/common/macroblock.c
  623. +++ b/common/macroblock.c
  624. @@ -675,7 +675,7 @@ void x264_mb_mc( x264_t *h )
  625.      }
  626.  }
  627.  
  628. -int x264_macroblock_cache_init( x264_t *h )
  629. +int x264_macroblock_cache_allocate( x264_t *h )
  630.  {
  631.      int i_mb_count = h->mb.i_mb_count;
  632.  
  633. @@ -689,6 +689,8 @@ int x264_macroblock_cache_init( x264_t *h )
  634.      CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
  635.      CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
  636.      CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
  637. +    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
  638. +    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
  639.  
  640.      /* 0 -> 3 top(4), 4 -> 6 : left(3) */
  641.      CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
  642. @@ -755,22 +757,11 @@ int x264_macroblock_cache_init( x264_t *h )
  643.  #undef ALIGN
  644.      }
  645.  
  646. -    for( int i = 0; i <= h->param.b_interlaced; i++ )
  647. -        for( int j = 0; j < 3; j++ )
  648. -        {
  649. -            /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
  650. -            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
  651. -            h->mb.intra_border_backup[i][j] += 8;
  652. -        }
  653. -
  654.      return 0;
  655.  fail: return -1;
  656.  }
  657. -void x264_macroblock_cache_end( x264_t *h )
  658. +void x264_macroblock_cache_free( x264_t *h )
  659.  {
  660. -    for( int i = 0; i <= h->param.b_interlaced; i++ )
  661. -        for( int j = 0; j < 3; j++ )
  662. -            x264_free( h->mb.intra_border_backup[i][j] - 8 );
  663.      for( int i = 0; i < 2; i++ )
  664.          for( int j = 0; j < 32; j++ )
  665.              x264_free( h->mb.mvr[i][j] );
  666. @@ -783,6 +774,7 @@ void x264_macroblock_cache_end( x264_t *h )
  667.          x264_free( h->mb.mvd[0] );
  668.          x264_free( h->mb.mvd[1] );
  669.      }
  670. +    x264_free( h->mb.slice_table );
  671.      x264_free( h->mb.intra4x4_pred_mode );
  672.      x264_free( h->mb.non_zero_count );
  673.      x264_free( h->mb.mb_transform_size );
  674. @@ -790,6 +782,47 @@ void x264_macroblock_cache_end( x264_t *h )
  675.      x264_free( h->mb.cbp );
  676.      x264_free( h->mb.qp );
  677.  }
  678. +
  679. +int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
  680. +{
  681. +    if( !b_lookahead )
  682. +        for( int i = 0; i <= h->param.b_interlaced; i++ )
  683. +            for( int j = 0; j < 3; j++ )
  684. +            {
  685. +                /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
  686. +                CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
  687. +                h->intra_border_backup[i][j] += 8;
  688. +            }
  689. +
  690. +    /* Allocate scratch buffer */
  691. +    int scratch_size = 0;
  692. +    if( !b_lookahead )
  693. +    {
  694. +        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
  695. +        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
  696. +        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
  697. +        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
  698. +            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
  699. +        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
  700. +        scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
  701. +    }
  702. +    int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
  703. +    scratch_size = X264_MAX( scratch_size, buf_mbtree );
  704. +    CHECKED_MALLOC( h->scratch_buffer, scratch_size );
  705. +
  706. +    return 0;
  707. +fail: return -1;
  708. +}
  709. +
  710. +void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
  711. +{
  712. +    if( !b_lookahead )
  713. +        for( int i = 0; i <= h->param.b_interlaced; i++ )
  714. +            for( int j = 0; j < 3; j++ )
  715. +                x264_free( h->intra_border_backup[i][j] - 8 );
  716. +    x264_free( h->scratch_buffer );
  717. +}
  718. +
  719.  void x264_macroblock_slice_init( x264_t *h )
  720.  {
  721.      h->mb.mv[0] = h->fdec->mv[0];
  722. @@ -898,8 +931,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
  723.                             ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
  724.                             : w * (mb_x + mb_y * i_stride);
  725.      const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
  726. -    const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
  727. -                                &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
  728. +    const uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
  729.      int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
  730.      x264_frame_t **fref[2] = { h->fref0, h->fref1 };
  731.      if( h->mb.b_interlaced )
  732. @@ -908,10 +940,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
  733.      h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
  734.      h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
  735.          h->mb.pic.p_fenc_plane[i], i_stride2, w );
  736. -    if( mb_y > 0 )
  737. -        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
  738. -    else
  739. -        memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
  740. +    memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
  741.      if( h->mb.b_interlaced )
  742.          for( int j = 0; j < w; j++ )
  743.              h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
  744. @@ -1327,6 +1356,7 @@ void x264_macroblock_cache_save( x264_t *h )
  745.      x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
  746.  
  747.      h->mb.type[i_mb_xy] = i_mb_type;
  748. +    h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb;
  749.      h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
  750.      h->mb.i_mb_prev_xy = i_mb_xy;
  751.  
  752. diff --git a/common/macroblock.h b/common/macroblock.h
  753. index 5ef1498..ee8c113 100644
  754. --- a/common/macroblock.h
  755. +++ b/common/macroblock.h
  756. @@ -260,13 +260,18 @@ enum cabac_ctx_block_cat_e
  757.      DCT_LUMA_8x8  = 5,
  758.  };
  759.  
  760. +/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
  761. +int  x264_macroblock_cache_allocate( x264_t *h );
  762. +void x264_macroblock_cache_free( x264_t *h );
  763. +
  764. +/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
  765. +int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
  766. +void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
  767.  
  768. -int  x264_macroblock_cache_init( x264_t *h );
  769.  void x264_macroblock_slice_init( x264_t *h );
  770.  void x264_macroblock_thread_init( x264_t *h );
  771.  void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
  772.  void x264_macroblock_cache_save( x264_t *h );
  773. -void x264_macroblock_cache_end( x264_t *h );
  774.  
  775.  void x264_macroblock_bipred_init( x264_t *h );
  776.  
  777. diff --git a/encoder/encoder.c b/encoder/encoder.c
  778. index 300041e..a07f0ea 100644
  779. --- a/encoder/encoder.c
  780. +++ b/encoder/encoder.c
  781. @@ -158,7 +158,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
  782.      int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta);
  783.      /* If effective qp <= 15, deblocking would have no effect anyway */
  784.      if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) )
  785. -        sh->i_disable_deblocking_filter_idc = 0;
  786. +        sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0;
  787.      else
  788.          sh->i_disable_deblocking_filter_idc = 1;
  789.      sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
  790. @@ -519,6 +519,16 @@ static int x264_validate_parameters( x264_t *h )
  791.          h->param.rc.i_vbv_max_bitrate = 0;
  792.      }
  793.  
  794. +    if( h->param.b_interlaced && h->param.i_slice_max_size )
  795. +    {
  796. +        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
  797. +        h->param.i_slice_max_size = 0;
  798. +    }
  799. +    if( h->param.b_interlaced && h->param.i_slice_max_mbs )
  800. +    {
  801. +        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
  802. +        h->param.i_slice_max_mbs = 0;
  803. +    }
  804.      int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
  805.      if( h->param.b_sliced_threads )
  806.          h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
  807. @@ -527,16 +537,6 @@ static int x264_validate_parameters( x264_t *h )
  808.          h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
  809.          h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
  810.          h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
  811. -        if( h->param.b_interlaced && h->param.i_slice_max_size )
  812. -        {
  813. -            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
  814. -            h->param.i_slice_max_size = 0;
  815. -        }
  816. -        if( h->param.b_interlaced && h->param.i_slice_max_mbs )
  817. -        {
  818. -            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
  819. -            h->param.i_slice_max_mbs = 0;
  820. -        }
  821.          if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
  822.              h->param.i_slice_count = 0;
  823.      }
  824. @@ -1059,23 +1059,13 @@ x264_t *x264_encoder_open( x264_param_t *param )
  825.          CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
  826.          h->thread[i]->out.i_nals_allocated = init_nal_count;
  827.  
  828. -        if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
  829. +        if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
  830.              goto fail;
  831.      }
  832.  
  833. -    /* Allocate scratch buffer */
  834. -    for( int i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
  835. -    {
  836. -        int buf_hpel = (h->fdec->i_width[0]+48) * sizeof(int16_t);
  837. -        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
  838. -        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
  839. -        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
  840. -            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
  841. -        int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
  842. -        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
  843. -        int scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, X264_MAX( buf_mbtree, buf_nnz ) );
  844. -        CHECKED_MALLOC( h->thread[i]->scratch_buffer, scratch_size );
  845. -    }
  846. +    for( int i = 0; i < h->param.i_threads; i++ )
  847. +        if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
  848. +            goto fail;
  849.  
  850.      if( x264_ratecontrol_new( h ) < 0 )
  851.          goto fail;
  852. @@ -1552,25 +1542,32 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
  853.      h->mb.pic.i_fref[1] = h->i_ref1;
  854.  }
  855.  
  856. -static void x264_fdec_filter_row( x264_t *h, int mb_y )
  857. +static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
  858.  {
  859.      /* mb_y is the mb to be encoded next, not the mb to be filtered here */
  860.      int b_hpel = h->fdec->b_kept_as_ref;
  861. -    int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
  862. -    int b_end = mb_y == h->sps->i_mb_height;
  863. +    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
  864. +    int b_end = mb_y == h->i_threadslice_end;
  865. +    int b_measure_quality = 1;
  866.      int min_y = mb_y - (1 << h->sh.b_mbaff);
  867. -    int max_y = b_end ? h->sps->i_mb_height : mb_y;
  868. +    int b_start = min_y == h->i_threadslice_start;
  869. +    int max_y = b_end ? h->i_threadslice_end : mb_y;
  870.      b_deblock &= b_hpel || h->param.psz_dump_yuv;
  871. +    if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
  872. +    {
  873. +        b_deblock = 0;         /* We already deblocked on the inloop pass. */
  874. +        b_measure_quality = 0; /* We already measured quality on the inloop pass. */
  875. +    }
  876.      if( mb_y & h->sh.b_mbaff )
  877.          return;
  878. -    if( min_y < 0 )
  879. +    if( min_y < h->i_threadslice_start )
  880.          return;
  881.  
  882. -    if( !b_end && !h->param.b_sliced_threads )
  883. +    if( !b_end && b_inloop )
  884.          for( int j = 0; j <= h->sh.b_mbaff; j++ )
  885.              for( int i = 0; i < 3; i++ )
  886.              {
  887. -                memcpy( h->mb.intra_border_backup[j][i],
  888. +                memcpy( h->intra_border_backup[j][i],
  889.                          h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
  890.                          h->sps->i_mb_width*16 >> !!i );
  891.              }
  892. @@ -1581,39 +1578,43 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
  893.  
  894.      if( b_hpel )
  895.      {
  896. -        x264_frame_expand_border( h, h->fdec, min_y, b_end );
  897. +        int end = mb_y == h->sps->i_mb_height;
  898. +        x264_frame_expand_border( h, h->fdec, min_y, end );
  899.          if( h->param.analyse.i_subpel_refine )
  900.          {
  901. -            x264_frame_filter( h, h->fdec, min_y, b_end );
  902. -            x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
  903. +            x264_frame_filter( h, h->fdec, min_y, end );
  904. +            x264_frame_expand_border_filtered( h, h->fdec, min_y, end );
  905.          }
  906.      }
  907.  
  908.      if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
  909.          x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
  910.  
  911. -    min_y = X264_MAX( min_y*16-8, 0 );
  912. -    max_y = b_end ? h->param.i_height : mb_y*16-8;
  913. -
  914. -    if( h->param.analyse.b_psnr )
  915. -        for( int i = 0; i < 3; i++ )
  916. -            h->stat.frame.i_ssd[i] +=
  917. -                x264_pixel_ssd_wxh( &h->pixf,
  918. -                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
  919. -                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
  920. -                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
  921. +    min_y = min_y*16 - 8 * !b_start;
  922. +    max_y = b_end ? X264_MIN( h->i_threadslice_end*16 , h->param.i_height ) : mb_y*16 - 8;
  923.  
  924. -    if( h->param.analyse.b_ssim )
  925. +    if( b_measure_quality )
  926.      {
  927. -        x264_emms();
  928. -        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
  929. -         * and overlap by 4 */
  930. -        min_y += min_y == 0 ? 2 : -6;
  931. -        h->stat.frame.f_ssim +=
  932. -            x264_pixel_ssim_wxh( &h->pixf,
  933. -                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
  934. -                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
  935. -                h->param.i_width-2, max_y-min_y, h->scratch_buffer );
  936. +        if( h->param.analyse.b_psnr )
  937. +            for( int i = 0; i < 3; i++ )
  938. +                h->stat.frame.i_ssd[i] +=
  939. +                    x264_pixel_ssd_wxh( &h->pixf,
  940. +                        h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
  941. +                        h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
  942. +                        h->param.i_width >> !!i, (max_y-min_y) >> !!i );
  943. +
  944. +        if( h->param.analyse.b_ssim )
  945. +        {
  946. +            x264_emms();
  947. +            /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
  948. +             * and overlap by 4 */
  949. +            min_y += b_start ? 2 : -6;
  950. +            h->stat.frame.f_ssim +=
  951. +                x264_pixel_ssim_wxh( &h->pixf,
  952. +                    h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
  953. +                    h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
  954. +                    h->param.i_width-2, max_y-min_y, h->scratch_buffer );
  955. +        }
  956.      }
  957.  }
  958.  
  959. @@ -1808,8 +1809,8 @@ static int x264_slice_write( x264_t *h )
  960.              }
  961.          }
  962.  
  963. -        if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
  964. -            x264_fdec_filter_row( h, i_mb_y );
  965. +        if( i_mb_x == 0 && !h->mb.b_reencode_mb )
  966. +            x264_fdec_filter_row( h, i_mb_y, 1 );
  967.  
  968.          /* load cache */
  969.          x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
  970. @@ -1971,14 +1972,13 @@ static int x264_slice_write( x264_t *h )
  971.      if( x264_nal_end( h ) )
  972.          return -1;
  973.  
  974. -    if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
  975. +    if( h->sh.i_last_mb == (h->i_threadslice_end * h->sps->i_mb_width - 1) )
  976.      {
  977.          h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
  978.                                    + (h->out.i_nal*NALU_OVERHEAD * 8)
  979.                                    - h->stat.frame.i_tex_bits
  980.                                    - h->stat.frame.i_mv_bits;
  981. -        if( !h->param.b_sliced_threads )
  982. -            x264_fdec_filter_row( h, h->sps->i_mb_height );
  983. +        x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
  984.      }
  985.  
  986.      return 0;
  987. @@ -2099,9 +2099,9 @@ static int x264_threaded_slices_write( x264_t *h )
  988.              return (intptr_t)ret;
  989.      }
  990.  
  991. -    /* deblocking and hpel filtering */
  992. -    for( int i = 0; i <= h->sps->i_mb_height; i++ )
  993. -        x264_stack_align( x264_fdec_filter_row, h, i );
  994. +    /* Go back and fix up the hpel on the borders between slices. */
  995. +    for( int i = 1; i < h->param.i_threads; i++ )
  996. +        x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
  997.  
  998.      x264_threads_merge_ratecontrol( h );
  999.  
  1000. @@ -2114,10 +2114,12 @@ static int x264_threaded_slices_write( x264_t *h )
  1001.              h->out.i_nal++;
  1002.              x264_nal_check_buffer( h );
  1003.          }
  1004. -        /* All entries in stat.frame are ints except for ssd/ssim,
  1005. -         * which are only calculated in the main thread. */
  1006. +        /* All entries in stat.frame are ints except for ssd/ssim. */
  1007.          for( int j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
  1008.              ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
  1009. +        for( int j = 0; j < 3; j++ )
  1010. +            h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j];
  1011. +        h->stat.frame.f_ssim += t->stat.frame.f_ssim;
  1012.      }
  1013.  
  1014.      return 0;
  1015. @@ -3072,9 +3074,9 @@ void    x264_encoder_close  ( x264_t *h )
  1016.              (*frame)->i_reference_count--;
  1017.              if( (*frame)->i_reference_count == 0 )
  1018.                  x264_frame_delete( *frame );
  1019. -            x264_macroblock_cache_end( h->thread[i] );
  1020. +            x264_macroblock_cache_free( h->thread[i] );
  1021.          }
  1022. -        x264_free( h->thread[i]->scratch_buffer );
  1023. +        x264_macroblock_thread_free( h->thread[i], 0 );
  1024.          x264_free( h->thread[i]->out.p_bitstream );
  1025.          x264_free( h->thread[i]->out.nal);
  1026.          x264_free( h->thread[i] );
  1027. diff --git a/encoder/lookahead.c b/encoder/lookahead.c
  1028. index 7a0c6d3..5e29fb5 100644
  1029. --- a/encoder/lookahead.c
  1030. +++ b/encoder/lookahead.c
  1031. @@ -148,7 +148,10 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )
  1032.  
  1033.      x264_t *look_h = h->thread[h->param.i_threads];
  1034.      *look_h = *h;
  1035. -    if( x264_macroblock_cache_init( look_h ) )
  1036. +    if( x264_macroblock_cache_allocate( look_h ) )
  1037. +        goto fail;
  1038. +
  1039. +    if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
  1040.          goto fail;
  1041.  
  1042.      if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
  1043. @@ -170,8 +173,8 @@ void x264_lookahead_delete( x264_t *h )
  1044.          x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
  1045.          x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
  1046.          x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
  1047. -        x264_macroblock_cache_end( h->thread[h->param.i_threads] );
  1048. -        x264_free( h->thread[h->param.i_threads]->scratch_buffer );
  1049. +        x264_macroblock_cache_free( h->thread[h->param.i_threads] );
  1050. +        x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
  1051.          x264_free( h->thread[h->param.i_threads] );
  1052.      }
  1053.      x264_synch_frame_list_delete( &h->lookahead->ifbuf );
  1054. --
  1055. 1.7.0.4
  1056.  
  1057.  
  1058. From cd9762c72e81e036b8eda7d6559d0a867f187c9e Mon Sep 17 00:00:00 2001
  1059. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1060. Date: Fri, 16 Apr 2010 03:06:46 -0700
  1061. Subject: [PATCH 4/6] Fix four minor bugs found by Clang
  1062.  
  1063. ---
  1064. encoder/analyse.c |    2 +-
  1065.  encoder/encoder.c |    2 +-
  1066.  input/timecode.c  |   17 ++++++++++-------
  1067.  output/matroska.c |    2 ++
  1068.  4 files changed, 14 insertions(+), 9 deletions(-)
  1069.  
  1070. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1071. index 2ece9dc..74672d1 100644
  1072. --- a/encoder/analyse.c
  1073. +++ b/encoder/analyse.c
  1074. @@ -1480,7 +1480,7 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
  1075.          weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
  1076.      h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
  1077.      if( weight[2].weightfn ) \
  1078. -        weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
  1079. +        weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
  1080.  
  1081.  
  1082.      if( pixel == PIXEL_4x4 )
  1083. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1084. index a07f0ea..1438ec0 100644
  1085. --- a/encoder/encoder.c
  1086. +++ b/encoder/encoder.c
  1087. @@ -1338,7 +1338,7 @@ int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t
  1088.          if( h->fref0[i_ref]->i_frame != h->fref0[j]->i_frame )
  1089.          {
  1090.              /* found a place, after j, make sure there is not already a duplicate there */
  1091. -            if( j == i-1 || ( h->fref0[j+1] && h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
  1092. +            if( j == i-1 || ( h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
  1093.                  break;
  1094.          }
  1095.  
  1096. diff --git a/input/timecode.c b/input/timecode.c
  1097. index 4a369ee..5fabe61 100644
  1098. --- a/input/timecode.c
  1099. +++ b/input/timecode.c
  1100. @@ -194,15 +194,18 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
  1101.              ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
  1102.              if( ret != 3 )
  1103.                  start = end = timecodes_num - 1;
  1104. -            if( h->auto_timebase_den || h->auto_timebase_num )
  1105. -                fpss[seq_num++] = seq_fps;
  1106. -            seq_fps = correct_fps( seq_fps, h );
  1107. -            if( seq_fps < 0 )
  1108. -                goto fail;
  1109.              for( ; num < start && num < timecodes_num - 1; num++ )
  1110.                  timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
  1111. -            for( num = start; num <= end && num < timecodes_num - 1; num++ )
  1112. -                timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
  1113. +            if( num < timecodes_num - 1 )
  1114. +            {
  1115. +                if( h->auto_timebase_den || h->auto_timebase_num )
  1116. +                    fpss[seq_num++] = seq_fps;
  1117. +                seq_fps = correct_fps( seq_fps, h );
  1118. +                if( seq_fps < 0 )
  1119. +                    goto fail;
  1120. +                for( num = start; num <= end && num < timecodes_num - 1; num++ )
  1121. +                    timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
  1122. +            }
  1123.          }
  1124.          if( h->auto_timebase_den || h->auto_timebase_num )
  1125.              fpss[seq_num] = h->assume_fps;
  1126. diff --git a/output/matroska.c b/output/matroska.c
  1127. index 25e91d5..47753d7 100644
  1128. --- a/output/matroska.c
  1129. +++ b/output/matroska.c
  1130. @@ -150,6 +150,8 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
  1131.                            avcC, avcC_len, p_mkv->frame_duration, 50000,
  1132.                            p_mkv->width, p_mkv->height,
  1133.                            p_mkv->d_width, p_mkv->d_height );
  1134. +    if( ret < 0 )
  1135. +        return ret;
  1136.  
  1137.      free( avcC );
  1138.  
  1139. --
  1140. 1.7.0.4
  1141.  
  1142.  
  1143. From 217f4f314a13ae21b4ef559ddfa7cb1ce6b740f8 Mon Sep 17 00:00:00 2001
  1144. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1145. Date: Fri, 16 Apr 2010 12:06:07 -0700
  1146. Subject: [PATCH 5/6] MMX code for predictor rounding/clipping
  1147.  Faster predictor checking at subme < 3.
  1148.  
  1149. ---
  1150. common/common.h   |   11 +++++++++++
  1151.  common/x86/util.h |   41 +++++++++++++++++++++++++++++++++++++++++
  1152.  encoder/me.c      |   11 ++++++-----
  1153.  3 files changed, 58 insertions(+), 5 deletions(-)
  1154.  
  1155. diff --git a/common/common.h b/common/common.h
  1156. index 37f309d..ce2e7af 100644
  1157. --- a/common/common.h
  1158. +++ b/common/common.h
  1159. @@ -188,6 +188,17 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
  1160.      return amvd0 + (amvd1<<8);
  1161.  }
  1162.  
  1163. +static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
  1164. +{
  1165. +    for( int i = 0; i < i_mvc; i++ )
  1166. +    {
  1167. +        int mx = (mvc[i][0] + 2) >> 2;
  1168. +        int my = (mvc[i][1] + 2) >> 2;
  1169. +        mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
  1170. +        mvc[i][0] = x264_clip3( my, mv_y_min, mv_y_max );
  1171. +    }
  1172. +}
  1173. +
  1174.  extern const uint8_t x264_exp2_lut[64];
  1175.  extern const float x264_log2_lut[128];
  1176.  extern const float x264_log2_lz_lut[32];
  1177. diff --git a/common/x86/util.h b/common/x86/util.h
  1178. index e094309..1a5ed32 100644
  1179. --- a/common/x86/util.h
  1180. +++ b/common/x86/util.h
  1181. @@ -45,6 +45,7 @@ static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16
  1182.          :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
  1183.      );
  1184.  }
  1185. +
  1186.  #define x264_predictor_difference x264_predictor_difference_mmxext
  1187.  static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
  1188.  {
  1189. @@ -80,6 +81,7 @@ static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], in
  1190.      );
  1191.      return sum;
  1192.  }
  1193. +
  1194.  #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
  1195.  static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
  1196.  {
  1197. @@ -103,6 +105,45 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
  1198.      );
  1199.      return amvd;
  1200.  }
  1201. +
  1202. +#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
  1203. +static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
  1204. +{
  1205. +    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
  1206. +    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
  1207. +    static const uint64_t pw_2 = 0x0002000200020002ULL;
  1208. +    intptr_t i = i_mvc;
  1209. +    asm(
  1210. +        "movd    %2, %%mm5       \n"
  1211. +        "movd    %3, %%mm6       \n"
  1212. +        "movq    %4, %%mm7       \n"
  1213. +        "punpckldq %%mm5, %%mm5  \n"
  1214. +        "punpckldq %%mm6, %%mm6  \n"
  1215. +        "test $1, %0             \n"
  1216. +        "jz 1f                   \n"
  1217. +        "movd -4(%5,%0,4), %%mm0 \n"
  1218. +        "paddw %%mm7, %%mm0      \n"
  1219. +        "psraw $2, %%mm0         \n"
  1220. +        "pmaxsw %%mm5, %%mm0     \n"
  1221. +        "pminsw %%mm6, %%mm0     \n"
  1222. +        "movd %%mm0, -4(%5,%0,4) \n"
  1223. +        "dec %0                  \n"
  1224. +        "jz 2f                   \n"
  1225. +        "1:                      \n"
  1226. +        "movq -8(%5,%0,4), %%mm0 \n"
  1227. +        "paddw %%mm7, %%mm0      \n"
  1228. +        "psraw $2, %%mm0         \n"
  1229. +        "pmaxsw %%mm5, %%mm0     \n"
  1230. +        "pminsw %%mm6, %%mm0     \n"
  1231. +        "movq %%mm0, -8(%5,%0,4) \n"
  1232. +        "sub $2, %0              \n"
  1233. +        "jnz 1b                  \n"
  1234. +        "2:                      \n"
  1235. +        :"+r"(i), "+m"(M64( mvc ))
  1236. +        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
  1237. +    );
  1238. +}
  1239. +
  1240.  #undef M128_CONST
  1241.  #define M128_CONST(x) ((__m128){x,x,x,x})
  1242.  #define x264_union128_t x264_union128_sse_t
  1243. diff --git a/encoder/me.c b/encoder/me.c
  1244. index 6788022..0b519ea 100644
  1245. --- a/encoder/me.c
  1246. +++ b/encoder/me.c
  1247. @@ -241,14 +241,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
  1248.           * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
  1249.           * biasing against use of the predicted motion vector. */
  1250.          bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
  1251. +        uint32_t bmv = pack16to32_mask( bmx, bmy );
  1252. +        if( i_mvc )
  1253. +            x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
  1254.          for( int i = 0; i < i_mvc; i++ )
  1255.          {
  1256. -            int mx = (mvc[i][0] + 2) >> 2;
  1257. -            int my = (mvc[i][1] + 2) >> 2;
  1258. -            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
  1259. +            if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
  1260.              {
  1261. -                mx = x264_clip3( mx, mv_x_min, mv_x_max );
  1262. -                my = x264_clip3( my, mv_y_min, mv_y_max );
  1263. +                int mx = mvc[i][0];
  1264. +                int my = mvc[i][1];
  1265.                  COST_MV( mx, my );
  1266.              }
  1267.          }
  1268. --
  1269. 1.7.0.4
  1270.  
  1271.  
  1272. From 292fc5e6a7c842e70e752eea9d758ad857ac7873 Mon Sep 17 00:00:00 2001
  1273. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1274. Date: Fri, 16 Apr 2010 11:36:43 -0700
  1275. Subject: [PATCH 6/6] Fix issues with extremely large timebases
  1276.  With timebase denominators >= 2^30 , x264 would silently overflow and cause odd issues.
  1277.  Now x264 will explicitly fail with timebase denominators >= 2^31 and work with timebase denominators 2^31 > x >= 2^30.
  1278.  
  1279. ---
  1280. common/common.c       |   14 +++++++-------
  1281.  common/common.h       |    2 +-
  1282.  common/set.h          |    4 ++--
  1283.  encoder/encoder.c     |   22 +++++++++++++++-------
  1284.  encoder/ratecontrol.c |    4 ++--
  1285.  input/input.h         |   12 ++++++------
  1286.  input/timecode.c      |   32 +++++++++++++++++---------------
  1287.  input/y4m.c           |    3 ++-
  1288.  output/flv.c          |    4 ++--
  1289.  output/matroska.c     |    4 ++--
  1290.  output/mp4.c          |    2 +-
  1291.  x264.c                |    8 ++++----
  1292.  x264.h                |   11 +++++------
  1293.  13 files changed, 66 insertions(+), 56 deletions(-)
  1294.  
  1295. diff --git a/common/common.c b/common/common.c
  1296. index 924323a..6471c07 100644
  1297. --- a/common/common.c
  1298. +++ b/common/common.c
  1299. @@ -614,7 +614,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
  1300.      }
  1301.      OPT("fps")
  1302.      {
  1303. -        if( sscanf( value, "%d/%d", &p->i_fps_num, &p->i_fps_den ) == 2 )
  1304. +        if( sscanf( value, "%u/%u", &p->i_fps_num, &p->i_fps_den ) == 2 )
  1305.              ;
  1306.          else
  1307.          {
  1308. @@ -1119,11 +1119,11 @@ void x264_free( void *p )
  1309.  /****************************************************************************
  1310.   * x264_reduce_fraction:
  1311.   ****************************************************************************/
  1312. -void x264_reduce_fraction( int *n, int *d )
  1313. +void x264_reduce_fraction( uint32_t *n, uint32_t *d )
  1314.  {
  1315. -    int a = *n;
  1316. -    int b = *d;
  1317. -    int c;
  1318. +    uint32_t a = *n;
  1319. +    uint32_t b = *d;
  1320. +    uint32_t c;
  1321.      if( !a || !b )
  1322.          return;
  1323.      c = a % b;
  1324. @@ -1185,8 +1185,8 @@ char *x264_param2string( x264_param_t *p, int b_res )
  1325.      if( b_res )
  1326.      {
  1327.          s += sprintf( s, "%dx%d ", p->i_width, p->i_height );
  1328. -        s += sprintf( s, "fps=%d/%d ", p->i_fps_num, p->i_fps_den );
  1329. -        s += sprintf( s, "timebase=%d/%d ", p->i_timebase_num, p->i_timebase_den );
  1330. +        s += sprintf( s, "fps=%u/%u ", p->i_fps_num, p->i_fps_den );
  1331. +        s += sprintf( s, "timebase=%u/%u ", p->i_timebase_num, p->i_timebase_den );
  1332.      }
  1333.  
  1334.      s += sprintf( s, "cabac=%d", p->b_cabac );
  1335. diff --git a/common/common.h b/common/common.h
  1336. index ce2e7af..f4bd5dc 100644
  1337. --- a/common/common.h
  1338. +++ b/common/common.h
  1339. @@ -134,7 +134,7 @@ int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_sta
  1340.  /* log */
  1341.  void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  1342.  
  1343. -void x264_reduce_fraction( int *n, int *d );
  1344. +void x264_reduce_fraction( uint32_t *n, uint32_t *d );
  1345.  void x264_init_vlc_tables();
  1346.  
  1347.  static ALWAYS_INLINE uint8_t x264_clip_uint8( int x )
  1348. diff --git a/common/set.h b/common/set.h
  1349. index 9783118..ee27d74 100644
  1350. --- a/common/set.h
  1351. +++ b/common/set.h
  1352. @@ -112,8 +112,8 @@ typedef struct
  1353.          int i_chroma_loc_bottom;
  1354.  
  1355.          int b_timing_info_present;
  1356. -        int i_num_units_in_tick;
  1357. -        int i_time_scale;
  1358. +        uint32_t i_num_units_in_tick;
  1359. +        uint32_t i_time_scale;
  1360.          int b_fixed_frame_rate;
  1361.  
  1362.          int b_nal_hrd_parameters_present;
  1363. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1364. index 1438ec0..9b21d92 100644
  1365. --- a/encoder/encoder.c
  1366. +++ b/encoder/encoder.c
  1367. @@ -817,10 +817,10 @@ static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
  1368.      /* VUI */
  1369.      if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
  1370.      {
  1371. -        int i_w = param->vui.i_sar_width;
  1372. -        int i_h = param->vui.i_sar_height;
  1373. -        int old_w = h->param.vui.i_sar_width;
  1374. -        int old_h = h->param.vui.i_sar_height;
  1375. +        uint32_t i_w = param->vui.i_sar_width;
  1376. +        uint32_t i_h = param->vui.i_sar_height;
  1377. +        uint32_t old_w = h->param.vui.i_sar_width;
  1378. +        uint32_t old_h = h->param.vui.i_sar_height;
  1379.  
  1380.          x264_reduce_fraction( &i_w, &i_h );
  1381.  
  1382. @@ -886,21 +886,29 @@ x264_t *x264_encoder_open( x264_param_t *param )
  1383.      h->i_frame = -1;
  1384.      h->i_frame_num = 0;
  1385.      h->i_idr_pic_id = 0;
  1386. +    uint64_t new_timebase_den = h->param.i_timebase_den;
  1387.      if( h->param.b_dts_compress )
  1388.      {
  1389.          /* h->i_dts_compress_multiplier == h->frames.i_bframe_delay + 1 */
  1390.          h->i_dts_compress_multiplier = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 3 : 2) : 1;
  1391.          if( h->i_dts_compress_multiplier != 1 )
  1392.          {
  1393. -            x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %d/%d -> %d/%d\n",
  1394. +            new_timebase_den = h->param.i_timebase_den * h->i_dts_compress_multiplier;
  1395. +            x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %u/%u -> %u/ %"PRIu64"\n",
  1396.                        h->param.i_timebase_num, h->param.i_timebase_den,
  1397. -                      h->param.i_timebase_num, h->param.i_timebase_den * h->i_dts_compress_multiplier );
  1398. -            h->param.i_timebase_den *= h->i_dts_compress_multiplier;
  1399. +                      h->param.i_timebase_num, new_timebase_den );
  1400.          }
  1401.      }
  1402.      else
  1403.          h->i_dts_compress_multiplier = 1;
  1404.  
  1405. +    if( new_timebase_den * 2 >= (1ULL << 32) )
  1406. +    {
  1407. +        x264_log( h, X264_LOG_ERROR, "Effective timebase denominator %"PRIu64" exceeds H.264 maximum\n", new_timebase_den );
  1408. +        goto fail;
  1409. +    }
  1410. +    h->param.i_timebase_den = new_timebase_den;
  1411. +
  1412.      h->sps = &h->sps_array[0];
  1413.      x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
  1414.  
  1415. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  1416. index b51dbf7..8dd38f1 100644
  1417. --- a/encoder/ratecontrol.c
  1418. +++ b/encoder/ratecontrol.c
  1419. @@ -657,14 +657,14 @@ int x264_ratecontrol_new( x264_t *h )
  1420.                  return -1;
  1421.              }
  1422.  
  1423. -            if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%d/%d", &i, &j ) != 2 )
  1424. +            if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%u/%u", &i, &j ) != 2 )
  1425.              {
  1426.                  x264_log( h, X264_LOG_ERROR, "timebase specified in stats file not valid\n" );
  1427.                  return -1;
  1428.              }
  1429.              if( i != h->param.i_timebase_num || j != h->param.i_timebase_den )
  1430.              {
  1431. -                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%d/%d vs %d/%d)\n",
  1432. +                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%u/%u vs %u/%u)\n",
  1433.                            h->param.i_timebase_num, h->param.i_timebase_den, i, j );
  1434.                  return -1;
  1435.              }
  1436. diff --git a/input/input.h b/input/input.h
  1437. index b6cd218..eb62fdd 100644
  1438. --- a/input/input.h
  1439. +++ b/input/input.h
  1440. @@ -38,15 +38,15 @@ typedef struct
  1441.  typedef struct
  1442.  {
  1443.      int csp; /* X264_CSP_YV12 or X264_CSP_I420 */
  1444. -    int fps_num;
  1445. -    int fps_den;
  1446. +    uint32_t fps_num;
  1447. +    uint32_t fps_den;
  1448.      int height;
  1449.      int interlaced;
  1450. -    int sar_width;
  1451. -    int sar_height;
  1452. +    uint32_t sar_width;
  1453. +    uint32_t sar_height;
  1454.      int tff;
  1455. -    int timebase_num;
  1456. -    int timebase_den;
  1457. +    uint32_t timebase_num;
  1458. +    uint32_t timebase_den;
  1459.      int vfr;
  1460.      int width;
  1461.  } video_info_t;
  1462. diff --git a/input/timecode.c b/input/timecode.c
  1463. index 5fabe61..008cb19 100644
  1464. --- a/input/timecode.c
  1465. +++ b/input/timecode.c
  1466. @@ -32,8 +32,8 @@ typedef struct
  1467.      int frame_total;
  1468.      int auto_timebase_num;
  1469.      int auto_timebase_den;
  1470. -    int timebase_num;
  1471. -    int timebase_den;
  1472. +    int64_t timebase_num;
  1473. +    int64_t timebase_den;
  1474.      int seek;
  1475.      int stored_pts_num;
  1476.      int64_t *pts;
  1477. @@ -53,7 +53,7 @@ static inline double sigexp10( double value, double *exponent )
  1478.  
  1479.  static double correct_fps( double fps, timecode_hnd_t *h )
  1480.  {
  1481. -    int64_t i = 1;
  1482. +    int i = 1;
  1483.      int64_t fps_num, fps_den;
  1484.      double exponent;
  1485.      double fps_sig = sigexp10( fps, &exponent );
  1486. @@ -61,7 +61,7 @@ static double correct_fps( double fps, timecode_hnd_t *h )
  1487.      {
  1488.          fps_den = i * h->timebase_num;
  1489.          fps_num = round( fps_den * fps_sig ) * exponent;
  1490. -        if( fps_num < 0 )
  1491. +        if( fps_num > UINT_MAX )
  1492.          {
  1493.              fprintf( stderr, "timecode [error]: tcfile fps correction failed.\n"
  1494.                               "                  Specify an appropriate timebase manually or remake tcfile.\n" );
  1495. @@ -74,7 +74,7 @@ static double correct_fps( double fps, timecode_hnd_t *h )
  1496.      if( h->auto_timebase_den )
  1497.      {
  1498.          h->timebase_den = h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
  1499. -        if( h->timebase_den < 0 )
  1500. +        if( h->timebase_den > UINT_MAX )
  1501.              h->auto_timebase_den = 0;
  1502.      }
  1503.      return (double)fps_num / fps_den;
  1504. @@ -86,12 +86,12 @@ static int try_mkv_timebase_den( double *fpss, timecode_hnd_t *h, int loop_num )
  1505.      h->timebase_den = MKV_TIMEBASE_DEN;
  1506.      for( int num = 0; num < loop_num; num++ )
  1507.      {
  1508. -        int fps_den;
  1509. +        int64_t fps_den;
  1510.          double exponent;
  1511.          double fps_sig = sigexp10( fpss[num], &exponent );
  1512.          fps_den = round( MKV_TIMEBASE_DEN / fps_sig ) / exponent;
  1513.          h->timebase_num = fps_den > 0 && h->timebase_num ? gcd( h->timebase_num, fps_den ) : fps_den;
  1514. -        if( h->timebase_num <= 0 )
  1515. +        if( h->timebase_num > UINT_MAX || !h->timebase_num )
  1516.          {
  1517.              fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
  1518.                               "                  Specify timebase manually.\n" );
  1519. @@ -305,19 +305,19 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
  1520.                  if( h->timebase_den >= 0 )
  1521.                  {
  1522.                      int i = 1;
  1523. -                    int fps_num, fps_den;
  1524. +                    int64_t fps_num, fps_den;
  1525.                      double exponent;
  1526.                      double fps_sig = sigexp10( fpss[num], &exponent );
  1527.                      while( 1 )
  1528.                      {
  1529.                          fps_den = i * h->timebase_num;
  1530.                          fps_num = round( fps_den * fps_sig ) * exponent;
  1531. -                        if( fps_num < 0 || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
  1532. +                        if( fps_num > UINT_MAX || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
  1533.                              break;
  1534.                          ++i;
  1535.                      }
  1536.                      h->timebase_den = fps_num > 0 && h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
  1537. -                    if( h->timebase_den < 0 )
  1538. +                    if( h->timebase_den > UINT_MAX )
  1539.                      {
  1540.                          h->auto_timebase_den = 0;
  1541.                          continue;
  1542. @@ -339,10 +339,12 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
  1543.  
  1544.      if( h->auto_timebase_den || h->auto_timebase_num )
  1545.      {
  1546. -        x264_reduce_fraction( &h->timebase_num, &h->timebase_den );
  1547. -        fprintf( stderr, "timecode [info]: automatic timebase generation %d/%d\n", h->timebase_num, h->timebase_den );
  1548. +        int64_t i = gcd( h->timebase_num, h->timebase_den );
  1549. +        h->timebase_num /= i;
  1550. +        h->timebase_den /= i;
  1551. +        fprintf( stderr, "timecode [info]: automatic timebase generation %"PRId64"/%"PRId64"\n", h->timebase_num, h->timebase_den );
  1552.      }
  1553. -    else if( h->timebase_den <= 0 )
  1554. +    else if( h->timebase_den > UINT_MAX || !h->timebase_den )
  1555.      {
  1556.          fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
  1557.                           "                  Specify an appropriate timebase manually.\n" );
  1558. @@ -394,9 +396,9 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
  1559.      h->frame_total = input.get_frame_total( h->p_handle );
  1560.      h->seek = opt->seek;
  1561.      if( opt->timebase )
  1562. -        ret = sscanf( opt->timebase, "%d/%d", &h->timebase_num, &h->timebase_den );
  1563. +        ret = sscanf( opt->timebase, "%"PRId64"/%"PRId64"", &h->timebase_num, &h->timebase_den );
  1564.      if( ret == 1 )
  1565. -        h->timebase_num = atoi( opt->timebase );
  1566. +        h->timebase_num = strtoul( opt->timebase, NULL, 10 );
  1567.      h->auto_timebase_num = !ret;
  1568.      h->auto_timebase_den = ret < 2;
  1569.      if( h->auto_timebase_num )
  1570. diff --git a/input/y4m.c b/input/y4m.c
  1571. index c34f264..842b986 100644
  1572. --- a/input/y4m.c
  1573. +++ b/input/y4m.c
  1574. @@ -40,7 +40,8 @@ typedef struct
  1575.  static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
  1576.  {
  1577.      y4m_hnd_t *h = malloc( sizeof(y4m_hnd_t) );
  1578. -    int  i, n, d;
  1579. +    int i;
  1580. +    uint32_t n, d;
  1581.      char header[MAX_YUV4_HEADER+10];
  1582.      char *tokend, *header_end;
  1583.      int colorspace = X264_CSP_NONE;
  1584. diff --git a/output/flv.c b/output/flv.c
  1585. index 04f4428..e441b6d 100644
  1586. --- a/output/flv.c
  1587. +++ b/output/flv.c
  1588. @@ -47,8 +47,8 @@ typedef struct
  1589.      int64_t i_prev_dts;
  1590.      int64_t i_prev_pts;
  1591.  
  1592. -    int i_timebase_num;
  1593. -    int i_timebase_den;
  1594. +    uint32_t i_timebase_num;
  1595. +    uint32_t i_timebase_den;
  1596.      int b_vfr_input;
  1597.  
  1598.      unsigned start;
  1599. diff --git a/output/matroska.c b/output/matroska.c
  1600. index 47753d7..0304c84 100644
  1601. --- a/output/matroska.c
  1602. +++ b/output/matroska.c
  1603. @@ -30,8 +30,8 @@ typedef struct
  1604.      int64_t frame_duration;
  1605.  
  1606.      char b_writing_frame;
  1607. -    int i_timebase_num;
  1608. -    int i_timebase_den;
  1609. +    uint32_t i_timebase_num;
  1610. +    uint32_t i_timebase_den;
  1611.  
  1612.  } mkv_hnd_t;
  1613.  
  1614. diff --git a/output/mp4.c b/output/mp4.c
  1615. index cbe9f5c..f76541e 100644
  1616. --- a/output/mp4.c
  1617. +++ b/output/mp4.c
  1618. @@ -38,7 +38,7 @@ typedef struct
  1619.      GF_ISOSample *p_sample;
  1620.      int i_track;
  1621.      uint32_t i_descidx;
  1622. -    int i_time_res;
  1623. +    uint32_t i_time_res;
  1624.      int64_t i_time_inc;
  1625.      int i_numframe;
  1626.      int i_delay_time;
  1627. diff --git a/x264.c b/x264.c
  1628. index 3f46fd9..cabdb1d 100644
  1629. --- a/x264.c
  1630. +++ b/x264.c
  1631. @@ -1205,9 +1205,9 @@ generic_option:
  1632.      }
  1633.      if( !tcfile_name && input_opt.timebase )
  1634.      {
  1635. -        int i_user_timebase_num;
  1636. -        int i_user_timebase_den;
  1637. -        int ret = sscanf( input_opt.timebase, "%d/%d", &i_user_timebase_num, &i_user_timebase_den );
  1638. +        uint32_t i_user_timebase_num;
  1639. +        uint32_t i_user_timebase_den;
  1640. +        int ret = sscanf( input_opt.timebase, "%u/%u", &i_user_timebase_num, &i_user_timebase_den );
  1641.          if( !ret )
  1642.          {
  1643.              fprintf( stderr, "x264 [error]: invalid argument: timebase = %s\n", input_opt.timebase );
  1644. @@ -1216,7 +1216,7 @@ generic_option:
  1645.          else if( ret == 1 )
  1646.          {
  1647.              i_user_timebase_num = param->i_timebase_num;
  1648. -            i_user_timebase_den = atoi( input_opt.timebase );
  1649. +            i_user_timebase_den = strtoul( input_opt.timebase, NULL, 10 );
  1650.          }
  1651.          opt->timebase_convert_multiplier = ((double)i_user_timebase_den / param->i_timebase_den)
  1652.                                           * ((double)param->i_timebase_num / i_user_timebase_num);
  1653. diff --git a/x264.h b/x264.h
  1654. index d30effe..83f087e 100644
  1655. --- a/x264.h
  1656. +++ b/x264.h
  1657. @@ -35,7 +35,7 @@
  1658.  
  1659.  #include <stdarg.h>
  1660.  
  1661. -#define X264_BUILD 93
  1662. +#define X264_BUILD 94
  1663.  
  1664.  /* x264_t:
  1665.   *      opaque handler for encoder */
  1666. @@ -208,9 +208,6 @@ typedef struct x264_param_t
  1667.          int         i_chroma_loc;    /* both top & bottom */
  1668.      } vui;
  1669.  
  1670. -    int         i_fps_num;
  1671. -    int         i_fps_den;
  1672. -
  1673.      /* Bitstream parameters */
  1674.      int         i_frame_reference;  /* Maximum number of reference frames */
  1675.      int         i_keyint_max;       /* Force an IDR keyframe at this interval */
  1676. @@ -330,8 +327,10 @@ typedef struct x264_param_t
  1677.                                   * otherwise place size (4 bytes) before NAL units. */
  1678.      int i_sps_id;               /* SPS and PPS id number */
  1679.      int b_vfr_input;            /* VFR input */
  1680. -    int i_timebase_num;         /* Timebase numerator */
  1681. -    int i_timebase_den;         /* Timebase denominator */
  1682. +    uint32_t i_fps_num;
  1683. +    uint32_t i_fps_den;
  1684. +    uint32_t i_timebase_num;    /* Timebase numerator */
  1685. +    uint32_t i_timebase_den;    /* Timebase denominator */
  1686.      int b_dts_compress;         /* DTS compression: this algorithm eliminates negative DTS
  1687.                                   * by compressing them to be less than the second PTS.
  1688.                                   * Warning: this will change the timebase! */
  1689. --
  1690. 1.7.0.4
RAW Paste Data
Top