Advertisement
Guest User

Untitled

a guest
May 11th, 2017
603
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 111.98 KB | None | 0 0
  1. From 4784723450ae1dd28ede1ff04a93f1849d6444e5 Mon Sep 17 00:00:00 2001
  2. From: Anton Mitrofanov <BugMaster@narod.ru>
  3. Date: Tue, 16 Feb 2010 09:41:55 -0800
  4. Subject: [PATCH 01/16] Fix I and B-frame QPs with threads
  5. Rounding errors resulted in slightly wrong QPs with threads enabled.
  6.  
  7. ---
  8. encoder/ratecontrol.c | 6 +++---
  9. 1 files changed, 3 insertions(+), 3 deletions(-)
  10.  
  11. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  12. index 8c61582..3d86aaa 100644
  13. --- a/encoder/ratecontrol.c
  14. +++ b/encoder/ratecontrol.c
  15. @@ -1077,15 +1077,15 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
  16.  
  17. rc->qpa_rc =
  18. rc->qpa_aq = 0;
  19. - h->fdec->f_qp_avg_rc =
  20. - h->fdec->f_qp_avg_aq =
  21. rc->qpm =
  22. rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 );
  23. + h->fdec->f_qp_avg_rc =
  24. + h->fdec->f_qp_avg_aq =
  25. rc->f_qpm = q;
  26. if( rce )
  27. rce->new_qp = rc->qp;
  28.  
  29. - accum_p_qp_update( h, rc->qp );
  30. + accum_p_qp_update( h, rc->f_qpm );
  31.  
  32. if( h->sh.i_type != SLICE_TYPE_B )
  33. rc->last_non_b_pict_type = h->sh.i_type;
  34. --
  35. 1.6.1.2
  36.  
  37.  
  38. From 28e6eb67ffaa002469f60c40e2b5d58b2a758f9c Mon Sep 17 00:00:00 2001
  39. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  40. Date: Mon, 22 Feb 2010 11:21:51 -0800
  41. Subject: [PATCH 02/16] Fix integer overflow in chroma SSD check
  42. Could cause bad skips at very high quantizers on extreme inputs.
  43.  
  44. ---
  45. encoder/rdo.c | 4 ++--
  46. 1 files changed, 2 insertions(+), 2 deletions(-)
  47.  
  48. diff --git a/encoder/rdo.c b/encoder/rdo.c
  49. index 3ed4a47..e15f47d 100644
  50. --- a/encoder/rdo.c
  51. +++ b/encoder/rdo.c
  52. @@ -131,7 +131,7 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
  53. static inline int ssd_mb( x264_t *h )
  54. {
  55. int chromassd = ssd_plane(h, PIXEL_8x8, 1, 0, 0) + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
  56. - chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
  57. + chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
  58. return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chromassd;
  59. }
  60.  
  61. @@ -223,7 +223,7 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
  62.  
  63. chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
  64. + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
  65. - chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
  66. + chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
  67. i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 ) + chromassd;
  68.  
  69. if( h->param.b_cabac )
  70. --
  71. 1.6.1.2
  72.  
  73.  
  74. From f0da96145cb068ade0f0232d0682137c9065929f Mon Sep 17 00:00:00 2001
  75. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  76. Date: Mon, 22 Feb 2010 13:04:47 -0800
  77. Subject: [PATCH 03/16] Fix overread of scratch buffer
  78. Could cause crashes on non-mod16 frames.
  79.  
  80. ---
  81. encoder/encoder.c | 2 +-
  82. 1 files changed, 1 insertions(+), 1 deletions(-)
  83.  
  84. diff --git a/encoder/encoder.c b/encoder/encoder.c
  85. index df62389..89bf457 100644
  86. --- a/encoder/encoder.c
  87. +++ b/encoder/encoder.c
  88. @@ -1055,7 +1055,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
  89. /* Allocate scratch buffer */
  90. for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
  91. {
  92. - int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
  93. + int buf_hpel = (h->fdec->i_width[0]+48) * sizeof(int16_t);
  94. int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
  95. int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
  96. int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
  97. --
  98. 1.6.1.2
  99.  
  100.  
  101. From 25292b825a42b577bd121c48d2508f3b4aa7a9eb Mon Sep 17 00:00:00 2001
  102. From: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
  103. Date: Tue, 16 Feb 2010 11:05:21 -0800
  104. Subject: [PATCH 04/16] Add GPAC version check
  105.  
  106. ---
  107. configure | 8 +++++++-
  108. 1 files changed, 7 insertions(+), 1 deletions(-)
  109.  
  110. diff --git a/configure b/configure
  111. index 25f5458..d0ff43a 100755
  112. --- a/configure
  113. +++ b/configure
  114. @@ -584,7 +584,13 @@ if [ $SYS = MINGW ]; then
  115. fi
  116. if [ "$mp4_output" = "auto" ] ; then
  117. mp4_output="no"
  118. - cc_check gpac/isomedia.h "$MP4_LDFLAGS" && mp4_output="yes"
  119. + if cc_check gpac/isomedia.h "$MP4_LDFLAGS" ; then
  120. + if cc_check gpac/isomedia.h "$MP4_LDFLAGS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0);" ; then
  121. + mp4_output="yes"
  122. + else
  123. + echo "Warning: gpac is too old, update to 2007-06-21 UTC or later"
  124. + fi
  125. + fi
  126. fi
  127. if [ "$mp4_output" = "yes" ] ; then
  128. define MP4_OUTPUT
  129. --
  130. 1.6.1.2
  131.  
  132.  
  133. From 5234f855a23607ae0dbfce9eeb0c69007e9d69e4 Mon Sep 17 00:00:00 2001
  134. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  135. Date: Sun, 21 Feb 2010 14:21:26 -0800
  136. Subject: [PATCH 05/16] SimpleBlock requires Matroska Doctype v2
  137.  
  138. ---
  139. output/matroska_ebml.c | 4 ++--
  140. 1 files changed, 2 insertions(+), 2 deletions(-)
  141.  
  142. diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
  143. index 7265909..89790b7 100644
  144. --- a/output/matroska_ebml.c
  145. +++ b/output/matroska_ebml.c
  146. @@ -338,8 +338,8 @@ int mk_writeHeader( mk_writer *w, const char *writing_app,
  147. CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
  148. CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
  149. CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
  150. - CHECK( mk_write_uint( c, 0x4287, 1 ) ); // DocTypeVersion
  151. - CHECK( mk_write_uint( c, 0x4285, 1 ) ); // DocTypeReadversion
  152. + CHECK( mk_write_uint( c, 0x4287, 2 ) ); // DocTypeVersion
  153. + CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadversion
  154. CHECK( mk_close_context( c, 0 ) );
  155.  
  156. if( !(c = mk_create_context( w, w->root, 0x18538067 )) ) // Segment
  157. --
  158. 1.6.1.2
  159.  
  160.  
  161. From fff9312827eb936da8da24a426e167494208d195 Mon Sep 17 00:00:00 2001
  162. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  163. Date: Tue, 16 Feb 2010 10:13:33 -0800
  164. Subject: [PATCH 06/16] Much faster and simpler direct spatial calculation
  165.  
  166. ---
  167. common/macroblock.c | 130 ++++++++++++++++++++++++--------------------------
  168. 1 files changed, 62 insertions(+), 68 deletions(-)
  169.  
  170. diff --git a/common/macroblock.c b/common/macroblock.c
  171. index 278659c..19cd371 100644
  172. --- a/common/macroblock.c
  173. +++ b/common/macroblock.c
  174. @@ -36,8 +36,6 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
  175. int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
  176. int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width];
  177.  
  178. - int i_count = 0;
  179. -
  180. if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 )
  181. {
  182. i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
  183. @@ -83,9 +81,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
  184. }
  185. }
  186.  
  187. - if( i_refa == i_ref ) i_count++;
  188. - if( i_refb == i_ref ) i_count++;
  189. - if( i_refc == i_ref ) i_count++;
  190. + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
  191.  
  192. if( i_count > 1 )
  193. {
  194. @@ -115,18 +111,13 @@ void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2]
  195. int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
  196. int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
  197. int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
  198. -
  199. - int i_count = 0;
  200. -
  201. if( i_refc == -2 )
  202. {
  203. i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
  204. mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
  205. }
  206.  
  207. - if( i_refa == i_ref ) i_count++;
  208. - if( i_refb == i_ref ) i_count++;
  209. - if( i_refc == i_ref ) i_count++;
  210. + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
  211.  
  212. if( i_count > 1 )
  213. {
  214. @@ -196,7 +187,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
  215. if( i_ref >= 0 )
  216. {
  217. const int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
  218. - const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
  219. + const int16_t *mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
  220. const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
  221. const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
  222. if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) )
  223. @@ -221,58 +212,67 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
  224.  
  225. static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  226. {
  227. - int ref[2];
  228. + int8_t ref[2];
  229. ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
  230. - int i_list;
  231. - int i8;
  232. - const int8_t *l1ref0 = &h->fref1[0]->ref[0][ h->mb.i_b8_xy ];
  233. - const int8_t *l1ref1 = &h->fref1[0]->ref[1][ h->mb.i_b8_xy ];
  234. - const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->fref1[0]->mv[0][ h->mb.i_b4_xy ];
  235. - const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->fref1[0]->mv[1][ h->mb.i_b4_xy ];
  236. - const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
  237. + int i_list, i8, i_ref;
  238. + const int8_t *l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy];
  239. + const int8_t *l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy];
  240. + const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy],
  241. + (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] };
  242. + const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
  243.  
  244. - for( i_list=0; i_list<2; i_list++ )
  245. + for( i_list = 0; i_list < 2; i_list++ )
  246. {
  247. - int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
  248. - int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
  249. - int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
  250. + int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
  251. + int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
  252. + int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
  253. + int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
  254. + int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
  255. + int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
  256. if( i_refc == -2 )
  257. + {
  258. i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
  259. + mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
  260. + }
  261. +
  262. + i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
  263. + if( i_ref < 0 )
  264. + {
  265. + i_ref = -1;
  266. + M32( mv[i_list] ) = 0;
  267. + }
  268. + else
  269. + {
  270. + /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases
  271. + * not relevant to spatial direct. */
  272. + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
  273. +
  274. + if( i_count > 1 )
  275. + x264_median_mv( mv[i_list], mv_a, mv_b, mv_c );
  276. + else
  277. + {
  278. + if( i_refa == i_ref )
  279. + CP32( mv[i_list], mv_a );
  280. + else if( i_refb == i_ref )
  281. + CP32( mv[i_list], mv_b );
  282. + else
  283. + CP32( mv[i_list], mv_c );
  284. + }
  285. + }
  286.  
  287. - ref[i_list] = i_refa;
  288. - if( ref[i_list] < 0 || ( i_refb < ref[i_list] && i_refb >= 0 ))
  289. - ref[i_list] = i_refb;
  290. - if( ref[i_list] < 0 || ( i_refc < ref[i_list] && i_refc >= 0 ))
  291. - ref[i_list] = i_refc;
  292. - if( ref[i_list] < 0 )
  293. - ref[i_list] = -1;
  294. + x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref );
  295. + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] );
  296. + ref[i_list] = i_ref;
  297. }
  298.  
  299. - if( ref[0] < 0 && ref[1] < 0 )
  300. + if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */
  301. {
  302. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
  303. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
  304. - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 );
  305. - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 );
  306. return 1;
  307. }
  308.  
  309. - if( ref[0] >= 0 )
  310. - x264_mb_predict_mv_16x16( h, 0, ref[0], mv[0] );
  311. - else
  312. - M32( mv[0] ) = 0;
  313. -
  314. - if( ref[1] >= 0 )
  315. - x264_mb_predict_mv_16x16( h, 1, ref[1], mv[1] );
  316. - else
  317. - M32( mv[1] ) = 0;
  318. -
  319. - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
  320. - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
  321. - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
  322. - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
  323. -
  324. - if( !M64( mv ) )
  325. + if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
  326. return 1;
  327.  
  328. if( h->param.i_threads > 1
  329. @@ -287,31 +287,25 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  330. return 0;
  331. }
  332.  
  333. - if( IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
  334. - return 1;
  335. -
  336. /* col_zero_flag */
  337. - for( i8=0; i8<4; i8++ )
  338. + for( i8 = 0; i8 < 4; i8++ )
  339. {
  340. - const int x8 = i8%2;
  341. - const int y8 = i8/2;
  342. + const int x8 = i8&1;
  343. + const int y8 = i8>>1;
  344. const int o8 = x8 + y8 * h->mb.i_b8_stride;
  345. const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride);
  346. + int idx;
  347. if( l1ref0[o8] == 0 )
  348. - {
  349. - if( abs( l1mv0[o4][0] ) <= 1 && abs( l1mv0[o4][1] ) <= 1 )
  350. - {
  351. - if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
  352. - if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
  353. - }
  354. - }
  355. + idx = 0;
  356. else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 )
  357. + idx = 1;
  358. + else
  359. + continue;
  360. +
  361. + if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
  362. {
  363. - if( abs( l1mv1[o4][0] ) <= 1 && abs( l1mv1[o4][1] ) <= 1 )
  364. - {
  365. - if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
  366. - if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
  367. - }
  368. + if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
  369. + if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
  370. }
  371. }
  372.  
  373. --
  374. 1.6.1.2
  375.  
  376.  
  377. From 4a1303d128a4f7a9df81321940f789022695a9ad Mon Sep 17 00:00:00 2001
  378. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  379. Date: Wed, 17 Feb 2010 22:41:16 -0800
  380. Subject: [PATCH 07/16] Keep track of macroblock partitions
  381. Allows vastly simpler motion compensation and direct MV calculation.
  382.  
  383. ---
  384. common/common.h | 2 +
  385. common/frame.c | 1 +
  386. common/frame.h | 1 +
  387. common/macroblock.c | 233 +++++++++++++++++++++++++--------------------------
  388. encoder/analyse.c | 1 +
  389. 5 files changed, 121 insertions(+), 117 deletions(-)
  390.  
  391. diff --git a/common/common.h b/common/common.h
  392. index e2e8fac..68f79ba 100644
  393. --- a/common/common.h
  394. +++ b/common/common.h
  395. @@ -519,6 +519,7 @@ struct x264_t
  396.  
  397. /* mb table */
  398. int8_t *type; /* mb type */
  399. + uint8_t *partition; /* mb partition */
  400. int8_t *qp; /* mb qp */
  401. int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc (all set for PCM)*/
  402. int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
  403. @@ -627,6 +628,7 @@ struct x264_t
  404.  
  405. ALIGNED_4( int16_t direct_mv[2][4][2] );
  406. ALIGNED_4( int8_t direct_ref[2][4] );
  407. + int direct_partition;
  408. ALIGNED_4( int16_t pskip_mv[2] );
  409.  
  410. /* number of neighbors (top and left) that used 8x8 dct */
  411. diff --git a/common/frame.c b/common/frame.c
  412. index d89f5ab..2798f25 100644
  413. --- a/common/frame.c
  414. +++ b/common/frame.c
  415. @@ -95,6 +95,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  416. if( b_fdec ) /* fdec frame */
  417. {
  418. CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
  419. + CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
  420. CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
  421. CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
  422. if( h->param.i_bframe )
  423. diff --git a/common/frame.h b/common/frame.h
  424. index 7c8e2ff..6e7de50 100644
  425. --- a/common/frame.h
  426. +++ b/common/frame.h
  427. @@ -75,6 +75,7 @@ typedef struct x264_frame
  428.  
  429. /* motion data */
  430. int8_t *mb_type;
  431. + uint8_t *mb_partition;
  432. int16_t (*mv[2])[2];
  433. int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
  434. uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
  435. diff --git a/common/macroblock.c b/common/macroblock.c
  436. index 19cd371..2573415 100644
  437. --- a/common/macroblock.c
  438. +++ b/common/macroblock.c
  439. @@ -165,9 +165,12 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
  440. int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
  441. int i8;
  442. const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
  443. + const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];
  444.  
  445. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
  446.  
  447. + h->mb.i_partition = partition_col;
  448. +
  449. if( IS_INTRA( type_col ) )
  450. {
  451. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
  452. @@ -176,7 +179,15 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
  453. return 1;
  454. }
  455.  
  456. - for( i8 = 0; i8 < 4; i8++ )
  457. + /* Don't do any checks other than the ones we have to, based
  458. + * on the size of the colocated partitions.
  459. + * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
  460. + int max_i8 = (D_16x16 - partition_col) + 1;
  461. + int step = (partition_col == D_16x8) + 1;
  462. + int width = 4 >> ((D_16x16 - partition_col)&1);
  463. + int height = 4 >> ((D_16x16 - partition_col)>>1);
  464. +
  465. + for( i8 = 0; i8 < max_i8; i8 += step )
  466. {
  467. const int x8 = i8%2;
  468. const int y8 = i8/2;
  469. @@ -192,9 +203,9 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
  470. const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
  471. if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) )
  472. return 0;
  473. - x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
  474. - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, pack16to32_mask(l0x, l0y) );
  475. - x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
  476. + x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref );
  477. + x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) );
  478. + x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
  479. }
  480. else
  481. {
  482. @@ -220,6 +231,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  483. const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy],
  484. (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] };
  485. const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
  486. + const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];
  487. +
  488. + h->mb.i_partition = partition_col;
  489.  
  490. for( i_list = 0; i_list < 2; i_list++ )
  491. {
  492. @@ -287,8 +301,16 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  493. return 0;
  494. }
  495.  
  496. + /* Don't do any checks other than the ones we have to, based
  497. + * on the size of the colocated partitions.
  498. + * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
  499. + int max_i8 = (D_16x16 - partition_col) + 1;
  500. + int step = (partition_col == D_16x8) + 1;
  501. + int width = 4 >> ((D_16x16 - partition_col)&1);
  502. + int height = 4 >> ((D_16x16 - partition_col)>>1);
  503. +
  504. /* col_zero_flag */
  505. - for( i8 = 0; i8 < 4; i8++ )
  506. + for( i8 = 0; i8 < max_i8; i8 += step )
  507. {
  508. const int x8 = i8&1;
  509. const int y8 = i8>>1;
  510. @@ -304,8 +326,8 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  511.  
  512. if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
  513. {
  514. - if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
  515. - if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
  516. + if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 );
  517. + if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 );
  518. }
  519. }
  520.  
  521. @@ -324,32 +346,29 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
  522.  
  523. if( b_changed != NULL && b_available )
  524. {
  525. - int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
  526. - int changed = 0;
  527. + int changed;
  528.  
  529. - if( IS_INTRA( type_col ) || type_col == P_SKIP )
  530. + changed = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] );
  531. + changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] );
  532. + changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]];
  533. + changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]];
  534. + if( !changed && h->mb.i_partition != D_16x16 )
  535. {
  536. - changed |= M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][X264_SCAN8_0] );
  537. - changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][X264_SCAN8_0] );
  538. - changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][X264_SCAN8_0];
  539. - changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][X264_SCAN8_0];
  540. + changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] );
  541. + changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] );
  542. + changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]];
  543. + changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]];
  544. }
  545. - else
  546. + if( !changed && h->mb.i_partition == D_8x8 )
  547. {
  548. - int l;
  549. - for( l = 0; l < 2; l++ )
  550. - {
  551. - changed |= M32( h->mb.cache.direct_mv[l][0] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 0]] );
  552. - if( changed ) break;
  553. - changed |= M32( h->mb.cache.direct_mv[l][1] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 4]] );
  554. - changed |= M32( h->mb.cache.direct_mv[l][2] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 8]] );
  555. - changed |= M32( h->mb.cache.direct_mv[l][3] ) ^ M32( h->mb.cache.mv[l][x264_scan8[12]] );
  556. - if( changed ) break;
  557. - changed |= h->mb.cache.direct_ref[l][0] ^ h->mb.cache.ref[l][x264_scan8[ 0]];
  558. - changed |= h->mb.cache.direct_ref[l][1] ^ h->mb.cache.ref[l][x264_scan8[ 4]];
  559. - changed |= h->mb.cache.direct_ref[l][2] ^ h->mb.cache.ref[l][x264_scan8[ 8]];
  560. - changed |= h->mb.cache.direct_ref[l][3] ^ h->mb.cache.ref[l][x264_scan8[12]];
  561. - }
  562. + changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] );
  563. + changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] );
  564. + changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] );
  565. + changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] );
  566. + changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]];
  567. + changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]];
  568. + changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]];
  569. + changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]];
  570. }
  571. *b_changed = changed;
  572. if( !changed )
  573. @@ -370,6 +389,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
  574. h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
  575. h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
  576. h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
  577. + h->mb.cache.direct_partition = h->mb.i_partition;
  578. }
  579. }
  580.  
  581. @@ -564,116 +584,93 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
  582. h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
  583. }
  584.  
  585. -static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
  586. -{
  587. - const int i8 = x264_scan8[0] + x + 8*y;
  588. -
  589. - if( h->mb.cache.ref[0][i8] >= 0 )
  590. - if( h->mb.cache.ref[1][i8] >= 0 )
  591. - x264_mb_mc_01xywh( h, x, y, 2, 2 );
  592. - else
  593. - x264_mb_mc_0xywh( h, x, y, 2, 2 );
  594. - else
  595. - x264_mb_mc_1xywh( h, x, y, 2, 2 );
  596. -}
  597. -
  598. void x264_mb_mc_8x8( x264_t *h, int i8 )
  599. {
  600. const int x = 2*(i8&1);
  601. const int y = 2*(i8>>1);
  602. - switch( h->mb.i_sub_partition[i8] )
  603. +
  604. + if( h->sh.i_type == SLICE_TYPE_P )
  605. + {
  606. + switch( h->mb.i_sub_partition[i8] )
  607. + {
  608. + case D_L0_8x8:
  609. + x264_mb_mc_0xywh( h, x, y, 2, 2 );
  610. + break;
  611. + case D_L0_8x4:
  612. + x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
  613. + x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
  614. + break;
  615. + case D_L0_4x8:
  616. + x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
  617. + x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
  618. + break;
  619. + case D_L0_4x4:
  620. + x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
  621. + x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
  622. + x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
  623. + x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
  624. + break;
  625. + }
  626. + }
  627. + else
  628. {
  629. - case D_L0_8x8:
  630. - x264_mb_mc_0xywh( h, x, y, 2, 2 );
  631. - break;
  632. - case D_L0_8x4:
  633. - x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
  634. - x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
  635. - break;
  636. - case D_L0_4x8:
  637. - x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
  638. - x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
  639. - break;
  640. - case D_L0_4x4:
  641. - x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
  642. - x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
  643. - x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
  644. - x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
  645. - break;
  646. - case D_L1_8x8:
  647. + const int i8 = x264_scan8[0] + x + 8*y;
  648. +
  649. + if( h->mb.cache.ref[0][i8] >= 0 )
  650. + if( h->mb.cache.ref[1][i8] >= 0 )
  651. + x264_mb_mc_01xywh( h, x, y, 2, 2 );
  652. + else
  653. + x264_mb_mc_0xywh( h, x, y, 2, 2 );
  654. + else
  655. x264_mb_mc_1xywh( h, x, y, 2, 2 );
  656. - break;
  657. - case D_BI_8x8:
  658. - x264_mb_mc_01xywh( h, x, y, 2, 2 );
  659. - break;
  660. - case D_DIRECT_8x8:
  661. - x264_mb_mc_direct8x8( h, x, y );
  662. - break;
  663. }
  664. }
  665.  
  666. void x264_mb_mc( x264_t *h )
  667. {
  668. - if( h->mb.i_type == P_L0 )
  669. - {
  670. - if( h->mb.i_partition == D_16x16 )
  671. - {
  672. - x264_mb_mc_0xywh( h, 0, 0, 4, 4 );
  673. - }
  674. - else if( h->mb.i_partition == D_16x8 )
  675. - {
  676. - x264_mb_mc_0xywh( h, 0, 0, 4, 2 );
  677. - x264_mb_mc_0xywh( h, 0, 2, 4, 2 );
  678. - }
  679. - else if( h->mb.i_partition == D_8x16 )
  680. - {
  681. - x264_mb_mc_0xywh( h, 0, 0, 2, 4 );
  682. - x264_mb_mc_0xywh( h, 2, 0, 2, 4 );
  683. - }
  684. - }
  685. - else if( h->mb.i_type == P_8x8 || h->mb.i_type == B_8x8 )
  686. + if( h->mb.i_partition == D_8x8 )
  687. {
  688. int i;
  689. for( i = 0; i < 4; i++ )
  690. x264_mb_mc_8x8( h, i );
  691. }
  692. - else if( h->mb.i_type == B_SKIP || h->mb.i_type == B_DIRECT )
  693. - {
  694. - x264_mb_mc_direct8x8( h, 0, 0 );
  695. - x264_mb_mc_direct8x8( h, 2, 0 );
  696. - x264_mb_mc_direct8x8( h, 0, 2 );
  697. - x264_mb_mc_direct8x8( h, 2, 2 );
  698. - }
  699. - else /* B_*x* */
  700. + else
  701. {
  702. - const uint8_t *b_list0 = x264_mb_type_list_table[h->mb.i_type][0];
  703. - const uint8_t *b_list1 = x264_mb_type_list_table[h->mb.i_type][1];
  704. + const int ref0a = h->mb.cache.ref[0][x264_scan8[ 0]];
  705. + const int ref0b = h->mb.cache.ref[0][x264_scan8[12]];
  706. + const int ref1a = h->mb.cache.ref[1][x264_scan8[ 0]];
  707. + const int ref1b = h->mb.cache.ref[1][x264_scan8[12]];
  708.  
  709. if( h->mb.i_partition == D_16x16 )
  710. {
  711. - if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
  712. - else if( b_list0[0] ) x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
  713. - else if( b_list1[0] ) x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
  714. + if( ref0a >= 0 )
  715. + if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
  716. + else x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
  717. + else x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
  718. }
  719. else if( h->mb.i_partition == D_16x8 )
  720. {
  721. - if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
  722. - else if( b_list0[0] ) x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
  723. - else if( b_list1[0] ) x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );
  724. -
  725. - if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
  726. - else if( b_list0[1] ) x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
  727. - else if( b_list1[1] ) x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
  728. + if( ref0a >= 0 )
  729. + if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
  730. + else x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
  731. + else x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );
  732. +
  733. + if( ref0b >= 0 )
  734. + if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
  735. + else x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
  736. + else x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
  737. }
  738. else if( h->mb.i_partition == D_8x16 )
  739. {
  740. - if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
  741. - else if( b_list0[0] ) x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
  742. - else if( b_list1[0] ) x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );
  743. -
  744. - if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
  745. - else if( b_list0[1] ) x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
  746. - else if( b_list1[1] ) x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
  747. + if( ref0a >= 0 )
  748. + if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
  749. + else x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
  750. + else x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );
  751. +
  752. + if( ref0b >= 0 )
  753. + if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
  754. + else x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
  755. + else x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
  756. }
  757. }
  758. }
  759. @@ -767,10 +764,6 @@ int x264_macroblock_cache_init( x264_t *h )
  760. h->mb.intra_border_backup[i][j] += 8;
  761. }
  762.  
  763. - /* init with not available (for top right idx=7,15) */
  764. - memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
  765. - memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
  766. -
  767. return 0;
  768. fail: return -1;
  769. }
  770. @@ -808,6 +801,7 @@ void x264_macroblock_slice_init( x264_t *h )
  771. h->mb.ref[0] = h->fdec->ref[0];
  772. h->mb.ref[1] = h->fdec->ref[1];
  773. h->mb.type = h->fdec->mb_type;
  774. + h->mb.partition = h->fdec->mb_partition;
  775.  
  776. h->fdec->i_ref[0] = h->i_ref0;
  777. h->fdec->i_ref[1] = h->i_ref1;
  778. @@ -835,6 +829,10 @@ void x264_macroblock_slice_init( x264_t *h )
  779. if( h->sh.i_type == SLICE_TYPE_P )
  780. memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
  781.  
  782. + /* init with not available (for top right idx=7,15) */
  783. + memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
  784. + memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
  785. +
  786. setup_inverse_delta_pocs( h );
  787.  
  788. h->mb.i_neighbour4[6] =
  789. @@ -1304,6 +1302,7 @@ void x264_macroblock_cache_save( x264_t *h )
  790. x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
  791.  
  792. h->mb.type[i_mb_xy] = i_mb_type;
  793. + h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
  794. h->mb.i_mb_prev_xy = i_mb_xy;
  795.  
  796. /* save intra4x4 */
  797. diff --git a/encoder/analyse.c b/encoder/analyse.c
  798. index 1d48b7d..6ee5f8e 100644
  799. --- a/encoder/analyse.c
  800. +++ b/encoder/analyse.c
  801. @@ -3149,6 +3149,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
  802.  
  803. case B_SKIP:
  804. case B_DIRECT:
  805. + h->mb.i_partition = h->mb.cache.direct_partition;
  806. x264_mb_load_mv_direct8x8( h, 0 );
  807. x264_mb_load_mv_direct8x8( h, 1 );
  808. x264_mb_load_mv_direct8x8( h, 2 );
  809. --
  810. 1.6.1.2
  811.  
  812.  
  813. From d0be7257766d40b39dd453ebe8a266b64d653f71 Mon Sep 17 00:00:00 2001
  814. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  815. Date: Thu, 18 Feb 2010 10:37:57 -0800
  816. Subject: [PATCH 08/16] Add temporal predictor support to interlaced encoding
  817. 0.5-1% better compression in interlaced mode
  818.  
  819. ---
  820. common/frame.h | 2 +-
  821. common/macroblock.c | 26 +++++++++++++++++++-------
  822. 2 files changed, 20 insertions(+), 8 deletions(-)
  823.  
  824. diff --git a/common/frame.h b/common/frame.h
  825. index 6e7de50..0566b1e 100644
  826. --- a/common/frame.h
  827. +++ b/common/frame.h
  828. @@ -85,7 +85,7 @@ typedef struct x264_frame
  829. int8_t *ref[2];
  830. int i_ref[2];
  831. int ref_poc[2][16];
  832. - int inv_ref_poc[16]; // inverse values (list0 only) to avoid divisions in MB encoding
  833. + int16_t inv_ref_poc[2][32]; // inverse values (list0 only) to avoid divisions in MB encoding
  834.  
  835. /* for adaptive B-frame decision.
  836. * contains the SATD cost of the lowres frame encoded in various modes
  837. diff --git a/common/macroblock.c b/common/macroblock.c
  838. index 2573415..68c7e06 100644
  839. --- a/common/macroblock.c
  840. +++ b/common/macroblock.c
  841. @@ -447,10 +447,14 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
  842. #undef SET_MVP
  843.  
  844. /* temporal predictors */
  845. - /* FIXME temporal scaling w/ interlace */
  846. - if( h->fref0[0]->i_ref[0] > 0 && !h->sh.b_mbaff )
  847. + if( h->fref0[0]->i_ref[0] > 0 )
  848. {
  849. x264_frame_t *l0 = h->fref0[0];
  850. + int field = h->mb.i_mb_y&1;
  851. + int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
  852. + int refpoc = h->fref0[i_ref>>h->sh.b_mbaff]->i_poc;
  853. + if( h->sh.b_mbaff && field^(i_ref&1) )
  854. + refpoc += h->sh.i_delta_poc_bottom;
  855.  
  856. #define SET_TMVP(dx, dy) { \
  857. int i_b4 = h->mb.i_b4_xy + dx*4 + dy*4*h->mb.i_b4_stride; \
  858. @@ -458,7 +462,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
  859. int ref_col = l0->ref[0][i_b8]; \
  860. if( ref_col >= 0 ) \
  861. { \
  862. - int scale = (h->fdec->i_poc - h->fdec->ref_poc[0][i_ref]) * l0->inv_ref_poc[ref_col];\
  863. + int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field][ref_col];\
  864. mvc[i][0] = (l0->mv[0][i_b4][0]*scale + 128) >> 8;\
  865. mvc[i][1] = (l0->mv[0][i_b4][1]*scale + 128) >> 8;\
  866. i++; \
  867. @@ -479,11 +483,19 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
  868. /* Set up a lookup table for delta pocs to reduce an IDIV to an IMUL */
  869. static void setup_inverse_delta_pocs( x264_t *h )
  870. {
  871. - int i;
  872. - for( i = 0; i < h->i_ref0; i++ )
  873. + int i, field;
  874. + for( field = 0; field <= h->sh.b_mbaff; field++ )
  875. {
  876. - int delta = h->fdec->i_poc - h->fref0[i]->i_poc;
  877. - h->fdec->inv_ref_poc[i] = (256 + delta/2) / delta;
  878. + int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
  879. + for( i = 0; i < (h->i_ref0<<h->sh.b_mbaff); i++ )
  880. + {
  881. + int refpoc = h->fref0[i>>h->sh.b_mbaff]->i_poc;
  882. + if( h->sh.b_mbaff && field^(i&1) )
  883. + refpoc += h->sh.i_delta_poc_bottom;
  884. + int delta = curpoc - refpoc;
  885. +
  886. + h->fdec->inv_ref_poc[field][i] = (256 + delta/2) / delta;
  887. + }
  888. }
  889. }
  890.  
  891. --
  892. 1.6.1.2
  893.  
  894.  
  895. From da810dcc80ef85239a7c641b8af5c00f88aba1eb Mon Sep 17 00:00:00 2001
  896. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  897. Date: Thu, 18 Feb 2010 17:01:38 -0800
  898. Subject: [PATCH 09/16] Much faster and more efficient MVD handling
  899. Store MV deltas as clipped absolute values.
  900. This means CABAC no longer has to calculate absolute values in MV context selection.
  901. This also lets us cut the memory spent on MVDs by a factor of 2, speeding up cache_mvd and reducing memory usage by 32*threads*(num macroblocks) bytes.
  902. On a Core i7 encoding 1080p, this is about 3 megabytes saved.
  903.  
  904. ---
  905. common/common.h | 8 ++++----
  906. common/macroblock.c | 47 +++++++++++++----------------------------------
  907. common/macroblock.h | 31 +++++++++++++++++++++++++++++--
  908. common/x86/util.h | 40 +++++++++++++++++-----------------------
  909. encoder/cabac.c | 20 +++++++++++---------
  910. encoder/me.c | 3 ++-
  911. 6 files changed, 76 insertions(+), 73 deletions(-)
  912.  
  913. diff --git a/common/common.h b/common/common.h
  914. index 68f79ba..ab54508 100644
  915. --- a/common/common.h
  916. +++ b/common/common.h
  917. @@ -171,13 +171,13 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
  918. return sum;
  919. }
  920.  
  921. -static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
  922. +static inline uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
  923. {
  924. int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]);
  925. int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]);
  926. amvd0 = (amvd0 > 2) + (amvd0 > 32);
  927. amvd1 = (amvd1 > 2) + (amvd1 > 32);
  928. - return amvd0 + (amvd1<<16);
  929. + return amvd0 + (amvd1<<8);
  930. }
  931.  
  932. extern const uint8_t x264_exp2_lut[64];
  933. @@ -527,7 +527,7 @@ struct x264_t
  934. uint8_t (*non_zero_count)[16+4+4]; /* nzc. for I_PCM set to 16 */
  935. int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
  936. int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */
  937. - int16_t (*mvd[2])[2]; /* mb mv difference with predict. set to 0 if intra. cabac only */
  938. + uint8_t (*mvd[2])[2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
  939. int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */
  940. int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */
  941. int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
  942. @@ -621,7 +621,7 @@ struct x264_t
  943.  
  944. /* 0 if not available */
  945. ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
  946. - ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
  947. + ALIGNED_8( uint8_t mvd[2][X264_SCAN8_SIZE][2] );
  948.  
  949. /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
  950. ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
  951. diff --git a/common/macroblock.c b/common/macroblock.c
  952. index 68c7e06..8a4f095 100644
  953. --- a/common/macroblock.c
  954. +++ b/common/macroblock.c
  955. @@ -712,8 +712,8 @@ int x264_macroblock_cache_init( x264_t *h )
  956. if( h->param.b_cabac )
  957. {
  958. CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
  959. - CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(int16_t) );
  960. - CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(int16_t) );
  961. + CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(uint8_t) );
  962. + CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(uint8_t) );
  963. }
  964.  
  965. for( i=0; i<2; i++ )
  966. @@ -1211,33 +1211,24 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
  967. if( h->param.b_cabac )
  968. {
  969. if( i_top_type >= 0 )
  970. - {
  971. - const int i8 = x264_scan8[0] - 8;
  972. - const int iv = i_top_4x4;
  973. - CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
  974. - CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
  975. - }
  976. + CP64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8], h->mb.mvd[i_list][i_top_4x4] );
  977. else
  978. - {
  979. - const int i8 = x264_scan8[0] - 8;
  980. - M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
  981. - M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
  982. - }
  983. + M64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8] ) = 0;
  984.  
  985. if( i_left_type >= 0 )
  986. {
  987. const int i8 = x264_scan8[0] - 1;
  988. const int iv = i_mb_4x4 - 1;
  989. - CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
  990. - CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
  991. - CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
  992. - CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
  993. + CP16( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
  994. + CP16( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
  995. + CP16( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
  996. + CP16( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
  997. }
  998. else
  999. {
  1000. const int i8 = x264_scan8[0] - 1;
  1001. for( i = 0; i < 4; i++ )
  1002. - M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
  1003. + M16( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
  1004. }
  1005. }
  1006. }
  1007. @@ -1416,30 +1407,18 @@ void x264_macroblock_cache_save( x264_t *h )
  1008. if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
  1009. {
  1010. for( y = 0; y < 4; y++ )
  1011. - {
  1012. - CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
  1013. - CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
  1014. - }
  1015. + CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4], h->mb.cache.mvd[0][x264_scan8[0]+8*y] );
  1016. if( h->sh.i_type == SLICE_TYPE_B )
  1017. for( y = 0; y < 4; y++ )
  1018. - {
  1019. - CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
  1020. - CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
  1021. - }
  1022. + CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4], h->mb.cache.mvd[1][x264_scan8[0]+8*y] );
  1023. }
  1024. else
  1025. {
  1026. for( y = 0; y < 4; y++ )
  1027. - {
  1028. - M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
  1029. - M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
  1030. - }
  1031. + M64( h->mb.mvd[0][i_mb_4x4+y*s4x4] ) = 0;
  1032. if( h->sh.i_type == SLICE_TYPE_B )
  1033. for( y = 0; y < 4; y++ )
  1034. - {
  1035. - M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
  1036. - M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
  1037. - }
  1038. + M64( h->mb.mvd[1][i_mb_4x4+y*s4x4] ) = 0;
  1039. }
  1040.  
  1041. if( h->sh.i_type == SLICE_TYPE_B )
  1042. diff --git a/common/macroblock.h b/common/macroblock.h
  1043. index 48f3105..eb903d2 100644
  1044. --- a/common/macroblock.h
  1045. +++ b/common/macroblock.h
  1046. @@ -353,6 +353,33 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int
  1047. if( height == 4 ) M16( d+6 ) = val2;
  1048. }
  1049. }
  1050. +static ALWAYS_INLINE void x264_macroblock_cache_rect2( void *dst, int width, int height, uint16_t val )
  1051. +{
  1052. + uint16_t *d = dst;
  1053. + uint32_t val32 = val + (val<<16);
  1054. + uint64_t val64 = val32 + ((uint64_t)val32<<32);
  1055. + if( width == 4 )
  1056. + {
  1057. + M64( d+ 0 ) = val64;
  1058. + if( height >= 2 ) M64( d+ 8 ) = val64;
  1059. + if( height == 4 ) M64( d+16 ) = val64;
  1060. + if( height == 4 ) M64( d+24 ) = val64;
  1061. + }
  1062. + else if( width == 2 )
  1063. + {
  1064. + M32( d+ 0 ) = val32;
  1065. + if( height >= 2 ) M32( d+ 8 ) = val32;
  1066. + if( height == 4 ) M32( d+16 ) = val32;
  1067. + if( height == 4 ) M32( d+24 ) = val32;
  1068. + }
  1069. + else //if( width == 1 )
  1070. + {
  1071. + M16( d+ 0 ) = val;
  1072. + if( height >= 2 ) M16( d+ 8 ) = val;
  1073. + if( height == 4 ) M16( d+16 ) = val;
  1074. + if( height == 4 ) M16( d+24 ) = val;
  1075. + }
  1076. +}
  1077. static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
  1078. {
  1079. int dy;
  1080. @@ -383,9 +410,9 @@ static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int
  1081. {
  1082. x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
  1083. }
  1084. -static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
  1085. +static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mv )
  1086. {
  1087. - x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
  1088. + x264_macroblock_cache_rect2( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
  1089. }
  1090. static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
  1091. {
  1092. diff --git a/common/x86/util.h b/common/x86/util.h
  1093. index c8bcf4b..0674323 100644
  1094. --- a/common/x86/util.h
  1095. +++ b/common/x86/util.h
  1096. @@ -77,32 +77,26 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
  1097. );
  1098. return sum;
  1099. }
  1100. -#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
  1101. -static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
  1102. +#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
  1103. +static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
  1104. {
  1105. - static const uint64_t pw_2 = 0x0002000200020002ULL;
  1106. - static const uint64_t pw_28 = 0x001C001C001C001CULL;
  1107. - static const uint64_t pw_2184 = 0x0888088808880888ULL;
  1108. - /* MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32) */
  1109. - /* 2184 = fix16(1/30) */
  1110. - uint32_t amvd;
  1111. + static const uint64_t pb_2 = 0x0202020202020202ULL;
  1112. + static const uint64_t pb_32 = 0x2020202020202020ULL;
  1113. + int amvd;
  1114. asm(
  1115. - "movd %1, %%mm0 \n"
  1116. - "movd %2, %%mm1 \n"
  1117. - "pxor %%mm2, %%mm2 \n"
  1118. - "pxor %%mm3, %%mm3 \n"
  1119. - "psubw %%mm0, %%mm2 \n"
  1120. - "psubw %%mm1, %%mm3 \n"
  1121. - "pmaxsw %%mm2, %%mm0 \n"
  1122. - "pmaxsw %%mm3, %%mm1 \n"
  1123. - "paddw %3, %%mm0 \n"
  1124. - "paddw %%mm1, %%mm0 \n"
  1125. - "pmulhuw %4, %%mm0 \n"
  1126. - "pminsw %5, %%mm0 \n"
  1127. - "movd %%mm0, %0 \n"
  1128. + "movd %1, %%mm0 \n"
  1129. + "movd %2, %%mm1 \n"
  1130. + "paddb %%mm1, %%mm0 \n"
  1131. + "pxor %%mm2, %%mm2 \n"
  1132. + "movq %%mm0, %%mm1 \n"
  1133. + "pcmpgtb %3, %%mm0 \n"
  1134. + "pcmpgtb %4, %%mm1 \n"
  1135. + "psubb %%mm0, %%mm2 \n"
  1136. + "psubb %%mm1, %%mm2 \n"
  1137. + "movd %%mm2, %0 \n"
  1138. :"=r"(amvd)
  1139. - :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
  1140. - "m"(pw_28),"m"(pw_2184),"m"(pw_2)
  1141. + :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
  1142. + "m"(pb_2),"m"(pb_32)
  1143. );
  1144. return amvd;
  1145. }
  1146. diff --git a/encoder/cabac.c b/encoder/cabac.c
  1147. index 271f527..083b783 100644
  1148. --- a/encoder/cabac.c
  1149. +++ b/encoder/cabac.c
  1150. @@ -349,7 +349,7 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
  1151. x264_cabac_encode_decision( cb, 54 + ctx, 0 );
  1152. }
  1153.  
  1154. -static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
  1155. +static inline int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
  1156. {
  1157. const int i_abs = abs( mvd );
  1158. const int ctxbase = l ? 47 : 40;
  1159. @@ -408,32 +408,34 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
  1160. x264_cabac_encode_bypass( cb, mvd < 0 );
  1161. }
  1162. #endif
  1163. + /* Since we don't need to keep track of MVDs larger than 33, just cap the value.
  1164. + * This lets us store MVDs as 8-bit values instead of 16-bit. */
  1165. + return X264_MIN( i_abs, 33 );
  1166. }
  1167.  
  1168. -static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
  1169. +static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
  1170. {
  1171. ALIGNED_4( int16_t mvp[2] );
  1172. - uint32_t amvd;
  1173. int mdx, mdy;
  1174.  
  1175. /* Calculate mvd */
  1176. x264_mb_predict_mv( h, i_list, idx, width, mvp );
  1177. mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
  1178. mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
  1179. - amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
  1180. - h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
  1181. + uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
  1182. + h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
  1183.  
  1184. /* encode */
  1185. - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF );
  1186. - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 );
  1187. + mdx = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF );
  1188. + mdy = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 );
  1189.  
  1190. - return pack16to32_mask(mdx,mdy);
  1191. + return pack8to16(mdx,mdy);
  1192. }
  1193.  
  1194. #define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
  1195. do\
  1196. {\
  1197. - uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
  1198. + uint16_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
  1199. x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
  1200. } while(0)
  1201.  
  1202. diff --git a/encoder/me.c b/encoder/me.c
  1203. index f58a6a8..44f6c7d 100644
  1204. --- a/encoder/me.c
  1205. +++ b/encoder/me.c
  1206. @@ -1174,6 +1174,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
  1207. m->mv[0] = bmx;
  1208. m->mv[1] = bmy;
  1209. x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
  1210. - x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
  1211. + uint16_t amvd = pack8to16(X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33));
  1212. + x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
  1213. h->mb.b_skip_mc = 0;
  1214. }
  1215. --
  1216. 1.6.1.2
  1217.  
  1218.  
  1219. From 54d1bed32086228ce2de06a5207501bdf258d9a9 Mon Sep 17 00:00:00 2001
  1220. From: Anton Mitrofanov <BugMaster@narod.ru>
  1221. Date: Fri, 19 Feb 2010 10:45:22 -0800
  1222. Subject: [PATCH 10/16] Faster, more accurate psy-RD caching
  1223. Keep more variants of cached Hadamard scores and only calculate them when necessary.
  1224. Results in more calculation, but simpler lookups.
  1225. Slightly more accurate due to internal rounding in SATD and SA8D functions.
  1226.  
  1227. ---
  1228. common/common.h | 8 ++---
  1229. common/x86/mc-a2.asm | 6 +++-
  1230. encoder/analyse.c | 39 ++++++---------------------
  1231. encoder/rdo.c | 69 ++++++++++++++++++++++++++++---------------------
  1232. 4 files changed, 55 insertions(+), 67 deletions(-)
  1233.  
  1234. diff --git a/common/common.h b/common/common.h
  1235. index ab54508..413b82f 100644
  1236. --- a/common/common.h
  1237. +++ b/common/common.h
  1238. @@ -583,11 +583,9 @@ struct x264_t
  1239. ALIGNED_16( int16_t fenc_dct8[4][64] );
  1240. ALIGNED_16( int16_t fenc_dct4[16][16] );
  1241.  
  1242. - /* Psy RD SATD scores */
  1243. - int fenc_satd[4][4];
  1244. - int fenc_satd_sum;
  1245. - int fenc_sa8d[2][2];
  1246. - int fenc_sa8d_sum;
  1247. + /* Psy RD SATD/SA8D scores cache */
  1248. + ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
  1249. + ALIGNED_16( uint32_t fenc_satd_cache[32] );
  1250.  
  1251. /* pointer over mb of the frame to be compressed */
  1252. uint8_t *p_fenc[3];
  1253. diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
  1254. index f2e69c0..d86d6ef 100644
  1255. --- a/common/x86/mc-a2.asm
  1256. +++ b/common/x86/mc-a2.asm
  1257. @@ -731,15 +731,17 @@ cglobal x264_memcpy_aligned_sse2, 3,3
  1258. ;-----------------------------------------------------------------------------
  1259. %macro MEMZERO 1
  1260. cglobal x264_memzero_aligned_%1, 2,2
  1261. + add r0, r1
  1262. + neg r1
  1263. pxor m0, m0
  1264. .loop:
  1265. - sub r1d, mmsize*8
  1266. %assign i 0
  1267. %rep 8
  1268. mova [r0 + r1 + i], m0
  1269. %assign i i+mmsize
  1270. %endrep
  1271. - jg .loop
  1272. + add r1d, mmsize*8
  1273. + jl .loop
  1274. REP_RET
  1275. %endmacro
  1276.  
  1277. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1278. index 6ee5f8e..02fbf7c 100644
  1279. --- a/encoder/analyse.c
  1280. +++ b/encoder/analyse.c
  1281. @@ -578,34 +578,13 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
  1282. h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
  1283. }
  1284.  
  1285. -/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
  1286. -static inline void x264_mb_cache_fenc_satd( x264_t *h )
  1287. +/* Reset fenc satd scores cache for psy RD */
  1288. +static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
  1289. {
  1290. - ALIGNED_16( static uint8_t zero[16] ) = {0};
  1291. - uint8_t *fenc;
  1292. - int x, y, satd_sum = 0, sa8d_sum = 0;
  1293. - if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
  1294. - x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
  1295. - if( !h->mb.i_psy_rd )
  1296. - return;
  1297. - for( y = 0; y < 4; y++ )
  1298. - for( x = 0; x < 4; x++ )
  1299. - {
  1300. - fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
  1301. - h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
  1302. - - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
  1303. - satd_sum += h->mb.pic.fenc_satd[y][x];
  1304. - }
  1305. - for( y = 0; y < 2; y++ )
  1306. - for( x = 0; x < 2; x++ )
  1307. - {
  1308. - fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
  1309. - h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
  1310. - - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
  1311. - sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
  1312. - }
  1313. - h->mb.pic.fenc_satd_sum = satd_sum;
  1314. - h->mb.pic.fenc_sa8d_sum = sa8d_sum;
  1315. + /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
  1316. + h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
  1317. + if( b_satd )
  1318. + h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
  1319. }
  1320.  
  1321. static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
  1322. @@ -1193,7 +1172,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
  1323. h->mb.i_type = P_L0;
  1324. if( a->i_mbrd )
  1325. {
  1326. - x264_mb_cache_fenc_satd( h );
  1327. + x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
  1328. if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
  1329. {
  1330. h->mb.i_partition = D_16x16;
  1331. @@ -2432,7 +2411,7 @@ void x264_macroblock_analyse( x264_t *h )
  1332. {
  1333. intra_analysis:
  1334. if( analysis.i_mbrd )
  1335. - x264_mb_cache_fenc_satd( h );
  1336. + x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
  1337. x264_mb_analyse_intra( h, &analysis, COST_MAX );
  1338. if( analysis.i_mbrd )
  1339. x264_intra_rd( h, &analysis, COST_MAX );
  1340. @@ -2749,7 +2728,7 @@ intra_analysis:
  1341. int b_skip = 0;
  1342.  
  1343. if( analysis.i_mbrd )
  1344. - x264_mb_cache_fenc_satd( h );
  1345. + x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
  1346.  
  1347. h->mb.i_type = B_SKIP;
  1348. if( h->mb.b_direct_auto_write )
  1349. diff --git a/encoder/rdo.c b/encoder/rdo.c
  1350. index e15f47d..fed2a28 100644
  1351. --- a/encoder/rdo.c
  1352. +++ b/encoder/rdo.c
  1353. @@ -61,36 +61,44 @@ static uint16_t cabac_size_5ones[128];
  1354. #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
  1355. sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
  1356.  
  1357. -
  1358. -/* Sum the cached SATDs to avoid repeating them. */
  1359. -static inline int sum_satd( x264_t *h, int pixel, int x, int y )
  1360. +static inline uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
  1361. {
  1362. - int satd = 0;
  1363. - int min_x = x>>2;
  1364. - int min_y = y>>2;
  1365. - int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
  1366. - int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
  1367. - if( pixel == PIXEL_16x16 )
  1368. - return h->mb.pic.fenc_satd_sum;
  1369. - for( y = min_y; y < max_y; y++ )
  1370. - for( x = min_x; x < max_x; x++ )
  1371. - satd += h->mb.pic.fenc_satd[y][x];
  1372. - return satd;
  1373. + static const uint8_t hadamard_shift_x[4] = {4, 4, 3, 3};
  1374. + static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
  1375. + static const uint8_t hadamard_offset[4] = {0, 1, 3, 5};
  1376. + int cache_index = (x >> hadamard_shift_x[pixel]) + (y >> hadamard_shift_y[pixel])
  1377. + + hadamard_offset[pixel];
  1378. + uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
  1379. + if( res )
  1380. + return res - 1;
  1381. + else
  1382. + {
  1383. + uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
  1384. + res = h->pixf.hadamard_ac[pixel]( fenc, FENC_STRIDE );
  1385. + h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
  1386. + return res;
  1387. + }
  1388. }
  1389.  
  1390. -static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
  1391. +static inline int cached_satd( x264_t *h, int pixel, int x, int y )
  1392. {
  1393. - int sa8d = 0;
  1394. - int min_x = x>>3;
  1395. - int min_y = y>>3;
  1396. - int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
  1397. - int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
  1398. - if( pixel == PIXEL_16x16 )
  1399. - return h->mb.pic.fenc_sa8d_sum;
  1400. - for( y = min_y; y < max_y; y++ )
  1401. - for( x = min_x; x < max_x; x++ )
  1402. - sa8d += h->mb.pic.fenc_sa8d[y][x];
  1403. - return sa8d;
  1404. + static const uint8_t satd_shift_x[3] = {3, 2, 2};
  1405. + static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
  1406. + static const uint8_t satd_offset[3] = {0, 8, 16};
  1407. + ALIGNED_16( static uint8_t zero[16] );
  1408. + int cache_index = (x >> satd_shift_x[pixel - PIXEL_8x4]) + (y >> satd_shift_y[pixel - PIXEL_8x4])
  1409. + + satd_offset[pixel - PIXEL_8x4];
  1410. + int res = h->mb.pic.fenc_satd_cache[cache_index];
  1411. + if( res )
  1412. + return res - 1;
  1413. + else
  1414. + {
  1415. + uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
  1416. + int dc = h->pixf.sad[pixel]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
  1417. + res = h->pixf.satd[pixel]( fenc, FENC_STRIDE, zero, 0 ) - dc;
  1418. + h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
  1419. + return res;
  1420. + }
  1421. }
  1422.  
  1423. /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
  1424. @@ -113,15 +121,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
  1425. /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
  1426. if( size <= PIXEL_8x8 )
  1427. {
  1428. - uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
  1429. - satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
  1430. - + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
  1431. + uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
  1432. + uint64_t fenc_acs = cached_hadamard( h, size, x, y );
  1433. + satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
  1434. + + abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
  1435. satd >>= 1;
  1436. }
  1437. else
  1438. {
  1439. int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
  1440. - satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
  1441. + satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
  1442. }
  1443. satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
  1444. }
  1445. --
  1446. 1.6.1.2
  1447.  
  1448.  
  1449. From c45278a7107934fdad77c0cac14a924b97a6272e Mon Sep 17 00:00:00 2001
  1450. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1451. Date: Sun, 21 Feb 2010 01:56:12 -0800
  1452. Subject: [PATCH 11/16] Move presets, tunings, and profiles into libx264
  1453. Now any application calling libx264 can use them.
  1454. Full documentation and guidelines for usage are included in x264.h.
  1455.  
  1456. ---
  1457. common/common.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
  1458. x264.c | 267 +++----------------------------------------------------
  1459. x264.h | 96 +++++++++++++++++---
  1460. 3 files changed, 357 insertions(+), 272 deletions(-)
  1461.  
  1462. diff --git a/common/common.c b/common/common.c
  1463. index 0dd7af5..a99b65b 100644
  1464. --- a/common/common.c
  1465. +++ b/common/common.c
  1466. @@ -36,7 +36,7 @@ static void x264_log_default( void *, int, const char *, va_list );
  1467. /****************************************************************************
  1468. * x264_param_default:
  1469. ****************************************************************************/
  1470. -void x264_param_default( x264_param_t *param )
  1471. +void x264_param_default( x264_param_t *param )
  1472. {
  1473. /* */
  1474. memset( param, 0, sizeof( x264_param_t ) );
  1475. @@ -160,6 +160,270 @@ void x264_param_default( x264_param_t *param )
  1476. param->b_dts_compress = 0;
  1477. }
  1478.  
  1479. +static int x264_param_apply_preset( x264_param_t *param, const char *preset )
  1480. +{
  1481. + if( !strcasecmp( preset, "ultrafast" ) )
  1482. + {
  1483. + param->i_frame_reference = 1;
  1484. + param->i_scenecut_threshold = 0;
  1485. + param->b_deblocking_filter = 0;
  1486. + param->b_cabac = 0;
  1487. + param->i_bframe = 0;
  1488. + param->analyse.intra = 0;
  1489. + param->analyse.inter = 0;
  1490. + param->analyse.b_transform_8x8 = 0;
  1491. + param->analyse.i_me_method = X264_ME_DIA;
  1492. + param->analyse.i_subpel_refine = 0;
  1493. + param->rc.i_aq_mode = 0;
  1494. + param->analyse.b_mixed_references = 0;
  1495. + param->analyse.i_trellis = 0;
  1496. + param->i_bframe_adaptive = X264_B_ADAPT_NONE;
  1497. + param->rc.b_mb_tree = 0;
  1498. + param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1499. + }
  1500. + else if( !strcasecmp( preset, "veryfast" ) )
  1501. + {
  1502. + param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
  1503. + param->analyse.i_me_method = X264_ME_DIA;
  1504. + param->analyse.i_subpel_refine = 1;
  1505. + param->i_frame_reference = 1;
  1506. + param->analyse.b_mixed_references = 0;
  1507. + param->analyse.i_trellis = 0;
  1508. + param->rc.b_mb_tree = 0;
  1509. + param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1510. + }
  1511. + else if( !strcasecmp( preset, "faster" ) )
  1512. + {
  1513. + param->analyse.b_mixed_references = 0;
  1514. + param->i_frame_reference = 2;
  1515. + param->analyse.i_subpel_refine = 4;
  1516. + param->rc.b_mb_tree = 0;
  1517. + param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
  1518. + }
  1519. + else if( !strcasecmp( preset, "fast" ) )
  1520. + {
  1521. + param->i_frame_reference = 2;
  1522. + param->analyse.i_subpel_refine = 6;
  1523. + param->rc.i_lookahead = 30;
  1524. + }
  1525. + else if( !strcasecmp( preset, "medium" ) )
  1526. + {
  1527. + /* Default is medium */
  1528. + }
  1529. + else if( !strcasecmp( preset, "slow" ) )
  1530. + {
  1531. + param->analyse.i_me_method = X264_ME_UMH;
  1532. + param->analyse.i_subpel_refine = 8;
  1533. + param->i_frame_reference = 5;
  1534. + param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1535. + param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1536. + param->rc.i_lookahead = 50;
  1537. + }
  1538. + else if( !strcasecmp( preset, "slower" ) )
  1539. + {
  1540. + param->analyse.i_me_method = X264_ME_UMH;
  1541. + param->analyse.i_subpel_refine = 9;
  1542. + param->i_frame_reference = 8;
  1543. + param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1544. + param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1545. + param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1546. + param->analyse.i_trellis = 2;
  1547. + param->rc.i_lookahead = 60;
  1548. + }
  1549. + else if( !strcasecmp( preset, "veryslow" ) )
  1550. + {
  1551. + param->analyse.i_me_method = X264_ME_UMH;
  1552. + param->analyse.i_subpel_refine = 10;
  1553. + param->analyse.i_me_range = 24;
  1554. + param->i_frame_reference = 16;
  1555. + param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1556. + param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1557. + param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1558. + param->analyse.i_trellis = 2;
  1559. + param->i_bframe = 8;
  1560. + param->rc.i_lookahead = 60;
  1561. + }
  1562. + else if( !strcasecmp( preset, "placebo" ) )
  1563. + {
  1564. + param->analyse.i_me_method = X264_ME_TESA;
  1565. + param->analyse.i_subpel_refine = 10;
  1566. + param->analyse.i_me_range = 24;
  1567. + param->i_frame_reference = 16;
  1568. + param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1569. + param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1570. + param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1571. + param->analyse.b_fast_pskip = 0;
  1572. + param->analyse.i_trellis = 2;
  1573. + param->i_bframe = 16;
  1574. + param->rc.i_lookahead = 60;
  1575. + }
  1576. + else
  1577. + {
  1578. + fprintf( stderr, "x264 [error]: invalid preset '%s'\n", preset );
  1579. + return -1;
  1580. + }
  1581. + return 0;
  1582. +}
  1583. +
  1584. +static int x264_param_apply_tune( x264_param_t *param, const char *tune )
  1585. +{
  1586. + char *tmp = x264_malloc( strlen( tune ) );
  1587. + if( !tmp )
  1588. + return -1;
  1589. + tmp = strcpy( tmp, tune );
  1590. + char *s = strtok( tmp, ",./-+" );
  1591. + int psy_tuning_used = 0;
  1592. + while( s )
  1593. + {
  1594. + if( !strncasecmp( s, "film", 4 ) )
  1595. + {
  1596. + if( psy_tuning_used++ ) goto psy_failure;
  1597. + param->i_deblocking_filter_alphac0 = -1;
  1598. + param->i_deblocking_filter_beta = -1;
  1599. + param->analyse.f_psy_trellis = 0.15;
  1600. + }
  1601. + else if( !strncasecmp( s, "animation", 9 ) )
  1602. + {
  1603. + if( psy_tuning_used++ ) goto psy_failure;
  1604. + param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
  1605. + param->i_deblocking_filter_alphac0 = 1;
  1606. + param->i_deblocking_filter_beta = 1;
  1607. + param->analyse.f_psy_rd = 0.4;
  1608. + param->rc.f_aq_strength = 0.6;
  1609. + param->i_bframe += 2;
  1610. + }
  1611. + else if( !strncasecmp( s, "grain", 5 ) )
  1612. + {
  1613. + if( psy_tuning_used++ ) goto psy_failure;
  1614. + param->i_deblocking_filter_alphac0 = -2;
  1615. + param->i_deblocking_filter_beta = -2;
  1616. + param->analyse.f_psy_trellis = 0.25;
  1617. + param->analyse.b_dct_decimate = 0;
  1618. + param->rc.f_pb_factor = 1.1;
  1619. + param->rc.f_ip_factor = 1.1;
  1620. + param->rc.f_aq_strength = 0.5;
  1621. + param->analyse.i_luma_deadzone[0] = 6;
  1622. + param->analyse.i_luma_deadzone[1] = 6;
  1623. + param->rc.f_qcompress = 0.8;
  1624. + }
  1625. + else if( !strncasecmp( s, "psnr", 4 ) )
  1626. + {
  1627. + if( psy_tuning_used++ ) goto psy_failure;
  1628. + param->rc.i_aq_mode = X264_AQ_NONE;
  1629. + param->analyse.b_psy = 0;
  1630. + }
  1631. + else if( !strncasecmp( s, "ssim", 4 ) )
  1632. + {
  1633. + if( psy_tuning_used++ ) goto psy_failure;
  1634. + param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
  1635. + param->analyse.b_psy = 0;
  1636. + }
  1637. + else if( !strncasecmp( s, "fastdecode", 10 ) )
  1638. + {
  1639. + param->b_deblocking_filter = 0;
  1640. + param->b_cabac = 0;
  1641. + param->analyse.b_weighted_bipred = 0;
  1642. + param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1643. + }
  1644. + else if( !strncasecmp( s, "zerolatency", 11 ) )
  1645. + {
  1646. + param->rc.i_lookahead = 0;
  1647. + param->i_sync_lookahead = 0;
  1648. + param->i_bframe = 0;
  1649. + param->b_sliced_threads = 1;
  1650. + }
  1651. + else if( !strncasecmp( s, "touhou", 6 ) )
  1652. + {
  1653. + if( psy_tuning_used++ ) goto psy_failure;
  1654. + param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
  1655. + param->i_deblocking_filter_alphac0 = -1;
  1656. + param->i_deblocking_filter_beta = -1;
  1657. + param->analyse.f_psy_trellis = 0.2;
  1658. + param->rc.f_aq_strength = 1.3;
  1659. + if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
  1660. + param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1661. + }
  1662. + else
  1663. + {
  1664. + fprintf( stderr, "x264 [error]: invalid tune '%s'\n", s );
  1665. + return -1;
  1666. + }
  1667. + if( 0 )
  1668. + {
  1669. + psy_failure:
  1670. + fprintf( stderr, "x264 [warning]: only 1 psy tuning can be used: ignoring tune %s\n", s );
  1671. + }
  1672. + s = strtok( NULL, ",./-+" );
  1673. + }
  1674. + return 0;
  1675. +}
  1676. +
  1677. +int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune )
  1678. +{
  1679. + x264_param_default( param );
  1680. +
  1681. + if( preset && x264_param_apply_preset( param, preset ) < 0 )
  1682. + return -1;
  1683. + if( tune && x264_param_apply_tune( param, tune ) < 0 )
  1684. + return -1;
  1685. + return 0;
  1686. +}
  1687. +
  1688. +void x264_param_apply_fastfirstpass( x264_param_t *param )
  1689. +{
  1690. + /* Set faster options in case of turbo firstpass. */
  1691. + if( param->rc.b_stat_read && !param->rc.b_stat_write )
  1692. + {
  1693. + param->i_frame_reference = 1;
  1694. + param->analyse.b_transform_8x8 = 0;
  1695. + param->analyse.inter = 0;
  1696. + param->analyse.i_me_method = X264_ME_DIA;
  1697. + param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
  1698. + param->analyse.i_trellis = 0;
  1699. + }
  1700. +}
  1701. +
  1702. +int x264_param_apply_profile( x264_param_t *param, const char *profile )
  1703. +{
  1704. + if( !profile )
  1705. + return 0;
  1706. +
  1707. + if( !strcasecmp( profile, "baseline" ) )
  1708. + {
  1709. + param->analyse.b_transform_8x8 = 0;
  1710. + param->b_cabac = 0;
  1711. + param->i_cqm_preset = X264_CQM_FLAT;
  1712. + param->i_bframe = 0;
  1713. + param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1714. + if( param->b_interlaced )
  1715. + {
  1716. + fprintf( stderr, "x264 [error]: baseline profile doesn't support interlacing\n" );
  1717. + return -1;
  1718. + }
  1719. + }
  1720. + else if( !strcasecmp( profile, "main" ) )
  1721. + {
  1722. + param->analyse.b_transform_8x8 = 0;
  1723. + param->i_cqm_preset = X264_CQM_FLAT;
  1724. + }
  1725. + else if( !strcasecmp( profile, "high" ) )
  1726. + {
  1727. + /* Default */
  1728. + }
  1729. + else
  1730. + {
  1731. + fprintf( stderr, "x264 [error]: invalid profile: %s\n", profile );
  1732. + return -1;
  1733. + }
  1734. + if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0) ||
  1735. + (param->rc.i_rc_method == X264_RC_CRF && param->rc.f_rf_constant == 0) )
  1736. + {
  1737. + fprintf( stderr, "x264 [error]: %s profile doesn't support lossless\n", profile );
  1738. + return -1;
  1739. + }
  1740. + return 0;
  1741. +}
  1742. +
  1743. static int parse_enum( const char *arg, const char * const *names, int *dst )
  1744. {
  1745. int i;
  1746. diff --git a/x264.c b/x264.c
  1747. index 959626a..2875dd1 100644
  1748. --- a/x264.c
  1749. +++ b/x264.c
  1750. @@ -115,8 +115,6 @@ int main( int argc, char **argv )
  1751. _setmode(_fileno(stdout), _O_BINARY);
  1752. #endif
  1753.  
  1754. - x264_param_default( &param );
  1755. -
  1756. /* Parse command line */
  1757. if( Parse( argc, argv, &param, &opt ) < 0 )
  1758. return -1;
  1759. @@ -799,12 +797,13 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
  1760. char *profile = NULL;
  1761. int b_thread_input = 0;
  1762. int b_turbo = 1;
  1763. - int b_pass1 = 0;
  1764. int b_user_ref = 0;
  1765. int b_user_fps = 0;
  1766. int b_user_interlaced = 0;
  1767. int i;
  1768. cli_input_opt_t input_opt;
  1769. + char *preset = NULL;
  1770. + char *tune = NULL;
  1771.  
  1772. memset( opt, 0, sizeof(cli_opt_t) );
  1773. memset( &input_opt, 0, sizeof(cli_input_opt_t) );
  1774. @@ -816,219 +815,20 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
  1775. int c = getopt_long( argc, argv, short_options, long_options, NULL );
  1776. if( c == -1 )
  1777. break;
  1778. -
  1779. if( c == OPT_PRESET )
  1780. {
  1781. - if( !strcasecmp( optarg, "ultrafast" ) )
  1782. - {
  1783. - param->i_frame_reference = 1;
  1784. - param->i_scenecut_threshold = 0;
  1785. - param->b_deblocking_filter = 0;
  1786. - param->b_cabac = 0;
  1787. - param->i_bframe = 0;
  1788. - param->analyse.intra = 0;
  1789. - param->analyse.inter = 0;
  1790. - param->analyse.b_transform_8x8 = 0;
  1791. - param->analyse.i_me_method = X264_ME_DIA;
  1792. - param->analyse.i_subpel_refine = 0;
  1793. - param->rc.i_aq_mode = 0;
  1794. - param->analyse.b_mixed_references = 0;
  1795. - param->analyse.i_trellis = 0;
  1796. - param->i_bframe_adaptive = X264_B_ADAPT_NONE;
  1797. - param->rc.b_mb_tree = 0;
  1798. - param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1799. - }
  1800. - else if( !strcasecmp( optarg, "veryfast" ) )
  1801. - {
  1802. - param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
  1803. - param->analyse.i_me_method = X264_ME_DIA;
  1804. - param->analyse.i_subpel_refine = 1;
  1805. - param->i_frame_reference = 1;
  1806. - param->analyse.b_mixed_references = 0;
  1807. - param->analyse.i_trellis = 0;
  1808. - param->rc.b_mb_tree = 0;
  1809. - param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1810. - }
  1811. - else if( !strcasecmp( optarg, "faster" ) )
  1812. - {
  1813. - param->analyse.b_mixed_references = 0;
  1814. - param->i_frame_reference = 2;
  1815. - param->analyse.i_subpel_refine = 4;
  1816. - param->rc.b_mb_tree = 0;
  1817. - param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
  1818. - }
  1819. - else if( !strcasecmp( optarg, "fast" ) )
  1820. - {
  1821. - param->i_frame_reference = 2;
  1822. - param->analyse.i_subpel_refine = 6;
  1823. - param->rc.i_lookahead = 30;
  1824. - }
  1825. - else if( !strcasecmp( optarg, "medium" ) )
  1826. - {
  1827. - /* Default is medium */
  1828. - }
  1829. - else if( !strcasecmp( optarg, "slow" ) )
  1830. - {
  1831. - param->analyse.i_me_method = X264_ME_UMH;
  1832. - param->analyse.i_subpel_refine = 8;
  1833. - param->i_frame_reference = 5;
  1834. - param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1835. - param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1836. - param->rc.i_lookahead = 50;
  1837. - }
  1838. - else if( !strcasecmp( optarg, "slower" ) )
  1839. - {
  1840. - param->analyse.i_me_method = X264_ME_UMH;
  1841. - param->analyse.i_subpel_refine = 9;
  1842. - param->i_frame_reference = 8;
  1843. - param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1844. - param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1845. - param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1846. - param->analyse.i_trellis = 2;
  1847. - param->rc.i_lookahead = 60;
  1848. - }
  1849. - else if( !strcasecmp( optarg, "veryslow" ) )
  1850. - {
  1851. - param->analyse.i_me_method = X264_ME_UMH;
  1852. - param->analyse.i_subpel_refine = 10;
  1853. - param->analyse.i_me_range = 24;
  1854. - param->i_frame_reference = 16;
  1855. - param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1856. - param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1857. - param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1858. - param->analyse.i_trellis = 2;
  1859. - param->i_bframe = 8;
  1860. - param->rc.i_lookahead = 60;
  1861. - }
  1862. - else if( !strcasecmp( optarg, "placebo" ) )
  1863. - {
  1864. - param->analyse.i_me_method = X264_ME_TESA;
  1865. - param->analyse.i_subpel_refine = 10;
  1866. - param->analyse.i_me_range = 24;
  1867. - param->i_frame_reference = 16;
  1868. - param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
  1869. - param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
  1870. - param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1871. - param->analyse.b_fast_pskip = 0;
  1872. - param->analyse.i_trellis = 2;
  1873. - param->i_bframe = 16;
  1874. - param->rc.i_lookahead = 60;
  1875. + preset = optarg;
  1876. + if( !strcmp( preset, "placebo" ) )
  1877. b_turbo = 0;
  1878. - }
  1879. - else
  1880. - {
  1881. - fprintf( stderr, "x264 [error]: invalid preset '%s'\n", optarg );
  1882. - return -1;
  1883. - }
  1884. }
  1885. - else if( c == '?' )
  1886. - return -1;
  1887. - }
  1888. -
  1889. - /* Tunings are applied next. */
  1890. - for( optind = 0;; )
  1891. - {
  1892. - int c = getopt_long( argc, argv, short_options, long_options, NULL );
  1893. - if( c == -1 )
  1894. - break;
  1895. -
  1896. if( c == OPT_TUNE )
  1897. - {
  1898. - char *s = strtok( optarg, ",./-+" );
  1899. - int psy_tuning_used = 0;
  1900. - while( s )
  1901. - {
  1902. - if( !strncasecmp( s, "film", 4 ) )
  1903. - {
  1904. - if( psy_tuning_used ) goto psy_failure;
  1905. - param->i_deblocking_filter_alphac0 = -1;
  1906. - param->i_deblocking_filter_beta = -1;
  1907. - param->analyse.f_psy_trellis = 0.15;
  1908. - psy_tuning_used = 1;
  1909. - }
  1910. - else if( !strncasecmp( s, "animation", 9 ) )
  1911. - {
  1912. - if( psy_tuning_used ) goto psy_failure;
  1913. - param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
  1914. - param->i_deblocking_filter_alphac0 = 1;
  1915. - param->i_deblocking_filter_beta = 1;
  1916. - param->analyse.f_psy_rd = 0.4;
  1917. - param->rc.f_aq_strength = 0.6;
  1918. - param->i_bframe += 2;
  1919. - psy_tuning_used = 1;
  1920. - }
  1921. - else if( !strncasecmp( s, "grain", 5 ) )
  1922. - {
  1923. - if( psy_tuning_used ) goto psy_failure;
  1924. - param->i_deblocking_filter_alphac0 = -2;
  1925. - param->i_deblocking_filter_beta = -2;
  1926. - param->analyse.f_psy_trellis = 0.25;
  1927. - param->analyse.b_dct_decimate = 0;
  1928. - param->rc.f_pb_factor = 1.1;
  1929. - param->rc.f_ip_factor = 1.1;
  1930. - param->rc.f_aq_strength = 0.5;
  1931. - param->analyse.i_luma_deadzone[0] = 6;
  1932. - param->analyse.i_luma_deadzone[1] = 6;
  1933. - param->rc.f_qcompress = 0.8;
  1934. - psy_tuning_used = 1;
  1935. - }
  1936. - else if( !strncasecmp( s, "psnr", 4 ) )
  1937. - {
  1938. - if( psy_tuning_used ) goto psy_failure;
  1939. - param->rc.i_aq_mode = X264_AQ_NONE;
  1940. - param->analyse.b_psy = 0;
  1941. - psy_tuning_used = 1;
  1942. - }
  1943. - else if( !strncasecmp( s, "ssim", 4 ) )
  1944. - {
  1945. - if( psy_tuning_used ) goto psy_failure;
  1946. - param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
  1947. - param->analyse.b_psy = 0;
  1948. - psy_tuning_used = 1;
  1949. - }
  1950. - else if( !strncasecmp( s, "fastdecode", 10 ) )
  1951. - {
  1952. - param->b_deblocking_filter = 0;
  1953. - param->b_cabac = 0;
  1954. - param->analyse.b_weighted_bipred = 0;
  1955. - param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1956. - }
  1957. - else if( !strncasecmp( s, "zerolatency", 11 ) )
  1958. - {
  1959. - param->rc.i_lookahead = 0;
  1960. - param->i_sync_lookahead = 0;
  1961. - param->i_bframe = 0;
  1962. - param->b_sliced_threads = 1;
  1963. - }
  1964. - else if( !strncasecmp( s, "touhou", 6 ) )
  1965. - {
  1966. - if( psy_tuning_used ) goto psy_failure;
  1967. - param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
  1968. - param->i_deblocking_filter_alphac0 = -1;
  1969. - param->i_deblocking_filter_beta = -1;
  1970. - param->analyse.f_psy_trellis = 0.2;
  1971. - param->rc.f_aq_strength = 1.3;
  1972. - if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
  1973. - param->analyse.inter |= X264_ANALYSE_PSUB8x8;
  1974. - psy_tuning_used = 1;
  1975. - }
  1976. - else
  1977. - {
  1978. - fprintf( stderr, "x264 [error]: invalid tune '%s'\n", s );
  1979. - return -1;
  1980. - }
  1981. - if( 0 )
  1982. - {
  1983. -psy_failure:
  1984. - fprintf( stderr, "x264 [warning]: only 1 psy tuning can be used: ignoring tune %s\n", s );
  1985. - }
  1986. - s = strtok( NULL, ",./-+" );
  1987. - }
  1988. - }
  1989. + tune = optarg;
  1990. else if( c == '?' )
  1991. return -1;
  1992. }
  1993.  
  1994. + x264_param_default_preset( param, preset, tune );
  1995. +
  1996. /* Parse command line options */
  1997. for( optind = 0;; )
  1998. {
  1999. @@ -1144,9 +944,6 @@ psy_failure:
  2000. case 'r':
  2001. b_user_ref = 1;
  2002. goto generic_option;
  2003. - case 'p':
  2004. - b_pass1 = atoi( optarg ) == 1;
  2005. - goto generic_option;
  2006. case OPT_FPS:
  2007. b_user_fps = 1;
  2008. param->b_vfr_input = 0;
  2009. @@ -1185,54 +982,12 @@ generic_option:
  2010. }
  2011. }
  2012.  
  2013. - /* Set faster options in case of turbo firstpass. */
  2014. - if( b_turbo && b_pass1 )
  2015. - {
  2016. - param->i_frame_reference = 1;
  2017. - param->analyse.b_transform_8x8 = 0;
  2018. - param->analyse.inter = 0;
  2019. - param->analyse.i_me_method = X264_ME_DIA;
  2020. - param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
  2021. - param->analyse.i_trellis = 0;
  2022. - }
  2023. + /* If first pass mode is used, apply faster settings. */
  2024. + if( b_turbo )
  2025. + x264_param_apply_fastfirstpass( param );
  2026.  
  2027. /* Apply profile restrictions. */
  2028. - if( profile )
  2029. - {
  2030. - if( !strcasecmp( profile, "baseline" ) )
  2031. - {
  2032. - param->analyse.b_transform_8x8 = 0;
  2033. - param->b_cabac = 0;
  2034. - param->i_cqm_preset = X264_CQM_FLAT;
  2035. - param->i_bframe = 0;
  2036. - param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  2037. - if( param->b_interlaced )
  2038. - {
  2039. - fprintf( stderr, "x264 [error]: baseline profile doesn't support interlacing\n" );
  2040. - return -1;
  2041. - }
  2042. - }
  2043. - else if( !strcasecmp( profile, "main" ) )
  2044. - {
  2045. - param->analyse.b_transform_8x8 = 0;
  2046. - param->i_cqm_preset = X264_CQM_FLAT;
  2047. - }
  2048. - else if( !strcasecmp( profile, "high" ) )
  2049. - {
  2050. - /* Default */
  2051. - }
  2052. - else
  2053. - {
  2054. - fprintf( stderr, "x264 [error]: invalid profile: %s\n", profile );
  2055. - return -1;
  2056. - }
  2057. - if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0) ||
  2058. - (param->rc.i_rc_method == X264_RC_CRF && param->rc.f_rf_constant == 0) )
  2059. - {
  2060. - fprintf( stderr, "x264 [error]: %s profile doesn't support lossless\n", profile );
  2061. - return -1;
  2062. - }
  2063. - }
  2064. + x264_param_apply_profile( param, profile );
  2065.  
  2066. /* Get the file name */
  2067. if( optind > argc - 1 || !output_filename )
  2068. diff --git a/x264.h b/x264.h
  2069. index e7d19b7..f317e98 100644
  2070. --- a/x264.h
  2071. +++ b/x264.h
  2072. @@ -35,14 +35,14 @@
  2073.  
  2074. #include <stdarg.h>
  2075.  
  2076. -#define X264_BUILD 85
  2077. +#define X264_BUILD 86
  2078.  
  2079. /* x264_t:
  2080. * opaque handler for encoder */
  2081. typedef struct x264_t x264_t;
  2082.  
  2083. /****************************************************************************
  2084. - * Initialisation structure and function.
  2085. + * Encoder parameters
  2086. ****************************************************************************/
  2087. /* CPU flags
  2088. */
  2089. @@ -332,6 +332,10 @@ typedef struct x264_param_t
  2090. void (*param_free)( void* );
  2091. } x264_param_t;
  2092.  
  2093. +/****************************************************************************
  2094. + * H.264 level restriction information
  2095. + ****************************************************************************/
  2096. +
  2097. typedef struct {
  2098. int level_idc;
  2099. int mbps; /* max macroblock processing rate (macroblocks/sec) */
  2100. @@ -350,6 +354,10 @@ typedef struct {
  2101. /* all of the levels defined in the standard, terminated by .level_idc=0 */
  2102. extern const x264_level_t x264_levels[];
  2103.  
  2104. +/****************************************************************************
  2105. + * Basic parameter handling functions
  2106. + ****************************************************************************/
  2107. +
  2108. /* x264_param_default:
  2109. * fill x264_param_t with default values and do CPU detection */
  2110. void x264_param_default( x264_param_t * );
  2111. @@ -366,15 +374,73 @@ void x264_param_default( x264_param_t * );
  2112. int x264_param_parse( x264_param_t *, const char *name, const char *value );
  2113.  
  2114. /****************************************************************************
  2115. - * Picture structures and functions.
  2116. + * Advanced parameter handling functions
  2117. + ****************************************************************************/
  2118. +
  2119. +/* These functions expose the full power of x264's preset-tune-profile system for
  2120. + * easy adjustment of large numbers of internal parameters.
  2121. + *
  2122. + * In order to replicate x264CLI's option handling, these functions MUST be called
  2123. + * in the following order:
  2124. + * 1) x264_param_default_preset
  2125. + * 2) Custom user options (via param_parse or directly assigned variables)
  2126. + * 3) x264_param_apply_fastfirstpass
  2127. + * 4) x264_param_apply_profile
  2128. + *
  2129. + * Additionally, x264CLI does not apply step 3 if the preset chosen is "placebo"
  2130. + * or --slow-firstpass is set. */
  2131. +
  2132. +/* x264_param_default_preset:
  2133. + * The same as x264_param_default, but also use the passed preset and tune
  2134. + * to modify the default settings.
  2135. + * (either can be NULL, which implies no preset or no tune, respectively)
  2136. + *
  2137. + * Currently available presets are, ordered from fastest to slowest: */
  2138. +static const char * const x264_preset_names[] = { "ultrafast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", "placebo", 0 };
  2139. +
  2140. +/* Warning: the speed of these presets scales dramatically. Ultrafast is a full
  2141. + * 100 times faster than placebo!
  2142. + *
  2143. + * Currently available tunings are: */
  2144. +static const char * const x264_tune_names[] = { "film", "animation", "grain", "psnr", "ssim", "fastdecode", "zerolatency", 0 };
  2145. +
  2146. +/* Multiple tunings can be used if separated by a delimiter in ",./-+",
  2147. + * however multiple psy tunings cannot be used.
  2148. + * film, animation, grain, psnr, and ssim are psy tunings.
  2149. + *
  2150. + * returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
  2151. +int x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
  2152. +
  2153. +/* x264_param_apply_fastfirstpass:
  2154. + * If first-pass mode is set (rc.b_stat_read == 1, rc.b_stat_write == 0),
  2155. + * modify the encoder settings to disable options generally not useful on
  2156. + * the first pass. */
  2157. +void x264_param_apply_fastfirstpass( x264_param_t * );
  2158. +
  2159. +/* x264_param_apply_profile:
  2160. + * Applies the restrictions of the given profile.
  2161. + * Currently available profiles are, from most to least restrictive: */
  2162. +static const char * const x264_profile_names[] = { "baseline", "main", "high", 0 };
  2163. +
  2164. +/* (can be NULL, in which case the function will do nothing)
  2165. + *
  2166. + * Does NOT guarantee that the given profile will be used: if the restrictions
  2167. + * of "High" are applied to settings that are already Baseline-compatible, the
  2168. + * stream will remain baseline. In short, it does not increase settings, only
  2169. + * decrease them.
  2170. + *
  2171. + * returns 0 on success, negative on failure (e.g. invalid profile name). */
  2172. +int x264_param_apply_profile( x264_param_t *, const char *profile );
  2173. +
  2174. +/****************************************************************************
  2175. + * Picture structures and functions
  2176. ****************************************************************************/
  2177. typedef struct
  2178. {
  2179. - int i_csp;
  2180. -
  2181. - int i_plane;
  2182. - int i_stride[4];
  2183. - uint8_t *plane[4];
  2184. + int i_csp; /* Colorspace */
  2185. + int i_plane; /* Number of image planes */
  2186. + int i_stride[4]; /* Strides for each plane */
  2187. + uint8_t *plane[4]; /* Pointers to each plane */
  2188. } x264_image_t;
  2189.  
  2190. typedef struct
  2191. @@ -421,9 +487,9 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
  2192. void x264_picture_clean( x264_picture_t *pic );
  2193.  
  2194. /****************************************************************************
  2195. - * NAL structure and functions:
  2196. + * NAL structure and functions
  2197. ****************************************************************************/
  2198. -/* nal */
  2199. +
  2200. enum nal_unit_type_e
  2201. {
  2202. NAL_UNKNOWN = 0,
  2203. @@ -465,7 +531,7 @@ typedef struct
  2204. } x264_nal_t;
  2205.  
  2206. /****************************************************************************
  2207. - * Encoder functions:
  2208. + * Encoder functions
  2209. ****************************************************************************/
  2210.  
  2211. /* Force a link error in the case of linking against an incompatible API version.
  2212. @@ -497,16 +563,16 @@ int x264_encoder_reconfig( x264_t *, x264_param_t * );
  2213. void x264_encoder_parameters( x264_t *, x264_param_t * );
  2214. /* x264_encoder_headers:
  2215. * return the SPS and PPS that will be used for the whole stream.
  2216. - * if i_nal > 0, returns the total size of all NAL payloads.
  2217. + * *pi_nal is the number of NAL units outputted in pp_nal.
  2218. * returns negative on error.
  2219. * the payloads of all output NALs are guaranteed to be sequential in memory. */
  2220. -int x264_encoder_headers( x264_t *, x264_nal_t **, int * );
  2221. +int x264_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
  2222. /* x264_encoder_encode:
  2223. * encode one picture.
  2224. - * if i_nal > 0, returns the total size of all NAL payloads.
  2225. + * *pi_nal is the number of NAL units outputted in pp_nal.
  2226. * returns negative on error, zero if no NAL units returned.
  2227. * the payloads of all output NALs are guaranteed to be sequential in memory. */
  2228. -int x264_encoder_encode ( x264_t *, x264_nal_t **, int *, x264_picture_t *, x264_picture_t * );
  2229. +int x264_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
  2230. /* x264_encoder_close:
  2231. * close an encoder handler */
  2232. void x264_encoder_close ( x264_t * );
  2233. --
  2234. 1.6.1.2
  2235.  
  2236.  
  2237. From cb7143299578377dbe1e11a93c074d0890d487e0 Mon Sep 17 00:00:00 2001
  2238. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2239. Date: Sun, 21 Feb 2010 03:56:06 -0800
  2240. Subject: [PATCH 12/16] Make b-pyramid normal the default
  2241. Now that b-pyramid works with MB-tree and is spec compliant, there's no real reason not to make it default.
  2242. Improves compression 0-5% depending on the video.
  2243. Also allow 0/1/2 to be used as aliases for none/strict/normal (for conciseness).
  2244.  
  2245. ---
  2246. common/common.c | 9 ++++++++-
  2247. x264.h | 2 +-
  2248. 2 files changed, 9 insertions(+), 2 deletions(-)
  2249.  
  2250. diff --git a/common/common.c b/common/common.c
  2251. index a99b65b..2faf139 100644
  2252. --- a/common/common.c
  2253. +++ b/common/common.c
  2254. @@ -75,7 +75,7 @@ void x264_param_default( x264_param_t *param )
  2255. param->i_scenecut_threshold = 40;
  2256. param->i_bframe_adaptive = X264_B_ADAPT_FAST;
  2257. param->i_bframe_bias = 0;
  2258. - param->i_bframe_pyramid = 0;
  2259. + param->i_bframe_pyramid = X264_B_PYRAMID_NORMAL;
  2260. param->b_interlaced = 0;
  2261. param->b_constrained_intra = 0;
  2262.  
  2263. @@ -637,7 +637,14 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
  2264. OPT("b-bias")
  2265. p->i_bframe_bias = atoi(value);
  2266. OPT("b-pyramid")
  2267. + {
  2268. b_error |= parse_enum( value, x264_b_pyramid_names, &p->i_bframe_pyramid );
  2269. + if( b_error )
  2270. + {
  2271. + b_error = 0;
  2272. + p->i_bframe_pyramid = atoi(value);
  2273. + }
  2274. + }
  2275. OPT("nf")
  2276. p->b_deblocking_filter = !atobool(value);
  2277. OPT2("filter", "deblock")
  2278. diff --git a/x264.h b/x264.h
  2279. index f317e98..dec296c 100644
  2280. --- a/x264.h
  2281. +++ b/x264.h
  2282. @@ -35,7 +35,7 @@
  2283.  
  2284. #include <stdarg.h>
  2285.  
  2286. -#define X264_BUILD 86
  2287. +#define X264_BUILD 87
  2288.  
  2289. /* x264_t:
  2290. * opaque handler for encoder */
  2291. --
  2292. 1.6.1.2
  2293.  
  2294.  
  2295. From edebcf0074105c058c60e33b5bf7323743eb19e6 Mon Sep 17 00:00:00 2001
  2296. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2297. Date: Sun, 21 Feb 2010 13:20:19 -0800
  2298. Subject: [PATCH 13/16] Abide by the MinCR level limit
  2299. Some Blu-ray analyzers were complaining about this.
  2300.  
  2301. ---
  2302. encoder/ratecontrol.c | 29 +++++++++++++++++++++++++++--
  2303. encoder/set.c | 32 ++++++++++++++++----------------
  2304. x264.h | 3 ++-
  2305. 3 files changed, 45 insertions(+), 19 deletions(-)
  2306.  
  2307. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  2308. index 3d86aaa..d0fdb50 100644
  2309. --- a/encoder/ratecontrol.c
  2310. +++ b/encoder/ratecontrol.c
  2311. @@ -136,6 +136,7 @@ struct x264_ratecontrol_t
  2312. /* MBRC stuff */
  2313. float frame_size_estimated; /* Access to this variable must be atomic: double is
  2314. * not atomic on all arches we care about */
  2315. + double frame_size_maximum; /* Maximum frame size due to MinCR */
  2316. double frame_size_planned;
  2317. double slice_size_planned;
  2318. double max_frame_error;
  2319. @@ -1039,6 +1040,24 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
  2320. memset( h->fdec->i_row_bits, 0, h->sps->i_mb_height * sizeof(int) );
  2321. rc->row_pred = &rc->row_preds[h->sh.i_type];
  2322. update_vbv_plan( h, overhead );
  2323. +
  2324. + const x264_level_t *l = x264_levels;
  2325. + while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
  2326. + l++;
  2327. +
  2328. + /* The spec has a bizarre special case for the first frame. */
  2329. + if( h->i_frame == 0 )
  2330. + {
  2331. + //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
  2332. + double fr = 1. / 172;
  2333. + int pic_size_in_mbs = h->sps->i_mb_width * h->sps->i_mb_height;
  2334. + rc->frame_size_maximum = 384 * 8 * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / l->mincr;
  2335. + }
  2336. + else
  2337. + {
  2338. + //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR
  2339. + rc->frame_size_maximum = 384 * 8 * (1 / rc->fps) * l->mbps / l->mincr;
  2340. + }
  2341. }
  2342.  
  2343. if( h->sh.i_type != SLICE_TYPE_B )
  2344. @@ -1220,9 +1239,10 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2345. b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2346. }
  2347.  
  2348. - /* avoid VBV underflow */
  2349. + /* avoid VBV underflow or MinCR violation */
  2350. while( (rc->qpm < h->param.rc.i_qp_max)
  2351. - && (rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) )
  2352. + && ((rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) ||
  2353. + (rc->frame_size_maximum - b1 < rc->frame_size_maximum * rc->max_frame_error)))
  2354. {
  2355. rc->qpm ++;
  2356. b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2357. @@ -1677,6 +1697,11 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
  2358. q = X264_MAX( q0, q );
  2359. }
  2360.  
  2361. + /* Apply MinCR restrictions */
  2362. + double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
  2363. + if( bits > rcc->frame_size_maximum )
  2364. + q *= bits / rcc->frame_size_maximum;
  2365. +
  2366. /* Check B-frame complexity, and use up any bits that would
  2367. * overflow before the next P-frame. */
  2368. if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
  2369. diff --git a/encoder/set.c b/encoder/set.c
  2370. index f79919b..03a6dee 100644
  2371. --- a/encoder/set.c
  2372. +++ b/encoder/set.c
  2373. @@ -536,22 +536,22 @@ fail:
  2374.  
  2375. const x264_level_t x264_levels[] =
  2376. {
  2377. - { 10, 1485, 99, 152064, 64, 175, 64, 64, 0, 0, 0, 1 },
  2378. -// {"1b", 1485, 99, 152064, 128, 350, 64, 64, 0, 0, 0, 1 },
  2379. - { 11, 3000, 396, 345600, 192, 500, 128, 64, 0, 0, 0, 1 },
  2380. - { 12, 6000, 396, 912384, 384, 1000, 128, 64, 0, 0, 0, 1 },
  2381. - { 13, 11880, 396, 912384, 768, 2000, 128, 64, 0, 0, 0, 1 },
  2382. - { 20, 11880, 396, 912384, 2000, 2000, 128, 64, 0, 0, 0, 1 },
  2383. - { 21, 19800, 792, 1824768, 4000, 4000, 256, 64, 0, 0, 0, 0 },
  2384. - { 22, 20250, 1620, 3110400, 4000, 4000, 256, 64, 0, 0, 0, 0 },
  2385. - { 30, 40500, 1620, 3110400, 10000, 10000, 256, 32, 22, 0, 1, 0 },
  2386. - { 31, 108000, 3600, 6912000, 14000, 14000, 512, 16, 60, 1, 1, 0 },
  2387. - { 32, 216000, 5120, 7864320, 20000, 20000, 512, 16, 60, 1, 1, 0 },
  2388. - { 40, 245760, 8192, 12582912, 20000, 25000, 512, 16, 60, 1, 1, 0 },
  2389. - { 41, 245760, 8192, 12582912, 50000, 62500, 512, 16, 24, 1, 1, 0 },
  2390. - { 42, 522240, 8704, 13369344, 50000, 62500, 512, 16, 24, 1, 1, 1 },
  2391. - { 50, 589824, 22080, 42393600, 135000, 135000, 512, 16, 24, 1, 1, 1 },
  2392. - { 51, 983040, 36864, 70778880, 240000, 240000, 512, 16, 24, 1, 1, 1 },
  2393. + { 10, 1485, 99, 152064, 64, 175, 64, 64, 0, 2, 0, 0, 1 },
  2394. +// {"1b", 1485, 99, 152064, 128, 350, 64, 64, 0, 2, 0, 0, 1 },
  2395. + { 11, 3000, 396, 345600, 192, 500, 128, 64, 0, 2, 0, 0, 1 },
  2396. + { 12, 6000, 396, 912384, 384, 1000, 128, 64, 0, 2, 0, 0, 1 },
  2397. + { 13, 11880, 396, 912384, 768, 2000, 128, 64, 0, 2, 0, 0, 1 },
  2398. + { 20, 11880, 396, 912384, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 },
  2399. + { 21, 19800, 792, 1824768, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
  2400. + { 22, 20250, 1620, 3110400, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
  2401. + { 30, 40500, 1620, 3110400, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 },
  2402. + { 31, 108000, 3600, 6912000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 },
  2403. + { 32, 216000, 5120, 7864320, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 },
  2404. + { 40, 245760, 8192, 12582912, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 },
  2405. + { 41, 245760, 8192, 12582912, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 },
  2406. + { 42, 522240, 8704, 13369344, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 },
  2407. + { 50, 589824, 22080, 42393600, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
  2408. + { 51, 983040, 36864, 70778880, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
  2409. { 0 }
  2410. };
  2411.  
  2412. diff --git a/x264.h b/x264.h
  2413. index dec296c..7474a50 100644
  2414. --- a/x264.h
  2415. +++ b/x264.h
  2416. @@ -35,7 +35,7 @@
  2417.  
  2418. #include <stdarg.h>
  2419.  
  2420. -#define X264_BUILD 87
  2421. +#define X264_BUILD 88
  2422.  
  2423. /* x264_t:
  2424. * opaque handler for encoder */
  2425. @@ -346,6 +346,7 @@ typedef struct {
  2426. int mv_range; /* max vertical mv component range (pixels) */
  2427. int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
  2428. int slice_rate; /* ?? */
  2429. + int mincr; /* min compression ratio */
  2430. int bipred8x8; /* limit bipred to >=8x8 */
  2431. int direct8x8; /* limit b_direct to >=8x8 */
  2432. int frame_only; /* forbid interlacing */
  2433. --
  2434. 1.6.1.2
  2435.  
  2436.  
  2437. From 1df2cf28b68242423638468f94ed742105f40d28 Mon Sep 17 00:00:00 2001
  2438. From: Anton Mitrofanov <BugMaster@narod.ru>
  2439. Date: Sun, 21 Feb 2010 13:21:11 -0800
  2440. Subject: [PATCH 14/16] New algorithm for AQ mode 2
  2441. Combines the auto-ness of AQ2 with a new var^0.25 instead of log(var) formula.
  2442. Works better with MB-tree than the old AQ mode 2 and should give higher SSIM.
  2443.  
  2444. ---
  2445. encoder/ratecontrol.c | 9 ++++++---
  2446. 1 files changed, 6 insertions(+), 3 deletions(-)
  2447.  
  2448. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  2449. index d0fdb50..8b47e29 100644
  2450. --- a/encoder/ratecontrol.c
  2451. +++ b/encoder/ratecontrol.c
  2452. @@ -246,17 +246,20 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  2453.  
  2454. if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  2455. {
  2456. + float avg_adj_pow2 = 0.f;
  2457. for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
  2458. for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
  2459. {
  2460. uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
  2461. - float qp_adj = x264_log2( energy + 2 );
  2462. - qp_adj *= qp_adj;
  2463. + float qp_adj = powf( energy + 1, 0.125f );
  2464. frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  2465. avg_adj += qp_adj;
  2466. + avg_adj_pow2 += qp_adj * qp_adj;
  2467. }
  2468. avg_adj /= h->mb.i_mb_count;
  2469. - strength = h->param.rc.f_aq_strength * avg_adj * (1.f / 6000.f);
  2470. + avg_adj_pow2 /= h->mb.i_mb_count;
  2471. + strength = h->param.rc.f_aq_strength * avg_adj;
  2472. + avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
  2473. }
  2474. else
  2475. strength = h->param.rc.f_aq_strength * 1.0397f;
  2476. --
  2477. 1.6.1.2
  2478.  
  2479.  
  2480. From b487fb0af745cdc276e059d58fb2b2590203fe85 Mon Sep 17 00:00:00 2001
  2481. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2482. Date: Sun, 21 Feb 2010 17:30:52 -0800
  2483. Subject: [PATCH 15/16] Use short startcodes whenever possible
  2484. Saves one byte per frame for every slice beyond the first.
  2485. Only applies to Annex-B output mode.
  2486.  
  2487. ---
  2488. common/common.c | 6 +++---
  2489. common/common.h | 2 +-
  2490. encoder/encoder.c | 12 +++++++++---
  2491. 3 files changed, 13 insertions(+), 7 deletions(-)
  2492.  
  2493. diff --git a/common/common.c b/common/common.c
  2494. index 2faf139..0410588 100644
  2495. --- a/common/common.c
  2496. +++ b/common/common.c
  2497. @@ -985,17 +985,17 @@ void x264_picture_clean( x264_picture_t *pic )
  2498. /****************************************************************************
  2499. * x264_nal_encode:
  2500. ****************************************************************************/
  2501. -int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal )
  2502. +int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
  2503. {
  2504. uint8_t *src = nal->p_payload;
  2505. uint8_t *end = nal->p_payload + nal->i_payload;
  2506. uint8_t *orig_dst = dst;
  2507. int i_count = 0, size;
  2508.  
  2509. - /* long nal start code (we always use long ones) */
  2510. if( b_annexb )
  2511. {
  2512. - *dst++ = 0x00;
  2513. + if( b_long_startcode )
  2514. + *dst++ = 0x00;
  2515. *dst++ = 0x00;
  2516. *dst++ = 0x00;
  2517. *dst++ = 0x01;
  2518. diff --git a/common/common.h b/common/common.h
  2519. index 413b82f..d2b53b0 100644
  2520. --- a/common/common.h
  2521. +++ b/common/common.h
  2522. @@ -121,7 +121,7 @@ int64_t x264_mdate( void );
  2523. * the encoding options */
  2524. char *x264_param2string( x264_param_t *p, int b_res );
  2525.  
  2526. -int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal );
  2527. +int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
  2528.  
  2529. /* log */
  2530. void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  2531. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2532. index 89bf457..c76938c 100644
  2533. --- a/encoder/encoder.c
  2534. +++ b/encoder/encoder.c
  2535. @@ -1228,10 +1228,14 @@ static int x264_encoder_encapsulate_nals( x264_t *h )
  2536. }
  2537.  
  2538. uint8_t *nal_buffer = h->nal_buffer;
  2539. + int long_startcode = 1;
  2540.  
  2541. for( i = 0; i < h->out.i_nal; i++ )
  2542. {
  2543. - int size = x264_nal_encode( nal_buffer, h->param.b_annexb, &h->out.nal[i] );
  2544. + int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
  2545. + /* Don't use long startcodes for any slice beyond the first. */
  2546. + if( h->out.nal[i].i_type >= NAL_SLICE && h->out.nal[i].i_type <= NAL_SLICE_IDR )
  2547. + long_startcode = 0;
  2548. h->out.nal[i].i_payload = size;
  2549. h->out.nal[i].p_payload = nal_buffer;
  2550. nal_buffer += size;
  2551. @@ -1715,8 +1719,10 @@ static int x264_slice_write( x264_t *h )
  2552. bs_t bs_bak;
  2553. x264_cabac_t cabac_bak;
  2554. uint8_t cabac_prevbyte_bak = 0; /* Shut up GCC. */
  2555. - /* Assume no more than 3 bytes of NALU escaping. */
  2556. - int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-3-NALU_OVERHEAD)*8 : INT_MAX;
  2557. + /* Assume no more than 3 bytes of NALU escaping.
  2558. + * Slices other than the first use a 3-byte startcode. */
  2559. + int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->sh.i_first_mb)) + 3;
  2560. + int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : INT_MAX;
  2561. int starting_bits = bs_pos(&h->out.bs);
  2562. bs_realign( &h->out.bs );
  2563.  
  2564. --
  2565. 1.6.1.2
  2566.  
  2567.  
  2568. From 81c1ae7de624e837cb3cc058ea0d8e8d3dccbeb3 Mon Sep 17 00:00:00 2001
  2569. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2570. Date: Mon, 22 Feb 2010 17:33:17 -0800
  2571. Subject: [PATCH 16/16] Faster probe_skip, 2x2 DC transform handling
  2572. Move the 2x2 DC DCT into the dct_dc asm function to avoid some store-to-load forwarding penalties and extra register loads.
  2573. Use dct_dc as part of the early termination in probe_skip.
  2574. x86 asm partially by Holger Lubitz.
  2575. ARM NEON asm by David Conrad.
  2576.  
  2577. ---
  2578. common/arm/dct-a.S | 14 +++++++++++---
  2579. common/dct.c | 11 +++++++++++
  2580. common/x86/dct-a.asm | 50 ++++++++++++++++++++++++++++++++++----------------
  2581. encoder/macroblock.c | 13 +++++++++----
  2582. 4 files changed, 65 insertions(+), 23 deletions(-)
  2583.  
  2584. diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S
  2585. index 0ed7238..3b9fab9 100644
  2586. --- a/common/arm/dct-a.S
  2587. +++ b/common/arm/dct-a.S
  2588. @@ -639,12 +639,20 @@ function x264_sub8x8_dct_dc_neon
  2589. vld1.64 {d30}, [r1,:64], r3
  2590. vadd.s16 q1, q12, q13
  2591. vld1.64 {d31}, [r2,:64], ip
  2592. - vpadd.s16 d0, d0, d1
  2593. - vadd.s16 q1, q1, q14
  2594. vsubl.u8 q15, d30, d31
  2595. + vadd.s16 q1, q1, q14
  2596. +
  2597. + vadd.s16 d4, d0, d1
  2598. vadd.s16 q1, q1, q15
  2599. - vpadd.s16 d2, d2, d3
  2600. + vsub.s16 d5, d0, d1
  2601. + vadd.s16 d6, d2, d3
  2602. + vsub.s16 d7, d2, d3
  2603. + vadd.s16 q0, q2, q3
  2604. + vsub.s16 q1, q2, q3
  2605. +
  2606. vpadd.s16 d0, d0, d2
  2607. + vpadd.s16 d1, d1, d3
  2608. + vpadd.s16 d0, d0, d1
  2609. vst1.64 {d0}, [r0,:64]
  2610. bx lr
  2611. .endfunc
  2612. diff --git a/common/dct.c b/common/dct.c
  2613. index aa83ef4..55f78a5 100644
  2614. --- a/common/dct.c
  2615. +++ b/common/dct.c
  2616. @@ -184,10 +184,21 @@ static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
  2617.  
  2618. static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
  2619. {
  2620. + int d0, d1, d2, d3;
  2621. dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
  2622. dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
  2623. dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
  2624. dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
  2625. +
  2626. + /* 2x2 DC transform */
  2627. + d0 = dct[0] + dct[1];
  2628. + d1 = dct[2] + dct[3];
  2629. + d2 = dct[0] - dct[1];
  2630. + d3 = dct[2] - dct[3];
  2631. + dct[0] = d0 + d1;
  2632. + dct[2] = d2 + d3;
  2633. + dct[1] = d0 - d1;
  2634. + dct[3] = d2 - d3;
  2635. }
  2636.  
  2637. static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
  2638. diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
  2639. index 618433c..5dd51e5 100644
  2640. --- a/common/x86/dct-a.asm
  2641. +++ b/common/x86/dct-a.asm
  2642. @@ -509,28 +509,43 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
  2643. movq m1, m2
  2644. punpckldq m2, m3
  2645. punpckhdq m1, m3
  2646. - psadbw %1, m7
  2647. - psadbw %2, m7
  2648. - psadbw m2, m7
  2649. - psadbw m1, m7
  2650. + pxor m3, m3
  2651. + psadbw %1, m3
  2652. + psadbw %2, m3
  2653. + psadbw m2, m3
  2654. + psadbw m1, m3
  2655. psubw %1, m2
  2656. psubw %2, m1
  2657. %endmacro
  2658.  
  2659. +%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
  2660. + pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
  2661. + pshufw mm0, %2, 10110001b ; s3 __ s2 __
  2662. + paddw mm1, %2 ; s1 s13 s0 s02
  2663. + psubw mm1, mm0 ; d13 s13 d02 s02
  2664. + pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
  2665. + psrlq mm1, 32 ; __ __ d13 s13
  2666. + paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
  2667. + psllq mm1, 32 ; d13 s13
  2668. + psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
  2669. +%endmacro
  2670. +
  2671. INIT_MMX
  2672. cglobal x264_sub8x8_dct_dc_mmxext, 3,3
  2673. - pxor m7, m7
  2674. - call .loop
  2675. - add r1, FENC_STRIDE*4
  2676. - add r2, FDEC_STRIDE*4
  2677. - add r0, 4
  2678. -.loop:
  2679. DCTDC_2ROW_MMX m0, m4, 0
  2680. DCTDC_2ROW_MMX m5, m6, 2
  2681. paddw m0, m5
  2682. paddw m4, m6
  2683. - punpcklwd m0, m4
  2684. - movd [r0], m0
  2685. + punpckldq m0, m4
  2686. + add r1, FENC_STRIDE*4
  2687. + add r2, FDEC_STRIDE*4
  2688. + DCTDC_2ROW_MMX m7, m4, 0
  2689. + DCTDC_2ROW_MMX m5, m6, 2
  2690. + paddw m7, m5
  2691. + paddw m4, m6
  2692. + punpckldq m7, m4
  2693. + DCT2x2 m0, m7
  2694. + movq [r0], m0
  2695. ret
  2696.  
  2697. INIT_XMM
  2698. @@ -558,13 +573,16 @@ cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
  2699. DCTDC_2ROW_SSE2 2, 1, m4
  2700. add r1, FENC_STRIDE*4
  2701. add r2, FDEC_STRIDE*4
  2702. - psubq m4, m6
  2703. + psubd m4, m6
  2704. DCTDC_2ROW_SSE2 0, 0, m5
  2705. DCTDC_2ROW_SSE2 2, 1, m5
  2706. - psubq m5, m6
  2707. + psubd m5, m6
  2708. packssdw m4, m5
  2709. - packssdw m4, m4
  2710. - movq [r0], m4
  2711. + movhlps m5, m4
  2712. + movdq2q mm0, m4
  2713. + movdq2q mm7, m5
  2714. + DCT2x2 mm0, mm7
  2715. + movq [r0], mm0
  2716. RET
  2717.  
  2718. ;-----------------------------------------------------------------------------
  2719. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  2720. index f67a898..0be6201 100644
  2721. --- a/encoder/macroblock.c
  2722. +++ b/encoder/macroblock.c
  2723. @@ -365,7 +365,6 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  2724. if( ssd[ch] > thresh )
  2725. {
  2726. h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
  2727. - dct2x2dc_dconly( dct2x2 );
  2728. if( h->mb.b_trellis )
  2729. nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
  2730. else
  2731. @@ -980,10 +979,10 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  2732. if( ssd < thresh )
  2733. continue;
  2734.  
  2735. - h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
  2736. + /* The vast majority of chroma checks will terminate during the DC check or the higher
  2737. + * threshold check, so we can save time by doing a DC-only DCT. */
  2738. + h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
  2739.  
  2740. - /* calculate dct DC */
  2741. - dct2x2dc( dct2x2, dct4x4 );
  2742. if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
  2743. return 0;
  2744.  
  2745. @@ -991,9 +990,15 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  2746. if( ssd < thresh*4 )
  2747. continue;
  2748.  
  2749. + h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
  2750. +
  2751. /* calculate dct coeffs */
  2752. for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
  2753. {
  2754. + /* We don't need to zero the DC coefficient before quantization because we already
  2755. + * checked that all the DCs were zero above at twice the precision that quant4x4
  2756. + * uses. This applies even though the DC here is being quantized before the 2x2
  2757. + * transform. */
  2758. if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
  2759. continue;
  2760. h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
  2761. --
  2762. 1.6.1.2
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement