Advertisement
Guest User

Untitled

a guest
May 4th, 2017
598
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 212.65 KB | None | 0 0
  1. From f555cf3758f46f3c4f7a2f05094b16f8f3c25a27 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Fri, 29 Jan 2010 02:40:41 -0800
  4. Subject: [PATCH 01/26] Add ability to adjust ratecontrol parameters on the fly
  5. encoder_reconfig and x264_picture_t->param can now be used to change ratecontrol parameters.
  6. This is extraordinarily useful in certain streaming situations where the encoder needs to adapt the bitrate to network circumstances.
  7.  
  8. What can be changed:
  9. 1) CRF can be adjusted if in CRF mode.
  10. 2) VBV maxrate and bufsize can be adjusted if in VBV mode.
  11. 3) Bitrate can be adjusted if in CBR mode.
  12. However, x264 cannot switch between modes and cannot change bitrate in ABR mode.
  13.  
  14. Also fix a bug where x264_picture_t->param reconfig method would not always be frame-exact.
  15.  
  16. Commit sponsored by SayMama video calling.
  17. ---
  18. encoder/encoder.c | 56 +++++++++++++++++++-
  19. encoder/ratecontrol.c | 137 +++++++++++++++++++++++-------------------------
  20. encoder/ratecontrol.h | 2 +
  21. x264.h | 7 ++-
  22. 4 files changed, 126 insertions(+), 76 deletions(-)
  23.  
  24. diff --git a/encoder/encoder.c b/encoder/encoder.c
  25. index d873cd0..008d0f2 100644
  26. --- a/encoder/encoder.c
  27. +++ b/encoder/encoder.c
  28. @@ -507,6 +507,39 @@ static int x264_validate_parameters( x264_t *h )
  29. }
  30. h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
  31. h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
  32. + if( h->param.rc.i_vbv_buffer_size )
  33. + {
  34. + if( h->param.rc.i_rc_method == X264_RC_CQP )
  35. + {
  36. + x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" );
  37. + h->param.rc.i_vbv_max_bitrate = 0;
  38. + h->param.rc.i_vbv_buffer_size = 0;
  39. + }
  40. + else if( h->param.rc.i_vbv_max_bitrate == 0 )
  41. + {
  42. + if( h->param.rc.i_rc_method == X264_RC_ABR )
  43. + {
  44. + x264_log( h, X264_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n" );
  45. + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  46. + }
  47. + else
  48. + {
  49. + x264_log( h, X264_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n" );
  50. + h->param.rc.i_vbv_buffer_size = 0;
  51. + }
  52. + }
  53. + else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
  54. + h->param.rc.i_rc_method == X264_RC_ABR )
  55. + {
  56. + x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
  57. + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  58. + }
  59. + }
  60. + else if( h->param.rc.i_vbv_max_bitrate )
  61. + {
  62. + x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n" );
  63. + h->param.rc.i_vbv_max_bitrate = 0;
  64. + }
  65.  
  66. int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
  67. if( h->param.b_sliced_threads )
  68. @@ -1071,7 +1104,7 @@ fail:
  69. ****************************************************************************/
  70. int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
  71. {
  72. - h = h->thread[h->i_thread_phase];
  73. + h = h->thread[h->thread[0]->i_thread_phase];
  74. x264_set_aspect_ratio( h, param, 0 );
  75. #define COPY(var) h->param.var = param->var
  76. COPY( i_frame_reference ); // but never uses more refs than initially specified
  77. @@ -1110,11 +1143,30 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
  78. COPY( i_slice_max_size );
  79. COPY( i_slice_max_mbs );
  80. COPY( i_slice_count );
  81. + /* VBV can't be turned on if it wasn't on to begin with */
  82. + if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 &&
  83. + param->rc.i_vbv_max_bitrate > 0 && param->rc.i_vbv_buffer_size > 0 )
  84. + {
  85. + COPY( rc.i_vbv_max_bitrate );
  86. + COPY( rc.i_vbv_buffer_size );
  87. + COPY( rc.i_bitrate );
  88. + }
  89. + COPY( rc.f_rf_constant );
  90. #undef COPY
  91.  
  92. mbcmp_init( h );
  93.  
  94. - return x264_validate_parameters( h );
  95. + int ret = x264_validate_parameters( h );
  96. +
  97. + /* Supported reconfiguration options (1-pass only):
  98. + * vbv-maxrate
  99. + * vbv-bufsize
  100. + * crf
  101. + * bitrate (CBR only) */
  102. + if( !ret )
  103. + x264_ratecontrol_init_reconfigurable( h, 0 );
  104. +
  105. + return ret;
  106. }
  107.  
  108. /****************************************************************************
  109. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  110. index 63b3be6..52196e7 100644
  111. --- a/encoder/ratecontrol.c
  112. +++ b/encoder/ratecontrol.c
  113. @@ -388,6 +388,53 @@ static char *x264_strcat_filename( char *input, char *suffix )
  114. return output;
  115. }
  116.  
  117. +void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
  118. +{
  119. + x264_ratecontrol_t *rc = h->rc;
  120. + if( !b_init && rc->b_2pass )
  121. + return;
  122. +
  123. + if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
  124. + {
  125. + if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
  126. + {
  127. + h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
  128. + x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
  129. + h->param.rc.i_vbv_buffer_size );
  130. + }
  131. +
  132. + /* We don't support changing the ABR bitrate right now,
  133. + so if the stream starts as CBR, keep it CBR. */
  134. + if( rc->b_vbv_min_rate )
  135. + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  136. + rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
  137. + rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
  138. + rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
  139. + rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
  140. + * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
  141. + if( b_init )
  142. + {
  143. + if( h->param.rc.f_vbv_buffer_init > 1. )
  144. + h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
  145. + h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
  146. + rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
  147. + rc->b_vbv = 1;
  148. + rc->b_vbv_min_rate = !rc->b_2pass
  149. + && h->param.rc.i_rc_method == X264_RC_ABR
  150. + && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
  151. + }
  152. + }
  153. + if( h->param.rc.i_rc_method == X264_RC_CRF )
  154. + {
  155. + /* Arbitrary rescaling to make CRF somewhat similar to QP.
  156. + * Try to compensate for MB-tree's effects as well. */
  157. + double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
  158. + double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
  159. + rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
  160. + / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
  161. + }
  162. +}
  163. +
  164. int x264_ratecontrol_new( x264_t *h )
  165. {
  166. x264_ratecontrol_t *rc;
  167. @@ -426,60 +473,10 @@ int x264_ratecontrol_new( x264_t *h )
  168. x264_log(h, X264_LOG_ERROR, "constant rate-factor is incompatible with 2pass.\n");
  169. return -1;
  170. }
  171. - if( h->param.rc.i_vbv_buffer_size )
  172. - {
  173. - if( h->param.rc.i_rc_method == X264_RC_CQP )
  174. - {
  175. - x264_log(h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n");
  176. - h->param.rc.i_vbv_max_bitrate = 0;
  177. - h->param.rc.i_vbv_buffer_size = 0;
  178. - }
  179. - else if( h->param.rc.i_vbv_max_bitrate == 0 )
  180. - {
  181. - if( h->param.rc.i_rc_method == X264_RC_ABR )
  182. - {
  183. - x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
  184. - h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  185. - }
  186. - else
  187. - {
  188. - x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
  189. - h->param.rc.i_vbv_buffer_size = 0;
  190. - }
  191. - }
  192. - }
  193. - if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
  194. - h->param.rc.i_vbv_max_bitrate > 0)
  195. - x264_log(h, X264_LOG_WARNING, "max bitrate less than average bitrate, ignored.\n");
  196. - else if( h->param.rc.i_vbv_max_bitrate > 0 &&
  197. - h->param.rc.i_vbv_buffer_size > 0 )
  198. - {
  199. - if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
  200. - {
  201. - h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
  202. - x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
  203. - h->param.rc.i_vbv_buffer_size );
  204. - }
  205. - if( h->param.rc.f_vbv_buffer_init > 1. )
  206. - h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
  207. - rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
  208. - rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
  209. - rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
  210. - h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
  211. - rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
  212. - rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
  213. - * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
  214. - rc->b_vbv = 1;
  215. - rc->b_vbv_min_rate = !rc->b_2pass
  216. - && h->param.rc.i_rc_method == X264_RC_ABR
  217. - && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
  218. - }
  219. - else if( h->param.rc.i_vbv_max_bitrate )
  220. - {
  221. - x264_log(h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize.\n");
  222. - h->param.rc.i_vbv_max_bitrate = 0;
  223. - }
  224. - if(rc->rate_tolerance < 0.01)
  225. +
  226. + x264_ratecontrol_init_reconfigurable( h, 1 );
  227. +
  228. + if( rc->rate_tolerance < 0.01 )
  229. {
  230. x264_log(h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n");
  231. rc->rate_tolerance = 0.01;
  232. @@ -499,16 +496,6 @@ int x264_ratecontrol_new( x264_t *h )
  233. rc->last_non_b_pict_type = SLICE_TYPE_I;
  234. }
  235.  
  236. - if( h->param.rc.i_rc_method == X264_RC_CRF )
  237. - {
  238. - /* Arbitrary rescaling to make CRF somewhat similar to QP.
  239. - * Try to compensate for MB-tree's effects as well. */
  240. - double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
  241. - double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
  242. - rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
  243. - / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
  244. - }
  245. -
  246. rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
  247. rc->pb_offset = 6.0 * log(h->param.rc.f_pb_factor) / log(2.0);
  248. rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
  249. @@ -1577,15 +1564,15 @@ static void update_vbv( x264_t *h, int bits )
  250. if( rct->buffer_fill_final < 0 )
  251. x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, rct->buffer_fill_final );
  252. rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
  253. - rct->buffer_fill_final += rct->buffer_rate;
  254. - rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rct->buffer_size );
  255. + rct->buffer_fill_final += rcc->buffer_rate;
  256. + rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rcc->buffer_size );
  257. }
  258.  
  259. // provisionally update VBV according to the planned size of all frames currently in progress
  260. static void update_vbv_plan( x264_t *h, int overhead )
  261. {
  262. x264_ratecontrol_t *rcc = h->rc;
  263. - rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
  264. + rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
  265. if( h->i_thread_frames > 1 )
  266. {
  267. int j = h->rc - h->thread[0]->rc;
  268. @@ -1603,6 +1590,8 @@ static void update_vbv_plan( x264_t *h, int overhead )
  269. rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
  270. }
  271. }
  272. + rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
  273. + rcc->buffer_fill -= overhead;
  274. }
  275.  
  276. // apply VBV constraints and clip qscale to between lmin and lmax
  277. @@ -2027,8 +2016,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  278. #define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
  279. /* these vars are updated in x264_ratecontrol_start()
  280. * so copy them from the context that most recently started (prev)
  281. - * to the context that's about to start (cur).
  282. - */
  283. + * to the context that's about to start (cur). */
  284. COPY(accum_p_qp);
  285. COPY(accum_p_norm);
  286. COPY(last_satd);
  287. @@ -2040,6 +2028,14 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  288. COPY(bframes);
  289. COPY(prev_zone);
  290. COPY(qpbuf_pos);
  291. + /* these vars can be updated by x264_ratecontrol_init_reconfigurable */
  292. + COPY(buffer_rate);
  293. + COPY(buffer_size);
  294. + COPY(single_frame_vbv);
  295. + COPY(cbr_decay);
  296. + COPY(b_vbv_min_rate);
  297. + COPY(rate_factor_constant);
  298. + COPY(bitrate);
  299. #undef COPY
  300. }
  301. if( cur != next )
  302. @@ -2047,8 +2043,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  303. #define COPY(var) next->rc->var = cur->rc->var
  304. /* these vars are updated in x264_ratecontrol_end()
  305. * so copy them from the context that most recently ended (cur)
  306. - * to the context that's about to end (next)
  307. - */
  308. + * to the context that's about to end (next) */
  309. COPY(cplxr_sum);
  310. COPY(expected_bits_sum);
  311. COPY(wanted_bits_window);
  312. diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
  313. index 5a8d088..2767866 100644
  314. --- a/encoder/ratecontrol.h
  315. +++ b/encoder/ratecontrol.h
  316. @@ -27,6 +27,8 @@
  317. int x264_ratecontrol_new ( x264_t * );
  318. void x264_ratecontrol_delete( x264_t * );
  319.  
  320. +void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
  321. +
  322. void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
  323. void x264_adaptive_quant( x264_t * );
  324. int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
  325. diff --git a/x264.h b/x264.h
  326. index 2550864..e7d19b7 100644
  327. --- a/x264.h
  328. +++ b/x264.h
  329. @@ -35,7 +35,7 @@
  330.  
  331. #include <stdarg.h>
  332.  
  333. -#define X264_BUILD 84
  334. +#define X264_BUILD 85
  335.  
  336. /* x264_t:
  337. * opaque handler for encoder */
  338. @@ -480,11 +480,12 @@ typedef struct
  339. x264_t *x264_encoder_open( x264_param_t * );
  340.  
  341. /* x264_encoder_reconfig:
  342. - * analysis-related parameters from x264_param_t are copied.
  343. + * various parameters from x264_param_t are copied.
  344. * this takes effect immediately, on whichever frame is encoded next;
  345. * due to delay, this may not be the next frame passed to encoder_encode.
  346. * if the change should apply to some particular frame, use x264_picture_t->param instead.
  347. - * returns 0 on success, negative on parameter validation error. */
  348. + * returns 0 on success, negative on parameter validation error.
  349. + * not all parameters can be changed; see the actual function for a detailed breakdown. */
  350. int x264_encoder_reconfig( x264_t *, x264_param_t * );
  351. /* x264_encoder_parameters:
  352. * copies the current internal set of parameters to the pointer provided
  353. --
  354. 1.6.1.2
  355.  
  356.  
  357. From 08d4a999b0300e50196afb3ee0e310834028b537 Mon Sep 17 00:00:00 2001
  358. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  359. Date: Mon, 1 Feb 2010 13:04:47 -0800
  360. Subject: [PATCH 02/26] Slightly faster predictor_difference_mmxext
  361.  
  362. ---
  363. common/x86/util.h | 17 ++++++++++-------
  364. 1 files changed, 10 insertions(+), 7 deletions(-)
  365.  
  366. diff --git a/common/x86/util.h b/common/x86/util.h
  367. index efc700a..c8bcf4b 100644
  368. --- a/common/x86/util.h
  369. +++ b/common/x86/util.h
  370. @@ -45,8 +45,9 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
  371. #define x264_predictor_difference x264_predictor_difference_mmxext
  372. static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
  373. {
  374. - int sum = 0;
  375. - uint16_t output[4];
  376. + int sum;
  377. + static const uint64_t pw_1 = 0x0001000100010001ULL;
  378. +
  379. asm(
  380. "pxor %%mm4, %%mm4 \n"
  381. "test $1, %1 \n"
  382. @@ -56,7 +57,7 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
  383. "psubw %%mm3, %%mm0 \n"
  384. "jmp 2f \n"
  385. "3: \n"
  386. - "sub $1, %1 \n"
  387. + "dec %1 \n"
  388. "1: \n"
  389. "movq -8(%2,%1,4), %%mm0 \n"
  390. "psubw -4(%2,%1,4), %%mm0 \n"
  391. @@ -67,11 +68,13 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
  392. "pmaxsw %%mm2, %%mm0 \n"
  393. "paddusw %%mm0, %%mm4 \n"
  394. "jg 1b \n"
  395. - "movq %%mm4, %0 \n"
  396. - :"=m"(output), "+r"(i_mvc)
  397. - :"r"(mvc), "m"(M64( mvc ))
  398. + "pmaddwd %4, %%mm4 \n"
  399. + "pshufw $14, %%mm4, %%mm0 \n"
  400. + "paddd %%mm0, %%mm4 \n"
  401. + "movd %%mm4, %0 \n"
  402. + :"=r"(sum), "+r"(i_mvc)
  403. + :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
  404. );
  405. - sum += output[0] + output[1] + output[2] + output[3];
  406. return sum;
  407. }
  408. #define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
  409. --
  410. 1.6.1.2
  411.  
  412.  
  413. From 1ec82b87c875c5fa6e66e9cbedb4ec04ac6c058c Mon Sep 17 00:00:00 2001
  414. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  415. Date: Tue, 2 Feb 2010 03:15:18 -0800
  416. Subject: [PATCH 03/26] Improve bidir search, fix some artifacts in fades
  417. Modify analysis to allow bidir to use different motion vectors than L0/L1.
  418. Always try the <0,0,0,0> motion vector for bidir.
  419. Eliminates almost all errant motion vectors in fades.
  420. Slightly improves PSNR as well (~0.015db).
  421.  
  422. ---
  423. encoder/analyse.c | 50 ++++++++++++++++++++++++++++++++++++++------------
  424. 1 files changed, 38 insertions(+), 12 deletions(-)
  425.  
  426. diff --git a/encoder/analyse.c b/encoder/analyse.c
  427. index 666596b..1fb2206 100644
  428. --- a/encoder/analyse.c
  429. +++ b/encoder/analyse.c
  430. @@ -40,6 +40,7 @@ typedef struct
  431. int i_ref;
  432. int i_rd16x16;
  433. x264_me_t me16x16;
  434. + x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  435.  
  436. /* 8x8 */
  437. int i_cost8x8;
  438. @@ -1722,20 +1723,45 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  439. a->l1.me16x16.i_ref = a->l1.i_ref;
  440.  
  441. /* get cost of BI mode */
  442. + int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
  443. + h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
  444. + h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
  445. src0 = h->mc.get_ref( pix0, &stride0,
  446. h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
  447. - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
  448. + a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
  449. src1 = h->mc.get_ref( pix1, &stride1,
  450. h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
  451. - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
  452. + a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
  453.  
  454. h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
  455.  
  456. a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
  457. - + REF_COST( 0, a->l0.i_ref )
  458. - + REF_COST( 1, a->l1.i_ref )
  459. - + a->l0.me16x16.cost_mv
  460. - + a->l1.me16x16.cost_mv;
  461. + + ref_costs
  462. + + a->l0.bi16x16.cost_mv
  463. + + a->l1.bi16x16.cost_mv;
  464. +
  465. +
  466. + /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
  467. + if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
  468. + {
  469. + int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
  470. + + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
  471. + int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
  472. + + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
  473. + h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
  474. + h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
  475. + h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
  476. + int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
  477. + + ref_costs + l0_mv_cost + l1_mv_cost;
  478. + if( cost00 < a->i_cost16x16bi )
  479. + {
  480. + M32( a->l0.bi16x16.mv ) = 0;
  481. + M32( a->l1.bi16x16.mv ) = 0;
  482. + a->l0.bi16x16.cost_mv = l0_mv_cost;
  483. + a->l1.bi16x16.cost_mv = l1_mv_cost;
  484. + a->i_cost16x16bi = cost00;
  485. + }
  486. + }
  487.  
  488. /* mb type cost */
  489. a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
  490. @@ -2205,7 +2231,7 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
  491. {
  492. case D_16x16:
  493. if( h->mb.i_type == B_BI_BI )
  494. - x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
  495. + x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
  496. break;
  497. case D_16x8:
  498. for( i=0; i<2; i++ )
  499. @@ -2819,8 +2845,8 @@ intra_analysis:
  500. }
  501. else if( i_type == B_BI_BI )
  502. {
  503. - x264_me_refine_qpel( h, &analysis.l0.me16x16 );
  504. - x264_me_refine_qpel( h, &analysis.l1.me16x16 );
  505. + x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
  506. + x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
  507. }
  508. }
  509. else if( i_partition == D_16x8 )
  510. @@ -2938,7 +2964,7 @@ intra_analysis:
  511. x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
  512. }
  513. else if( i_type == B_BI_BI )
  514. - x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
  515. + x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
  516. }
  517. else if( i_partition == D_16x8 )
  518. {
  519. @@ -3121,10 +3147,10 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
  520. break;
  521. case B_BI_BI:
  522. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
  523. - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
  524. + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
  525.  
  526. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
  527. - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
  528. + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
  529. break;
  530. }
  531. break;
  532. --
  533. 1.6.1.2
  534.  
  535.  
  536. From dd349567b662bb4c2d629cf0967c87843b9bb3a3 Mon Sep 17 00:00:00 2001
  537. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  538. Date: Wed, 3 Feb 2010 14:22:05 -0800
  539. Subject: [PATCH 04/26] Faster CABAC MB header writing
  540. Reorganize the header writing to merge mb type and mb mode info (mv, pred, etc)
  541. Reduces redundant branches and better splits the code between frame types (for better code cache usage).
  542. Also slightly simplify qp delta calculation.
  543. Also make CAVLC and CABAC a bit more consistent in structure and function names.
  544.  
  545. ---
  546. encoder/cabac.c | 573 ++++++++++++++++++++++++++-----------------------------
  547. encoder/cavlc.c | 118 ++++++------
  548. 2 files changed, 334 insertions(+), 357 deletions(-)
  549.  
  550. diff --git a/encoder/cabac.c b/encoder/cabac.c
  551. index 271f527..6ff2aed 100644
  552. --- a/encoder/cabac.c
  553. +++ b/encoder/cabac.c
  554. @@ -29,151 +29,6 @@
  555. #define RDO_SKIP_BS 0
  556. #endif
  557.  
  558. -static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
  559. - int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
  560. -{
  561. - if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  562. - {
  563. - x264_cabac_encode_decision_noup( cb, ctx0, 0 );
  564. - }
  565. -#if !RDO_SKIP_BS
  566. - else if( i_mb_type == I_PCM )
  567. - {
  568. - x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  569. - x264_cabac_encode_flush( h, cb );
  570. - }
  571. -#endif
  572. - else
  573. - {
  574. - int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
  575. -
  576. - x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  577. - x264_cabac_encode_terminal( cb );
  578. -
  579. - x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
  580. - if( h->mb.i_cbp_chroma == 0 )
  581. - x264_cabac_encode_decision_noup( cb, ctx2, 0 );
  582. - else
  583. - {
  584. - x264_cabac_encode_decision( cb, ctx2, 1 );
  585. - x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
  586. - }
  587. - x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
  588. - x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
  589. - }
  590. -}
  591. -
  592. -static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
  593. -{
  594. - const int i_mb_type = h->mb.i_type;
  595. -
  596. - if( h->sh.b_mbaff &&
  597. - (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
  598. - {
  599. - x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced );
  600. - }
  601. -
  602. - if( h->sh.i_type == SLICE_TYPE_I )
  603. - {
  604. - int ctx = 0;
  605. - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 )
  606. - ctx++;
  607. - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 )
  608. - ctx++;
  609. -
  610. - x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
  611. - }
  612. - else if( h->sh.i_type == SLICE_TYPE_P )
  613. - {
  614. - /* prefix: 14, suffix: 17 */
  615. - if( i_mb_type == P_L0 )
  616. - {
  617. - x264_cabac_encode_decision_noup( cb, 14, 0 );
  618. - x264_cabac_encode_decision_noup( cb, 15, h->mb.i_partition != D_16x16 );
  619. - x264_cabac_encode_decision_noup( cb, 17-(h->mb.i_partition == D_16x16), h->mb.i_partition == D_16x8 );
  620. - }
  621. - else if( i_mb_type == P_8x8 )
  622. - {
  623. - x264_cabac_encode_decision_noup( cb, 14, 0 );
  624. - x264_cabac_encode_decision_noup( cb, 15, 0 );
  625. - x264_cabac_encode_decision_noup( cb, 16, 1 );
  626. - }
  627. - else /* intra */
  628. - {
  629. - /* prefix */
  630. - x264_cabac_encode_decision_noup( cb, 14, 1 );
  631. -
  632. - /* suffix */
  633. - x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
  634. - }
  635. - }
  636. - else //if( h->sh.i_type == SLICE_TYPE_B )
  637. - {
  638. - int ctx = 0;
  639. - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
  640. - ctx++;
  641. - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
  642. - ctx++;
  643. -
  644. - if( i_mb_type == B_DIRECT )
  645. - {
  646. - x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
  647. - return;
  648. - }
  649. - x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
  650. -
  651. - if( i_mb_type == B_8x8 )
  652. - {
  653. - x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  654. - x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  655. - x264_cabac_encode_decision( cb, 27+5, 1 );
  656. - x264_cabac_encode_decision( cb, 27+5, 1 );
  657. - x264_cabac_encode_decision_noup( cb, 27+5, 1 );
  658. - }
  659. - else if( IS_INTRA( i_mb_type ) )
  660. - {
  661. - /* prefix */
  662. - x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  663. - x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  664. - x264_cabac_encode_decision( cb, 27+5, 1 );
  665. - x264_cabac_encode_decision( cb, 27+5, 0 );
  666. - x264_cabac_encode_decision( cb, 27+5, 1 );
  667. -
  668. - /* suffix */
  669. - x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
  670. - }
  671. - else
  672. - {
  673. - static const uint8_t i_mb_bits[9*3] =
  674. - {
  675. - 0x31, 0x29, 0x4, /* L0 L0 */
  676. - 0x35, 0x2d, 0, /* L0 L1 */
  677. - 0x43, 0x63, 0, /* L0 BI */
  678. - 0x3d, 0x2f, 0, /* L1 L0 */
  679. - 0x39, 0x25, 0x6, /* L1 L1 */
  680. - 0x53, 0x73, 0, /* L1 BI */
  681. - 0x4b, 0x6b, 0, /* BI L0 */
  682. - 0x5b, 0x7b, 0, /* BI L1 */
  683. - 0x47, 0x67, 0x21 /* BI BI */
  684. - };
  685. -
  686. - const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
  687. - int bits = i_mb_bits[idx];
  688. -
  689. - x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
  690. - x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
  691. - if( bits != 1 )
  692. - {
  693. - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  694. - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  695. - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  696. - if( bits != 1 )
  697. - x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
  698. - }
  699. - }
  700. - }
  701. -}
  702. -
  703. static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode )
  704. {
  705. if( i_pred == i_mode )
  706. @@ -209,6 +64,12 @@ static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
  707. }
  708. }
  709.  
  710. +static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
  711. +{
  712. + int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
  713. + x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
  714. +}
  715. +
  716. static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb )
  717. {
  718. int cbp = h->mb.i_cbp_luma;
  719. @@ -244,7 +105,6 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
  720. static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
  721. {
  722. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  723. - int ctx;
  724.  
  725. /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
  726. if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
  727. @@ -257,7 +117,7 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
  728.  
  729. /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
  730. * we don't have to check for them. */
  731. - ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];
  732. + int ctx = !!h->mb.i_last_dqp;
  733.  
  734. if( i_dqp != 0 )
  735. {
  736. @@ -321,12 +181,6 @@ static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
  737. x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 );
  738. }
  739.  
  740. -static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
  741. -{
  742. - int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
  743. - x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
  744. -}
  745. -
  746. static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx )
  747. {
  748. const int i8 = x264_scan8[idx];
  749. @@ -463,6 +317,267 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
  750. }
  751. }
  752.  
  753. +static void x264_cabac_mb_header_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
  754. + int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
  755. +{
  756. + if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  757. + {
  758. + int i, di = h->mb.b_transform_8x8 ? 4 : 1;
  759. + x264_cabac_encode_decision_noup( cb, ctx0, 0 );
  760. +
  761. + if( h->pps->b_transform_8x8_mode )
  762. + x264_cabac_mb_transform_size( h, cb );
  763. +
  764. + for( i = 0; i < 16; i += di )
  765. + {
  766. + const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  767. + const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  768. + x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
  769. + }
  770. + }
  771. +#if !RDO_SKIP_BS
  772. + else if( i_mb_type == I_PCM )
  773. + {
  774. + x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  775. + x264_cabac_encode_flush( h, cb );
  776. + return;
  777. + }
  778. +#endif
  779. + else
  780. + {
  781. + int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
  782. +
  783. + x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  784. + x264_cabac_encode_terminal( cb );
  785. +
  786. + x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
  787. + if( h->mb.i_cbp_chroma == 0 )
  788. + x264_cabac_encode_decision_noup( cb, ctx2, 0 );
  789. + else
  790. + {
  791. + x264_cabac_encode_decision( cb, ctx2, 1 );
  792. + x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
  793. + }
  794. + x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
  795. + x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
  796. + }
  797. + x264_cabac_mb_intra_chroma_pred_mode( h, cb );
  798. +}
  799. +
  800. +static inline void x264_cabac_mb_header( x264_t *h, x264_cabac_t *cb )
  801. +{
  802. + const int i_mb_type = h->mb.i_type;
  803. + int i_list, i;
  804. +
  805. + if( h->sh.b_mbaff &&
  806. + (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
  807. + {
  808. + x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced );
  809. + }
  810. +
  811. + if( h->sh.i_type == SLICE_TYPE_I )
  812. + {
  813. + int ctx = 0;
  814. + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 )
  815. + ctx++;
  816. + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 )
  817. + ctx++;
  818. +
  819. + x264_cabac_mb_header_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
  820. + }
  821. + else if( h->sh.i_type == SLICE_TYPE_P )
  822. + {
  823. + /* prefix: 14, suffix: 17 */
  824. + if( i_mb_type == P_L0 )
  825. + {
  826. + x264_cabac_encode_decision_noup( cb, 14, 0 );
  827. + if( h->mb.i_partition == D_16x16 )
  828. + {
  829. + x264_cabac_encode_decision_noup( cb, 15, 0 );
  830. + x264_cabac_encode_decision_noup( cb, 16, 0 );
  831. + if( h->mb.pic.i_fref[0] > 1 )
  832. + x264_cabac_mb_ref( h, cb, 0, 0 );
  833. + x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 );
  834. + }
  835. + else if( h->mb.i_partition == D_16x8 )
  836. + {
  837. + x264_cabac_encode_decision_noup( cb, 15, 1 );
  838. + x264_cabac_encode_decision_noup( cb, 17, 1 );
  839. + if( h->mb.pic.i_fref[0] > 1 )
  840. + {
  841. + x264_cabac_mb_ref( h, cb, 0, 0 );
  842. + x264_cabac_mb_ref( h, cb, 0, 8 );
  843. + }
  844. + x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
  845. + x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
  846. + }
  847. + else //if( h->mb.i_partition == D_8x16 )
  848. + {
  849. + x264_cabac_encode_decision_noup( cb, 15, 1 );
  850. + x264_cabac_encode_decision_noup( cb, 17, 0 );
  851. + if( h->mb.pic.i_fref[0] > 1 )
  852. + {
  853. + x264_cabac_mb_ref( h, cb, 0, 0 );
  854. + x264_cabac_mb_ref( h, cb, 0, 4 );
  855. + }
  856. + x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 );
  857. + x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 );
  858. + }
  859. + }
  860. + else if( i_mb_type == P_8x8 )
  861. + {
  862. + x264_cabac_encode_decision_noup( cb, 14, 0 );
  863. + x264_cabac_encode_decision_noup( cb, 15, 0 );
  864. + x264_cabac_encode_decision_noup( cb, 16, 1 );
  865. +
  866. + /* sub mb type */
  867. + for( i = 0; i < 4; i++ )
  868. + x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
  869. +
  870. + /* ref 0 */
  871. + if( h->mb.pic.i_fref[0] > 1 )
  872. + {
  873. + x264_cabac_mb_ref( h, cb, 0, 0 );
  874. + x264_cabac_mb_ref( h, cb, 0, 4 );
  875. + x264_cabac_mb_ref( h, cb, 0, 8 );
  876. + x264_cabac_mb_ref( h, cb, 0, 12 );
  877. + }
  878. +
  879. + for( i = 0; i < 4; i++ )
  880. + x264_cabac_mb8x8_mvd( h, cb, i );
  881. + }
  882. + else /* intra */
  883. + {
  884. + /* prefix */
  885. + x264_cabac_encode_decision_noup( cb, 14, 1 );
  886. +
  887. + /* suffix */
  888. + x264_cabac_mb_header_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
  889. + }
  890. + }
  891. + else //if( h->sh.i_type == SLICE_TYPE_B )
  892. + {
  893. + int ctx = 0;
  894. + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
  895. + ctx++;
  896. + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
  897. + ctx++;
  898. +
  899. + if( i_mb_type == B_DIRECT )
  900. + {
  901. + x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
  902. + return;
  903. + }
  904. + x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
  905. +
  906. + if( i_mb_type == B_8x8 )
  907. + {
  908. + x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  909. + x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  910. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  911. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  912. + x264_cabac_encode_decision_noup( cb, 27+5, 1 );
  913. +
  914. + /* sub mb type */
  915. + for( i = 0; i < 4; i++ )
  916. + x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
  917. +
  918. + /* ref */
  919. + if( h->mb.pic.i_fref[0] > 1 )
  920. + for( i = 0; i < 4; i++ )
  921. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  922. + x264_cabac_mb_ref( h, cb, 0, 4*i );
  923. +
  924. + if( h->mb.pic.i_fref[1] > 1 )
  925. + for( i = 0; i < 4; i++ )
  926. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  927. + x264_cabac_mb_ref( h, cb, 1, 4*i );
  928. +
  929. + for( i = 0; i < 4; i++ )
  930. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  931. + x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
  932. +
  933. + for( i = 0; i < 4; i++ )
  934. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  935. + x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
  936. + }
  937. + else if( IS_INTRA( i_mb_type ) )
  938. + {
  939. + /* prefix */
  940. + x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  941. + x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  942. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  943. + x264_cabac_encode_decision ( cb, 27+5, 0 );
  944. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  945. +
  946. + /* suffix */
  947. + x264_cabac_mb_header_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
  948. + }
  949. + else
  950. + {
  951. + static const uint8_t i_mb_bits[9*3] =
  952. + {
  953. + 0x31, 0x29, 0x4, /* L0 L0 */
  954. + 0x35, 0x2d, 0, /* L0 L1 */
  955. + 0x43, 0x63, 0, /* L0 BI */
  956. + 0x3d, 0x2f, 0, /* L1 L0 */
  957. + 0x39, 0x25, 0x6, /* L1 L1 */
  958. + 0x53, 0x73, 0, /* L1 BI */
  959. + 0x4b, 0x6b, 0, /* BI L0 */
  960. + 0x5b, 0x7b, 0, /* BI L1 */
  961. + 0x47, 0x67, 0x21 /* BI BI */
  962. + };
  963. +
  964. + const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
  965. + int bits = i_mb_bits[idx];
  966. +
  967. + x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
  968. + x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
  969. + if( bits != 1 )
  970. + {
  971. + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  972. + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  973. + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  974. + if( bits != 1 )
  975. + x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
  976. + }
  977. +
  978. + const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  979. + if( h->mb.pic.i_fref[0] > 1 )
  980. + {
  981. + if( b_list[0][0] )
  982. + x264_cabac_mb_ref( h, cb, 0, 0 );
  983. + if( b_list[0][1] && h->mb.i_partition != D_16x16 )
  984. + x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
  985. + }
  986. + if( h->mb.pic.i_fref[1] > 1 )
  987. + {
  988. + if( b_list[1][0] )
  989. + x264_cabac_mb_ref( h, cb, 1, 0 );
  990. + if( b_list[1][1] && h->mb.i_partition != D_16x16 )
  991. + x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
  992. + }
  993. + for( i_list = 0; i_list < 2; i_list++ )
  994. + {
  995. + if( h->mb.i_partition == D_16x16 )
  996. + {
  997. + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 );
  998. + }
  999. + else if( h->mb.i_partition == D_16x8 )
  1000. + {
  1001. + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
  1002. + if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
  1003. + }
  1004. + else //if( h->mb.i_partition == D_8x16 )
  1005. + {
  1006. + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
  1007. + if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
  1008. + }
  1009. + }
  1010. + }
  1011. + }
  1012. +}
  1013. +
  1014. /* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0
  1015. * 1-> AC 16x16 i_idx = luma4x4idx
  1016. * 2-> Luma4x4 i_idx = luma4x4idx
  1017. @@ -752,7 +867,6 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
  1018. void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1019. {
  1020. const int i_mb_type = h->mb.i_type;
  1021. - int i_list;
  1022. int i;
  1023.  
  1024. #if !RDO_SKIP_BS
  1025. @@ -760,15 +874,14 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1026. int i_mb_pos_tex;
  1027. #endif
  1028.  
  1029. - /* Write the MB type */
  1030. - x264_cabac_mb_type( h, cb );
  1031. + x264_cabac_mb_header( h, cb );
  1032.  
  1033. #if !RDO_SKIP_BS
  1034. + i_mb_pos_tex = x264_cabac_pos( cb );
  1035. + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1036. +
  1037. if( i_mb_type == I_PCM )
  1038. {
  1039. - i_mb_pos_tex = x264_cabac_pos( cb );
  1040. - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1041. -
  1042. memcpy( cb->p, h->mb.pic.p_fenc[0], 256 );
  1043. cb->p += 256;
  1044. for( i = 0; i < 8; i++ )
  1045. @@ -793,140 +906,6 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1046. }
  1047. #endif
  1048.  
  1049. - if( IS_INTRA( i_mb_type ) )
  1050. - {
  1051. - if( h->pps->b_transform_8x8_mode && i_mb_type != I_16x16 )
  1052. - x264_cabac_mb_transform_size( h, cb );
  1053. -
  1054. - if( i_mb_type != I_16x16 )
  1055. - {
  1056. - int di = h->mb.b_transform_8x8 ? 4 : 1;
  1057. - for( i = 0; i < 16; i += di )
  1058. - {
  1059. - const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  1060. - const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  1061. - x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
  1062. - }
  1063. - }
  1064. -
  1065. - x264_cabac_mb_intra_chroma_pred_mode( h, cb );
  1066. - }
  1067. - else if( i_mb_type == P_L0 )
  1068. - {
  1069. - if( h->mb.i_partition == D_16x16 )
  1070. - {
  1071. - if( h->mb.pic.i_fref[0] > 1 )
  1072. - {
  1073. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1074. - }
  1075. - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 );
  1076. - }
  1077. - else if( h->mb.i_partition == D_16x8 )
  1078. - {
  1079. - if( h->mb.pic.i_fref[0] > 1 )
  1080. - {
  1081. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1082. - x264_cabac_mb_ref( h, cb, 0, 8 );
  1083. - }
  1084. - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
  1085. - x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
  1086. - }
  1087. - else //if( h->mb.i_partition == D_8x16 )
  1088. - {
  1089. - if( h->mb.pic.i_fref[0] > 1 )
  1090. - {
  1091. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1092. - x264_cabac_mb_ref( h, cb, 0, 4 );
  1093. - }
  1094. - x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 );
  1095. - x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 );
  1096. - }
  1097. - }
  1098. - else if( i_mb_type == P_8x8 )
  1099. - {
  1100. - /* sub mb type */
  1101. - for( i = 0; i < 4; i++ )
  1102. - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
  1103. -
  1104. - /* ref 0 */
  1105. - if( h->mb.pic.i_fref[0] > 1 )
  1106. - {
  1107. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1108. - x264_cabac_mb_ref( h, cb, 0, 4 );
  1109. - x264_cabac_mb_ref( h, cb, 0, 8 );
  1110. - x264_cabac_mb_ref( h, cb, 0, 12 );
  1111. - }
  1112. -
  1113. - for( i = 0; i < 4; i++ )
  1114. - x264_cabac_mb8x8_mvd( h, cb, i );
  1115. - }
  1116. - else if( i_mb_type == B_8x8 )
  1117. - {
  1118. - /* sub mb type */
  1119. - for( i = 0; i < 4; i++ )
  1120. - x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
  1121. -
  1122. - /* ref */
  1123. - if( h->mb.pic.i_fref[0] > 1 )
  1124. - for( i = 0; i < 4; i++ )
  1125. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  1126. - x264_cabac_mb_ref( h, cb, 0, 4*i );
  1127. -
  1128. - if( h->mb.pic.i_fref[1] > 1 )
  1129. - for( i = 0; i < 4; i++ )
  1130. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  1131. - x264_cabac_mb_ref( h, cb, 1, 4*i );
  1132. -
  1133. - for( i = 0; i < 4; i++ )
  1134. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  1135. - x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
  1136. -
  1137. - for( i = 0; i < 4; i++ )
  1138. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  1139. - x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
  1140. - }
  1141. - else if( i_mb_type != B_DIRECT )
  1142. - {
  1143. - /* All B mode */
  1144. - const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  1145. - if( h->mb.pic.i_fref[0] > 1 )
  1146. - {
  1147. - if( b_list[0][0] )
  1148. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1149. - if( b_list[0][1] && h->mb.i_partition != D_16x16 )
  1150. - x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
  1151. - }
  1152. - if( h->mb.pic.i_fref[1] > 1 )
  1153. - {
  1154. - if( b_list[1][0] )
  1155. - x264_cabac_mb_ref( h, cb, 1, 0 );
  1156. - if( b_list[1][1] && h->mb.i_partition != D_16x16 )
  1157. - x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
  1158. - }
  1159. - for( i_list = 0; i_list < 2; i_list++ )
  1160. - {
  1161. - if( h->mb.i_partition == D_16x16 )
  1162. - {
  1163. - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 );
  1164. - }
  1165. - else if( h->mb.i_partition == D_16x8 )
  1166. - {
  1167. - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
  1168. - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
  1169. - }
  1170. - else //if( h->mb.i_partition == D_8x16 )
  1171. - {
  1172. - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
  1173. - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
  1174. - }
  1175. - }
  1176. - }
  1177. -
  1178. -#if !RDO_SKIP_BS
  1179. - i_mb_pos_tex = x264_cabac_pos( cb );
  1180. - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1181. -#endif
  1182. -
  1183. if( i_mb_type != I_16x16 )
  1184. {
  1185. x264_cabac_mb_cbp_luma( h, cb );
  1186. @@ -934,11 +913,9 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1187. }
  1188.  
  1189. if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
  1190. - {
  1191. x264_cabac_mb_transform_size( h, cb );
  1192. - }
  1193.  
  1194. - if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
  1195. + if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  1196. {
  1197. const int b_intra = IS_INTRA( i_mb_type );
  1198. x264_cabac_mb_qp_delta( h, cb );
  1199. @@ -950,7 +927,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1200. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 1 );
  1201.  
  1202. /* AC Luma */
  1203. - if( h->mb.i_cbp_luma != 0 )
  1204. + if( h->mb.i_cbp_luma )
  1205. for( i = 0; i < 16; i++ )
  1206. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 1 );
  1207. }
  1208. @@ -967,7 +944,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1209. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], b_intra );
  1210. }
  1211.  
  1212. - if( h->mb.i_cbp_chroma&0x03 ) /* Chroma DC residual present */
  1213. + if( h->mb.i_cbp_chroma ) /* Chroma DC residual present */
  1214. {
  1215. block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], b_intra );
  1216. block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], b_intra );
  1217. diff --git a/encoder/cavlc.c b/encoder/cavlc.c
  1218. index c65c9bd..d18408b 100644
  1219. --- a/encoder/cavlc.c
  1220. +++ b/encoder/cavlc.c
  1221. @@ -203,7 +203,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
  1222. *nnz = block_residual_write_cavlc(h,cat,l,nC);\
  1223. }
  1224.  
  1225. -static void cavlc_qp_delta( x264_t *h )
  1226. +static void x264_cavlc_mb_qp_delta( x264_t *h )
  1227. {
  1228. bs_t *s = &h->out.bs;
  1229. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1230. @@ -228,7 +228,7 @@ static void cavlc_qp_delta( x264_t *h )
  1231. bs_write_se( s, i_dqp );
  1232. }
  1233.  
  1234. -static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
  1235. +static void x264_cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
  1236. {
  1237. bs_t *s = &h->out.bs;
  1238. ALIGNED_4( int16_t mvp[2] );
  1239. @@ -237,26 +237,26 @@ static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
  1240. bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
  1241. }
  1242.  
  1243. -static inline void cavlc_mb8x8_mvd( x264_t *h, int i )
  1244. +static inline void x264_cavlc_mb8x8_mvd( x264_t *h, int i )
  1245. {
  1246. switch( h->mb.i_sub_partition[i] )
  1247. {
  1248. case D_L0_8x8:
  1249. - cavlc_mb_mvd( h, 0, 4*i, 2 );
  1250. + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  1251. break;
  1252. case D_L0_8x4:
  1253. - cavlc_mb_mvd( h, 0, 4*i+0, 2 );
  1254. - cavlc_mb_mvd( h, 0, 4*i+2, 2 );
  1255. + x264_cavlc_mb_mvd( h, 0, 4*i+0, 2 );
  1256. + x264_cavlc_mb_mvd( h, 0, 4*i+2, 2 );
  1257. break;
  1258. case D_L0_4x8:
  1259. - cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1260. - cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1261. + x264_cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1262. + x264_cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1263. break;
  1264. case D_L0_4x4:
  1265. - cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1266. - cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1267. - cavlc_mb_mvd( h, 0, 4*i+2, 1 );
  1268. - cavlc_mb_mvd( h, 0, 4*i+3, 1 );
  1269. + x264_cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1270. + x264_cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1271. + x264_cavlc_mb_mvd( h, 0, 4*i+2, 1 );
  1272. + x264_cavlc_mb_mvd( h, 0, 4*i+3, 1 );
  1273. break;
  1274. }
  1275. }
  1276. @@ -372,7 +372,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1277.  
  1278. if( h->mb.pic.i_fref[0] > 1 )
  1279. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1280. - cavlc_mb_mvd( h, 0, 0, 4 );
  1281. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1282. }
  1283. else if( h->mb.i_partition == D_16x8 )
  1284. {
  1285. @@ -382,8 +382,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1286. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1287. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  1288. }
  1289. - cavlc_mb_mvd( h, 0, 0, 4 );
  1290. - cavlc_mb_mvd( h, 0, 8, 4 );
  1291. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1292. + x264_cavlc_mb_mvd( h, 0, 8, 4 );
  1293. }
  1294. else if( h->mb.i_partition == D_8x16 )
  1295. {
  1296. @@ -393,8 +393,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1297. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1298. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  1299. }
  1300. - cavlc_mb_mvd( h, 0, 0, 2 );
  1301. - cavlc_mb_mvd( h, 0, 4, 2 );
  1302. + x264_cavlc_mb_mvd( h, 0, 0, 2 );
  1303. + x264_cavlc_mb_mvd( h, 0, 4, 2 );
  1304. }
  1305. }
  1306. else if( i_mb_type == P_8x8 )
  1307. @@ -429,7 +429,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1308. }
  1309.  
  1310. for( i = 0; i < 4; i++ )
  1311. - cavlc_mb8x8_mvd( h, i );
  1312. + x264_cavlc_mb8x8_mvd( h, i );
  1313. }
  1314. else if( i_mb_type == B_8x8 )
  1315. {
  1316. @@ -452,10 +452,10 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1317. /* mvd */
  1318. for( i = 0; i < 4; i++ )
  1319. if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  1320. - cavlc_mb_mvd( h, 0, 4*i, 2 );
  1321. + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  1322. for( i = 0; i < 4; i++ )
  1323. if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  1324. - cavlc_mb_mvd( h, 1, 4*i, 2 );
  1325. + x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
  1326. }
  1327. else if( i_mb_type != B_DIRECT )
  1328. {
  1329. @@ -470,8 +470,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1330. {
  1331. if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
  1332. if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
  1333. - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
  1334. - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
  1335. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1336. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  1337. }
  1338. else
  1339. {
  1340. @@ -481,17 +481,17 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1341. if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
  1342. if( h->mb.i_partition == D_16x8 )
  1343. {
  1344. - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
  1345. - if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 8, 4 );
  1346. - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
  1347. - if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 8, 4 );
  1348. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1349. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
  1350. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  1351. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
  1352. }
  1353. else //if( h->mb.i_partition == D_8x16 )
  1354. {
  1355. - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 2 );
  1356. - if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 4, 2 );
  1357. - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 2 );
  1358. - if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 4, 2 );
  1359. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
  1360. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
  1361. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
  1362. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
  1363. }
  1364. }
  1365. }
  1366. @@ -514,31 +514,31 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1367. bs_write1( s, h->mb.b_transform_8x8 );
  1368.  
  1369. /* write residual */
  1370. - if( i_mb_type == I_16x16 )
  1371. + if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  1372. {
  1373. - cavlc_qp_delta( h );
  1374. + x264_cavlc_mb_qp_delta( h );
  1375.  
  1376. - /* DC Luma */
  1377. - block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
  1378. + if( i_mb_type == I_16x16 )
  1379. + {
  1380. + /* DC Luma */
  1381. + block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
  1382.  
  1383. - /* AC Luma */
  1384. - if( h->mb.i_cbp_luma )
  1385. - for( i = 0; i < 16; i++ )
  1386. - block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
  1387. - }
  1388. - else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
  1389. - {
  1390. - cavlc_qp_delta( h );
  1391. - x264_macroblock_luma_write_cavlc( h, 0, 3 );
  1392. - }
  1393. - if( h->mb.i_cbp_chroma )
  1394. - {
  1395. - /* Chroma DC residual present */
  1396. - block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
  1397. - block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
  1398. - if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
  1399. - for( i = 16; i < 24; i++ )
  1400. - block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
  1401. + /* AC Luma */
  1402. + if( h->mb.i_cbp_luma )
  1403. + for( i = 0; i < 16; i++ )
  1404. + block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
  1405. + }
  1406. + else
  1407. + x264_macroblock_luma_write_cavlc( h, 0, 3 );
  1408. +
  1409. + if( h->mb.i_cbp_chroma ) /* Chroma DC residual present */
  1410. + {
  1411. + block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
  1412. + block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
  1413. + if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
  1414. + for( i = 16; i < 24; i++ )
  1415. + block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
  1416. + }
  1417. }
  1418.  
  1419. #if !RDO_SKIP_BS
  1420. @@ -563,22 +563,22 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
  1421.  
  1422. if( i_mb_type == P_8x8 )
  1423. {
  1424. - cavlc_mb8x8_mvd( h, i8 );
  1425. + x264_cavlc_mb8x8_mvd( h, i8 );
  1426. bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
  1427. }
  1428. else if( i_mb_type == P_L0 )
  1429. - cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1430. + x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1431. else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
  1432. {
  1433. - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1434. - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  1435. + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1436. + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  1437. }
  1438. else //if( i_mb_type == B_8x8 )
  1439. {
  1440. if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  1441. - cavlc_mb_mvd( h, 0, 4*i8, 2 );
  1442. + x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
  1443. if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  1444. - cavlc_mb_mvd( h, 1, 4*i8, 2 );
  1445. + x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
  1446. }
  1447.  
  1448. for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
  1449. @@ -596,7 +596,7 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
  1450. {
  1451. int b_8x4 = i_pixel == PIXEL_8x4;
  1452. h->out.bs.i_bits_encoded = 0;
  1453. - cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
  1454. + x264_cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
  1455. block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
  1456. if( i_pixel != PIXEL_4x4 )
  1457. {
  1458. --
  1459. 1.6.1.2
  1460.  
  1461.  
  1462. From 8b3167396b9f48eefe4f6d1c7fda24d3f8e91dfc Mon Sep 17 00:00:00 2001
  1463. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1464. Date: Wed, 3 Feb 2010 18:19:29 -0800
  1465. Subject: [PATCH 05/26] Simplify decimate checks in macroblock_encode
  1466. Also fix a misleading comment.
  1467.  
  1468. ---
  1469. common/common.h | 1 +
  1470. encoder/analyse.c | 2 ++
  1471. encoder/macroblock.c | 12 +++++-------
  1472. 3 files changed, 8 insertions(+), 7 deletions(-)
  1473.  
  1474. diff --git a/common/common.h b/common/common.h
  1475. index 950f48f..8b1b05a 100644
  1476. --- a/common/common.h
  1477. +++ b/common/common.h
  1478. @@ -484,6 +484,7 @@ struct x264_t
  1479. int b_chroma_me;
  1480. int b_trellis;
  1481. int b_noise_reduction;
  1482. + int b_dct_decimate;
  1483. int i_psy_rd; /* Psy RD strength--fixed point value*/
  1484. int i_psy_trellis; /* Psy trellis strength--fixed point value*/
  1485.  
  1486. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1487. index 1fb2206..92d6584 100644
  1488. --- a/encoder/analyse.c
  1489. +++ b/encoder/analyse.c
  1490. @@ -364,6 +364,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
  1491. h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
  1492. h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
  1493. && h->mb.i_subpel_refine >= 5;
  1494. + h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
  1495. + (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
  1496.  
  1497. h->mb.b_transform_8x8 = 0;
  1498. h->mb.b_noise_reduction = 0;
  1499. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  1500. index e4edb8a..fa7942d 100644
  1501. --- a/encoder/macroblock.c
  1502. +++ b/encoder/macroblock.c
  1503. @@ -208,8 +208,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
  1504. ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
  1505.  
  1506. int i, nz;
  1507. - int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
  1508. - int decimate_score = b_decimate ? 0 : 9;
  1509. + int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
  1510.  
  1511. if( h->mb.b_lossless )
  1512. {
  1513. @@ -342,7 +341,7 @@ static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp,
  1514. void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  1515. {
  1516. int i, ch, nz, nz_dc;
  1517. - int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
  1518. + int b_decimate = b_inter && h->mb.b_dct_decimate;
  1519. ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
  1520. h->mb.i_cbp_chroma = 0;
  1521.  
  1522. @@ -607,7 +606,7 @@ void x264_macroblock_encode( x264_t *h )
  1523. {
  1524. int i_cbp_dc = 0;
  1525. int i_qp = h->mb.i_qp;
  1526. - int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
  1527. + int b_decimate = h->mb.b_dct_decimate;
  1528. int b_force_no_skip = 0;
  1529. int i,idx,nz;
  1530. h->mb.i_cbp_luma = 0;
  1531. @@ -914,8 +913,7 @@ void x264_macroblock_encode( x264_t *h )
  1532.  
  1533. /*****************************************************************************
  1534. * x264_macroblock_probe_skip:
  1535. - * Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
  1536. - * the previous QP
  1537. + * Check if the current MB could be encoded as a [PB]_SKIP
  1538. *****************************************************************************/
  1539. int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  1540. {
  1541. @@ -1052,7 +1050,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1542. int i_qp = h->mb.i_qp;
  1543. uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
  1544. uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
  1545. - int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
  1546. + int b_decimate = h->mb.b_dct_decimate;
  1547. int nnz8x8 = 0;
  1548. int ch, nz;
  1549.  
  1550. --
  1551. 1.6.1.2
  1552.  
  1553.  
  1554. From ea1bb5fb815d19ade6ace7482094bc8bb8b276c5 Mon Sep 17 00:00:00 2001
  1555. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1556. Date: Wed, 3 Feb 2010 18:36:44 -0800
  1557. Subject: [PATCH 06/26] Fix subpel iteration counts with B-frame analysis and subme 6/8
  1558. Since subme 6 means "like subme 5, except RD on P-frames", B-frame analysis
  1559. shouldn't use the RD subpel counts at subme 6. Similarly with subme 8.
  1560. Slightly faster (and very marginally worse) compression at subme 6 and 8.
  1561.  
  1562. ---
  1563. encoder/analyse.c | 2 ++
  1564. 1 files changed, 2 insertions(+), 0 deletions(-)
  1565.  
  1566. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1567. index 92d6584..c15bf8f 100644
  1568. --- a/encoder/analyse.c
  1569. +++ b/encoder/analyse.c
  1570. @@ -362,6 +362,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
  1571.  
  1572. h->mb.i_me_method = h->param.analyse.i_me_method;
  1573. h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
  1574. + if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
  1575. + h->mb.i_subpel_refine--;
  1576. h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
  1577. && h->mb.i_subpel_refine >= 5;
  1578. h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
  1579. --
  1580. 1.6.1.2
  1581.  
  1582.  
  1583. From 51f1ee4cfc93870c89c8708bcc79d83236c07f7e Mon Sep 17 00:00:00 2001
  1584. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1585. Date: Wed, 3 Feb 2010 20:01:16 -0800
  1586. Subject: [PATCH 07/26] Smarter QPRD
  1587. Catch some cases in which RD checks can be avoided; reduces QPRD RD calls by 10-20%.
  1588.  
  1589. ---
  1590. encoder/analyse.c | 42 ++++++++++++++++++++++++++++++++++++++----
  1591. 1 files changed, 38 insertions(+), 4 deletions(-)
  1592.  
  1593. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1594. index c15bf8f..1d48b7d 100644
  1595. --- a/encoder/analyse.c
  1596. +++ b/encoder/analyse.c
  1597. @@ -2307,9 +2307,10 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1598. int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
  1599. int last_qp_tried = 0;
  1600. origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
  1601. + int origcbp = h->mb.cbp[h->mb.i_mb_xy];
  1602.  
  1603. /* If CBP is already zero, don't raise the quantizer any higher. */
  1604. - for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
  1605. + for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
  1606. {
  1607. /* Without psy-RD, require monotonicity when moving quant away from previous
  1608. * macroblock's quant; allow 1 failure when moving quant towards previous quant.
  1609. @@ -2324,14 +2325,47 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1610. h->mb.i_qp = orig_qp;
  1611. failures = 0;
  1612. prevcost = origcost;
  1613. +
  1614. + /* If the current QP results in an empty CBP, it's highly likely that lower QPs
  1615. + * (up to a point) will too. So, jump down to where the threshold will kick in
  1616. + * and check the QP there. If the CBP is still empty, skip the main loop.
  1617. + * If it isn't empty, we would have ended up having to check this QP anyways,
  1618. + * so as long as we store it for later lookup, we lose nothing. */
  1619. + int already_checked_qp = -1;
  1620. + int already_checked_cost = COST_MAX;
  1621. + if( direction == -1 )
  1622. + {
  1623. + if( !origcbp )
  1624. + {
  1625. + h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
  1626. + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1627. + already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1628. + if( !h->mb.cbp[h->mb.i_mb_xy] )
  1629. + {
  1630. + /* If our empty-CBP block is lower QP than the last QP,
  1631. + * the last QP almost surely doesn't have a CBP either. */
  1632. + if( h->mb.i_last_qp > h->mb.i_qp )
  1633. + last_qp_tried = 1;
  1634. + break;
  1635. + }
  1636. + already_checked_qp = h->mb.i_qp;
  1637. + h->mb.i_qp = orig_qp;
  1638. + }
  1639. + }
  1640. +
  1641. h->mb.i_qp += direction;
  1642. while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
  1643. {
  1644. if( h->mb.i_last_qp == h->mb.i_qp )
  1645. last_qp_tried = 1;
  1646. - h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1647. - cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1648. - COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
  1649. + if( h->mb.i_qp == already_checked_qp )
  1650. + cost = already_checked_cost;
  1651. + else
  1652. + {
  1653. + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1654. + cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1655. + COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
  1656. + }
  1657.  
  1658. /* We can't assume that the costs are monotonic over QPs.
  1659. * Tie case-as-failure seems to give better results. */
  1660. --
  1661. 1.6.1.2
  1662.  
  1663.  
  1664. From 029e2dfc709039b56ec0cd195a0803c160ed73d9 Mon Sep 17 00:00:00 2001
  1665. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1666. Date: Wed, 3 Feb 2010 20:27:57 -0800
  1667. Subject: [PATCH 08/26] Fix 2-pass ratecontrol continuation in case of missing statsfile
  1668. Didn't work properly if MB-tree was enabled.
  1669.  
  1670. ---
  1671. encoder/ratecontrol.c | 1 +
  1672. 1 files changed, 1 insertions(+), 0 deletions(-)
  1673.  
  1674. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  1675. index 52196e7..e314ba2 100644
  1676. --- a/encoder/ratecontrol.c
  1677. +++ b/encoder/ratecontrol.c
  1678. @@ -1280,6 +1280,7 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
  1679. h->thread[i]->param.rc.b_stat_read = 0;
  1680. h->thread[i]->param.i_bframe_adaptive = 0;
  1681. h->thread[i]->param.i_scenecut_threshold = 0;
  1682. + h->thread[i]->param.rc.b_mb_tree = 0;
  1683. if( h->thread[i]->param.i_bframe > 1 )
  1684. h->thread[i]->param.i_bframe = 1;
  1685. }
  1686. --
  1687. 1.6.1.2
  1688.  
  1689.  
  1690. From de673993912a20ca9616f8733dbfbaf5c2d144f2 Mon Sep 17 00:00:00 2001
  1691. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1692. Date: Fri, 5 Feb 2010 16:15:23 -0800
  1693. Subject: [PATCH 09/26] Various CABAC/CAVLC cleanups/speedups
  1694. Make some if/else chains into switch statements.
  1695. Store CBP data in x264_t and only move it to frame storage later.
  1696. This saves a wasted cache line and some unnecessary dereferences in RDO.
  1697.  
  1698. ---
  1699. common/common.h | 1 +
  1700. common/macroblock.c | 3 +-
  1701. encoder/analyse.c | 8 +-
  1702. encoder/cabac.c | 40 +++---
  1703. encoder/cavlc.c | 365 ++++++++++++++++++++++++++------------------------
  1704. encoder/macroblock.c | 19 +--
  1705. 6 files changed, 219 insertions(+), 217 deletions(-)
  1706.  
  1707. diff --git a/common/common.h b/common/common.h
  1708. index 8b1b05a..d4a8dd9 100644
  1709. --- a/common/common.h
  1710. +++ b/common/common.h
  1711. @@ -542,6 +542,7 @@ struct x264_t
  1712. ALIGNED_4( uint8_t i_sub_partition[4] );
  1713. int b_transform_8x8;
  1714.  
  1715. + int i_cbp_combined;
  1716. int i_cbp_luma;
  1717. int i_cbp_chroma;
  1718.  
  1719. diff --git a/common/macroblock.c b/common/macroblock.c
  1720. index 10f09ac..d86f3af 100644
  1721. --- a/common/macroblock.c
  1722. +++ b/common/macroblock.c
  1723. @@ -1343,11 +1343,12 @@ void x264_macroblock_cache_save( x264_t *h )
  1724. M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
  1725. M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
  1726.  
  1727. - if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
  1728. + if( h->mb.i_type != I_16x16 && !h->mb.i_cbp_combined )
  1729. h->mb.i_qp = h->mb.i_last_qp;
  1730. h->mb.qp[i_mb_xy] = h->mb.i_qp;
  1731. h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1732. h->mb.i_last_qp = h->mb.i_qp;
  1733. + h->mb.cbp[i_mb_xy] = h->mb.i_cbp_combined;
  1734. }
  1735.  
  1736. if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
  1737. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1738. index 1d48b7d..63db36a 100644
  1739. --- a/encoder/analyse.c
  1740. +++ b/encoder/analyse.c
  1741. @@ -1199,7 +1199,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
  1742. h->mb.i_partition = D_16x16;
  1743. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
  1744. a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
  1745. - if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
  1746. + if( !h->mb.i_cbp_combined )
  1747. h->mb.i_type = P_SKIP;
  1748. }
  1749. }
  1750. @@ -2307,7 +2307,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1751. int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
  1752. int last_qp_tried = 0;
  1753. origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
  1754. - int origcbp = h->mb.cbp[h->mb.i_mb_xy];
  1755. + int origcbp = h->mb.i_cbp_combined;
  1756.  
  1757. /* If CBP is already zero, don't raise the quantizer any higher. */
  1758. for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
  1759. @@ -2340,7 +2340,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1760. h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
  1761. h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1762. already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1763. - if( !h->mb.cbp[h->mb.i_mb_xy] )
  1764. + if( !h->mb.i_cbp_combined )
  1765. {
  1766. /* If our empty-CBP block is lower QP than the last QP,
  1767. * the last QP almost surely doesn't have a CBP either. */
  1768. @@ -2377,7 +2377,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1769.  
  1770. if( failures > threshold )
  1771. break;
  1772. - if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
  1773. + if( direction == 1 && !h->mb.i_cbp_combined )
  1774. break;
  1775. h->mb.i_qp += direction;
  1776. }
  1777. diff --git a/encoder/cabac.c b/encoder/cabac.c
  1778. index 6ff2aed..6c14722 100644
  1779. --- a/encoder/cabac.c
  1780. +++ b/encoder/cabac.c
  1781. @@ -107,7 +107,7 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
  1782. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1783.  
  1784. /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
  1785. - if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
  1786. + if( h->mb.i_type == I_16x16 && !h->mb.i_cbp_combined )
  1787. {
  1788. #if !RDO_SKIP_BS
  1789. h->mb.i_qp = h->mb.i_last_qp;
  1790. @@ -915,7 +915,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1791. if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
  1792. x264_cabac_mb_transform_size( h, cb );
  1793.  
  1794. - if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  1795. + if( h->mb.i_cbp_combined || i_mb_type == I_16x16 )
  1796. {
  1797. const int b_intra = IS_INTRA( i_mb_type );
  1798. x264_cabac_mb_qp_delta( h, cb );
  1799. @@ -973,24 +973,24 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
  1800. int b_8x16 = h->mb.i_partition == D_8x16;
  1801. int j;
  1802.  
  1803. - if( i_mb_type == P_8x8 )
  1804. + switch( i_mb_type )
  1805. {
  1806. - x264_cabac_mb8x8_mvd( h, cb, i8 );
  1807. - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
  1808. - }
  1809. - else if( i_mb_type == P_L0 )
  1810. - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1811. - else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
  1812. - {
  1813. - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1814. - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1815. - }
  1816. - else //if( i_mb_type == B_8x8 )
  1817. - {
  1818. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  1819. - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
  1820. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  1821. - x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
  1822. + case P_L0:
  1823. + x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1824. + break;
  1825. + case P_8x8:
  1826. + x264_cabac_mb8x8_mvd( h, cb, i8 );
  1827. + x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
  1828. + break;
  1829. + case B_8x8:
  1830. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  1831. + x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
  1832. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  1833. + x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
  1834. + break;
  1835. + default: /* Rest of the B types */
  1836. + if( x264_mb_type_list_table[i_mb_type][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1837. + if( x264_mb_type_list_table[i_mb_type][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1838. }
  1839.  
  1840. for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
  1841. @@ -1019,9 +1019,7 @@ static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, i
  1842. int b_8x4 = i_pixel == PIXEL_8x4;
  1843. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 0 );
  1844. if( i_pixel == PIXEL_4x4 )
  1845. - {
  1846. x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
  1847. - }
  1848. else
  1849. {
  1850. x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
  1851. diff --git a/encoder/cavlc.c b/encoder/cavlc.c
  1852. index d18408b..45b55fe 100644
  1853. --- a/encoder/cavlc.c
  1854. +++ b/encoder/cavlc.c
  1855. @@ -209,8 +209,7 @@ static void x264_cavlc_mb_qp_delta( x264_t *h )
  1856. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1857.  
  1858. /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
  1859. - if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
  1860. - && !h->mb.cache.non_zero_count[x264_scan8[24]] )
  1861. + if( h->mb.i_type == I_16x16 && !h->mb.i_cbp_combined )
  1862. {
  1863. #if !RDO_SKIP_BS
  1864. h->mb.i_qp = h->mb.i_last_qp;
  1865. @@ -302,201 +301,209 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1866. bs_write1( s, h->mb.b_interlaced );
  1867. }
  1868.  
  1869. -#if !RDO_SKIP_BS
  1870. - if( i_mb_type == I_PCM )
  1871. - {
  1872. - uint8_t *p_start = s->p_start;
  1873. - bs_write_ue( s, i_mb_i_offset + 25 );
  1874. - i_mb_pos_tex = bs_pos( s );
  1875. - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1876. -
  1877. - bs_align_0( s );
  1878. -
  1879. - memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
  1880. - s->p += 256;
  1881. - for( i = 0; i < 8; i++ )
  1882. - memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
  1883. - s->p += 64;
  1884. - for( i = 0; i < 8; i++ )
  1885. - memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
  1886. - s->p += 64;
  1887. -
  1888. - bs_init( s, s->p, s->p_end - s->p );
  1889. - s->p_start = p_start;
  1890. -
  1891. - /* if PCM is chosen, we need to store reconstructed frame data */
  1892. - h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
  1893. - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
  1894. - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
  1895. -
  1896. - h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
  1897. - return;
  1898. - }
  1899. -#endif
  1900. -
  1901. /* Write:
  1902. - type
  1903. - prediction
  1904. - mv */
  1905. - if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  1906. + switch( i_mb_type )
  1907. {
  1908. - int di = i_mb_type == I_8x8 ? 4 : 1;
  1909. - bs_write_ue( s, i_mb_i_offset + 0 );
  1910. - if( h->pps->b_transform_8x8_mode )
  1911. - bs_write1( s, h->mb.b_transform_8x8 );
  1912. -
  1913. - /* Prediction: Luma */
  1914. - for( i = 0; i < 16; i += di )
  1915. + case I_4x4:
  1916. + case I_8x8:
  1917. {
  1918. - int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  1919. - int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  1920. + int di = i_mb_type == I_8x8 ? 4 : 1;
  1921. + bs_write_ue( s, i_mb_i_offset + 0 );
  1922. + if( h->pps->b_transform_8x8_mode )
  1923. + bs_write1( s, h->mb.b_transform_8x8 );
  1924.  
  1925. - if( i_pred == i_mode )
  1926. - bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
  1927. - else
  1928. - bs_write( s, 4, i_mode - (i_mode > i_pred) );
  1929. + /* Prediction: Luma */
  1930. + for( i = 0; i < 16; i += di )
  1931. + {
  1932. + int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  1933. + int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  1934. +
  1935. + if( i_pred == i_mode )
  1936. + bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
  1937. + else
  1938. + bs_write( s, 4, i_mode - (i_mode > i_pred) );
  1939. + }
  1940. + bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1941. + break;
  1942. + case I_16x16:
  1943. + bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
  1944. + h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
  1945. + bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1946. + break;
  1947. }
  1948. - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1949. - }
  1950. - else if( i_mb_type == I_16x16 )
  1951. - {
  1952. - bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
  1953. - h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
  1954. - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1955. - }
  1956. - else if( i_mb_type == P_L0 )
  1957. - {
  1958. - if( h->mb.i_partition == D_16x16 )
  1959. +#if !RDO_SKIP_BS
  1960. + case I_PCM:
  1961. {
  1962. - bs_write1( s, 1 );
  1963. -
  1964. - if( h->mb.pic.i_fref[0] > 1 )
  1965. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1966. - x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1967. + uint8_t *p_start = s->p_start;
  1968. + bs_write_ue( s, i_mb_i_offset + 25 );
  1969. + i_mb_pos_tex = bs_pos( s );
  1970. + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1971. +
  1972. + bs_align_0( s );
  1973. +
  1974. + memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
  1975. + s->p += 256;
  1976. + for( i = 0; i < 8; i++ )
  1977. + memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
  1978. + s->p += 64;
  1979. + for( i = 0; i < 8; i++ )
  1980. + memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
  1981. + s->p += 64;
  1982. +
  1983. + bs_init( s, s->p, s->p_end - s->p );
  1984. + s->p_start = p_start;
  1985. +
  1986. + /* if PCM is chosen, we need to store reconstructed frame data */
  1987. + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
  1988. + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
  1989. + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
  1990. +
  1991. + h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
  1992. + return;
  1993. }
  1994. - else if( h->mb.i_partition == D_16x8 )
  1995. +#endif
  1996. + case P_L0:
  1997. {
  1998. - bs_write_ue( s, 1 );
  1999. - if( h->mb.pic.i_fref[0] > 1 )
  2000. + if( h->mb.i_partition == D_16x16 )
  2001. {
  2002. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2003. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2004. + bs_write1( s, 1 );
  2005. +
  2006. + if( h->mb.pic.i_fref[0] > 1 )
  2007. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2008. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2009. }
  2010. - x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2011. - x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2012. + else if( h->mb.i_partition == D_16x8 )
  2013. + {
  2014. + bs_write_ue( s, 1 );
  2015. + if( h->mb.pic.i_fref[0] > 1 )
  2016. + {
  2017. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2018. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2019. + }
  2020. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2021. + x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2022. + }
  2023. + else if( h->mb.i_partition == D_8x16 )
  2024. + {
  2025. + bs_write_ue( s, 2 );
  2026. + if( h->mb.pic.i_fref[0] > 1 )
  2027. + {
  2028. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2029. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  2030. + }
  2031. + x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2032. + x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2033. + }
  2034. + break;
  2035. }
  2036. - else if( h->mb.i_partition == D_8x16 )
  2037. + case P_8x8:
  2038. {
  2039. - bs_write_ue( s, 2 );
  2040. - if( h->mb.pic.i_fref[0] > 1 )
  2041. + int b_sub_ref;
  2042. + if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
  2043. + h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
  2044. + {
  2045. + bs_write_ue( s, 4 );
  2046. + b_sub_ref = 0;
  2047. + }
  2048. + else
  2049. + {
  2050. + bs_write_ue( s, 3 );
  2051. + b_sub_ref = 1;
  2052. + }
  2053. +
  2054. + /* sub mb type */
  2055. + if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
  2056. + for( i = 0; i < 4; i++ )
  2057. + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
  2058. + else
  2059. + bs_write( s, 4, 0xf );
  2060. +
  2061. + /* ref0 */
  2062. + if( b_sub_ref )
  2063. {
  2064. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2065. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  2066. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2067. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
  2068. }
  2069. - x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2070. - x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2071. - }
  2072. - }
  2073. - else if( i_mb_type == P_8x8 )
  2074. - {
  2075. - int b_sub_ref;
  2076. - if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
  2077. - h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
  2078. - {
  2079. - bs_write_ue( s, 4 );
  2080. - b_sub_ref = 0;
  2081. - }
  2082. - else
  2083. - {
  2084. - bs_write_ue( s, 3 );
  2085. - b_sub_ref = 1;
  2086. - }
  2087.  
  2088. - /* sub mb type */
  2089. - if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
  2090. for( i = 0; i < 4; i++ )
  2091. - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
  2092. - else
  2093. - bs_write( s, 4, 0xf );
  2094. -
  2095. - /* ref0 */
  2096. - if( b_sub_ref )
  2097. - {
  2098. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2099. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  2100. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2101. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
  2102. + x264_cavlc_mb8x8_mvd( h, i );
  2103. + break;
  2104. }
  2105. + case B_8x8:
  2106. + {
  2107. + bs_write_ue( s, 22 );
  2108.  
  2109. - for( i = 0; i < 4; i++ )
  2110. - x264_cavlc_mb8x8_mvd( h, i );
  2111. - }
  2112. - else if( i_mb_type == B_8x8 )
  2113. - {
  2114. - bs_write_ue( s, 22 );
  2115. -
  2116. - /* sub mb type */
  2117. - for( i = 0; i < 4; i++ )
  2118. - bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
  2119. + /* sub mb type */
  2120. + for( i = 0; i < 4; i++ )
  2121. + bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
  2122.  
  2123. - /* ref */
  2124. - if( h->mb.pic.i_fref[0] > 1 )
  2125. + /* ref */
  2126. + if( h->mb.pic.i_fref[0] > 1 )
  2127. + for( i = 0; i < 4; i++ )
  2128. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  2129. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
  2130. + if( h->mb.pic.i_fref[1] > 1 )
  2131. + for( i = 0; i < 4; i++ )
  2132. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  2133. + bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
  2134. +
  2135. + /* mvd */
  2136. for( i = 0; i < 4; i++ )
  2137. if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  2138. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
  2139. - if( h->mb.pic.i_fref[1] > 1 )
  2140. + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  2141. for( i = 0; i < 4; i++ )
  2142. if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  2143. - bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
  2144. -
  2145. - /* mvd */
  2146. - for( i = 0; i < 4; i++ )
  2147. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  2148. - x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  2149. - for( i = 0; i < 4; i++ )
  2150. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  2151. - x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
  2152. - }
  2153. - else if( i_mb_type != B_DIRECT )
  2154. - {
  2155. - /* All B mode */
  2156. - /* Motion Vector */
  2157. - const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  2158. - const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
  2159. - const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
  2160. -
  2161. - bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
  2162. - if( h->mb.i_partition == D_16x16 )
  2163. + x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
  2164. + break;
  2165. + }
  2166. + case B_DIRECT:
  2167. {
  2168. - if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
  2169. - if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
  2170. - if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2171. - if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  2172. + bs_write1( s, 1 );
  2173. + break;
  2174. }
  2175. - else
  2176. + default: /* Rest of the B types */
  2177. {
  2178. - if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
  2179. - if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
  2180. - if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
  2181. - if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
  2182. - if( h->mb.i_partition == D_16x8 )
  2183. + const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  2184. + const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
  2185. + const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
  2186. +
  2187. + bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
  2188. + if( h->mb.i_partition == D_16x16 )
  2189. {
  2190. + if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
  2191. + if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
  2192. if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2193. - if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2194. if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  2195. - if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
  2196. }
  2197. - else //if( h->mb.i_partition == D_8x16 )
  2198. + else
  2199. {
  2200. - if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2201. - if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2202. - if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
  2203. - if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
  2204. + if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
  2205. + if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
  2206. + if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
  2207. + if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
  2208. + if( h->mb.i_partition == D_16x8 )
  2209. + {
  2210. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2211. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2212. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  2213. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
  2214. + }
  2215. + else //if( h->mb.i_partition == D_8x16 )
  2216. + {
  2217. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2218. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2219. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
  2220. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
  2221. + }
  2222. }
  2223. + break;
  2224. }
  2225. }
  2226. - else //if( i_mb_type == B_DIRECT )
  2227. - bs_write1( s, 1 );
  2228.  
  2229. #if !RDO_SKIP_BS
  2230. i_mb_pos_tex = bs_pos( s );
  2231. @@ -505,16 +512,16 @@ void x264_macroblock_write_cavlc( x264_t *h )
  2232.  
  2233. /* Coded block patern */
  2234. if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  2235. - bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
  2236. + bs_write_ue( s, intra4x4_cbp_to_golomb[h->mb.i_cbp_combined&0x3f] );
  2237. else if( i_mb_type != I_16x16 )
  2238. - bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
  2239. + bs_write_ue( s, inter_cbp_to_golomb[h->mb.i_cbp_combined&0x3f] );
  2240.  
  2241. /* transform size 8x8 flag */
  2242. if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
  2243. bs_write1( s, h->mb.b_transform_8x8 );
  2244.  
  2245. /* write residual */
  2246. - if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  2247. + if( h->mb.i_cbp_combined&0x3f || i_mb_type == I_16x16 )
  2248. {
  2249. x264_cavlc_mb_qp_delta( h );
  2250.  
  2251. @@ -561,24 +568,24 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
  2252. int b_8x16 = h->mb.i_partition == D_8x16;
  2253. int j;
  2254.  
  2255. - if( i_mb_type == P_8x8 )
  2256. - {
  2257. - x264_cavlc_mb8x8_mvd( h, i8 );
  2258. - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
  2259. - }
  2260. - else if( i_mb_type == P_L0 )
  2261. - x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2262. - else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
  2263. + switch( i_mb_type )
  2264. {
  2265. - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2266. - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  2267. - }
  2268. - else //if( i_mb_type == B_8x8 )
  2269. - {
  2270. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  2271. - x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
  2272. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  2273. - x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
  2274. + case P_L0:
  2275. + x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2276. + break;
  2277. + case P_8x8:
  2278. + x264_cavlc_mb8x8_mvd( h, i8 );
  2279. + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
  2280. + break;
  2281. + case B_8x8:
  2282. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  2283. + x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
  2284. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  2285. + x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
  2286. + break;
  2287. + default: /* Rest of the B types */
  2288. + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2289. + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  2290. }
  2291.  
  2292. for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
  2293. @@ -618,6 +625,8 @@ static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
  2294. static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
  2295. {
  2296. h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
  2297. + /* We can't use h->mb.i_cbp_combined here because it's only calculated at the end of
  2298. + * x264_macroblock_encode(), which hasn't been called at this point. */
  2299. bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
  2300. x264_macroblock_luma_write_cavlc( h, i8, i8 );
  2301. return h->out.bs.i_bits_encoded;
  2302. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  2303. index fa7942d..f5f6267 100644
  2304. --- a/encoder/macroblock.c
  2305. +++ b/encoder/macroblock.c
  2306. @@ -488,7 +488,7 @@ static void x264_macroblock_encode_skip( x264_t *h )
  2307. h->mb.i_cbp_chroma = 0x00;
  2308. memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE );
  2309. /* store cbp */
  2310. - h->mb.cbp[h->mb.i_mb_xy] = 0;
  2311. + h->mb.i_cbp_combined = 0;
  2312. }
  2313.  
  2314. /*****************************************************************************
  2315. @@ -604,7 +604,6 @@ void x264_predict_lossless_16x16( x264_t *h, int i_mode )
  2316. *****************************************************************************/
  2317. void x264_macroblock_encode( x264_t *h )
  2318. {
  2319. - int i_cbp_dc = 0;
  2320. int i_qp = h->mb.i_qp;
  2321. int b_decimate = h->mb.b_dct_decimate;
  2322. int b_force_no_skip = 0;
  2323. @@ -880,34 +879,28 @@ void x264_macroblock_encode( x264_t *h )
  2324. /* encode the 8x8 blocks */
  2325. x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
  2326.  
  2327. - if( h->param.b_cabac )
  2328. - {
  2329. - i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
  2330. + int i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
  2331. | h->mb.cache.non_zero_count[x264_scan8[25]] << 1
  2332. | h->mb.cache.non_zero_count[x264_scan8[26]] << 2;
  2333. - }
  2334.  
  2335. /* store cbp */
  2336. - h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
  2337. + h->mb.i_cbp_combined = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
  2338.  
  2339. /* Check for P_SKIP
  2340. * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
  2341. * (if multiple mv give same result)*/
  2342. if( !b_force_no_skip )
  2343. {
  2344. - if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
  2345. - !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
  2346. - M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
  2347. + if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && !h->mb.i_cbp_combined
  2348. + && M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
  2349. && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
  2350. {
  2351. h->mb.i_type = P_SKIP;
  2352. }
  2353.  
  2354. /* Check for B_SKIP */
  2355. - if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
  2356. - {
  2357. + if( h->mb.i_type == B_DIRECT && !h->mb.i_cbp_combined )
  2358. h->mb.i_type = B_SKIP;
  2359. - }
  2360. }
  2361. }
  2362.  
  2363. --
  2364. 1.6.1.2
  2365.  
  2366.  
  2367. From 2e760d47c213cdfe77c652b9d03518043e831615 Mon Sep 17 00:00:00 2001
  2368. From: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
  2369. Date: Mon, 8 Feb 2010 01:48:38 -0800
  2370. Subject: [PATCH 10/26] Write PASP atom in mp4 muxing
  2371. Adds container-level aspect ratio support for mp4.
  2372.  
  2373. ---
  2374. output/mp4.c | 3 ++-
  2375. 1 files changed, 2 insertions(+), 1 deletions(-)
  2376.  
  2377. diff --git a/output/mp4.c b/output/mp4.c
  2378. index e3ad9c6..b817c82 100644
  2379. --- a/output/mp4.c
  2380. +++ b/output/mp4.c
  2381. @@ -121,7 +121,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  2382. if( mdhd_duration != total_duration )
  2383. {
  2384. uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
  2385. - uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
  2386. + uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
  2387. gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
  2388. total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
  2389. }
  2390. @@ -212,6 +212,7 @@ static int set_param( hnd_t handle, x264_param_t *p_param )
  2391. dw *= sar ;
  2392. else
  2393. dh /= sar;
  2394. + gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height );
  2395. gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
  2396. }
  2397.  
  2398. --
  2399. 1.6.1.2
  2400.  
  2401.  
  2402. From 0c1c12eaab8ac05af4962c0e4ebdd24407cf9a13 Mon Sep 17 00:00:00 2001
  2403. From: Henrik Gramner <hengar-6@student.ltu.se>
  2404. Date: Mon, 8 Feb 2010 15:53:52 -0800
  2405. Subject: [PATCH 11/26] Faster 2x2 chroma DC dequant
  2406.  
  2407. ---
  2408. doc/standards.txt | 1 +
  2409. encoder/macroblock.c | 24 +++++++++---------------
  2410. 2 files changed, 10 insertions(+), 15 deletions(-)
  2411.  
  2412. diff --git a/doc/standards.txt b/doc/standards.txt
  2413. index db9a691..7474d8f 100644
  2414. --- a/doc/standards.txt
  2415. +++ b/doc/standards.txt
  2416. @@ -4,6 +4,7 @@ checkasm is written in gcc, with no attempt at compatibility with anything else.
  2417. We make the following additional assumptions which are true of real systems but not guaranteed by C99:
  2418. * Two's complement.
  2419. * Signed right-shifts are sign-extended.
  2420. +* int is 32-bit or larger.
  2421.  
  2422. x86-specific assumptions:
  2423. * The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that.
  2424. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  2425. index f5f6267..3d859de 100644
  2426. --- a/encoder/macroblock.c
  2427. +++ b/encoder/macroblock.c
  2428. @@ -42,30 +42,24 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
  2429. int d1 = dct[2] + dct[3]; \
  2430. int d2 = dct[0] - dct[1]; \
  2431. int d3 = dct[2] - dct[3]; \
  2432. - int dmf = dequant_mf[i_qp%6][0]; \
  2433. - int qbits = i_qp/6 - 5; \
  2434. - if( qbits > 0 ) \
  2435. - { \
  2436. - dmf <<= qbits; \
  2437. - qbits = 0; \
  2438. - }
  2439. + int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  2440.  
  2441. static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
  2442. {
  2443. IDCT_DEQUANT_START
  2444. - dct4x4[0][0] = (d0 + d1) * dmf >> -qbits;
  2445. - dct4x4[1][0] = (d0 - d1) * dmf >> -qbits;
  2446. - dct4x4[2][0] = (d2 + d3) * dmf >> -qbits;
  2447. - dct4x4[3][0] = (d2 - d3) * dmf >> -qbits;
  2448. + dct4x4[0][0] = (d0 + d1) * dmf >> 5;
  2449. + dct4x4[1][0] = (d0 - d1) * dmf >> 5;
  2450. + dct4x4[2][0] = (d2 + d3) * dmf >> 5;
  2451. + dct4x4[3][0] = (d2 - d3) * dmf >> 5;
  2452. }
  2453.  
  2454. static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
  2455. {
  2456. IDCT_DEQUANT_START
  2457. - out[0] = (d0 + d1) * dmf >> -qbits;
  2458. - out[1] = (d0 - d1) * dmf >> -qbits;
  2459. - out[2] = (d2 + d3) * dmf >> -qbits;
  2460. - out[3] = (d2 - d3) * dmf >> -qbits;
  2461. + out[0] = (d0 + d1) * dmf >> 5;
  2462. + out[1] = (d0 - d1) * dmf >> 5;
  2463. + out[2] = (d2 + d3) * dmf >> 5;
  2464. + out[3] = (d2 - d3) * dmf >> 5;
  2465. }
  2466.  
  2467. static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
  2468. --
  2469. 1.6.1.2
  2470.  
  2471.  
  2472. From d944b740aaa9e07434ff6b022b86460dc27d4b63 Mon Sep 17 00:00:00 2001
  2473. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2474. Date: Tue, 9 Feb 2010 15:08:31 -0800
  2475. Subject: [PATCH 12/26] Print psy-(rd|trellis) with more precision in userdata SEI
  2476.  
  2477. ---
  2478. common/common.c | 2 +-
  2479. 1 files changed, 1 insertions(+), 1 deletions(-)
  2480.  
  2481. diff --git a/common/common.c b/common/common.c
  2482. index 6d1d7f0..aaccdf2 100644
  2483. --- a/common/common.c
  2484. +++ b/common/common.c
  2485. @@ -886,7 +886,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
  2486. s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
  2487. s += sprintf( s, " psy=%d", p->analyse.b_psy );
  2488. if( p->analyse.b_psy )
  2489. - s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
  2490. + s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
  2491. s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
  2492. s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
  2493. s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
  2494. --
  2495. 1.6.1.2
  2496.  
  2497.  
  2498. From eb0d5bd9a8f5bbd0da6fbc7baf214f78de8b26d7 Mon Sep 17 00:00:00 2001
  2499. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2500. Date: Wed, 10 Feb 2010 12:12:29 -0800
  2501. Subject: [PATCH 13/26] Overhaul sliced-threads VBV
  2502. Make predictors thread-local and allow each thread to poll the others to get their predicted sizes.
  2503. Many, many other tweaks to improve quality with small VBV and sliced threads.
  2504. Note this may somewhat increase the risk of a VBV underflow in such extreme situations (single-frame VBV).
  2505. This is tolerable, as most relevant use-cases are better off with a few rare underflows (even if they have to drop a slice) than consistent low quality.
  2506.  
  2507. ---
  2508. encoder/encoder.c | 4 +-
  2509. encoder/ratecontrol.c | 163 ++++++++++++++++++++++++++++--------------------
  2510. encoder/slicetype.c | 4 +-
  2511. 3 files changed, 99 insertions(+), 72 deletions(-)
  2512.  
  2513. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2514. index 008d0f2..0ca6694 100644
  2515. --- a/encoder/encoder.c
  2516. +++ b/encoder/encoder.c
  2517. @@ -2062,6 +2062,8 @@ static int x264_threaded_slices_write( x264_t *h )
  2518. for( i = 0; i <= h->sps->i_mb_height; i++ )
  2519. x264_fdec_filter_row( h, i );
  2520.  
  2521. + x264_threads_merge_ratecontrol( h );
  2522. +
  2523. for( i = 1; i < h->param.i_threads; i++ )
  2524. {
  2525. x264_t *t = h->thread[i];
  2526. @@ -2077,8 +2079,6 @@ static int x264_threaded_slices_write( x264_t *h )
  2527. ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
  2528. }
  2529.  
  2530. - x264_threads_merge_ratecontrol( h );
  2531. -
  2532. return 0;
  2533. }
  2534.  
  2535. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  2536. index e314ba2..0c946ba 100644
  2537. --- a/encoder/ratecontrol.c
  2538. +++ b/encoder/ratecontrol.c
  2539. @@ -134,9 +134,11 @@ struct x264_ratecontrol_t
  2540. * This value is the current position (0 or 1). */
  2541.  
  2542. /* MBRC stuff */
  2543. - double frame_size_estimated;
  2544. + float frame_size_estimated; /* Access to this variable must be atomic: double is
  2545. + * not atomic on all arches we care about */
  2546. double frame_size_planned;
  2547. double slice_size_planned;
  2548. + double max_frame_error;
  2549. predictor_t (*row_pred)[2];
  2550. predictor_t row_preds[5][2];
  2551. predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
  2552. @@ -505,17 +507,21 @@ int x264_ratecontrol_new( x264_t *h )
  2553.  
  2554. rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
  2555. rc->last_qscale = qp2qscale(26);
  2556. - CHECKED_MALLOC( rc->pred, 5*sizeof(predictor_t) );
  2557. + int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
  2558. + CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
  2559. CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
  2560. for( i = 0; i < 5; i++ )
  2561. {
  2562. rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
  2563. rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min );
  2564. rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
  2565. - rc->pred[i].coeff= 2.0;
  2566. - rc->pred[i].count= 1.0;
  2567. - rc->pred[i].decay= 0.5;
  2568. - rc->pred[i].offset= 0.0;
  2569. + for( j = 0; j < num_preds; j++ )
  2570. + {
  2571. + rc->pred[i+j*5].coeff= 2.0;
  2572. + rc->pred[i+j*5].count= 1.0;
  2573. + rc->pred[i+j*5].decay= 0.5;
  2574. + rc->pred[i+j*5].offset= 0.0;
  2575. + }
  2576. for( j = 0; j < 2; j++ )
  2577. {
  2578. rc->row_preds[i][j].coeff= .25;
  2579. @@ -986,22 +992,6 @@ void x264_ratecontrol_delete( x264_t *h )
  2580. x264_free( rc );
  2581. }
  2582.  
  2583. -void x264_ratecontrol_set_estimated_size( x264_t *h, int bits )
  2584. -{
  2585. - x264_pthread_mutex_lock( &h->fenc->mutex );
  2586. - h->rc->frame_size_estimated = bits;
  2587. - x264_pthread_mutex_unlock( &h->fenc->mutex );
  2588. -}
  2589. -
  2590. -int x264_ratecontrol_get_estimated_size( x264_t const *h)
  2591. -{
  2592. - int size;
  2593. - x264_pthread_mutex_lock( &h->fenc->mutex );
  2594. - size = h->rc->frame_size_estimated;
  2595. - x264_pthread_mutex_unlock( &h->fenc->mutex );
  2596. - return size;
  2597. -}
  2598. -
  2599. static void accum_p_qp_update( x264_t *h, float qp )
  2600. {
  2601. x264_ratecontrol_t *rc = h->rc;
  2602. @@ -1173,6 +1163,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2603. /* tweak quality based on difference from predicted size */
  2604. if( y < h->i_threadslice_end-1 )
  2605. {
  2606. + int i;
  2607. int prev_row_qp = h->fdec->i_row_qp[y];
  2608. int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
  2609. int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
  2610. @@ -1186,19 +1177,23 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2611.  
  2612. float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
  2613. float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
  2614. - float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
  2615. + float size_of_other_slices = 0;
  2616. + if( h->param.b_sliced_threads )
  2617. + {
  2618. + for( i = 0; i < h->param.i_threads; i++ )
  2619. + if( h != h->thread[i] )
  2620. + size_of_other_slices += h->thread[i]->rc->frame_size_estimated;
  2621. + }
  2622. + else
  2623. + rc->max_frame_error = X264_MAX( 0.05, 1.0 / (h->sps->i_mb_width) );
  2624. +
  2625. /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
  2626. float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
  2627. - float max_frame_error = X264_MAX( 0.05, 1.0 / h->sps->i_mb_height );
  2628. - int b1 = predict_row_size_sum( h, y, rc->qpm );
  2629. -
  2630. - /* Assume that if this slice has become larger than expected,
  2631. - * the other slices will have gotten equally larger. */
  2632. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2633. + int b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2634.  
  2635. /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
  2636. /* area at the top of the frame was measured inaccurately. */
  2637. - if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
  2638. + if( row_bits_so_far( h, y ) < 0.05 * slice_size_planned )
  2639. return;
  2640.  
  2641. if( h->sh.i_type != SLICE_TYPE_I )
  2642. @@ -1213,8 +1208,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2643. (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
  2644. {
  2645. rc->qpm ++;
  2646. - b1 = predict_row_size_sum( h, y, rc->qpm );
  2647. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2648. + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2649. }
  2650.  
  2651. while( rc->qpm > i_qp_min
  2652. @@ -1223,20 +1217,18 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2653. || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
  2654. {
  2655. rc->qpm --;
  2656. - b1 = predict_row_size_sum( h, y, rc->qpm );
  2657. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2658. + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2659. }
  2660.  
  2661. /* avoid VBV underflow */
  2662. while( (rc->qpm < h->param.rc.i_qp_max)
  2663. - && (rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) )
  2664. + && (rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) )
  2665. {
  2666. rc->qpm ++;
  2667. - b1 = predict_row_size_sum( h, y, rc->qpm );
  2668. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2669. + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2670. }
  2671.  
  2672. - x264_ratecontrol_set_estimated_size(h, b1);
  2673. + h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm );
  2674. }
  2675.  
  2676. /* loses the fractional part of the frame-wise qp */
  2677. @@ -1584,7 +1576,7 @@ static void update_vbv_plan( x264_t *h, int overhead )
  2678. double bits = t->rc->frame_size_planned;
  2679. if( !t->b_thread_active )
  2680. continue;
  2681. - bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
  2682. + bits = X264_MAX(bits, t->rc->frame_size_estimated);
  2683. rcc->buffer_fill -= bits;
  2684. rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
  2685. rcc->buffer_fill += rcc->buffer_rate;
  2686. @@ -1783,7 +1775,7 @@ static float rate_estimate_qscale( x264_t *h )
  2687. rcc->frame_size_planned = qscale2bits( &rce, q );
  2688. else
  2689. rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
  2690. - x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
  2691. + h->rc->frame_size_estimated = rcc->frame_size_planned;
  2692.  
  2693. /* For row SATDs */
  2694. if( rcc->b_vbv )
  2695. @@ -1812,7 +1804,7 @@ static float rate_estimate_qscale( x264_t *h )
  2696. double bits = t->rc->frame_size_planned;
  2697. if( !t->b_thread_active )
  2698. continue;
  2699. - bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
  2700. + bits = X264_MAX(bits, t->rc->frame_size_estimated);
  2701. predicted_bits += (int64_t)bits;
  2702. }
  2703. }
  2704. @@ -1953,61 +1945,96 @@ static float rate_estimate_qscale( x264_t *h )
  2705. /* Always use up the whole VBV in this case. */
  2706. if( rcc->single_frame_vbv )
  2707. rcc->frame_size_planned = rcc->buffer_rate;
  2708. - x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
  2709. + h->rc->frame_size_estimated = rcc->frame_size_planned;
  2710. return q;
  2711. }
  2712. }
  2713.  
  2714. +void x264_threads_normalize_predictors( x264_t *h )
  2715. +{
  2716. + int i;
  2717. + double totalsize = 0;
  2718. + for( i = 0; i < h->param.i_threads; i++ )
  2719. + totalsize += h->thread[i]->rc->slice_size_planned;
  2720. + double factor = h->rc->frame_size_planned / totalsize;
  2721. + for( i = 0; i < h->param.i_threads; i++ )
  2722. + h->thread[i]->rc->slice_size_planned *= factor;
  2723. +}
  2724. +
  2725. void x264_threads_distribute_ratecontrol( x264_t *h )
  2726. {
  2727. - int i, row, totalsize = 0;
  2728. - if( h->rc->b_vbv )
  2729. - for( row = 0; row < h->sps->i_mb_height; row++ )
  2730. - totalsize += h->fdec->i_row_satd[row];
  2731. + int i, row;
  2732. + x264_ratecontrol_t *rc = h->rc;
  2733. +
  2734. + /* Initialize row predictors */
  2735. + if( h->i_frame == 0 )
  2736. + for( i = 0; i < h->param.i_threads; i++ )
  2737. + {
  2738. + x264_ratecontrol_t *t = h->thread[i]->rc;
  2739. + memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
  2740. + }
  2741. +
  2742. for( i = 0; i < h->param.i_threads; i++ )
  2743. {
  2744. x264_t *t = h->thread[i];
  2745. - x264_ratecontrol_t *rc = h->rc;
  2746. - memcpy( t->rc, rc, sizeof(x264_ratecontrol_t) );
  2747. + memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
  2748. + t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
  2749. /* Calculate the planned slice size. */
  2750. - if( h->rc->b_vbv && rc->frame_size_planned )
  2751. + if( rc->b_vbv && rc->frame_size_planned )
  2752. {
  2753. int size = 0;
  2754. for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
  2755. size += h->fdec->i_row_satd[row];
  2756. - t->rc->slice_size_planned = size * rc->frame_size_planned / totalsize;
  2757. + t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], rc->qpm, size );
  2758. }
  2759. else
  2760. t->rc->slice_size_planned = 0;
  2761. }
  2762. + if( rc->b_vbv && rc->frame_size_planned )
  2763. + {
  2764. + x264_threads_normalize_predictors( h );
  2765. +
  2766. + if( rc->single_frame_vbv )
  2767. + {
  2768. + /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */
  2769. + for( i = 0; i < h->param.i_threads; i++ )
  2770. + {
  2771. + x264_t *t = h->thread[i];
  2772. + t->rc->max_frame_error = X264_MAX( 0.05, 1.0 / (t->i_threadslice_end - t->i_threadslice_start) );
  2773. + t->rc->slice_size_planned += 2 * t->rc->max_frame_error * rc->frame_size_planned;
  2774. + }
  2775. + x264_threads_normalize_predictors( h );
  2776. + }
  2777. +
  2778. + for( i = 0; i < h->param.i_threads; i++ )
  2779. + h->thread[i]->rc->frame_size_estimated = h->thread[i]->rc->slice_size_planned;
  2780. + }
  2781. }
  2782.  
  2783. void x264_threads_merge_ratecontrol( x264_t *h )
  2784. {
  2785. - int i, j, k;
  2786. + int i, row;
  2787. x264_ratecontrol_t *rc = h->rc;
  2788. x264_emms();
  2789.  
  2790. - for( i = 1; i < h->param.i_threads; i++ )
  2791. + for( i = 0; i < h->param.i_threads; i++ )
  2792. {
  2793. - x264_ratecontrol_t *t = h->thread[i]->rc;
  2794. - rc->qpa_rc += t->qpa_rc;
  2795. - rc->qpa_aq += t->qpa_aq;
  2796. - for( j = 0; j < 5; j++ )
  2797. - for( k = 0; k < 2; k++ )
  2798. - {
  2799. - rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
  2800. - rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
  2801. - rc->row_preds[j][k].count += t->row_preds[j][k].count;
  2802. - }
  2803. - }
  2804. - for( j = 0; j < 5; j++ )
  2805. - for( k = 0; k < 2; k++ )
  2806. + x264_t *t = h->thread[i];
  2807. + x264_ratecontrol_t *rct = h->thread[i]->rc;
  2808. + if( h->param.rc.i_vbv_buffer_size )
  2809. {
  2810. - rc->row_preds[j][k].coeff /= h->param.i_threads;
  2811. - rc->row_preds[j][k].offset /= h->param.i_threads;
  2812. - rc->row_preds[j][k].count /= h->param.i_threads;
  2813. + int size = 0;
  2814. + for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
  2815. + size += h->fdec->i_row_satd[row];
  2816. + int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits;
  2817. + int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->sps->i_mb_width;
  2818. + update_predictor( &rc->pred[h->sh.i_type+5*i], qp2qscale(rct->qpa_rc/mb_count), size, bits );
  2819. }
  2820. + if( !i )
  2821. + continue;
  2822. + rc->qpa_rc += rct->qpa_rc;
  2823. + rc->qpa_aq += rct->qpa_aq;
  2824. + }
  2825. }
  2826.  
  2827. void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  2828. diff --git a/encoder/slicetype.c b/encoder/slicetype.c
  2829. index 057f6a6..bb2ed64 100644
  2830. --- a/encoder/slicetype.c
  2831. +++ b/encoder/slicetype.c
  2832. @@ -1394,10 +1394,10 @@ int x264_rc_analyse_slice( x264_t *h )
  2833. int mb_xy = y * h->mb.i_mb_stride;
  2834. for( x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
  2835. {
  2836. - int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor) >> 8;
  2837. + int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
  2838. int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
  2839. int diff = intra_cost - inter_cost;
  2840. - h->fdec->i_row_satd[y] += diff;
  2841. + h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
  2842. cost += diff;
  2843. }
  2844. }
  2845. --
  2846. 1.6.1.2
  2847.  
  2848.  
  2849. From 55cd605a06a1f09925d2707351774f34263ebe3f Mon Sep 17 00:00:00 2001
  2850. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2851. Date: Wed, 10 Feb 2010 13:44:28 -0800
  2852. Subject: [PATCH 14/26] Allow longer keyints with intra refresh
  2853. If a long keyint is specified (longer than macroblock width-1), the refresh will simply not occur all the time.
  2854. In other words, a refresh will take place, and then x264 will wait until keyint is over to start another refresh.
  2855.  
  2856. ---
  2857. encoder/encoder.c | 15 +++++++--------
  2858. 1 files changed, 7 insertions(+), 8 deletions(-)
  2859.  
  2860. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2861. index 0ca6694..d43a758 100644
  2862. --- a/encoder/encoder.c
  2863. +++ b/encoder/encoder.c
  2864. @@ -599,8 +599,6 @@ static int x264_validate_parameters( x264_t *h )
  2865. x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
  2866. h->param.i_frame_reference = 1;
  2867. }
  2868. - if( h->param.b_intra_refresh )
  2869. - h->param.i_keyint_max = X264_MIN( h->param.i_keyint_max, (h->param.i_width+15)/16 - 1 );
  2870. h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
  2871. h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
  2872. {
  2873. @@ -2307,22 +2305,22 @@ int x264_encoder_encode( x264_t *h,
  2874. if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
  2875. {
  2876. int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
  2877. - float increment = ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max;
  2878. + float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
  2879. + int max_position = (int)(increment * h->param.i_keyint_max);
  2880. if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
  2881. h->fdec->f_pir_position = 0;
  2882. else
  2883. {
  2884. - if( h->fref0[0]->i_pir_end_col == h->sps->i_mb_width - 1 )
  2885. + h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
  2886. + if( h->fdec->f_pir_position+0.5 >= max_position )
  2887. {
  2888. h->fdec->f_pir_position = 0;
  2889. h->fenc->b_keyframe = 1;
  2890. }
  2891. - else
  2892. - h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
  2893. }
  2894. h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
  2895. h->fdec->f_pir_position += increment * pocdiff;
  2896. - h->fdec->i_pir_end_col = X264_MIN( h->fdec->f_pir_position+0.5, h->sps->i_mb_width-1 );
  2897. + h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
  2898. }
  2899.  
  2900. /* Write SPS and PPS */
  2901. @@ -2358,8 +2356,9 @@ int x264_encoder_encode( x264_t *h,
  2902.  
  2903. if( h->fenc->i_type != X264_TYPE_IDR )
  2904. {
  2905. + int time_to_recovery = X264_MIN( h->sps->i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe;
  2906. x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
  2907. - x264_sei_recovery_point_write( h, &h->out.bs, h->param.i_keyint_max );
  2908. + x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery );
  2909. x264_nal_end( h );
  2910. overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
  2911. }
  2912. --
  2913. 1.6.1.2
  2914.  
  2915.  
  2916. From 2684c8486c7365db25188a70810f663de10428fa Mon Sep 17 00:00:00 2001
  2917. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2918. Date: Fri, 12 Feb 2010 03:33:54 -0800
  2919. Subject: [PATCH 15/26] Implement direct temporal + interlaced
  2920. This was much easier than I expected.
  2921. It will also be basically useless until TFF/BFF support gets in, since it requires delta_poc_bottom to be set correctly to work well.
  2922.  
  2923. ---
  2924. common/common.h | 5 +++--
  2925. common/macroblock.c | 8 ++++----
  2926. encoder/encoder.c | 5 -----
  2927. 3 files changed, 7 insertions(+), 11 deletions(-)
  2928.  
  2929. diff --git a/common/common.h b/common/common.h
  2930. index d4a8dd9..6da462f 100644
  2931. --- a/common/common.h
  2932. +++ b/common/common.h
  2933. @@ -655,11 +655,12 @@ struct x264_t
  2934. int i_chroma_lambda2_offset;
  2935.  
  2936. /* B_direct and weighted prediction */
  2937. - int16_t dist_scale_factor[16][2];
  2938. + int16_t dist_scale_factor_buf[2][16][2];
  2939. + int16_t (*dist_scale_factor)[2];
  2940. int8_t bipred_weight_buf[2][32][4];
  2941. int8_t (*bipred_weight)[4];
  2942. /* maps fref1[0]'s ref indices into the current list0 */
  2943. -#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
  2944. +#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
  2945. int8_t map_col_to_list0[18];
  2946. int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
  2947. } mb;
  2948. diff --git a/common/macroblock.c b/common/macroblock.c
  2949. index d86f3af..e676b8b 100644
  2950. --- a/common/macroblock.c
  2951. +++ b/common/macroblock.c
  2952. @@ -190,7 +190,8 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
  2953. const int x8 = i8%2;
  2954. const int y8 = i8/2;
  2955. const int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
  2956. - const int i_ref = map_col_to_list0(h->fref1[0]->ref[0][i_part_8x8]);
  2957. + const int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8];
  2958. + const int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
  2959.  
  2960. if( i_ref >= 0 )
  2961. {
  2962. @@ -1238,6 +1239,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
  2963. if( h->sh.i_type == SLICE_TYPE_B )
  2964. {
  2965. h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(i_mb_y&1)];
  2966. + h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(i_mb_y&1)];
  2967. if( h->param.b_cabac )
  2968. {
  2969. uint8_t skipbp;
  2970. @@ -1478,9 +1480,7 @@ void x264_macroblock_bipred_init( x264_t *h )
  2971. dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
  2972. }
  2973.  
  2974. - // FIXME: will need this if we ever do temporal MV pred with interlaced
  2975. - if( !h->sh.b_mbaff )
  2976. - h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
  2977. + h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor;
  2978.  
  2979. dist_scale_factor >>= 2;
  2980. if( h->param.analyse.b_weighted_bipred
  2981. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2982. index d43a758..9efe88a 100644
  2983. --- a/encoder/encoder.c
  2984. +++ b/encoder/encoder.c
  2985. @@ -430,11 +430,6 @@ static int x264_validate_parameters( x264_t *h )
  2986. x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
  2987. h->param.analyse.i_me_method = X264_ME_UMH;
  2988. }
  2989. - if( h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
  2990. - {
  2991. - x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
  2992. - h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
  2993. - }
  2994. if( h->param.analyse.i_weighted_pred > 0 )
  2995. {
  2996. x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
  2997. --
  2998. 1.6.1.2
  2999.  
  3000.  
  3001. From 436109f0c9cba043559f360cc69bae22d4b188f7 Mon Sep 17 00:00:00 2001
  3002. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3003. Date: Fri, 12 Feb 2010 21:15:12 -0800
  3004. Subject: [PATCH 16/26] Backport various speed tweak ideas from ffmpeg
  3005. Add mv0 early termination to spatial direct calculation
  3006. Up to twice as fast direct mv calculation on near-motionless video.
  3007.  
  3008. Branchless CAVLC level code adjustment based on trailing ones.
  3009. A few clocks faster.
  3010.  
  3011. Check tc value before clipping in C version of deblock functions.
  3012. Much faster, but nobody uses those anyways.
  3013.  
  3014. Thanks to Michael Niedermayer for the ideas.
  3015. ---
  3016. common/frame.c | 6 ++++--
  3017. common/macroblock.c | 3 +++
  3018. encoder/cavlc.c | 7 +++----
  3019. 3 files changed, 10 insertions(+), 6 deletions(-)
  3020.  
  3021. diff --git a/common/frame.c b/common/frame.c
  3022. index 40cc78f..d89f5ab 100644
  3023. --- a/common/frame.c
  3024. +++ b/common/frame.c
  3025. @@ -472,12 +472,14 @@ static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int a
  3026. int delta;
  3027. if( abs( p2 - p0 ) < beta )
  3028. {
  3029. - pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
  3030. + if( tc0[i] )
  3031. + pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
  3032. tc++;
  3033. }
  3034. if( abs( q2 - q0 ) < beta )
  3035. {
  3036. - pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
  3037. + if( tc0[i] )
  3038. + pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
  3039. tc++;
  3040. }
  3041.  
  3042. diff --git a/common/macroblock.c b/common/macroblock.c
  3043. index e676b8b..c9ce597 100644
  3044. --- a/common/macroblock.c
  3045. +++ b/common/macroblock.c
  3046. @@ -272,6 +272,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  3047. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
  3048. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
  3049.  
  3050. + if( !M64( mv ) )
  3051. + return 1;
  3052. +
  3053. if( h->param.i_threads > 1
  3054. && ( mv[0][1] > h->mb.mv_max_spel[1]
  3055. || mv[1][1] > h->mb.mv_max_spel[1] ) )
  3056. diff --git a/encoder/cavlc.c b/encoder/cavlc.c
  3057. index 45b55fe..12806ae 100644
  3058. --- a/encoder/cavlc.c
  3059. +++ b/encoder/cavlc.c
  3060. @@ -147,10 +147,9 @@ static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, int16_t *l,
  3061.  
  3062. if( i_trailing < i_total )
  3063. {
  3064. - int16_t val = runlevel.level[i_trailing];
  3065. - int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
  3066. - if( i_trailing < 3 )
  3067. - val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
  3068. + int val = runlevel.level[i_trailing];
  3069. + int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
  3070. + val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
  3071. val += LEVEL_TABLE_SIZE/2;
  3072.  
  3073. if( (unsigned)val_original < LEVEL_TABLE_SIZE )
  3074. --
  3075. 1.6.1.2
  3076.  
  3077.  
  3078. From 88a2153ed4519582b61cc516ca59b2d9559e6725 Mon Sep 17 00:00:00 2001
  3079. From: Alexander Strange <astrange@ithinksw.com>
  3080. Date: Mon, 10 Nov 2008 00:55:20 -0500
  3081. Subject: [PATCH 17/26] Allow | as a separator between psy-rd and psy-trellis values.
  3082. [,:/] are all taken when setting psy-trellis in a zone in an mencoder option.
  3083.  
  3084. Also fix a comment typo and remove a useless line of code.
  3085. ---
  3086. common/common.c | 3 ++-
  3087. encoder/encoder.c | 4 +---
  3088. 2 files changed, 3 insertions(+), 4 deletions(-)
  3089.  
  3090. diff --git a/common/common.c b/common/common.c
  3091. index aaccdf2..0dd7af5 100644
  3092. --- a/common/common.c
  3093. +++ b/common/common.c
  3094. @@ -515,7 +515,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
  3095. OPT("psy-rd")
  3096. {
  3097. if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
  3098. - 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) )
  3099. + 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
  3100. + 2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ))
  3101. { }
  3102. else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
  3103. {
  3104. diff --git a/encoder/encoder.c b/encoder/encoder.c
  3105. index 9efe88a..cca9c45 100644
  3106. --- a/encoder/encoder.c
  3107. +++ b/encoder/encoder.c
  3108. @@ -84,7 +84,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
  3109. x264_param_t *param = &h->param;
  3110. int i;
  3111.  
  3112. - /* First we fill all field */
  3113. + /* First we fill all fields */
  3114. sh->sps = sps;
  3115. sh->pps = pps;
  3116.  
  3117. @@ -685,8 +685,6 @@ static int x264_validate_parameters( x264_t *h )
  3118. /* Psy trellis has a similar effect. */
  3119. if( h->mb.i_psy_trellis )
  3120. h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
  3121. - else
  3122. - h->mb.i_psy_trellis = 0;
  3123. h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
  3124. h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
  3125. h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
  3126. --
  3127. 1.6.1.2
  3128.  
  3129.  
  3130. From ed7f1d8c708dad428a2706254016095ebf755e8b Mon Sep 17 00:00:00 2001
  3131. From: Alexander Strange <astrange@ithinksw.com>
  3132. Date: Sat, 13 Feb 2010 01:41:41 -0500
  3133. Subject: [PATCH 18/26] mkv: Write SimpleBlock instead of Block for frame headers
  3134.  
  3135. mkvtoolnix writes these by default since 2009/04/13.
  3136. Slightly simplifies muxer and allows 'mkvinfo -s' to show B-frames
  3137. as 'B' (but not B-ref frames).
  3138. ---
  3139. output/matroska.c | 2 +-
  3140. output/matroska_ebml.c | 80 ++++++++----------------------------------------
  3141. output/matroska_ebml.h | 2 +-
  3142. 3 files changed, 15 insertions(+), 69 deletions(-)
  3143.  
  3144. diff --git a/output/matroska.c b/output/matroska.c
  3145. index 8e84f52..db7639c 100644
  3146. --- a/output/matroska.c
  3147. +++ b/output/matroska.c
  3148. @@ -185,7 +185,7 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
  3149.  
  3150. p_mkv->b_writing_frame = 0;
  3151.  
  3152. - if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe ) < 0 )
  3153. + if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 )
  3154. return -1;
  3155.  
  3156. return i_size;
  3157. diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
  3158. index d1c6e13..7265909 100644
  3159. --- a/output/matroska_ebml.c
  3160. +++ b/output/matroska_ebml.c
  3161. @@ -53,9 +53,9 @@ struct mk_writer
  3162. int64_t def_duration;
  3163. int64_t timescale;
  3164. int64_t cluster_tc_scaled;
  3165. - int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
  3166. + int64_t frame_tc, max_frame_tc;
  3167.  
  3168. - char wrote_header, in_frame, keyframe;
  3169. + char wrote_header, in_frame, keyframe, skippable;
  3170. };
  3171.  
  3172. static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
  3173. @@ -258,23 +258,6 @@ static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
  3174. return 0;
  3175. }
  3176.  
  3177. -static int mk_write_sint( mk_context *c, unsigned id, int64_t si )
  3178. -{
  3179. - unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
  3180. - unsigned i = 0;
  3181. -
  3182. - CHECK( mk_write_id( c, id ) );
  3183. - if( si < 0 )
  3184. - while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
  3185. - ++i;
  3186. - else
  3187. - while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80 ) )
  3188. - ++i;
  3189. - CHECK( mk_write_size( c, 8 - i ) );
  3190. - CHECK( mk_append_context_data( c, c_si+i, 8 - i ) );
  3191. - return 0;
  3192. -}
  3193. -
  3194. static int mk_write_float_raw( mk_context *c, float f )
  3195. {
  3196. union
  3197. @@ -301,34 +284,6 @@ static int mk_write_float( mk_context *c, unsigned id, float f )
  3198. return 0;
  3199. }
  3200.  
  3201. -static unsigned mk_ebml_size_size( unsigned s )
  3202. -{
  3203. - if( s < 0x7f )
  3204. - return 1;
  3205. - if( s < 0x3fff )
  3206. - return 2;
  3207. - if( s < 0x1fffff )
  3208. - return 3;
  3209. - if( s < 0x0fffffff )
  3210. - return 4;
  3211. - return 5;
  3212. -}
  3213. -
  3214. -static unsigned mk_ebml_sint_size( int64_t si )
  3215. -{
  3216. - unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
  3217. - unsigned i = 0;
  3218. -
  3219. - if( si < 0 )
  3220. - while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
  3221. - ++i;
  3222. - else
  3223. - while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80) )
  3224. - ++i;
  3225. -
  3226. - return 8 - i;
  3227. -}
  3228. -
  3229. mk_writer *mk_create_writer( const char *filename )
  3230. {
  3231. mk_writer *w = malloc( sizeof(*w) );
  3232. @@ -446,8 +401,8 @@ static int mk_close_cluster( mk_writer *w )
  3233.  
  3234. static int mk_flush_frame( mk_writer *w )
  3235. {
  3236. - int64_t delta, ref = 0;
  3237. - unsigned fsize, bgsize;
  3238. + int64_t delta;
  3239. + unsigned fsize;
  3240. unsigned char c_delta_flags[3];
  3241.  
  3242. if( !w->in_frame )
  3243. @@ -470,33 +425,22 @@ static int mk_flush_frame( mk_writer *w )
  3244. }
  3245.  
  3246. fsize = w->frame ? w->frame->d_cur : 0;
  3247. - bgsize = fsize + 4 + mk_ebml_size_size( fsize + 4 ) + 1;
  3248. - if( !w->keyframe )
  3249. - {
  3250. - ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
  3251. - bgsize += 1 + 1 + mk_ebml_sint_size( ref );
  3252. - }
  3253.  
  3254. - CHECK( mk_write_id( w->cluster, 0xa0 ) ); // BlockGroup
  3255. - CHECK( mk_write_size( w->cluster, bgsize ) );
  3256. - CHECK( mk_write_id( w->cluster, 0xa1 ) ); // Block
  3257. + CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock
  3258. CHECK( mk_write_size( w->cluster, fsize + 4 ) );
  3259. CHECK( mk_write_size( w->cluster, 1 ) ); // track number
  3260.  
  3261. c_delta_flags[0] = delta >> 8;
  3262. c_delta_flags[1] = delta;
  3263. - c_delta_flags[2] = 0;
  3264. + c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
  3265. CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
  3266. if( w->frame )
  3267. {
  3268. CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
  3269. w->frame->d_cur = 0;
  3270. }
  3271. - if( !w->keyframe )
  3272. - CHECK( mk_write_sint( w->cluster, 0xfb, ref ) ); // ReferenceBlock
  3273.  
  3274. w->in_frame = 0;
  3275. - w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
  3276.  
  3277. if( w->cluster->d_cur > CLSIZE )
  3278. CHECK( mk_close_cluster( w ) );
  3279. @@ -509,19 +453,21 @@ int mk_start_frame( mk_writer *w )
  3280. if( mk_flush_frame( w ) < 0 )
  3281. return -1;
  3282.  
  3283. - w->in_frame = 1;
  3284. - w->keyframe = 0;
  3285. + w->in_frame = 1;
  3286. + w->keyframe = 0;
  3287. + w->skippable = 0;
  3288.  
  3289. return 0;
  3290. }
  3291.  
  3292. -int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe )
  3293. +int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable )
  3294. {
  3295. if( !w->in_frame )
  3296. return -1;
  3297.  
  3298. - w->frame_tc = timestamp;
  3299. - w->keyframe = keyframe != 0;
  3300. + w->frame_tc = timestamp;
  3301. + w->keyframe = keyframe != 0;
  3302. + w->skippable = skippable != 0;
  3303.  
  3304. if( w->max_frame_tc < timestamp )
  3305. w->max_frame_tc = timestamp;
  3306. diff --git a/output/matroska_ebml.h b/output/matroska_ebml.h
  3307. index 252e781..56eb8cc 100644
  3308. --- a/output/matroska_ebml.h
  3309. +++ b/output/matroska_ebml.h
  3310. @@ -35,7 +35,7 @@ int mk_writeHeader( mk_writer *w, const char *writing_app,
  3311.  
  3312. int mk_start_frame( mk_writer *w );
  3313. int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
  3314. -int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe );
  3315. +int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable );
  3316. int mk_close( mk_writer *w, int64_t last_delta );
  3317.  
  3318. #endif
  3319. --
  3320. 1.6.1.2
  3321.  
  3322.  
  3323. From 6c236ef44883e926c00f75f961a7423a8aa56036 Mon Sep 17 00:00:00 2001
  3324. From: Alexander Strange <astrange@ithinksw.com>
  3325. Date: Sat, 13 Feb 2010 02:00:57 -0500
  3326. Subject: [PATCH 19/26] mkv: Write the x264 version into the file header
  3327.  
  3328. This only updates the "writing application"; matroska_ebml.c is the
  3329. "muxing application", but the version string for that is still hardcoded.
  3330. ---
  3331. output/matroska.c | 2 +-
  3332. 1 files changed, 1 insertions(+), 1 deletions(-)
  3333.  
  3334. diff --git a/output/matroska.c b/output/matroska.c
  3335. index db7639c..b1805e4 100644
  3336. --- a/output/matroska.c
  3337. +++ b/output/matroska.c
  3338. @@ -146,7 +146,7 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
  3339.  
  3340. memcpy( avcC+11+sps_size, pps, pps_size );
  3341.  
  3342. - ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
  3343. + ret = mk_writeHeader( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
  3344. avcC, avcC_len, p_mkv->frame_duration, 50000,
  3345. p_mkv->width, p_mkv->height,
  3346. p_mkv->d_width, p_mkv->d_height );
  3347. --
  3348. 1.6.1.2
  3349.  
  3350.  
  3351. From 04b8ec5fa470d1132114ffcc09494050c6c5751e Mon Sep 17 00:00:00 2001
  3352. From: Alexander Strange <astrange@ithinksw.com>
  3353. Date: Sat, 13 Feb 2010 02:22:04 -0500
  3354. Subject: [PATCH 20/26] Mark cli_input/output_t variables as const when possible
  3355.  
  3356. ---
  3357. input/avs.c | 2 +-
  3358. input/ffms.c | 2 +-
  3359. input/input.h | 10 +++++-----
  3360. input/lavf.c | 2 +-
  3361. input/y4m.c | 2 +-
  3362. input/yuv.c | 2 +-
  3363. output/flv.c | 2 +-
  3364. output/matroska.c | 2 +-
  3365. output/mp4.c | 2 +-
  3366. output/output.h | 8 ++++----
  3367. output/raw.c | 2 +-
  3368. 11 files changed, 18 insertions(+), 18 deletions(-)
  3369.  
  3370. diff --git a/input/avs.c b/input/avs.c
  3371. index 522f8fe..79b5c80 100644
  3372. --- a/input/avs.c
  3373. +++ b/input/avs.c
  3374. @@ -313,4 +313,4 @@ static int close_file( hnd_t handle )
  3375. return 0;
  3376. }
  3377.  
  3378. -cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
  3379. +const cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
  3380. diff --git a/input/ffms.c b/input/ffms.c
  3381. index b680967..14962c7 100644
  3382. --- a/input/ffms.c
  3383. +++ b/input/ffms.c
  3384. @@ -244,4 +244,4 @@ static int close_file( hnd_t handle )
  3385. return 0;
  3386. }
  3387.  
  3388. -cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3389. +const cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3390. diff --git a/input/input.h b/input/input.h
  3391. index 9fb425c..6e386f4 100644
  3392. --- a/input/input.h
  3393. +++ b/input/input.h
  3394. @@ -60,11 +60,11 @@ typedef struct
  3395. int (*close_file)( hnd_t handle );
  3396. } cli_input_t;
  3397.  
  3398. -extern cli_input_t yuv_input;
  3399. -extern cli_input_t y4m_input;
  3400. -extern cli_input_t avs_input;
  3401. +extern const cli_input_t yuv_input;
  3402. +extern const cli_input_t y4m_input;
  3403. +extern const cli_input_t avs_input;
  3404. extern cli_input_t thread_input;
  3405. -extern cli_input_t lavf_input;
  3406. -extern cli_input_t ffms_input;
  3407. +extern const cli_input_t lavf_input;
  3408. +extern const cli_input_t ffms_input;
  3409.  
  3410. #endif
  3411. diff --git a/input/lavf.c b/input/lavf.c
  3412. index 180e509..6ecc6b0 100644
  3413. --- a/input/lavf.c
  3414. +++ b/input/lavf.c
  3415. @@ -269,4 +269,4 @@ static int close_file( hnd_t handle )
  3416. return 0;
  3417. }
  3418.  
  3419. -cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
  3420. +const cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
  3421. diff --git a/input/y4m.c b/input/y4m.c
  3422. index 1619f74..8645ff7 100644
  3423. --- a/input/y4m.c
  3424. +++ b/input/y4m.c
  3425. @@ -242,4 +242,4 @@ static int close_file( hnd_t handle )
  3426. return 0;
  3427. }
  3428.  
  3429. -cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3430. +const cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3431. diff --git a/input/yuv.c b/input/yuv.c
  3432. index dbd0317..3e39e07 100644
  3433. --- a/input/yuv.c
  3434. +++ b/input/yuv.c
  3435. @@ -125,4 +125,4 @@ static int close_file( hnd_t handle )
  3436. return 0;
  3437. }
  3438.  
  3439. -cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3440. +const cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3441. diff --git a/output/flv.c b/output/flv.c
  3442. index b3e5d16..2e0a0e4 100644
  3443. --- a/output/flv.c
  3444. +++ b/output/flv.c
  3445. @@ -305,4 +305,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  3446. return 0;
  3447. }
  3448.  
  3449. -cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3450. +const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3451. diff --git a/output/matroska.c b/output/matroska.c
  3452. index b1805e4..fb39ced 100644
  3453. --- a/output/matroska.c
  3454. +++ b/output/matroska.c
  3455. @@ -206,4 +206,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  3456. return ret;
  3457. }
  3458.  
  3459. -cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3460. +const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3461. diff --git a/output/mp4.c b/output/mp4.c
  3462. index b817c82..b99eaed 100644
  3463. --- a/output/mp4.c
  3464. +++ b/output/mp4.c
  3465. @@ -298,4 +298,4 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
  3466. return i_size;
  3467. }
  3468.  
  3469. -cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
  3470. +const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
  3471. diff --git a/output/output.h b/output/output.h
  3472. index 851b819..c79b48e 100644
  3473. --- a/output/output.h
  3474. +++ b/output/output.h
  3475. @@ -33,9 +33,9 @@ typedef struct
  3476. int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
  3477. } cli_output_t;
  3478.  
  3479. -extern cli_output_t raw_output;
  3480. -extern cli_output_t mkv_output;
  3481. -extern cli_output_t mp4_output;
  3482. -extern cli_output_t flv_output;
  3483. +extern const cli_output_t raw_output;
  3484. +extern const cli_output_t mkv_output;
  3485. +extern const cli_output_t mp4_output;
  3486. +extern const cli_output_t flv_output;
  3487.  
  3488. #endif
  3489. diff --git a/output/raw.c b/output/raw.c
  3490. index a4d1175..02e4c56 100644
  3491. --- a/output/raw.c
  3492. +++ b/output/raw.c
  3493. @@ -62,5 +62,5 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  3494. return fclose( (FILE*)handle );
  3495. }
  3496.  
  3497. -cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
  3498. +const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
  3499.  
  3500. --
  3501. 1.6.1.2
  3502.  
  3503.  
  3504. From f3dad80b901593c9d504930cd610650c8d8ff104 Mon Sep 17 00:00:00 2001
  3505. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3506. Date: Sat, 13 Feb 2010 00:52:31 -0800
  3507. Subject: [PATCH 21/26] Make the ABR buffer consider the distance to the end of the video
  3508. Should improve bitrate accuracy in 2-pass mode.
  3509. May also slightly improve quality by allowing more variation earlier-on in a file.
  3510.  
  3511. Also fix abr_buffer with 1-pass: it does something very different than what it does for 2-pass.
  3512. Thus, the earlier change that increased it based on threads caused 1-pass ABR to be somewhat less accurate.
  3513. ---
  3514. encoder/ratecontrol.c | 6 ++++--
  3515. 1 files changed, 4 insertions(+), 2 deletions(-)
  3516.  
  3517. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  3518. index 0c946ba..8c61582 100644
  3519. --- a/encoder/ratecontrol.c
  3520. +++ b/encoder/ratecontrol.c
  3521. @@ -1784,13 +1784,15 @@ static float rate_estimate_qscale( x264_t *h )
  3522. }
  3523. else
  3524. {
  3525. - double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate * h->i_thread_frames;
  3526. + double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
  3527.  
  3528. if( rcc->b_2pass )
  3529. {
  3530. - //FIXME adjust abr_buffer based on distance to the end of the video
  3531. int64_t diff;
  3532. int64_t predicted_bits = total_bits;
  3533. + /* Adjust ABR buffer based on distance to the end of the video. */
  3534. + if( rcc->num_entries > h->fenc->i_frame )
  3535. + abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->fenc->i_frame );
  3536.  
  3537. if( rcc->b_vbv )
  3538. {
  3539. --
  3540. 1.6.1.2
  3541.  
  3542.  
  3543. From 2fd2dfe22704ce1de0cb8811484de6c2c2c7ea64 Mon Sep 17 00:00:00 2001
  3544. From: David Conrad <lessen42@gmail.com>
  3545. Date: Sat, 13 Feb 2010 01:25:56 -0800
  3546. Subject: [PATCH 22/26] Use #ifdef instead of #if in checkasm
  3547.  
  3548. ---
  3549. tools/checkasm.c | 4 ++--
  3550. 1 files changed, 2 insertions(+), 2 deletions(-)
  3551.  
  3552. diff --git a/tools/checkasm.c b/tools/checkasm.c
  3553. index 0bedc5b..595bd9e 100644
  3554. --- a/tools/checkasm.c
  3555. +++ b/tools/checkasm.c
  3556. @@ -1662,13 +1662,13 @@ static int check_all_flags( void )
  3557. cpu1 &= ~X264_CPU_CACHELINE_64;
  3558. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
  3559. }
  3560. -#elif ARCH_PPC
  3561. +#elif defined(ARCH_PPC)
  3562. if( x264_cpu_detect() & X264_CPU_ALTIVEC )
  3563. {
  3564. fprintf( stderr, "x264: ALTIVEC against C\n" );
  3565. ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
  3566. }
  3567. -#elif ARCH_ARM
  3568. +#elif defined(ARCH_ARM)
  3569. if( x264_cpu_detect() & X264_CPU_ARMV6 )
  3570. ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
  3571. if( x264_cpu_detect() & X264_CPU_NEON )
  3572. --
  3573. 1.6.1.2
  3574.  
  3575.  
  3576. From 15132ee0c913cdca90598c14f8a7532579603721 Mon Sep 17 00:00:00 2001
  3577. From: David Conrad <lessen42@gmail.com>
  3578. Date: Fri, 8 Jan 2010 22:40:09 -0500
  3579. Subject: [PATCH 23/26] ARM NEON versions of weightp functions
  3580.  
  3581. ---
  3582. common/arm/mc-a.S | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++++
  3583. common/arm/mc-c.c | 47 ++++++++
  3584. 2 files changed, 352 insertions(+), 0 deletions(-)
  3585.  
  3586. diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
  3587. index a62af39..e1db404 100644
  3588. --- a/common/arm/mc-a.S
  3589. +++ b/common/arm/mc-a.S
  3590. @@ -432,6 +432,311 @@ avg2_w20_loop:
  3591. .endfunc
  3592.  
  3593.  
  3594. +.macro weight_prologue type
  3595. + push {r4-r5,lr}
  3596. + ldr r4, [sp, #4*3] // weight_t
  3597. + ldr ip, [sp, #4*3+4] // h
  3598. +.ifc \type, full
  3599. + ldr lr, [r4, #32] // denom
  3600. +.endif
  3601. + ldrd r4, [r4, #32+4] // scale, offset
  3602. + vdup.16 q0, r4
  3603. + vdup.16 q1, r5
  3604. +.ifc \type, full
  3605. + rsb lr, lr, #0
  3606. + vdup.16 q2, lr
  3607. +.endif
  3608. +.endm
  3609. +
  3610. +// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
  3611. +// const x264_weight_t *weight, int height )
  3612. +function x264_mc_weight_w20_neon
  3613. + weight_prologue full
  3614. + sub r1, #16
  3615. +weight20_loop:
  3616. + subs ip, #2
  3617. + vld1.8 {d17-d19}, [r2], r3
  3618. + vmovl.u8 q10, d17
  3619. + vmovl.u8 q11, d18
  3620. + vmovl.u8 q14, d19
  3621. + vld1.8 {d16-d18}, [r2], r3
  3622. + vmovl.u8 q12, d16
  3623. + vmovl.u8 q13, d17
  3624. + vmovl.u8 q15, d18
  3625. + vmul.s16 q10, q10, q0
  3626. + vmul.s16 q11, q11, q0
  3627. + vmul.s16 q12, q12, q0
  3628. + vmul.s16 q13, q13, q0
  3629. + vmul.s16 d28, d28, d0
  3630. + vmul.s16 d29, d30, d0
  3631. + vrshl.s16 q10, q10, q2
  3632. + vrshl.s16 q11, q11, q2
  3633. + vrshl.s16 q12, q12, q2
  3634. + vrshl.s16 q13, q13, q2
  3635. + vrshl.s16 q14, q14, q2
  3636. + vadd.s16 q10, q10, q1
  3637. + vadd.s16 q11, q11, q1
  3638. + vadd.s16 q12, q12, q1
  3639. + vadd.s16 q13, q13, q1
  3640. + vadd.s16 q14, q14, q1
  3641. + vqmovun.s16 d16, q10
  3642. + vqmovun.s16 d17, q11
  3643. + vqmovun.s16 d18, q12
  3644. + vqmovun.s16 d19, q13
  3645. + vqmovun.s16 d20, q14
  3646. + vst1.8 {d16-d17}, [r0,:128]!
  3647. + vst1.32 {d20[0]}, [r0,:32], r1
  3648. + vst1.8 {d18-d19}, [r0,:128]!
  3649. + vst1.32 {d20[1]}, [r0,:32], r1
  3650. + bgt weight20_loop
  3651. + pop {r4-r5,pc}
  3652. +.endfunc
  3653. +
  3654. +function x264_mc_weight_w16_neon
  3655. + weight_prologue full
  3656. +weight16_loop:
  3657. + subs ip, #2
  3658. + vld1.8 {d16-d17}, [r2], r3
  3659. + vld1.8 {d18-d19}, [r2], r3
  3660. + vmovl.u8 q10, d16
  3661. + vmovl.u8 q11, d17
  3662. + vmovl.u8 q12, d18
  3663. + vmovl.u8 q13, d19
  3664. + vmul.s16 q10, q10, q0
  3665. + vmul.s16 q11, q11, q0
  3666. + vmul.s16 q12, q12, q0
  3667. + vmul.s16 q13, q13, q0
  3668. + vrshl.s16 q10, q10, q2
  3669. + vrshl.s16 q11, q11, q2
  3670. + vrshl.s16 q12, q12, q2
  3671. + vrshl.s16 q13, q13, q2
  3672. + vadd.s16 q10, q10, q1
  3673. + vadd.s16 q11, q11, q1
  3674. + vadd.s16 q12, q12, q1
  3675. + vadd.s16 q13, q13, q1
  3676. + vqmovun.s16 d16, q10
  3677. + vqmovun.s16 d17, q11
  3678. + vqmovun.s16 d18, q12
  3679. + vqmovun.s16 d19, q13
  3680. + vst1.8 {d16-d17}, [r0,:128], r1
  3681. + vst1.8 {d18-d19}, [r0,:128], r1
  3682. + bgt weight16_loop
  3683. + pop {r4-r5,pc}
  3684. +.endfunc
  3685. +
  3686. +function x264_mc_weight_w8_neon
  3687. + weight_prologue full
  3688. +weight8_loop:
  3689. + subs ip, #2
  3690. + vld1.8 {d16}, [r2], r3
  3691. + vld1.8 {d18}, [r2], r3
  3692. + vmovl.u8 q8, d16
  3693. + vmovl.u8 q9, d18
  3694. + vmul.s16 q8, q8, q0
  3695. + vmul.s16 q9, q9, q0
  3696. + vrshl.s16 q8, q8, q2
  3697. + vrshl.s16 q9, q9, q2
  3698. + vadd.s16 q8, q8, q1
  3699. + vadd.s16 q9, q9, q1
  3700. + vqmovun.s16 d16, q8
  3701. + vqmovun.s16 d18, q9
  3702. + vst1.8 {d16}, [r0,:64], r1
  3703. + vst1.8 {d18}, [r0,:64], r1
  3704. + bgt weight8_loop
  3705. + pop {r4-r5,pc}
  3706. +.endfunc
  3707. +
  3708. +function x264_mc_weight_w4_neon
  3709. + weight_prologue full
  3710. +weight4_loop:
  3711. + subs ip, #2
  3712. + vld1.32 {d16[]}, [r2], r3
  3713. + vld1.32 {d18[]}, [r2], r3
  3714. + vmovl.u8 q8, d16
  3715. + vmovl.u8 q9, d18
  3716. + vmul.s16 d16, d16, d0
  3717. + vmul.s16 d17, d18, d0
  3718. + vrshl.s16 q8, q8, q2
  3719. + vadd.s16 q8, q8, q1
  3720. + vqmovun.s16 d16, q8
  3721. + vst1.32 {d16[0]}, [r0,:32], r1
  3722. + vst1.32 {d16[1]}, [r0,:32], r1
  3723. + bgt weight4_loop
  3724. + pop {r4-r5,pc}
  3725. +.endfunc
  3726. +
  3727. +function x264_mc_weight_w20_nodenom_neon
  3728. + weight_prologue nodenom
  3729. + sub r1, #16
  3730. +weight20_nodenom_loop:
  3731. + subs ip, #2
  3732. + vld1.8 {d17-d19}, [r2], r3
  3733. + vmovl.u8 q10, d17
  3734. + vmovl.u8 q11, d18
  3735. + vmovl.u8 q14, d19
  3736. + vld1.8 {d16-d18}, [r2], r3
  3737. + vmovl.u8 q12, d16
  3738. + vmovl.u8 q13, d17
  3739. + vmovl.u8 q15, d18
  3740. + vmov q8, q1
  3741. + vmov q9, q1
  3742. + vmla.s16 q8, q10, q0
  3743. + vmla.s16 q9, q11, q0
  3744. + vmov q10, q1
  3745. + vmov q11, q1
  3746. + vmla.s16 q10, q12, q0
  3747. + vmla.s16 q11, q13, q0
  3748. + vmov q12, q1
  3749. + vmla.s16 d24, d28, d0
  3750. + vmla.s16 d25, d30, d0
  3751. + vqmovun.s16 d16, q8
  3752. + vqmovun.s16 d17, q9
  3753. + vqmovun.s16 d18, q10
  3754. + vqmovun.s16 d19, q11
  3755. + vqmovun.s16 d20, q12
  3756. + vst1.8 {d16-d17}, [r0,:128]!
  3757. + vst1.32 {d20[0]}, [r0,:32], r1
  3758. + vst1.8 {d18-d19}, [r0,:128]!
  3759. + vst1.32 {d20[1]}, [r0,:32], r1
  3760. + bgt weight20_nodenom_loop
  3761. + pop {r4-r5,pc}
  3762. +.endfunc
  3763. +
  3764. +function x264_mc_weight_w16_nodenom_neon
  3765. + weight_prologue nodenom
  3766. +weight16_nodenom_loop:
  3767. + subs ip, #2
  3768. + vld1.8 {d16-d17}, [r2], r3
  3769. + vld1.8 {d18-d19}, [r2], r3
  3770. + vmovl.u8 q12, d16
  3771. + vmovl.u8 q13, d17
  3772. + vmovl.u8 q14, d18
  3773. + vmovl.u8 q15, d19
  3774. + vmov q8, q1
  3775. + vmov q9, q1
  3776. + vmov q10, q1
  3777. + vmov q11, q1
  3778. + vmla.s16 q8, q12, q0
  3779. + vmla.s16 q9, q13, q0
  3780. + vmla.s16 q10, q14, q0
  3781. + vmla.s16 q11, q15, q0
  3782. + vqmovun.s16 d16, q8
  3783. + vqmovun.s16 d17, q9
  3784. + vqmovun.s16 d18, q10
  3785. + vqmovun.s16 d19, q11
  3786. + vst1.8 {d16-d17}, [r0,:128], r1
  3787. + vst1.8 {d18-d19}, [r0,:128], r1
  3788. + bgt weight16_nodenom_loop
  3789. + pop {r4-r5,pc}
  3790. +.endfunc
  3791. +
  3792. +function x264_mc_weight_w8_nodenom_neon
  3793. + weight_prologue nodenom
  3794. +weight8_nodenom_loop:
  3795. + subs ip, #2
  3796. + vld1.8 {d16}, [r2], r3
  3797. + vld1.8 {d18}, [r2], r3
  3798. + vmovl.u8 q8, d16
  3799. + vmovl.u8 q9, d18
  3800. + vmov q10, q1
  3801. + vmov q11, q1
  3802. + vmla.s16 q10, q8, q0
  3803. + vmla.s16 q11, q9, q0
  3804. + vqmovun.s16 d16, q10
  3805. + vqmovun.s16 d17, q11
  3806. + vst1.8 {d16}, [r0,:64], r1
  3807. + vst1.8 {d17}, [r0,:64], r1
  3808. + bgt weight8_nodenom_loop
  3809. + pop {r4-r5,pc}
  3810. +.endfunc
  3811. +
  3812. +function x264_mc_weight_w4_nodenom_neon
  3813. + weight_prologue nodenom
  3814. +weight4_nodenom_loop:
  3815. + subs ip, #2
  3816. + vld1.32 {d16[]}, [r2], r3
  3817. + vld1.32 {d18[]}, [r2], r3
  3818. + vmovl.u8 q8, d16
  3819. + vmovl.u8 q9, d18
  3820. + vmov q10, q1
  3821. + vmla.s16 d20, d16, d0
  3822. + vmla.s16 d21, d18, d0
  3823. + vqmovun.s16 d16, q10
  3824. + vst1.32 {d16[0]}, [r0,:32], r1
  3825. + vst1.32 {d16[1]}, [r0,:32], r1
  3826. + bgt weight4_nodenom_loop
  3827. + pop {r4-r5,pc}
  3828. +.endfunc
  3829. +
  3830. +.macro weight_simple_prologue
  3831. + push {lr}
  3832. + ldr lr, [sp, #4] // weight_t
  3833. + ldr ip, [sp, #8] // h
  3834. + ldr lr, [lr] // offset
  3835. + vdup.8 q1, lr
  3836. +.endm
  3837. +
  3838. +.macro weight_simple name op
  3839. +function x264_mc_weight_w20_\name\()_neon
  3840. + weight_simple_prologue
  3841. +weight20_\name\()_loop:
  3842. + subs ip, #2
  3843. + vld1.8 {d16-d18}, [r2], r3
  3844. + vld1.8 {d19-d21}, [r2], r3
  3845. + \op q8, q8, q1
  3846. + \op q9, q9, q1
  3847. + \op q10, q10, q1
  3848. + vst1.8 {d16-d18}, [r0,:64], r1
  3849. + vst1.8 {d19-d21}, [r0,:64], r1
  3850. + bgt weight20_\name\()_loop
  3851. + pop {pc}
  3852. +.endfunc
  3853. +
  3854. +function x264_mc_weight_w16_\name\()_neon
  3855. + weight_simple_prologue
  3856. +weight16_\name\()_loop:
  3857. + subs ip, #2
  3858. + vld1.8 {d16-d17}, [r2], r3
  3859. + vld1.8 {d18-d19}, [r2], r3
  3860. + \op q8, q8, q1
  3861. + \op q9, q9, q1
  3862. + vst1.8 {d16-d17}, [r0,:128], r1
  3863. + vst1.8 {d18-d19}, [r0,:128], r1
  3864. + bgt weight16_\name\()_loop
  3865. + pop {pc}
  3866. +.endfunc
  3867. +
  3868. +function x264_mc_weight_w8_\name\()_neon
  3869. + weight_simple_prologue
  3870. +weight8_\name\()_loop:
  3871. + subs ip, #2
  3872. + vld1.8 {d16}, [r2], r3
  3873. + vld1.8 {d17}, [r2], r3
  3874. + \op q8, q8, q1
  3875. + vst1.8 {d16}, [r0,:64], r1
  3876. + vst1.8 {d17}, [r0,:64], r1
  3877. + bgt weight8_\name\()_loop
  3878. + pop {pc}
  3879. +.endfunc
  3880. +
  3881. +function x264_mc_weight_w4_\name\()_neon
  3882. + weight_simple_prologue
  3883. +weight4_\name\()_loop:
  3884. + subs ip, #2
  3885. + vld1.32 {d16[]}, [r2], r3
  3886. + vld1.32 {d17[]}, [r2], r3
  3887. + \op q8, q8, q1
  3888. + vst1.32 {d16[0]}, [r0,:32], r1
  3889. + vst1.32 {d17[0]}, [r0,:32], r1
  3890. + bgt weight4_\name\()_loop
  3891. + pop {pc}
  3892. +.endfunc
  3893. +.endm
  3894. +
  3895. +weight_simple offsetadd, vqadd.u8
  3896. +weight_simple offsetsub, vqsub.u8
  3897. +
  3898. +
  3899. // void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
  3900. function x264_mc_copy_w4_neon
  3901. ldr ip, [sp]
  3902. diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
  3903. index 20cf151..0a7b734 100644
  3904. --- a/common/arm/mc-c.c
  3905. +++ b/common/arm/mc-c.c
  3906. @@ -43,6 +43,48 @@ void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  3907. void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  3908. void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  3909.  
  3910. +#define MC_WEIGHT(func)\
  3911. +void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3912. +void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3913. +void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3914. +void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3915. +\
  3916. +static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
  3917. +{\
  3918. + x264_mc_weight_w4##func##_neon,\
  3919. + x264_mc_weight_w4##func##_neon,\
  3920. + x264_mc_weight_w8##func##_neon,\
  3921. + x264_mc_weight_w16##func##_neon,\
  3922. + x264_mc_weight_w16##func##_neon,\
  3923. + x264_mc_weight_w20##func##_neon,\
  3924. +};
  3925. +
  3926. +MC_WEIGHT()
  3927. +MC_WEIGHT(_nodenom)
  3928. +MC_WEIGHT(_offsetadd)
  3929. +MC_WEIGHT(_offsetsub)
  3930. +
  3931. +static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
  3932. +{
  3933. + if( w->i_scale == 1<<w->i_denom )
  3934. + {
  3935. + if( w->i_offset < 0 )
  3936. + {
  3937. + w->weightfn = x264_mc_offsetsub_wtab_neon;
  3938. + w->cachea[0] = -w->i_offset;
  3939. + }
  3940. + else
  3941. + {
  3942. + w->weightfn = x264_mc_offsetadd_wtab_neon;
  3943. + w->cachea[0] = w->i_offset;
  3944. + }
  3945. + }
  3946. + else if( !w->i_denom )
  3947. + w->weightfn = x264_mc_nodenom_wtab_neon;
  3948. + else
  3949. + w->weightfn = x264_mc_wtab_neon;
  3950. +}
  3951. +
  3952. void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
  3953. void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
  3954. void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
  3955. @@ -182,6 +224,11 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
  3956. pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
  3957. pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
  3958.  
  3959. + pf->weight = x264_mc_wtab_neon;
  3960. + pf->offsetadd = x264_mc_offsetadd_wtab_neon;
  3961. + pf->offsetsub = x264_mc_offsetsub_wtab_neon;
  3962. + pf->weight_cache = x264_weight_cache_neon;
  3963. +
  3964. // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
  3965. #ifndef SYS_MACOSX
  3966. pf->memcpy_aligned = x264_memcpy_aligned_neon;
  3967. --
  3968. 1.6.1.2
  3969.  
  3970.  
  3971. From 44057dac8a3c3a1fb359035895b9126a52f75993 Mon Sep 17 00:00:00 2001
  3972. From: David Conrad <lessen42@gmail.com>
  3973. Date: Sun, 4 Oct 2009 07:24:42 -0400
  3974. Subject: [PATCH 24/26] iPhone compilation support
  3975. Also add --sysroot to configure options
  3976.  
  3977. To build for iPhone 3gs / iPod touch 3g:
  3978. CC=/Developer/Platforms/iPhoneOS.platform/Developer/usr/bin/gcc ./configure --host=arm-apple-darwin --sysroot=/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.0.sdk
  3979.  
  3980. For older devices, add
  3981. --extra-cflags='-arch armv6 -mcpu=arm1176jzf-s' --extra-ldflags='-arch armv6' --disable-asm
  3982. ---
  3983. common/arm/asm.S | 9 ++-
  3984. common/arm/pixel-a.S | 13 ++-
  3985. configure | 17 +++-
  3986. extras/gas-preprocessor.pl | 256 ++++++++++++++++++++++++++++++++++++++++++++
  3987. 4 files changed, 287 insertions(+), 8 deletions(-)
  3988. create mode 100755 extras/gas-preprocessor.pl
  3989.  
  3990. diff --git a/common/arm/asm.S b/common/arm/asm.S
  3991. index d163165..395267f 100644
  3992. --- a/common/arm/asm.S
  3993. +++ b/common/arm/asm.S
  3994. @@ -20,6 +20,12 @@
  3995.  
  3996. #include "config.h"
  3997.  
  3998. +#ifdef PREFIX
  3999. +# define EXTERN_ASM _
  4000. +#else
  4001. +# define EXTERN_ASM
  4002. +#endif
  4003. +
  4004. #ifdef __ELF__
  4005. # define ELF
  4006. #else
  4007. @@ -35,7 +41,8 @@ ELF .eabi_attribute 25, \val
  4008. .endm
  4009.  
  4010. .macro function name
  4011. - .global \name
  4012. + .global EXTERN_ASM\name
  4013. +EXTERN_ASM\name:
  4014. ELF .hidden \name
  4015. ELF .type \name, %function
  4016. .func \name
  4017. diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
  4018. index 4dd65ed..d8533e5 100644
  4019. --- a/common/arm/pixel-a.S
  4020. +++ b/common/arm/pixel-a.S
  4021. @@ -110,16 +110,17 @@ SAD4_ARMV6 8
  4022.  
  4023. .macro SAD_FUNC w, h, name, align:vararg
  4024. function x264_pixel_sad\name\()_\w\()x\h\()_neon
  4025. + SAD_START_\w \align
  4026. +
  4027. .if \w == 16
  4028. - .set r, \h / 2 - 1
  4029. +.rept \h / 2 - 1
  4030. + SAD_\w \align
  4031. +.endr
  4032. .else
  4033. - .set r, \h - 1
  4034. -.endif
  4035. -
  4036. - SAD_START_\w \align
  4037. -.rept r
  4038. +.rept \h - 1
  4039. SAD_\w \align
  4040. .endr
  4041. +.endif
  4042.  
  4043. .if \w > 8
  4044. vabal.u8 q8, d4, d6
  4045. diff --git a/configure b/configure
  4046. index b254383..5288351 100755
  4047. --- a/configure
  4048. +++ b/configure
  4049. @@ -23,6 +23,7 @@ echo " --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS"
  4050. echo " --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
  4051. echo " --host=HOST build programs to run on HOST"
  4052. echo " --cross-prefix=PREFIX use PREFIX for compilation tools"
  4053. +echo " --sysroot=SYSROOT root of cross-build tree"
  4054. echo ""
  4055. exit 1
  4056. fi
  4057. @@ -223,6 +224,10 @@ for opt do
  4058. --cross-prefix=*)
  4059. cross_prefix="${opt#--cross-prefix=}"
  4060. ;;
  4061. + --sysroot=*)
  4062. + CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}"
  4063. + LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}"
  4064. + ;;
  4065. *)
  4066. echo "Unknown option $opt, ignored"
  4067. ;;
  4068. @@ -367,7 +372,17 @@ case $host_cpu in
  4069. ;;
  4070. arm*)
  4071. ARCH="ARM"
  4072. - AS="${AS-${cross_prefix}gcc}"
  4073. + if [ "$SYS" = MACOSX ] ; then
  4074. + AS="${AS-extras/gas-preprocessor.pl $CC}"
  4075. + ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all
  4076. + # build for armv7 by default
  4077. + if ! echo $CFLAGS | grep -Eq '\-arch' ; then
  4078. + CFLAGS="$CFLAGS -arch armv7"
  4079. + LDFLAGS="$LDFLAGS -arch armv7"
  4080. + fi
  4081. + else
  4082. + AS="${AS-${cross_prefix}gcc}"
  4083. + fi
  4084. ;;
  4085. s390|s390x)
  4086. ARCH="S390"
  4087. diff --git a/extras/gas-preprocessor.pl b/extras/gas-preprocessor.pl
  4088. new file mode 100755
  4089. index 0000000..d60893c
  4090. --- /dev/null
  4091. +++ b/extras/gas-preprocessor.pl
  4092. @@ -0,0 +1,256 @@
  4093. +#!/usr/bin/env perl
  4094. +# by David Conrad
  4095. +# This code is licensed under GPLv2 or later; go to gnu.org to read it
  4096. +# (not that it much matters for an asm preprocessor)
  4097. +# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
  4098. +use strict;
  4099. +
  4100. +# Apple's gas is ancient and doesn't support modern preprocessing features like
  4101. +# .rept and has ugly macro syntax, among other things. Thus, this script
  4102. +# implements the subset of the gas preprocessor used by x264 and ffmpeg
  4103. +# that isn't supported by Apple's gas.
  4104. +
  4105. +# FIXME: doesn't work if the path has spaces, but oh well...
  4106. +my $gcc_cmd = join(' ', @ARGV);
  4107. +my $preprocess_c_cmd;
  4108. +
  4109. +if ($gcc_cmd =~ /\S+\.c/) {
  4110. + # C file (inline asm?) - compile
  4111. + $preprocess_c_cmd = "$gcc_cmd -S";
  4112. + $gcc_cmd =~ s/\S+\.c/-x assembler -/g;
  4113. +} elsif ($gcc_cmd =~ /\S+\.S/) {
  4114. + # asm file, just do C preprocessor
  4115. + $preprocess_c_cmd = "$gcc_cmd -E";
  4116. + $gcc_cmd =~ s/\S+\.S/-x assembler -/g;
  4117. +} else {
  4118. + die "Unrecognized input filetype";
  4119. +}
  4120. +
  4121. +$preprocess_c_cmd =~ s/\S+\.o/-/g;
  4122. +
  4123. +open(ASMFILE, "-|", $preprocess_c_cmd) || die "Error running preprocessor";
  4124. +
  4125. +my $current_macro = '';
  4126. +my %macro_lines;
  4127. +my %macro_args;
  4128. +my %macro_args_default;
  4129. +
  4130. +my @pass1_lines;
  4131. +
  4132. +# pass 1: parse .macro
  4133. +# note that the handling of arguments is probably overly permissive vs. gas
  4134. +# but it should be the same for valid cases
  4135. +while (<ASMFILE>) {
  4136. + # comment out unsupported directives
  4137. + s/\.type/@.type/x;
  4138. + s/\.func/@.func/x;
  4139. + s/\.endfunc/@.endfunc/x;
  4140. + s/\.ltorg/@.ltorg/x;
  4141. + s/\.size/@.size/x;
  4142. + s/\.fpu/@.fpu/x;
  4143. +
  4144. + # the syntax for these is a little different
  4145. + s/\.global/.globl/x;
  4146. + # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
  4147. + s/(.*)\.rodata/.const_data/x;
  4148. + s/\.int/.long/x;
  4149. + s/\.float/.single/x;
  4150. +
  4151. + # catch unknown section names that aren't mach-o style (with a comma)
  4152. + if (/.section ([^,]*)$/) {
  4153. + die ".section $1 unsupported; figure out the mach-o section name and add it";
  4154. + }
  4155. +
  4156. + # macros creating macros is not handled (is that valid?)
  4157. + if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
  4158. + $current_macro = $1;
  4159. +
  4160. + # commas in the argument list are optional, so only use whitespace as the separator
  4161. + my $arglist = $2;
  4162. + $arglist =~ s/,/ /g;
  4163. +
  4164. + my @args = split(/\s+/, $arglist);
  4165. + foreach my $i (0 .. $#args) {
  4166. + my @argpair = split(/=/, $args[$i]);
  4167. + $macro_args{$current_macro}[$i] = $argpair[0];
  4168. + $argpair[0] =~ s/:vararg$//;
  4169. + $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
  4170. + }
  4171. + # ensure %macro_lines has the macro name added as a key
  4172. + $macro_lines{$current_macro} = [];
  4173. + } elsif (/\.endm/) {
  4174. + if (!$current_macro) {
  4175. + die "ERROR: .endm without .macro";
  4176. + }
  4177. + $current_macro = '';
  4178. + } elsif ($current_macro) {
  4179. + push(@{$macro_lines{$current_macro}}, $_);
  4180. + } else {
  4181. + expand_macros($_);
  4182. + }
  4183. +}
  4184. +
  4185. +sub expand_macros {
  4186. + my $line = @_[0];
  4187. + if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
  4188. + push(@pass1_lines, $1);
  4189. + my $macro = $2;
  4190. +
  4191. + # commas are optional here too, but are syntactically important because
  4192. + # parameters can be blank
  4193. + my @arglist = split(/,/, $3);
  4194. + my @args;
  4195. + foreach (@arglist) {
  4196. + my @whitespace_split = split(/\s+/, $_);
  4197. + if (!@whitespace_split) {
  4198. + push(@args, '');
  4199. + } else {
  4200. + foreach (@whitespace_split) {
  4201. + if (length($_)) {
  4202. + push(@args, $_);
  4203. + }
  4204. + }
  4205. + }
  4206. + }
  4207. +
  4208. + my %replacements;
  4209. + if ($macro_args_default{$macro}){
  4210. + %replacements = %{$macro_args_default{$macro}};
  4211. + }
  4212. +
  4213. + # construct hashtable of text to replace
  4214. + foreach my $i (0 .. $#args) {
  4215. + my $argname = $macro_args{$macro}[$i];
  4216. +
  4217. + if ($args[$i] =~ m/=/) {
  4218. + # arg=val references the argument name
  4219. + # XXX: I'm not sure what the expected behaviour if a lot of
  4220. + # these are mixed with unnamed args
  4221. + my @named_arg = split(/=/, $args[$i]);
  4222. + $replacements{$named_arg[0]} = $named_arg[1];
  4223. + } elsif ($i > $#{$macro_args{$macro}}) {
  4224. + # more args given than the macro has named args
  4225. + # XXX: is vararg allowed on arguments before the last?
  4226. + $argname = $macro_args{$macro}[-1];
  4227. + if ($argname =~ s/:vararg$//) {
  4228. + $replacements{$argname} .= ", $args[$i]";
  4229. + } else {
  4230. + die "Too many arguments to macro $macro";
  4231. + }
  4232. + } else {
  4233. + $argname =~ s/:vararg$//;
  4234. + $replacements{$argname} = $args[$i];
  4235. + }
  4236. + }
  4237. +
  4238. + # apply replacements as regex
  4239. + foreach (@{$macro_lines{$macro}}) {
  4240. + my $macro_line = $_;
  4241. + # do replacements by longest first, this avoids wrong replacement
  4242. + # when argument names are subsets of each other
  4243. + foreach (reverse sort {length $a <=> length $b} keys %replacements) {
  4244. + $macro_line =~ s/\\$_/$replacements{$_}/g;
  4245. + }
  4246. + $macro_line =~ s/\\\(\)//g; # remove \()
  4247. + expand_macros($macro_line);
  4248. + }
  4249. + } else {
  4250. + push(@pass1_lines, $line);
  4251. + }
  4252. +}
  4253. +
  4254. +close(ASMFILE) or exit 1;
  4255. +open(ASMFILE, "|-", $gcc_cmd) or die "Error running assembler";
  4256. +
  4257. +my @sections;
  4258. +my $num_repts;
  4259. +my $rept_lines;
  4260. +
  4261. +my %literal_labels; # for ldr <reg>, =<expr>
  4262. +my $literal_num = 0;
  4263. +
  4264. +# pass 2: parse .rept and .if variants
  4265. +# NOTE: since we don't implement a proper parser, using .rept with a
  4266. +# variable assigned from .set is not supported
  4267. +foreach my $line (@pass1_lines) {
  4268. + # textual comparison .if
  4269. + # this assumes nothing else on the same line
  4270. + if ($line =~ /\.ifnb\s+(.*)/) {
  4271. + if ($1) {
  4272. + $line = ".if 1\n";
  4273. + } else {
  4274. + $line = ".if 0\n";
  4275. + }
  4276. + } elsif ($line =~ /\.ifb\s+(.*)/) {
  4277. + if ($1) {
  4278. + $line = ".if 0\n";
  4279. + } else {
  4280. + $line = ".if 1\n";
  4281. + }
  4282. + } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
  4283. + if ($1 eq $2) {
  4284. + $line = ".if 1\n";
  4285. + } else {
  4286. + $line = ".if 0\n";
  4287. + }
  4288. + }
  4289. +
  4290. + # handle .previous (only with regard to .section not .subsection)
  4291. + if ($line =~ /\.(section|text|const_data)/) {
  4292. + push(@sections, $line);
  4293. + } elsif ($line =~ /\.previous/) {
  4294. + if (!$sections[-2]) {
  4295. + die ".previous without a previous section";
  4296. + }
  4297. + $line = $sections[-2];
  4298. + push(@sections, $line);
  4299. + }
  4300. +
  4301. + # handle ldr <reg>, =<expr>
  4302. + if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) {
  4303. + my $label = $literal_labels{$3};
  4304. + if (!$label) {
  4305. + $label = ".Literal_$literal_num";
  4306. + $literal_num++;
  4307. + $literal_labels{$3} = $label;
  4308. + }
  4309. + $line = "$1 ldr$2, $label\n";
  4310. + } elsif ($line =~ /\.ltorg/) {
  4311. + foreach my $literal (keys %literal_labels) {
  4312. + $line .= "$literal_labels{$literal}:\n .word $literal\n";
  4313. + }
  4314. + %literal_labels = ();
  4315. + }
  4316. +
  4317. + # @l -> lo16() @ha -> ha16()
  4318. + $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g;
  4319. + $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g;
  4320. +
  4321. + if ($line =~ /\.rept\s+(.*)/) {
  4322. + $num_repts = $1;
  4323. + $rept_lines = "\n";
  4324. +
  4325. + # handle the possibility of repeating another directive on the same line
  4326. + # .endr on the same line is not valid, I don't know if a non-directive is
  4327. + if ($num_repts =~ s/(\.\w+.*)//) {
  4328. + $rept_lines .= "$1\n";
  4329. + }
  4330. + $num_repts = eval($num_repts);
  4331. + } elsif ($line =~ /\.endr/) {
  4332. + for (1 .. $num_repts) {
  4333. + print ASMFILE $rept_lines;
  4334. + }
  4335. + $rept_lines = '';
  4336. + } elsif ($rept_lines) {
  4337. + $rept_lines .= $line;
  4338. + } else {
  4339. + print ASMFILE $line;
  4340. + }
  4341. +}
  4342. +
  4343. +print ASMFILE ".text\n";
  4344. +foreach my $literal (keys %literal_labels) {
  4345. + print ASMFILE "$literal_labels{$literal}:\n .word $literal\n";
  4346. +}
  4347. +
  4348. +close(ASMFILE) or exit 1;
  4349. --
  4350. 1.6.1.2
  4351.  
  4352.  
  4353. From d3dfd8704d23ae2c723263478e40326b51a2ceaf Mon Sep 17 00:00:00 2001
  4354. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  4355. Date: Sat, 13 Feb 2010 11:19:38 -0800
  4356. Subject: [PATCH 25/26] Don't even try direct temporal when it would give junk MVs
  4357. In PbBbP pyramid structure, the last "b" cannot use temporal because L0Ref0(L1Ref0) != L0Ref0.
  4358. Don't even bother analyzing it, just use spatial.
  4359. Should improve speed and direct auto effectiveness in CRF and 1-pass modes when b-pyramid is used.
  4360. Also makes --direct temporal useful with --b-pyramid, since it will fall back to spatial for frames where temporal is broken.
  4361.  
  4362. ---
  4363. common/frame.h | 1 +
  4364. encoder/encoder.c | 30 +++++++++++++++++++++---------
  4365. 2 files changed, 22 insertions(+), 9 deletions(-)
  4366.  
  4367. diff --git a/common/frame.h b/common/frame.h
  4368. index b1852b3..7c8e2ff 100644
  4369. --- a/common/frame.h
  4370. +++ b/common/frame.h
  4371. @@ -48,6 +48,7 @@ typedef struct x264_frame
  4372. uint8_t i_bframes; /* number of bframes following this nonb in coded order */
  4373. float f_qp_avg_rc; /* QPs as decided by ratecontrol */
  4374. float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
  4375. + int i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
  4376.  
  4377. /* YUV buffer */
  4378. int i_plane;
  4379. diff --git a/encoder/encoder.c b/encoder/encoder.c
  4380. index cca9c45..df62389 100644
  4381. --- a/encoder/encoder.c
  4382. +++ b/encoder/encoder.c
  4383. @@ -108,12 +108,24 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
  4384.  
  4385. sh->i_redundant_pic_cnt = 0;
  4386.  
  4387. - if( !h->mb.b_direct_auto_read )
  4388. + h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
  4389. + && h->param.i_bframe
  4390. + && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
  4391. +
  4392. + if( !h->mb.b_direct_auto_read && sh->i_type == SLICE_TYPE_B )
  4393. {
  4394. - if( h->mb.b_direct_auto_write )
  4395. - sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
  4396. + if( h->fref1[0]->i_poc_l0ref0 == h->fref0[0]->i_poc )
  4397. + {
  4398. + if( h->mb.b_direct_auto_write )
  4399. + sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
  4400. + else
  4401. + sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
  4402. + }
  4403. else
  4404. - sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
  4405. + {
  4406. + h->mb.b_direct_auto_write = 0;
  4407. + sh->b_direct_spatial_mv_pred = 1;
  4408. + }
  4409. }
  4410. /* else b_direct_spatial_mv_pred was read from the 2pass statsfile */
  4411.  
  4412. @@ -623,10 +635,6 @@ static int x264_validate_parameters( x264_t *h )
  4413. h->param.i_sync_lookahead = 0;
  4414. #endif
  4415.  
  4416. - h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
  4417. - && h->param.i_bframe
  4418. - && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
  4419. -
  4420. h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
  4421. h->param.i_deblocking_filter_beta = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
  4422. h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
  4423. @@ -2371,6 +2379,9 @@ int x264_encoder_encode( x264_t *h,
  4424. x264_reference_check_reorder( h );
  4425. }
  4426.  
  4427. + if( h->i_ref0 )
  4428. + h->fdec->i_poc_l0ref0 = h->fref0[0]->i_poc;
  4429. +
  4430. if( h->sh.i_type == SLICE_TYPE_B )
  4431. x264_macroblock_bipred_init( h );
  4432.  
  4433. @@ -2806,7 +2817,8 @@ void x264_encoder_close ( x264_t *h )
  4434. x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
  4435. }
  4436.  
  4437. - if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
  4438. + if( (h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ||
  4439. + (h->stat.i_direct_frames[0] && h->stat.i_direct_frames[1]))
  4440. && h->stat.i_frame_count[SLICE_TYPE_B] )
  4441. {
  4442. x264_log( h, X264_LOG_INFO, "direct mvs spatial:%.1f%% temporal:%.1f%%\n",
  4443. --
  4444. 1.6.1.2
  4445.  
  4446.  
  4447. From dcf583527aea433e7a3972cd7597d167bb8f3fe5 Mon Sep 17 00:00:00 2001
  4448. From: Loren Merritt <pengvado@akuvian.org>
  4449. Date: Thu, 28 Jan 2010 18:09:07 +0000
  4450. Subject: [PATCH 26/26] Remove unnecessary PIC support macros
  4451. yasm has a directive to enable PIC globally
  4452.  
  4453. ---
  4454. common/x86/cabac-a.asm | 2 +-
  4455. common/x86/dct-32.asm | 10 ++++----
  4456. common/x86/dct-64.asm | 10 ++++----
  4457. common/x86/dct-a.asm | 42 +++++++++++++++++++-------------------
  4458. common/x86/deblock-a.asm | 34 +++++++++++++++---------------
  4459. common/x86/mc-a.asm | 40 ++++++++++++++++++------------------
  4460. common/x86/mc-a2.asm | 30 +++++++++++++-------------
  4461. common/x86/pixel-a.asm | 50 +++++++++++++++++++++++-----------------------
  4462. common/x86/predict-a.asm | 28 ++++++++++++------------
  4463. common/x86/quant-a.asm | 22 ++++++++++----------
  4464. common/x86/sad-a.asm | 14 ++++++------
  4465. common/x86/x86inc.asm | 20 +++--------------
  4466. common/x86/x86util.asm | 4 +-
  4467. tools/checkasm-a.asm | 16 +++++++-------
  4468. 14 files changed, 155 insertions(+), 167 deletions(-)
  4469.  
  4470. diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
  4471. index 29e05f1..62e281a 100644
  4472. --- a/common/x86/cabac-a.asm
  4473. +++ b/common/x86/cabac-a.asm
  4474. @@ -59,7 +59,7 @@ endstruc
  4475. %macro LOAD_GLOBAL 4
  4476. %ifdef PIC
  4477. ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
  4478. - lea r11, [%2 GLOBAL]
  4479. + lea r11, [%2]
  4480. %ifnidn %3, 0
  4481. add r11, %3
  4482. %endif
  4483. diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
  4484. index a713dd6..3350e40 100644
  4485. --- a/common/x86/dct-32.asm
  4486. +++ b/common/x86/dct-32.asm
  4487. @@ -349,7 +349,7 @@ cglobal x264_sub8x8_dct_%1, 3,3
  4488. global x264_sub8x8_dct_%1.skip_prologue
  4489. .skip_prologue:
  4490. %ifnidn %1, sse2
  4491. - mova m7, [hsub_mul GLOBAL]
  4492. + mova m7, [hsub_mul]
  4493. %endif
  4494. LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
  4495. SPILL r0, 1,2
  4496. @@ -393,7 +393,7 @@ global x264_sub8x8_dct8_%1.skip_prologue
  4497. LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
  4498. UNSPILL r0, 0
  4499. %else
  4500. - mova m7, [hsub_mul GLOBAL]
  4501. + mova m7, [hsub_mul]
  4502. LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
  4503. SPILL r0, 0,1
  4504. SWAP 1, 7
  4505. @@ -441,9 +441,9 @@ global x264_add8x8_idct_sse2.skip_prologue
  4506. SPILL r1, 0
  4507. TRANSPOSE2x4x4W 4,5,6,7,0
  4508. UNSPILL r1, 0
  4509. - paddw m0, [pw_32 GLOBAL]
  4510. + paddw m0, [pw_32]
  4511. IDCT4_1D 0,1,2,3,r1
  4512. - paddw m4, [pw_32 GLOBAL]
  4513. + paddw m4, [pw_32]
  4514. IDCT4_1D 4,5,6,7,r1
  4515. SPILL r1, 6,7
  4516. pxor m7, m7
  4517. @@ -466,7 +466,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
  4518. IDCT8_1D 0,1,2,3,4,5,6,7,r1
  4519. SPILL r1, 6
  4520. TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
  4521. - paddw m0, [pw_32 GLOBAL]
  4522. + paddw m0, [pw_32]
  4523. SPILL r1, 0
  4524. IDCT8_1D 0,1,2,3,4,5,6,7,r1
  4525. SPILL r1, 6,7
  4526. diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
  4527. index 9915789..ba7741e 100644
  4528. --- a/common/x86/dct-64.asm
  4529. +++ b/common/x86/dct-64.asm
  4530. @@ -143,7 +143,7 @@ INIT_XMM
  4531. cglobal x264_sub8x8_dct_%1, 3,3,11
  4532. add r2, 4*FDEC_STRIDE
  4533. %ifnidn %1, sse2
  4534. - mova m7, [hsub_mul GLOBAL]
  4535. + mova m7, [hsub_mul]
  4536. %endif
  4537. %ifdef WIN64
  4538. call .skip_prologue
  4539. @@ -170,7 +170,7 @@ global x264_sub8x8_dct_%1.skip_prologue
  4540. cglobal x264_sub8x8_dct8_%1, 3,3,11
  4541. add r2, 4*FDEC_STRIDE
  4542. %ifnidn %1, sse2
  4543. - mova m7, [hsub_mul GLOBAL]
  4544. + mova m7, [hsub_mul]
  4545. %endif
  4546. %ifdef WIN64
  4547. call .skip_prologue
  4548. @@ -227,7 +227,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
  4549. movdqa m7, [r1+0x70]
  4550. IDCT8_1D 0,1,2,3,4,5,6,7,8,10
  4551. TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
  4552. - paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
  4553. + paddw m0, [pw_32] ; rounding for the >>6 at the end
  4554. IDCT8_1D 0,1,2,3,4,5,6,7,8,10
  4555. DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
  4556. DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
  4557. @@ -265,9 +265,9 @@ global x264_add8x8_idct_sse2.skip_prologue
  4558. TRANSPOSE2x4x4W 0,1,2,3,8
  4559. IDCT4_1D 4,5,6,7,8,10
  4560. TRANSPOSE2x4x4W 4,5,6,7,8
  4561. - paddw m0, [pw_32 GLOBAL]
  4562. + paddw m0, [pw_32]
  4563. IDCT4_1D 0,1,2,3,8,10
  4564. - paddw m4, [pw_32 GLOBAL]
  4565. + paddw m4, [pw_32]
  4566. IDCT4_1D 4,5,6,7,8,10
  4567. DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
  4568. DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
  4569. diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
  4570. index d4a0cae..618433c 100644
  4571. --- a/common/x86/dct-a.asm
  4572. +++ b/common/x86/dct-a.asm
  4573. @@ -80,7 +80,7 @@ cglobal x264_dct4x4dc_mmx, 1,1
  4574. movq m2, [r0+16]
  4575. movq m1, [r0+ 8]
  4576. movq m0, [r0+ 0]
  4577. - movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
  4578. + movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
  4579. WALSH4_1D 0,1,2,3,4
  4580. TRANSPOSE4x4W 0,1,2,3,4
  4581. SUMSUB_BADC m1, m0, m3, m2, m4
  4582. @@ -123,7 +123,7 @@ cglobal x264_sub4x4_dct_%1, 3,3
  4583. LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
  4584. LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
  4585. %else
  4586. - mova m5, [hsub_mul GLOBAL]
  4587. + mova m5, [hsub_mul]
  4588. LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
  4589. %endif
  4590. DCT4_1D 0,1,2,3,4
  4591. @@ -151,7 +151,7 @@ cglobal x264_add4x4_idct_mmx, 2,2
  4592. movq m0, [r1+ 0]
  4593. IDCT4_1D 0,1,2,3,4,5
  4594. TRANSPOSE4x4W 0,1,2,3,4
  4595. - paddw m0, [pw_32 GLOBAL]
  4596. + paddw m0, [pw_32]
  4597. IDCT4_1D 0,1,2,3,4,5
  4598. STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
  4599. STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
  4600. @@ -179,7 +179,7 @@ cglobal x264_add4x4_idct_sse4, 2,2,6
  4601. punpckhdq m2, m0
  4602. SWAP 0, 1
  4603.  
  4604. - mova m1, [pw_32_0 GLOBAL]
  4605. + mova m1, [pw_32_0]
  4606. paddw m1, m0 ; row1/row0 corrected
  4607. psraw m0, 1 ; row1>>1/...
  4608. mova m3, m2 ; row3/row2
  4609. @@ -221,7 +221,7 @@ cglobal %1, 3,3,11
  4610. pxor m7, m7
  4611. %else
  4612. add r2, 4*FDEC_STRIDE
  4613. - mova m7, [hsub_mul GLOBAL]
  4614. + mova m7, [hsub_mul]
  4615. %endif
  4616. .skip_prologue:
  4617. %ifdef WIN64
  4618. @@ -335,7 +335,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
  4619. movq mm0, [r1]
  4620. pxor mm1, mm1
  4621. add r0, FDEC_STRIDE*4
  4622. - paddw mm0, [pw_32 GLOBAL]
  4623. + paddw mm0, [pw_32]
  4624. psraw mm0, 6
  4625. psubw mm1, mm0
  4626. packuswb mm0, mm0
  4627. @@ -354,10 +354,10 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
  4628. movq xmm0, [r1]
  4629. pxor xmm1, xmm1
  4630. add r0, FDEC_STRIDE*4
  4631. - paddw xmm0, [pw_32 GLOBAL]
  4632. + paddw xmm0, [pw_32]
  4633. psraw xmm0, 6
  4634. psubw xmm1, xmm0
  4635. - movdqa xmm5, [pb_idctdc_unpack GLOBAL]
  4636. + movdqa xmm5, [pb_idctdc_unpack]
  4637. packuswb xmm0, xmm0
  4638. packuswb xmm1, xmm1
  4639. pshufb xmm0, xmm5
  4640. @@ -393,7 +393,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
  4641. .loop:
  4642. movq mm0, [r1]
  4643. pxor mm1, mm1
  4644. - paddw mm0, [pw_32 GLOBAL]
  4645. + paddw mm0, [pw_32]
  4646. psraw mm0, 6
  4647. psubw mm1, mm0
  4648. packuswb mm0, mm0
  4649. @@ -447,8 +447,8 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2,8
  4650. punpcklwd xmm2, xmm2
  4651. pxor xmm1, xmm1
  4652. pxor xmm3, xmm3
  4653. - paddw xmm0, [pw_32 GLOBAL]
  4654. - paddw xmm2, [pw_32 GLOBAL]
  4655. + paddw xmm0, [pw_32]
  4656. + paddw xmm2, [pw_32]
  4657. psraw xmm0, 6
  4658. psraw xmm2, 6
  4659. psubw xmm1, xmm0
  4660. @@ -477,11 +477,11 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
  4661. movdqa xmm0, [r1]
  4662. add r1, 16
  4663. pxor xmm1, xmm1
  4664. - paddw xmm0, [pw_32 GLOBAL]
  4665. + paddw xmm0, [pw_32]
  4666. psraw xmm0, 6
  4667. psubw xmm1, xmm0
  4668. - movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
  4669. - movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
  4670. + movdqa xmm5, [ pb_idctdc_unpack]
  4671. + movdqa xmm6, [pb_idctdc_unpack2]
  4672. packuswb xmm0, xmm0
  4673. packuswb xmm1, xmm1
  4674. movdqa xmm2, xmm0
  4675. @@ -815,8 +815,8 @@ cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
  4676. cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
  4677. movdqa xmm1, [r1+16]
  4678. movdqa xmm0, [r1]
  4679. - pshufb xmm1, [pb_scan4frameb GLOBAL]
  4680. - pshufb xmm0, [pb_scan4framea GLOBAL]
  4681. + pshufb xmm1, [pb_scan4frameb]
  4682. + pshufb xmm0, [pb_scan4framea]
  4683. movdqa xmm2, xmm1
  4684. psrldq xmm1, 6
  4685. palignr xmm2, xmm0, 6
  4686. @@ -963,9 +963,9 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
  4687. punpcklqdq xmm0, xmm2
  4688. punpcklqdq xmm4, xmm6
  4689. %ifidn %2, frame
  4690. - movdqa xmm7, [pb_sub4frame GLOBAL]
  4691. + movdqa xmm7, [pb_sub4frame]
  4692. %else
  4693. - movdqa xmm7, [pb_sub4field GLOBAL]
  4694. + movdqa xmm7, [pb_sub4field]
  4695. %endif
  4696. pshufb xmm0, xmm7
  4697. pshufb xmm4, xmm7
  4698. @@ -980,7 +980,7 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
  4699. psubw xmm1, xmm5
  4700. %ifidn %1, ac
  4701. movd r2d, xmm0
  4702. - pand xmm0, [pb_subacmask GLOBAL]
  4703. + pand xmm0, [pb_subacmask]
  4704. %endif
  4705. movdqa [r0], xmm0
  4706. pxor xmm2, xmm2
  4707. @@ -1039,7 +1039,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
  4708. packsswb m5, m5
  4709. pxor m0, m0
  4710. pcmpeqb m5, m0
  4711. - paddb m5, [pb_1 GLOBAL]
  4712. + paddb m5, [pb_1]
  4713. movd r0d, m5
  4714. mov [r2+0], r0w
  4715. shr r0d, 16
  4716. @@ -1085,7 +1085,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
  4717. packsswb m2, m2
  4718. packsswb m2, m2
  4719. pcmpeqb m5, m2
  4720. - paddb m5, [pb_1 GLOBAL]
  4721. + paddb m5, [pb_1]
  4722. movd r0d, m5
  4723. mov [r2+0], r0w
  4724. shr r0d, 16
  4725. diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
  4726. index 75b308f..00d0418 100644
  4727. --- a/common/x86/deblock-a.asm
  4728. +++ b/common/x86/deblock-a.asm
  4729. @@ -233,19 +233,19 @@ SECTION .text
  4730. ; clobbers: m0,3-6
  4731. %macro DEBLOCK_P0_Q0 0
  4732. mova m5, m1
  4733. - pxor m5, m2 ; p0^q0
  4734. - pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
  4735. + pxor m5, m2 ; p0^q0
  4736. + pand m5, [pb_01] ; (p0^q0)&1
  4737. pcmpeqb m4, m4
  4738. pxor m3, m4
  4739. - pavgb m3, m0 ; (p1 - q1 + 256)>>1
  4740. - pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
  4741. + pavgb m3, m0 ; (p1 - q1 + 256)>>1
  4742. + pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
  4743. pxor m4, m1
  4744. - pavgb m4, m2 ; (q0 - p0 + 256)>>1
  4745. + pavgb m4, m2 ; (q0 - p0 + 256)>>1
  4746. pavgb m3, m5
  4747. - paddusb m3, m4 ; d+128+33
  4748. - mova m6, [pb_a1 GLOBAL]
  4749. + paddusb m3, m4 ; d+128+33
  4750. + mova m6, [pb_a1]
  4751. psubusb m6, m3
  4752. - psubusb m3, [pb_a1 GLOBAL]
  4753. + psubusb m3, [pb_a1]
  4754. pminub m6, m7
  4755. pminub m3, m7
  4756. psubusb m1, m6
  4757. @@ -261,10 +261,10 @@ SECTION .text
  4758. %macro LUMA_Q1 6
  4759. mova %6, m1
  4760. pavgb %6, m2
  4761. - pavgb %2, %6 ; avg(p2,avg(p0,q0))
  4762. + pavgb %2, %6 ; avg(p2,avg(p0,q0))
  4763. pxor %6, %3
  4764. - pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
  4765. - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
  4766. + pand %6, [pb_01] ; (p2^avg(p0,q0))&1
  4767. + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
  4768. mova %6, %1
  4769. psubusb %6, %5
  4770. paddusb %5, %1
  4771. @@ -614,8 +614,8 @@ DEBLOCK_LUMA sse2, v, 16
  4772. %define mask0 spill(2)
  4773. %define mask1p spill(3)
  4774. %define mask1q spill(4)
  4775. - %define mpb_00 [pb_00 GLOBAL]
  4776. - %define mpb_01 [pb_01 GLOBAL]
  4777. + %define mpb_00 [pb_00]
  4778. + %define mpb_01 [pb_01]
  4779. %endif
  4780.  
  4781. ;-----------------------------------------------------------------------------
  4782. @@ -639,7 +639,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
  4783. mova q1, [r0+r1]
  4784. %ifdef ARCH_X86_64
  4785. pxor mpb_00, mpb_00
  4786. - mova mpb_01, [pb_01 GLOBAL]
  4787. + mova mpb_01, [pb_01]
  4788. LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
  4789. SWAP 7, 12 ; m12=mask0
  4790. pavgb t5, mpb_00
  4791. @@ -658,8 +658,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
  4792. LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
  4793. mova m4, t5
  4794. mova mask0, m7
  4795. - pavgb m4, [pb_00 GLOBAL]
  4796. - pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
  4797. + pavgb m4, [pb_00]
  4798. + pavgb m4, [pb_01] ; alpha/4+1
  4799. DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
  4800. pand m6, mask0
  4801. DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
  4802. @@ -835,7 +835,7 @@ chroma_inter_body_mmxext:
  4803. %macro CHROMA_INTRA_P0 3
  4804. movq m4, %1
  4805. pxor m4, %3
  4806. - pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
  4807. + pand m4, [pb_01] ; m4 = (p0^q1)&1
  4808. pavgb %1, %3
  4809. psubusb %1, m4
  4810. pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
  4811. diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
  4812. index f486a8d..9783066 100644
  4813. --- a/common/x86/mc-a.asm
  4814. +++ b/common/x86/mc-a.asm
  4815. @@ -89,9 +89,9 @@ SECTION .text
  4816. %macro BIWEIGHT_START_MMX 0
  4817. movd m2, r6m
  4818. SPLATW m2, m2 ; weight_dst
  4819. - mova m3, [pw_64 GLOBAL]
  4820. + mova m3, [pw_64]
  4821. psubw m3, m2 ; weight_src
  4822. - mova m4, [pw_32 GLOBAL] ; rounding
  4823. + mova m4, [pw_32] ; rounding
  4824. pxor m5, m5
  4825. %endmacro
  4826.  
  4827. @@ -111,7 +111,7 @@ SECTION .text
  4828. shl t7d, 8
  4829. add t6d, t7d
  4830. movd m3, t6d
  4831. - mova m4, [pw_32 GLOBAL]
  4832. + mova m4, [pw_32]
  4833. SPLATW m3, m3 ; weight_dst,src
  4834. %endmacro
  4835.  
  4836. @@ -641,7 +641,7 @@ AVG2_W20 sse2_misalign
  4837. %macro INIT_SHIFT 2
  4838. and eax, 7
  4839. shl eax, 3
  4840. - movd %1, [sw_64 GLOBAL]
  4841. + movd %1, [sw_64]
  4842. movd %2, eax
  4843. psubw %1, %2
  4844. %endmacro
  4845. @@ -778,10 +778,10 @@ cglobal x264_pixel_avg2_w16_cache64_ssse3
  4846. shl r6, 4 ;jump = (offset + align*2)*48
  4847. %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
  4848. %ifdef PIC
  4849. - lea r11, [avg_w16_addr GLOBAL]
  4850. + lea r11, [avg_w16_addr]
  4851. add r6, r11
  4852. %else
  4853. - lea r6, [avg_w16_addr + r6 GLOBAL]
  4854. + lea r6, [avg_w16_addr + r6]
  4855. %endif
  4856. %ifdef UNIX64
  4857. jmp r6
  4858. @@ -1007,7 +1007,7 @@ cglobal x264_mc_chroma_%1
  4859. SPLATW m5, m5 ; m5 = dx
  4860. SPLATW m6, m6 ; m6 = dy
  4861.  
  4862. - mova m4, [pw_8 GLOBAL]
  4863. + mova m4, [pw_8]
  4864. mova m0, m4
  4865. psubw m4, m5 ; m4 = 8-dx
  4866. psubw m0, m6 ; m0 = 8-dy
  4867. @@ -1042,7 +1042,7 @@ cglobal x264_mc_chroma_%1
  4868. punpcklbw m2, m3
  4869. punpcklbw m1, m3
  4870.  
  4871. - paddw m0, [pw_32 GLOBAL]
  4872. + paddw m0, [pw_32]
  4873.  
  4874. pmullw m2, m5 ; line * cB
  4875. pmullw m1, m7 ; line * cD
  4876. @@ -1084,9 +1084,9 @@ cglobal x264_mc_chroma_%1
  4877. movd m6, r4d
  4878. mov r5d, 1
  4879. .mc1d:
  4880. - mova m5, [pw_8 GLOBAL]
  4881. + mova m5, [pw_8]
  4882. SPLATW m6, m6
  4883. - mova m7, [pw_4 GLOBAL]
  4884. + mova m7, [pw_4]
  4885. psubw m5, m6
  4886. movifnidn r0, r0mp
  4887. movifnidn r1d, r1m
  4888. @@ -1166,7 +1166,7 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
  4889. imul r4d, t0d ; (x*255+8)*(8-y)
  4890. cmp dword r6m, 4
  4891. jg .width8
  4892. - mova m5, [pw_32 GLOBAL]
  4893. + mova m5, [pw_32]
  4894. movd m6, r5d
  4895. movd m7, r4d
  4896. movifnidn r0, r0mp
  4897. @@ -1178,10 +1178,10 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
  4898. and r2, ~3
  4899. and r5, 3
  4900. %ifdef PIC
  4901. - lea r11, [ch_shuffle GLOBAL]
  4902. + lea r11, [ch_shuffle]
  4903. movu m5, [r11 + r5*2]
  4904. %else
  4905. - movu m5, [ch_shuffle + r5*2 GLOBAL]
  4906. + movu m5, [ch_shuffle + r5*2]
  4907. %endif
  4908. movu m0, [r2]
  4909. pshufb m0, m5
  4910. @@ -1197,8 +1197,8 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
  4911. pmaddubsw m1, m6
  4912. pmaddubsw m2, m7
  4913. pmaddubsw m3, m6
  4914. - paddw m0, [pw_32 GLOBAL]
  4915. - paddw m2, [pw_32 GLOBAL]
  4916. + paddw m0, [pw_32]
  4917. + paddw m2, [pw_32]
  4918. paddw m1, m0
  4919. paddw m3, m2
  4920. mova m0, m4
  4921. @@ -1228,7 +1228,7 @@ INIT_XMM
  4922. cmp r5, 0x38
  4923. jge .split
  4924. %endif
  4925. - mova m5, [pw_32 GLOBAL]
  4926. + mova m5, [pw_32]
  4927. movh m0, [r2]
  4928. movh m1, [r2+1]
  4929. punpcklbw m0, m1
  4930. @@ -1265,18 +1265,18 @@ INIT_XMM
  4931. and r2, ~7
  4932. and r5, 7
  4933. %ifdef PIC
  4934. - lea r11, [ch_shuffle GLOBAL]
  4935. + lea r11, [ch_shuffle]
  4936. movu m5, [r11 + r5*2]
  4937. %else
  4938. - movu m5, [ch_shuffle + r5*2 GLOBAL]
  4939. + movu m5, [ch_shuffle + r5*2]
  4940. %endif
  4941. movu m0, [r2]
  4942. pshufb m0, m5
  4943. %ifdef ARCH_X86_64
  4944. - mova m8, [pw_32 GLOBAL]
  4945. + mova m8, [pw_32]
  4946. %define round m8
  4947. %else
  4948. - %define round [pw_32 GLOBAL]
  4949. + %define round [pw_32]
  4950. %endif
  4951. .splitloop8:
  4952. movu m1, [r2+r3]
  4953. diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
  4954. index 245c09f..f2e69c0 100644
  4955. --- a/common/x86/mc-a2.asm
  4956. +++ b/common/x86/mc-a2.asm
  4957. @@ -125,7 +125,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
  4958. %ifnidn %1, ssse3
  4959. pxor m0, m0
  4960. %else
  4961. - mova m0, [filt_mul51 GLOBAL]
  4962. + mova m0, [filt_mul51]
  4963. %endif
  4964. .loop:
  4965. %ifidn %1, ssse3
  4966. @@ -142,8 +142,8 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
  4967. pmaddubsw m4, m0
  4968. pmaddubsw m2, m0
  4969. pmaddubsw m5, m0
  4970. - pmaddubsw m3, [filt_mul20 GLOBAL]
  4971. - pmaddubsw m6, [filt_mul20 GLOBAL]
  4972. + pmaddubsw m3, [filt_mul20]
  4973. + pmaddubsw m6, [filt_mul20]
  4974. paddw m1, m2
  4975. paddw m4, m5
  4976. paddw m1, m3
  4977. @@ -155,7 +155,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
  4978. LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
  4979. FILT_V2
  4980. %endif
  4981. - mova m7, [pw_16 GLOBAL]
  4982. + mova m7, [pw_16]
  4983. mova [r2+r4*2], m1
  4984. mova [r2+r4*2+mmsize], m4
  4985. paddw m1, m7
  4986. @@ -180,7 +180,7 @@ cglobal x264_hpel_filter_c_mmxext, 3,3
  4987. lea r1, [r1+r2*2]
  4988. neg r2
  4989. %define src r1+r2*2
  4990. - movq m7, [pw_32 GLOBAL]
  4991. + movq m7, [pw_32]
  4992. .loop:
  4993. movq m1, [src-4]
  4994. movq m2, [src-2]
  4995. @@ -237,7 +237,7 @@ cglobal x264_hpel_filter_h_mmxext, 3,3
  4996. punpcklbw m7, m0
  4997. punpcklbw m6, m0
  4998. paddw m6, m7 ; a1
  4999. - movq m7, [pw_1 GLOBAL]
  5000. + movq m7, [pw_1]
  5001. FILT_H2 m1, m2, m3, m4, m5, m6
  5002. FILT_PACK m1, m4, 1
  5003. movntq [r0+r2], m1
  5004. @@ -257,13 +257,13 @@ cglobal x264_hpel_filter_c_%1, 3,3,9
  5005. neg r2
  5006. %define src r1+r2*2
  5007. %ifidn %1, ssse3
  5008. - mova m7, [pw_32 GLOBAL]
  5009. + mova m7, [pw_32]
  5010. %define tpw_32 m7
  5011. %elifdef ARCH_X86_64
  5012. - mova m8, [pw_32 GLOBAL]
  5013. + mova m8, [pw_32]
  5014. %define tpw_32 m8
  5015. %else
  5016. - %define tpw_32 [pw_32 GLOBAL]
  5017. + %define tpw_32 [pw_32]
  5018. %endif
  5019. .loop:
  5020. %ifidn %1,sse2_misalign
  5021. @@ -340,7 +340,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3,8
  5022. punpcklbw m6, m0
  5023. punpcklbw m7, m0
  5024. paddw m6, m7 ; c1
  5025. - mova m7, [pw_1 GLOBAL] ; FIXME xmm8
  5026. + mova m7, [pw_1] ; FIXME xmm8
  5027. FILT_H2 m1, m2, m3, m4, m5, m6
  5028. FILT_PACK m1, m4, 1
  5029. movntdq [r0+r2], m1
  5030. @@ -362,7 +362,7 @@ cglobal x264_hpel_filter_h_ssse3, 3,3
  5031. punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
  5032. movh m2, [src]
  5033. punpcklbw m2, m0
  5034. - mova m7, [pw_1 GLOBAL]
  5035. + mova m7, [pw_1]
  5036. .loop:
  5037. movh m3, [src+8]
  5038. punpcklbw m3, m0
  5039. @@ -436,7 +436,7 @@ HPEL_V ssse3
  5040. mova m3, [r1]
  5041. mova %4, [r1+r2]
  5042. mova m0, [r1+r2*2]
  5043. - mova %2, [filt_mul51 GLOBAL]
  5044. + mova %2, [filt_mul51]
  5045. mova m4, m1
  5046. punpcklbw m1, m2
  5047. punpckhbw m4, m2
  5048. @@ -452,8 +452,8 @@ HPEL_V ssse3
  5049. pmaddubsw m4, %2
  5050. pmaddubsw m0, %2
  5051. pmaddubsw m2, %2
  5052. - pmaddubsw m3, [filt_mul20 GLOBAL]
  5053. - pmaddubsw %1, [filt_mul20 GLOBAL]
  5054. + pmaddubsw m3, [filt_mul20]
  5055. + pmaddubsw %1, [filt_mul20]
  5056. psrlw %3, 8
  5057. psrlw %4, 8
  5058. paddw m1, m0
  5059. @@ -1096,7 +1096,7 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
  5060. add r4, r5
  5061. neg r5
  5062. pxor xmm5, xmm5
  5063. - movdqa xmm4, [pd_128 GLOBAL]
  5064. + movdqa xmm4, [pd_128]
  5065. .loop:
  5066. movq xmm2, [r2+r5] ; intra
  5067. movq xmm0, [r4+r5] ; invq
  5068. diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
  5069. index d94daaf..46b4557 100644
  5070. --- a/common/x86/pixel-a.asm
  5071. +++ b/common/x86/pixel-a.asm
  5072. @@ -59,7 +59,7 @@ SECTION .text
  5073. %endmacro
  5074.  
  5075. %macro HADDW 2
  5076. - pmaddwd %1, [pw_1 GLOBAL]
  5077. + pmaddwd %1, [pw_1]
  5078. HADDD %1, %2
  5079. %endmacro
  5080.  
  5081. @@ -244,9 +244,9 @@ cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
  5082. %endif
  5083.  
  5084. %ifidn %3, ssse3
  5085. - mova m7, [hsub_mul GLOBAL]
  5086. + mova m7, [hsub_mul]
  5087. %elifidn %3, sse2
  5088. - mova m7, [pw_00ff GLOBAL]
  5089. + mova m7, [pw_00ff]
  5090. %elif %1 >= mmsize
  5091. pxor m7, m7
  5092. %endif
  5093. @@ -310,7 +310,7 @@ SSD 4, 8, ssse3
  5094. pxor m5, m5 ; sum
  5095. pxor m6, m6 ; sum squared
  5096. %if %1
  5097. - mova m7, [pw_00ff GLOBAL]
  5098. + mova m7, [pw_00ff]
  5099. %else
  5100. pxor m7, m7 ; zero
  5101. %endif
  5102. @@ -482,7 +482,7 @@ cglobal x264_pixel_var2_8x8_sse2, 5,6,8
  5103. cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
  5104. pxor m5, m5 ; sum
  5105. pxor m6, m6 ; sum squared
  5106. - mova m7, [hsub_mul GLOBAL]
  5107. + mova m7, [hsub_mul]
  5108. mov r5d, 2
  5109. .loop:
  5110. movq m0, [r0]
  5111. @@ -775,7 +775,7 @@ cglobal x264_pixel_satd_4x4_mmxext, 4,6
  5112.  
  5113. %macro SATD_START_SSE2 3
  5114. %ifnidn %1, sse2
  5115. - mova %3, [hmul_8p GLOBAL]
  5116. + mova %3, [hmul_8p]
  5117. %endif
  5118. lea r4, [3*r1]
  5119. lea r5, [3*r3]
  5120. @@ -815,7 +815,7 @@ INIT_XMM
  5121. %ifnidn %1, sse2
  5122. cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
  5123. SATD_START_MMX
  5124. - mova m4, [hmul_4p GLOBAL]
  5125. + mova m4, [hmul_4p]
  5126. LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
  5127. LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
  5128. LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
  5129. @@ -832,7 +832,7 @@ cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
  5130. cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
  5131. SATD_START_MMX
  5132. %ifnidn %1, sse2
  5133. - mova m7, [hmul_4p GLOBAL]
  5134. + mova m7, [hmul_4p]
  5135. %endif
  5136. movd m4, [r2]
  5137. movd m5, [r2+r3]
  5138. @@ -889,14 +889,14 @@ cglobal x264_pixel_satd_16x4_internal_%1
  5139. cglobal x264_pixel_satd_16x8_%1, 4,6,12
  5140. SATD_START_SSE2 %1, m10, m7
  5141. %ifidn %1, sse2
  5142. - mova m7, [pw_00ff GLOBAL]
  5143. + mova m7, [pw_00ff]
  5144. %endif
  5145. jmp x264_pixel_satd_16x8_internal_%1
  5146.  
  5147. cglobal x264_pixel_satd_16x16_%1, 4,6,12
  5148. SATD_START_SSE2 %1, m10, m7
  5149. %ifidn %1, sse2
  5150. - mova m7, [pw_00ff GLOBAL]
  5151. + mova m7, [pw_00ff]
  5152. %endif
  5153. call x264_pixel_satd_16x4_internal_%1
  5154. call x264_pixel_satd_16x4_internal_%1
  5155. @@ -977,7 +977,7 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
  5156. lea r4, [3*r1]
  5157. lea r5, [3*r3]
  5158. %ifnidn %1, sse2
  5159. - mova m7, [hmul_8p GLOBAL]
  5160. + mova m7, [hmul_8p]
  5161. %endif
  5162. call x264_pixel_sa8d_8x8_internal_%1
  5163. HADDW m0, m1
  5164. @@ -990,7 +990,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
  5165. lea r4, [3*r1]
  5166. lea r5, [3*r3]
  5167. %ifnidn %1, sse2
  5168. - mova m7, [hmul_8p GLOBAL]
  5169. + mova m7, [hmul_8p]
  5170. %endif
  5171. call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
  5172. add r2, 8
  5173. @@ -1029,7 +1029,7 @@ cglobal x264_pixel_sa8d_8x8_internal_%1
  5174. paddw m0, m1
  5175. HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
  5176. %else ; non-sse2
  5177. - mova m7, [hmul_8p GLOBAL]
  5178. + mova m7, [hmul_8p]
  5179. LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
  5180. ; could do first HADAMARD4_V here to save spilling later
  5181. ; surprisingly, not a win on conroe or even p4
  5182. @@ -1221,7 +1221,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
  5183. paddusw m2, m0
  5184.  
  5185. ; 3x HADDW
  5186. - movdqa m7, [pw_1 GLOBAL]
  5187. + movdqa m7, [pw_1]
  5188. pmaddwd m2, m7
  5189. pmaddwd m14, m7
  5190. pmaddwd m15, m7
  5191. @@ -1650,7 +1650,7 @@ cglobal x264_hadamard_ac_2x2max_mmxext
  5192. ret
  5193.  
  5194. cglobal x264_hadamard_ac_8x8_mmxext
  5195. - mova m6, [mask_ac4 GLOBAL]
  5196. + mova m6, [mask_ac4]
  5197. pxor m7, m7
  5198. call x264_hadamard_ac_4x4_mmxext
  5199. add r0, 4
  5200. @@ -1727,7 +1727,7 @@ cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
  5201. mova m3, m0
  5202. paddusw m1, [rsp+0x38]
  5203. pxor m3, m2
  5204. - pand m3, [pw_1 GLOBAL]
  5205. + pand m3, [pw_1]
  5206. pavgw m0, m2
  5207. psubusw m0, m3
  5208. HADDUW m0, m2
  5209. @@ -1791,7 +1791,7 @@ cglobal x264_hadamard_ac_8x8_%1
  5210. %endif
  5211. %ifnidn %1, sse2
  5212. ;LOAD_INC loads sumsubs
  5213. - mova m7, [hmul_8p GLOBAL]
  5214. + mova m7, [hmul_8p]
  5215. %else
  5216. ;LOAD_INC only unpacks to words
  5217. pxor m7, m7
  5218. @@ -1834,9 +1834,9 @@ cglobal x264_hadamard_ac_8x8_%1
  5219. paddw m1, m2
  5220. SUMSUB_BA m0, m4; m2
  5221. %ifnidn %1, sse2
  5222. - pand m1, [mask_ac4b GLOBAL]
  5223. + pand m1, [mask_ac4b]
  5224. %else
  5225. - pand m1, [mask_ac4 GLOBAL]
  5226. + pand m1, [mask_ac4]
  5227. %endif
  5228. ABS_MOV m2, spill0
  5229. paddw m1, m3
  5230. @@ -1878,7 +1878,7 @@ cglobal x264_hadamard_ac_8x8_%1
  5231. paddw m2, m1
  5232. paddw m2, m2
  5233. ABS1 m4, m7
  5234. - pand m0, [mask_ac8 GLOBAL]
  5235. + pand m0, [mask_ac8]
  5236. ABS1 m0, m7
  5237. paddw m2, m4
  5238. paddw m0, m2
  5239. @@ -2041,7 +2041,7 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
  5240. SSIM_ITER 3
  5241. ; PHADDW m1, m2
  5242. ; PHADDD m3, m4
  5243. - movdqa m7, [pw_1 GLOBAL]
  5244. + movdqa m7, [pw_1]
  5245. pshufd m5, m3, 0xb1
  5246. pmaddwd m1, m7
  5247. pmaddwd m2, m7
  5248. @@ -2086,8 +2086,8 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
  5249. paddd m1, m2
  5250. paddd m2, m3
  5251. paddd m3, m4
  5252. - movdqa m5, [ssim_c1 GLOBAL]
  5253. - movdqa m6, [ssim_c2 GLOBAL]
  5254. + movdqa m5, [ssim_c1]
  5255. + movdqa m6, [ssim_c2]
  5256. TRANSPOSE4x4D 0, 1, 2, 3, 4
  5257.  
  5258. ; s1=m0, s2=m1, ss=m2, s12=m3
  5259. @@ -2117,10 +2117,10 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
  5260. je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
  5261. neg r2
  5262. %ifdef PIC
  5263. - lea r3, [mask_ff + 16 GLOBAL]
  5264. + lea r3, [mask_ff + 16]
  5265. movdqu m1, [r3 + r2*4]
  5266. %else
  5267. - movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
  5268. + movdqu m1, [mask_ff + r2*4 + 16]
  5269. %endif
  5270. pand m4, m1
  5271. .skip:
  5272. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  5273. index 808aa31..4d03f8f 100644
  5274. --- a/common/x86/predict-a.asm
  5275. +++ b/common/x86/predict-a.asm
  5276. @@ -99,7 +99,7 @@ SECTION .text
  5277. pavgb %2, %3
  5278. pxor %3, %5
  5279. mov%6 %1, %4
  5280. - pand %3, [pb_1 GLOBAL]
  5281. + pand %3, [pb_1]
  5282. psubusb %2, %3
  5283. pavgb %1, %2
  5284. %endmacro
  5285. @@ -466,7 +466,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
  5286. pxor mm1, mm1
  5287. psadbw mm0, [r1+7]
  5288. psadbw mm1, [r1+16]
  5289. - paddw mm0, [pw_8 GLOBAL]
  5290. + paddw mm0, [pw_8]
  5291. paddw mm0, mm1
  5292. psrlw mm0, 4
  5293. pshufw mm0, mm0, 0
  5294. @@ -481,7 +481,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
  5295. cglobal %1, 2,2
  5296. pxor mm0, mm0
  5297. psadbw mm0, [r1+%2]
  5298. - paddw mm0, [pw_4 GLOBAL]
  5299. + paddw mm0, [pw_4]
  5300. psrlw mm0, 3
  5301. pshufw mm0, mm0, 0
  5302. packuswb mm0, mm0
  5303. @@ -643,7 +643,7 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
  5304. cglobal predict_8x8c_p_core_mmxext, 1,2
  5305. LOAD_PLANE_ARGS
  5306. movq mm1, mm2
  5307. - pmullw mm2, [pw_3210 GLOBAL]
  5308. + pmullw mm2, [pw_3210]
  5309. psllw mm1, 2
  5310. paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
  5311. paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
  5312. @@ -672,7 +672,7 @@ cglobal predict_16x16_p_core_mmxext, 1,2
  5313. LOAD_PLANE_ARGS
  5314. movq mm5, mm2
  5315. movq mm1, mm2
  5316. - pmullw mm5, [pw_3210 GLOBAL]
  5317. + pmullw mm5, [pw_3210]
  5318. psllw mm2, 3
  5319. psllw mm1, 2
  5320. movq mm3, mm2
  5321. @@ -786,7 +786,7 @@ cglobal predict_8x8_vl_sse2, 2,2
  5322. ;-----------------------------------------------------------------------------
  5323. cglobal predict_8x8_vr_sse2, 2,2,7
  5324. movdqu xmm0, [r1+8]
  5325. - movdqa xmm6, [pw_ff00 GLOBAL]
  5326. + movdqa xmm6, [pw_ff00]
  5327. add r0, 4*FDEC_STRIDE
  5328. movdqa xmm1, xmm0
  5329. movdqa xmm2, xmm0
  5330. @@ -910,7 +910,7 @@ cglobal predict_8x8_hu_%1, 2,2
  5331. add r0, 4*FDEC_STRIDE
  5332. %ifidn %1, ssse3
  5333. movq mm5, [r1+7]
  5334. - movq mm6, [pb_reverse GLOBAL]
  5335. + movq mm6, [pb_reverse]
  5336. movq mm1, mm5
  5337. movq mm2, mm5
  5338. movq mm3, mm5
  5339. @@ -979,7 +979,7 @@ cglobal predict_8x8c_v_mmx, 1,1
  5340. %macro PRED_8x8C_H 1
  5341. cglobal predict_8x8c_h_%1, 1,1
  5342. %ifidn %1, ssse3
  5343. - mova m1, [pb_3 GLOBAL]
  5344. + mova m1, [pb_3]
  5345. %endif
  5346. %assign n 0
  5347. %rep 8
  5348. @@ -1018,7 +1018,7 @@ cglobal predict_8x8c_dc_core_mmxext, 1,1
  5349. pshufw mm2, r2m, 0
  5350. %endif
  5351. psrlw mm0, 3
  5352. - paddw mm1, [pw_2 GLOBAL]
  5353. + paddw mm1, [pw_2]
  5354. movq mm3, mm2
  5355. pshufw mm1, mm1, 0
  5356. pshufw mm0, mm0, 0 ; dc0 (w)
  5357. @@ -1065,7 +1065,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1
  5358. punpcklqdq xmm0, xmm0
  5359. punpcklqdq xmm2, xmm2
  5360. punpcklqdq xmm4, xmm4
  5361. - pmullw xmm2, [pw_76543210 GLOBAL]
  5362. + pmullw xmm2, [pw_76543210]
  5363. paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
  5364. movdqa xmm3, xmm0
  5365. paddsw xmm3, xmm4
  5366. @@ -1107,7 +1107,7 @@ cglobal predict_16x16_p_core_sse2, 1,2,8
  5367. punpcklqdq xmm1, xmm1
  5368. punpcklqdq xmm2, xmm2
  5369. movdqa xmm3, xmm1
  5370. - pmullw xmm3, [pw_76543210 GLOBAL]
  5371. + pmullw xmm3, [pw_76543210]
  5372. psllw xmm1, 3
  5373. paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
  5374. paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
  5375. @@ -1162,7 +1162,7 @@ cglobal predict_16x16_v_sse2, 1,1
  5376. cglobal predict_16x16_h_%1, 1,2
  5377. mov r1, FDEC_STRIDE*12
  5378. %ifidn %1, ssse3
  5379. - mova m1, [pb_3 GLOBAL]
  5380. + mova m1, [pb_3]
  5381. %endif
  5382. .vloop:
  5383. %assign n 0
  5384. @@ -1214,7 +1214,7 @@ cglobal predict_16x16_dc_core_mmxext, 1,2
  5385. REP_RET
  5386.  
  5387. cglobal predict_16x16_dc_top_mmxext, 1,2
  5388. - PRED16x16_DC [pw_8 GLOBAL], 4
  5389. + PRED16x16_DC [pw_8], 4
  5390. REP_RET
  5391.  
  5392. cglobal predict_16x16_dc_left_core_mmxext, 1,1
  5393. @@ -1247,7 +1247,7 @@ cglobal predict_16x16_dc_core_sse2, 1,1
  5394. RET
  5395.  
  5396. cglobal predict_16x16_dc_top_sse2, 1,1
  5397. - PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
  5398. + PRED16x16_DC_SSE2 [pw_8], 4
  5399. RET
  5400.  
  5401. cglobal predict_16x16_dc_left_core_sse2, 1,1
  5402. diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
  5403. index 52e121a..3edd244 100644
  5404. --- a/common/x86/quant-a.asm
  5405. +++ b/common/x86/quant-a.asm
  5406. @@ -86,7 +86,7 @@ SECTION .text
  5407. %endmacro
  5408.  
  5409. %macro QUANT_DC_START_SSSE3 0
  5410. - movdqa m5, [pb_01 GLOBAL]
  5411. + movdqa m5, [pb_01]
  5412. movd m6, r1m ; mf
  5413. movd m7, r2m ; bias
  5414. pshufb m6, m5
  5415. @@ -361,7 +361,7 @@ cglobal x264_dequant_%2x%2_%1, 0,3
  5416. .rshift32:
  5417. neg t0d
  5418. movd m2, t0d
  5419. - mova m3, [pd_1 GLOBAL]
  5420. + mova m3, [pd_1]
  5421. pxor m4, m4
  5422. pslld m3, m2
  5423. psrld m3, 1
  5424. @@ -381,10 +381,10 @@ cglobal x264_dequant_%2x%2_flat16_%1, 0,3
  5425. sub t2d, t1d ; i_mf = i_qp % 6
  5426. shl t2d, %3
  5427. %ifdef PIC
  5428. - lea r1, [dequant%2_scale GLOBAL]
  5429. + lea r1, [dequant%2_scale]
  5430. add r1, t2
  5431. %else
  5432. - lea r1, [dequant%2_scale + t2 GLOBAL]
  5433. + lea r1, [dequant%2_scale + t2]
  5434. %endif
  5435. movifnidn r0, r0mp
  5436. movd m4, t0d
  5437. @@ -446,7 +446,7 @@ cglobal x264_dequant_4x4dc_%1, 0,3
  5438. .rshift32:
  5439. neg t0d
  5440. movd m3, t0d
  5441. - mova m4, [pw_1 GLOBAL]
  5442. + mova m4, [pw_1]
  5443. mova m5, m4
  5444. pslld m4, m3
  5445. psrld m4, 1
  5446. @@ -588,15 +588,15 @@ cextern x264_decimate_table8
  5447. ;This is not true for score64.
  5448. cglobal x264_decimate_score%1_%2, 1,3
  5449. %ifdef PIC
  5450. - lea r10, [x264_decimate_table4 GLOBAL]
  5451. - lea r11, [decimate_mask_table4 GLOBAL]
  5452. + lea r10, [x264_decimate_table4]
  5453. + lea r11, [decimate_mask_table4]
  5454. %define table r10
  5455. %define mask_table r11
  5456. %else
  5457. %define table x264_decimate_table4
  5458. %define mask_table decimate_mask_table4
  5459. %endif
  5460. - DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
  5461. + DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
  5462. xor edx, 0xffff
  5463. je .ret
  5464. test eax, eax
  5465. @@ -640,12 +640,12 @@ DECIMATE4x4 16, ssse3
  5466. %ifdef ARCH_X86_64
  5467. cglobal x264_decimate_score64_%1, 1,4
  5468. %ifdef PIC
  5469. - lea r10, [x264_decimate_table8 GLOBAL]
  5470. + lea r10, [x264_decimate_table8]
  5471. %define table r10
  5472. %else
  5473. %define table x264_decimate_table8
  5474. %endif
  5475. - mova m5, [pb_1 GLOBAL]
  5476. + mova m5, [pb_1]
  5477. DECIMATE_MASK r1d, eax, r0, m5, %1, null
  5478. test eax, eax
  5479. jne .ret9
  5480. @@ -681,7 +681,7 @@ cglobal x264_decimate_score64_%1, 1,6
  5481. %else
  5482. cglobal x264_decimate_score64_%1, 1,5
  5483. %endif
  5484. - mova m7, [pb_1 GLOBAL]
  5485. + mova m7, [pb_1]
  5486. DECIMATE_MASK r3, r2, r0, m7, %1, r5
  5487. test r2, r2
  5488. jne .ret9
  5489. diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
  5490. index 342a984..6db8abf 100644
  5491. --- a/common/x86/sad-a.asm
  5492. +++ b/common/x86/sad-a.asm
  5493. @@ -351,7 +351,7 @@ cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
  5494. psadbw m0, m7
  5495. psadbw m1, m6
  5496. paddw m0, m1
  5497. - paddw m0, [pw_8 GLOBAL]
  5498. + paddw m0, [pw_8]
  5499. psrlw m0, 4
  5500. punpcklbw m0, m0
  5501. pshufw m0, m0, 0x0 ;DC prediction
  5502. @@ -411,7 +411,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
  5503. movq m6, [r1 - FDEC_STRIDE]
  5504. add r1, FDEC_STRIDE*4
  5505. %ifidn %1,ssse3
  5506. - movq m7, [pb_3 GLOBAL]
  5507. + movq m7, [pb_3]
  5508. %endif
  5509. INTRA_SAD_HV_ITER 0, %1
  5510. INTRA_SAD_HV_ITER 2, %1
  5511. @@ -450,7 +450,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
  5512. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  5513. %ifidn %1, ssse3
  5514. movq2dq xmm0, m0
  5515. - pshufb xmm0, [pb_shuf8x8c GLOBAL]
  5516. + pshufb xmm0, [pb_shuf8x8c]
  5517. movq xmm1, [r0+FENC_STRIDE*0]
  5518. movq xmm2, [r0+FENC_STRIDE*1]
  5519. movq xmm3, [r0+FENC_STRIDE*2]
  5520. @@ -522,7 +522,7 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
  5521. paddw mm0, mm1
  5522. movd r3d, mm0
  5523. %ifidn %1, ssse3
  5524. - mova m1, [pb_3 GLOBAL]
  5525. + mova m1, [pb_3]
  5526. %endif
  5527. %assign x 0
  5528. %rep 16
  5529. @@ -1301,10 +1301,10 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
  5530. %endif
  5531. %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
  5532. %ifdef PIC
  5533. - lea r5, [sad_w16_addr GLOBAL]
  5534. + lea r5, [sad_w16_addr]
  5535. add r5, r4
  5536. %else
  5537. - lea r5, [sad_w16_addr + r4 GLOBAL]
  5538. + lea r5, [sad_w16_addr + r4]
  5539. %endif
  5540. and r2, ~15
  5541. mov r4d, %2/2
  5542. @@ -1323,7 +1323,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
  5543. jle x264_pixel_sad_%1x%2_mmxext
  5544. and eax, 7
  5545. shl eax, 3
  5546. - movd mm6, [sw_64 GLOBAL]
  5547. + movd mm6, [sw_64]
  5548. movd mm7, eax
  5549. psubw mm6, mm7
  5550. PROLOGUE 4,5
  5551. diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
  5552. index 2a91084..ee3eca9 100644
  5553. --- a/common/x86/x86inc.asm
  5554. +++ b/common/x86/x86inc.asm
  5555. @@ -65,28 +65,16 @@
  5556. %endif
  5557. %endmacro
  5558.  
  5559. -; PIC support macros.
  5560. -; x86_64 can't fit 64bit address literals in most instruction types,
  5561. -; so shared objects (under the assumption that they might be anywhere
  5562. -; in memory) must use an address mode that does fit.
  5563. -; So all accesses to global variables must use this macro, e.g.
  5564. -; mov eax, [foo GLOBAL]
  5565. -; instead of
  5566. -; mov eax, [foo]
  5567. -;
  5568. -; x86_32 doesn't require PIC.
  5569. -; Some distros prefer shared objects to be PIC, but nothing breaks if
  5570. -; the code contains a few textrels, so we'll skip that complexity.
  5571. -
  5572. %ifdef WIN64
  5573. %define PIC
  5574. %elifndef ARCH_X86_64
  5575. +; x86_32 doesn't require PIC.
  5576. +; Some distros prefer shared objects to be PIC, but nothing breaks if
  5577. +; the code contains a few textrels, so we'll skip that complexity.
  5578. %undef PIC
  5579. %endif
  5580. %ifdef PIC
  5581. - %define GLOBAL wrt rip
  5582. -%else
  5583. - %define GLOBAL
  5584. + default rel
  5585. %endif
  5586.  
  5587. ; Macros to eliminate most code duplication between x86_32 and x86_64:
  5588. diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
  5589. index b822688..d70bb0e 100644
  5590. --- a/common/x86/x86util.asm
  5591. +++ b/common/x86/x86util.asm
  5592. @@ -239,10 +239,10 @@
  5593. ; %3/%4: source regs
  5594. ; %5/%6: tmp regs
  5595. %ifidn %1, d
  5596. -%define mask [mask_10 GLOBAL]
  5597. +%define mask [mask_10]
  5598. %define shift 16
  5599. %elifidn %1, q
  5600. -%define mask [mask_1100 GLOBAL]
  5601. +%define mask [mask_1100]
  5602. %define shift 32
  5603. %endif
  5604. %if %0==6 ; less dependency if we have two tmp
  5605. diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
  5606. index 966615b..1970cb9 100644
  5607. --- a/tools/checkasm-a.asm
  5608. +++ b/tools/checkasm-a.asm
  5609. @@ -71,19 +71,19 @@ cglobal x264_checkasm_call, 4,7,16
  5610. %endrep
  5611. %assign i 6
  5612. %rep 16-6
  5613. - movdqa xmm %+ i, [x %+ i GLOBAL]
  5614. + movdqa xmm %+ i, [x %+ i]
  5615. %assign i i+1
  5616. %endrep
  5617. - mov r4, [n4 GLOBAL]
  5618. - mov r5, [n5 GLOBAL]
  5619. + mov r4, [n4]
  5620. + mov r5, [n5]
  5621. call r6
  5622. - xor r4, [n4 GLOBAL]
  5623. - xor r5, [n5 GLOBAL]
  5624. + xor r4, [n4]
  5625. + xor r5, [n5]
  5626. or r4, r5
  5627. pxor xmm5, xmm5
  5628. %assign i 6
  5629. %rep 16-6
  5630. - pxor xmm %+ i, [x %+ i GLOBAL]
  5631. + pxor xmm %+ i, [x %+ i]
  5632. por xmm5, xmm %+ i
  5633. %assign i i+1
  5634. %endrep
  5635. @@ -92,7 +92,7 @@ cglobal x264_checkasm_call, 4,7,16
  5636. or r4, r5
  5637. jz .ok
  5638. mov r4, rax
  5639. - lea r0, [error_message GLOBAL]
  5640. + lea r0, [error_message]
  5641. call puts
  5642. mov r1, [rsp+stack_offset+16]
  5643. mov dword [r1], 0
  5644. @@ -132,7 +132,7 @@ cglobal x264_checkasm_call, 1,7
  5645. or r3, r5
  5646. jz .ok
  5647. mov r3, eax
  5648. - lea r1, [error_message GLOBAL]
  5649. + lea r1, [error_message]
  5650. push r1
  5651. call puts
  5652. add esp, 4
  5653. --
  5654. 1.6.1.2
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement