Advertisement
Guest User

Untitled

a guest
May 3rd, 2017
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 169.13 KB | None | 0 0
  1. From 6cbaa59fc4c4a0af3b67bd1a776a2e63a2b11746 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Fri, 29 Jan 2010 02:40:41 -0800
  4. Subject: [PATCH 01/24] Add ability to adjust ratecontrol parameters on the fly
  5. encoder_reconfig and x264_picture_t->param can now be used to change ratecontrol parameters.
  6. This is extraordinarily useful in certain streaming situations where the encoder needs to adapt the bitrate to network circumstances.
  7.  
  8. What can be changed:
  9. 1) CRF can be adjusted if in CRF mode.
  10. 2) VBV maxrate and bufsize can be adjusted if in VBV mode.
  11. 3) Bitrate can be adjusted if in CBR mode.
  12. However, x264 cannot switch between modes and cannot change bitrate in ABR mode.
  13.  
  14. Also fix a bug where x264_picture_t->param reconfig method would not always be frame-exact.
  15.  
  16. Commit sponsored by SayMama video calling.
  17. ---
  18. encoder/encoder.c | 55 +++++++++++++++++++-
  19. encoder/ratecontrol.c | 137 +++++++++++++++++++++++-------------------------
  20. encoder/ratecontrol.h | 2 +
  21. x264.h | 7 ++-
  22. 4 files changed, 125 insertions(+), 76 deletions(-)
  23.  
  24. diff --git a/encoder/encoder.c b/encoder/encoder.c
  25. index d873cd0..e266a1a 100644
  26. --- a/encoder/encoder.c
  27. +++ b/encoder/encoder.c
  28. @@ -507,6 +507,39 @@ static int x264_validate_parameters( x264_t *h )
  29. }
  30. h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
  31. h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
  32. + if( h->param.rc.i_vbv_buffer_size )
  33. + {
  34. + if( h->param.rc.i_rc_method == X264_RC_CQP )
  35. + {
  36. + x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" );
  37. + h->param.rc.i_vbv_max_bitrate = 0;
  38. + h->param.rc.i_vbv_buffer_size = 0;
  39. + }
  40. + else if( h->param.rc.i_vbv_max_bitrate == 0 )
  41. + {
  42. + if( h->param.rc.i_rc_method == X264_RC_ABR )
  43. + {
  44. + x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
  45. + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  46. + }
  47. + else
  48. + {
  49. + x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
  50. + h->param.rc.i_vbv_buffer_size = 0;
  51. + }
  52. + }
  53. + else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
  54. + h->param.rc.i_rc_method == X264_RC_ABR )
  55. + {
  56. + x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR.\n" );
  57. + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  58. + }
  59. + }
  60. + else if( h->param.rc.i_vbv_max_bitrate )
  61. + {
  62. + x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize.\n" );
  63. + h->param.rc.i_vbv_max_bitrate = 0;
  64. + }
  65.  
  66. int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
  67. if( h->param.b_sliced_threads )
  68. @@ -1071,7 +1104,7 @@ fail:
  69. ****************************************************************************/
  70. int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
  71. {
  72. - h = h->thread[h->i_thread_phase];
  73. + h = h->thread[h->thread[0]->i_thread_phase];
  74. x264_set_aspect_ratio( h, param, 0 );
  75. #define COPY(var) h->param.var = param->var
  76. COPY( i_frame_reference ); // but never uses more refs than initially specified
  77. @@ -1110,11 +1143,29 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
  78. COPY( i_slice_max_size );
  79. COPY( i_slice_max_mbs );
  80. COPY( i_slice_count );
  81. + /* VBV can't be turned on if it wasn't on to begin with */
  82. + if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
  83. + {
  84. + COPY( rc.i_vbv_max_bitrate );
  85. + COPY( rc.i_vbv_buffer_size );
  86. + COPY( rc.i_bitrate );
  87. + }
  88. + COPY( rc.f_rf_constant );
  89. #undef COPY
  90.  
  91. mbcmp_init( h );
  92.  
  93. - return x264_validate_parameters( h );
  94. + int failure = x264_validate_parameters( h );
  95. +
  96. + /* Supported reconfiguration options (1-pass only):
  97. + * vbv-maxrate
  98. + * vbv-bufsize
  99. + * crf
  100. + * bitrate (CBR only) */
  101. + if( !failure )
  102. + x264_ratecontrol_init_reconfigurable( h, 0 );
  103. +
  104. + return failure;
  105. }
  106.  
  107. /****************************************************************************
  108. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  109. index 63b3be6..52196e7 100644
  110. --- a/encoder/ratecontrol.c
  111. +++ b/encoder/ratecontrol.c
  112. @@ -388,6 +388,53 @@ static char *x264_strcat_filename( char *input, char *suffix )
  113. return output;
  114. }
  115.  
  116. +void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
  117. +{
  118. + x264_ratecontrol_t *rc = h->rc;
  119. + if( !b_init && rc->b_2pass )
  120. + return;
  121. +
  122. + if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
  123. + {
  124. + if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
  125. + {
  126. + h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
  127. + x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
  128. + h->param.rc.i_vbv_buffer_size );
  129. + }
  130. +
  131. + /* We don't support changing the ABR bitrate right now,
  132. + so if the stream starts as CBR, keep it CBR. */
  133. + if( rc->b_vbv_min_rate )
  134. + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  135. + rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
  136. + rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
  137. + rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
  138. + rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
  139. + * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
  140. + if( b_init )
  141. + {
  142. + if( h->param.rc.f_vbv_buffer_init > 1. )
  143. + h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
  144. + h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
  145. + rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
  146. + rc->b_vbv = 1;
  147. + rc->b_vbv_min_rate = !rc->b_2pass
  148. + && h->param.rc.i_rc_method == X264_RC_ABR
  149. + && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
  150. + }
  151. + }
  152. + if( h->param.rc.i_rc_method == X264_RC_CRF )
  153. + {
  154. + /* Arbitrary rescaling to make CRF somewhat similar to QP.
  155. + * Try to compensate for MB-tree's effects as well. */
  156. + double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
  157. + double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
  158. + rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
  159. + / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
  160. + }
  161. +}
  162. +
  163. int x264_ratecontrol_new( x264_t *h )
  164. {
  165. x264_ratecontrol_t *rc;
  166. @@ -426,60 +473,10 @@ int x264_ratecontrol_new( x264_t *h )
  167. x264_log(h, X264_LOG_ERROR, "constant rate-factor is incompatible with 2pass.\n");
  168. return -1;
  169. }
  170. - if( h->param.rc.i_vbv_buffer_size )
  171. - {
  172. - if( h->param.rc.i_rc_method == X264_RC_CQP )
  173. - {
  174. - x264_log(h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n");
  175. - h->param.rc.i_vbv_max_bitrate = 0;
  176. - h->param.rc.i_vbv_buffer_size = 0;
  177. - }
  178. - else if( h->param.rc.i_vbv_max_bitrate == 0 )
  179. - {
  180. - if( h->param.rc.i_rc_method == X264_RC_ABR )
  181. - {
  182. - x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
  183. - h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
  184. - }
  185. - else
  186. - {
  187. - x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
  188. - h->param.rc.i_vbv_buffer_size = 0;
  189. - }
  190. - }
  191. - }
  192. - if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
  193. - h->param.rc.i_vbv_max_bitrate > 0)
  194. - x264_log(h, X264_LOG_WARNING, "max bitrate less than average bitrate, ignored.\n");
  195. - else if( h->param.rc.i_vbv_max_bitrate > 0 &&
  196. - h->param.rc.i_vbv_buffer_size > 0 )
  197. - {
  198. - if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
  199. - {
  200. - h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
  201. - x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
  202. - h->param.rc.i_vbv_buffer_size );
  203. - }
  204. - if( h->param.rc.f_vbv_buffer_init > 1. )
  205. - h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
  206. - rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
  207. - rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
  208. - rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
  209. - h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
  210. - rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
  211. - rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
  212. - * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
  213. - rc->b_vbv = 1;
  214. - rc->b_vbv_min_rate = !rc->b_2pass
  215. - && h->param.rc.i_rc_method == X264_RC_ABR
  216. - && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
  217. - }
  218. - else if( h->param.rc.i_vbv_max_bitrate )
  219. - {
  220. - x264_log(h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize.\n");
  221. - h->param.rc.i_vbv_max_bitrate = 0;
  222. - }
  223. - if(rc->rate_tolerance < 0.01)
  224. +
  225. + x264_ratecontrol_init_reconfigurable( h, 1 );
  226. +
  227. + if( rc->rate_tolerance < 0.01 )
  228. {
  229. x264_log(h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n");
  230. rc->rate_tolerance = 0.01;
  231. @@ -499,16 +496,6 @@ int x264_ratecontrol_new( x264_t *h )
  232. rc->last_non_b_pict_type = SLICE_TYPE_I;
  233. }
  234.  
  235. - if( h->param.rc.i_rc_method == X264_RC_CRF )
  236. - {
  237. - /* Arbitrary rescaling to make CRF somewhat similar to QP.
  238. - * Try to compensate for MB-tree's effects as well. */
  239. - double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
  240. - double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
  241. - rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
  242. - / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
  243. - }
  244. -
  245. rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
  246. rc->pb_offset = 6.0 * log(h->param.rc.f_pb_factor) / log(2.0);
  247. rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
  248. @@ -1577,15 +1564,15 @@ static void update_vbv( x264_t *h, int bits )
  249. if( rct->buffer_fill_final < 0 )
  250. x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, rct->buffer_fill_final );
  251. rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
  252. - rct->buffer_fill_final += rct->buffer_rate;
  253. - rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rct->buffer_size );
  254. + rct->buffer_fill_final += rcc->buffer_rate;
  255. + rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rcc->buffer_size );
  256. }
  257.  
  258. // provisionally update VBV according to the planned size of all frames currently in progress
  259. static void update_vbv_plan( x264_t *h, int overhead )
  260. {
  261. x264_ratecontrol_t *rcc = h->rc;
  262. - rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
  263. + rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
  264. if( h->i_thread_frames > 1 )
  265. {
  266. int j = h->rc - h->thread[0]->rc;
  267. @@ -1603,6 +1590,8 @@ static void update_vbv_plan( x264_t *h, int overhead )
  268. rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
  269. }
  270. }
  271. + rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
  272. + rcc->buffer_fill -= overhead;
  273. }
  274.  
  275. // apply VBV constraints and clip qscale to between lmin and lmax
  276. @@ -2027,8 +2016,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  277. #define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
  278. /* these vars are updated in x264_ratecontrol_start()
  279. * so copy them from the context that most recently started (prev)
  280. - * to the context that's about to start (cur).
  281. - */
  282. + * to the context that's about to start (cur). */
  283. COPY(accum_p_qp);
  284. COPY(accum_p_norm);
  285. COPY(last_satd);
  286. @@ -2040,6 +2028,14 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  287. COPY(bframes);
  288. COPY(prev_zone);
  289. COPY(qpbuf_pos);
  290. + /* these vars can be updated by x264_ratecontrol_init_reconfigurable */
  291. + COPY(buffer_rate);
  292. + COPY(buffer_size);
  293. + COPY(single_frame_vbv);
  294. + COPY(cbr_decay);
  295. + COPY(b_vbv_min_rate);
  296. + COPY(rate_factor_constant);
  297. + COPY(bitrate);
  298. #undef COPY
  299. }
  300. if( cur != next )
  301. @@ -2047,8 +2043,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  302. #define COPY(var) next->rc->var = cur->rc->var
  303. /* these vars are updated in x264_ratecontrol_end()
  304. * so copy them from the context that most recently ended (cur)
  305. - * to the context that's about to end (next)
  306. - */
  307. + * to the context that's about to end (next) */
  308. COPY(cplxr_sum);
  309. COPY(expected_bits_sum);
  310. COPY(wanted_bits_window);
  311. diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
  312. index 5a8d088..2767866 100644
  313. --- a/encoder/ratecontrol.h
  314. +++ b/encoder/ratecontrol.h
  315. @@ -27,6 +27,8 @@
  316. int x264_ratecontrol_new ( x264_t * );
  317. void x264_ratecontrol_delete( x264_t * );
  318.  
  319. +void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
  320. +
  321. void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
  322. void x264_adaptive_quant( x264_t * );
  323. int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
  324. diff --git a/x264.h b/x264.h
  325. index 2550864..e7d19b7 100644
  326. --- a/x264.h
  327. +++ b/x264.h
  328. @@ -35,7 +35,7 @@
  329.  
  330. #include <stdarg.h>
  331.  
  332. -#define X264_BUILD 84
  333. +#define X264_BUILD 85
  334.  
  335. /* x264_t:
  336. * opaque handler for encoder */
  337. @@ -480,11 +480,12 @@ typedef struct
  338. x264_t *x264_encoder_open( x264_param_t * );
  339.  
  340. /* x264_encoder_reconfig:
  341. - * analysis-related parameters from x264_param_t are copied.
  342. + * various parameters from x264_param_t are copied.
  343. * this takes effect immediately, on whichever frame is encoded next;
  344. * due to delay, this may not be the next frame passed to encoder_encode.
  345. * if the change should apply to some particular frame, use x264_picture_t->param instead.
  346. - * returns 0 on success, negative on parameter validation error. */
  347. + * returns 0 on success, negative on parameter validation error.
  348. + * not all parameters can be changed; see the actual function for a detailed breakdown. */
  349. int x264_encoder_reconfig( x264_t *, x264_param_t * );
  350. /* x264_encoder_parameters:
  351. * copies the current internal set of parameters to the pointer provided
  352. --
  353. 1.6.1.2
  354.  
  355.  
  356. From 6c5c82b796b48f7005426cc3f55a90b3e1f582fd Mon Sep 17 00:00:00 2001
  357. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  358. Date: Mon, 1 Feb 2010 13:04:47 -0800
  359. Subject: [PATCH 02/24] Slightly faster predictor_difference_mmxext
  360.  
  361. ---
  362. common/x86/util.h | 17 ++++++++++-------
  363. 1 files changed, 10 insertions(+), 7 deletions(-)
  364.  
  365. diff --git a/common/x86/util.h b/common/x86/util.h
  366. index efc700a..c8bcf4b 100644
  367. --- a/common/x86/util.h
  368. +++ b/common/x86/util.h
  369. @@ -45,8 +45,9 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
  370. #define x264_predictor_difference x264_predictor_difference_mmxext
  371. static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
  372. {
  373. - int sum = 0;
  374. - uint16_t output[4];
  375. + int sum;
  376. + static const uint64_t pw_1 = 0x0001000100010001ULL;
  377. +
  378. asm(
  379. "pxor %%mm4, %%mm4 \n"
  380. "test $1, %1 \n"
  381. @@ -56,7 +57,7 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
  382. "psubw %%mm3, %%mm0 \n"
  383. "jmp 2f \n"
  384. "3: \n"
  385. - "sub $1, %1 \n"
  386. + "dec %1 \n"
  387. "1: \n"
  388. "movq -8(%2,%1,4), %%mm0 \n"
  389. "psubw -4(%2,%1,4), %%mm0 \n"
  390. @@ -67,11 +68,13 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
  391. "pmaxsw %%mm2, %%mm0 \n"
  392. "paddusw %%mm0, %%mm4 \n"
  393. "jg 1b \n"
  394. - "movq %%mm4, %0 \n"
  395. - :"=m"(output), "+r"(i_mvc)
  396. - :"r"(mvc), "m"(M64( mvc ))
  397. + "pmaddwd %4, %%mm4 \n"
  398. + "pshufw $14, %%mm4, %%mm0 \n"
  399. + "paddd %%mm0, %%mm4 \n"
  400. + "movd %%mm4, %0 \n"
  401. + :"=r"(sum), "+r"(i_mvc)
  402. + :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
  403. );
  404. - sum += output[0] + output[1] + output[2] + output[3];
  405. return sum;
  406. }
  407. #define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
  408. --
  409. 1.6.1.2
  410.  
  411.  
  412. From fabe7c83223feb254c8ff956ec934020bd2d0964 Mon Sep 17 00:00:00 2001
  413. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  414. Date: Tue, 2 Feb 2010 03:15:18 -0800
  415. Subject: [PATCH 03/24] Improve bidir search, fix some artifacts in fades
  416. Modify analysis to allow bidir to use different motion vectors than L0/L1.
  417. Always try the <0,0,0,0> motion vector for bidir.
  418. Eliminates almost all errant motion vectors in fades.
  419. Slightly improves PSNR as well (~0.015db).
  420.  
  421. ---
  422. encoder/analyse.c | 50 ++++++++++++++++++++++++++++++++++++++------------
  423. 1 files changed, 38 insertions(+), 12 deletions(-)
  424.  
  425. diff --git a/encoder/analyse.c b/encoder/analyse.c
  426. index 666596b..1fb2206 100644
  427. --- a/encoder/analyse.c
  428. +++ b/encoder/analyse.c
  429. @@ -40,6 +40,7 @@ typedef struct
  430. int i_ref;
  431. int i_rd16x16;
  432. x264_me_t me16x16;
  433. + x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  434.  
  435. /* 8x8 */
  436. int i_cost8x8;
  437. @@ -1722,20 +1723,45 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  438. a->l1.me16x16.i_ref = a->l1.i_ref;
  439.  
  440. /* get cost of BI mode */
  441. + int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
  442. + h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
  443. + h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
  444. src0 = h->mc.get_ref( pix0, &stride0,
  445. h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
  446. - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
  447. + a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
  448. src1 = h->mc.get_ref( pix1, &stride1,
  449. h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
  450. - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
  451. + a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
  452.  
  453. h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
  454.  
  455. a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
  456. - + REF_COST( 0, a->l0.i_ref )
  457. - + REF_COST( 1, a->l1.i_ref )
  458. - + a->l0.me16x16.cost_mv
  459. - + a->l1.me16x16.cost_mv;
  460. + + ref_costs
  461. + + a->l0.bi16x16.cost_mv
  462. + + a->l1.bi16x16.cost_mv;
  463. +
  464. +
  465. + /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
  466. + if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
  467. + {
  468. + int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
  469. + + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
  470. + int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
  471. + + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
  472. + h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
  473. + h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
  474. + h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
  475. + int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
  476. + + ref_costs + l0_mv_cost + l1_mv_cost;
  477. + if( cost00 < a->i_cost16x16bi )
  478. + {
  479. + M32( a->l0.bi16x16.mv ) = 0;
  480. + M32( a->l1.bi16x16.mv ) = 0;
  481. + a->l0.bi16x16.cost_mv = l0_mv_cost;
  482. + a->l1.bi16x16.cost_mv = l1_mv_cost;
  483. + a->i_cost16x16bi = cost00;
  484. + }
  485. + }
  486.  
  487. /* mb type cost */
  488. a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
  489. @@ -2205,7 +2231,7 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
  490. {
  491. case D_16x16:
  492. if( h->mb.i_type == B_BI_BI )
  493. - x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
  494. + x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
  495. break;
  496. case D_16x8:
  497. for( i=0; i<2; i++ )
  498. @@ -2819,8 +2845,8 @@ intra_analysis:
  499. }
  500. else if( i_type == B_BI_BI )
  501. {
  502. - x264_me_refine_qpel( h, &analysis.l0.me16x16 );
  503. - x264_me_refine_qpel( h, &analysis.l1.me16x16 );
  504. + x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
  505. + x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
  506. }
  507. }
  508. else if( i_partition == D_16x8 )
  509. @@ -2938,7 +2964,7 @@ intra_analysis:
  510. x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
  511. }
  512. else if( i_type == B_BI_BI )
  513. - x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
  514. + x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
  515. }
  516. else if( i_partition == D_16x8 )
  517. {
  518. @@ -3121,10 +3147,10 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
  519. break;
  520. case B_BI_BI:
  521. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
  522. - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
  523. + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
  524.  
  525. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
  526. - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
  527. + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
  528. break;
  529. }
  530. break;
  531. --
  532. 1.6.1.2
  533.  
  534.  
  535. From 652a7dff1d179c8bad98657bccdb42d5b2c25b81 Mon Sep 17 00:00:00 2001
  536. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  537. Date: Wed, 3 Feb 2010 14:22:05 -0800
  538. Subject: [PATCH 04/24] Faster CABAC MB header writing
  539. Reorganize the header writing to merge mb type and mb mode info (mv, pred, etc)
  540. Reduces redundant branches and better splits the code between frame types (for better code cache usage).
  541. Also slightly simplify qp delta calculation.
  542. Also make CAVLC and CABAC a bit more consistent in structure and function names.
  543.  
  544. ---
  545. encoder/cabac.c | 573 ++++++++++++++++++++++++++-----------------------------
  546. encoder/cavlc.c | 118 ++++++------
  547. 2 files changed, 334 insertions(+), 357 deletions(-)
  548.  
  549. diff --git a/encoder/cabac.c b/encoder/cabac.c
  550. index 271f527..6ff2aed 100644
  551. --- a/encoder/cabac.c
  552. +++ b/encoder/cabac.c
  553. @@ -29,151 +29,6 @@
  554. #define RDO_SKIP_BS 0
  555. #endif
  556.  
  557. -static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
  558. - int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
  559. -{
  560. - if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  561. - {
  562. - x264_cabac_encode_decision_noup( cb, ctx0, 0 );
  563. - }
  564. -#if !RDO_SKIP_BS
  565. - else if( i_mb_type == I_PCM )
  566. - {
  567. - x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  568. - x264_cabac_encode_flush( h, cb );
  569. - }
  570. -#endif
  571. - else
  572. - {
  573. - int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
  574. -
  575. - x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  576. - x264_cabac_encode_terminal( cb );
  577. -
  578. - x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
  579. - if( h->mb.i_cbp_chroma == 0 )
  580. - x264_cabac_encode_decision_noup( cb, ctx2, 0 );
  581. - else
  582. - {
  583. - x264_cabac_encode_decision( cb, ctx2, 1 );
  584. - x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
  585. - }
  586. - x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
  587. - x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
  588. - }
  589. -}
  590. -
  591. -static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
  592. -{
  593. - const int i_mb_type = h->mb.i_type;
  594. -
  595. - if( h->sh.b_mbaff &&
  596. - (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
  597. - {
  598. - x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced );
  599. - }
  600. -
  601. - if( h->sh.i_type == SLICE_TYPE_I )
  602. - {
  603. - int ctx = 0;
  604. - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 )
  605. - ctx++;
  606. - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 )
  607. - ctx++;
  608. -
  609. - x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
  610. - }
  611. - else if( h->sh.i_type == SLICE_TYPE_P )
  612. - {
  613. - /* prefix: 14, suffix: 17 */
  614. - if( i_mb_type == P_L0 )
  615. - {
  616. - x264_cabac_encode_decision_noup( cb, 14, 0 );
  617. - x264_cabac_encode_decision_noup( cb, 15, h->mb.i_partition != D_16x16 );
  618. - x264_cabac_encode_decision_noup( cb, 17-(h->mb.i_partition == D_16x16), h->mb.i_partition == D_16x8 );
  619. - }
  620. - else if( i_mb_type == P_8x8 )
  621. - {
  622. - x264_cabac_encode_decision_noup( cb, 14, 0 );
  623. - x264_cabac_encode_decision_noup( cb, 15, 0 );
  624. - x264_cabac_encode_decision_noup( cb, 16, 1 );
  625. - }
  626. - else /* intra */
  627. - {
  628. - /* prefix */
  629. - x264_cabac_encode_decision_noup( cb, 14, 1 );
  630. -
  631. - /* suffix */
  632. - x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
  633. - }
  634. - }
  635. - else //if( h->sh.i_type == SLICE_TYPE_B )
  636. - {
  637. - int ctx = 0;
  638. - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
  639. - ctx++;
  640. - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
  641. - ctx++;
  642. -
  643. - if( i_mb_type == B_DIRECT )
  644. - {
  645. - x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
  646. - return;
  647. - }
  648. - x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
  649. -
  650. - if( i_mb_type == B_8x8 )
  651. - {
  652. - x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  653. - x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  654. - x264_cabac_encode_decision( cb, 27+5, 1 );
  655. - x264_cabac_encode_decision( cb, 27+5, 1 );
  656. - x264_cabac_encode_decision_noup( cb, 27+5, 1 );
  657. - }
  658. - else if( IS_INTRA( i_mb_type ) )
  659. - {
  660. - /* prefix */
  661. - x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  662. - x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  663. - x264_cabac_encode_decision( cb, 27+5, 1 );
  664. - x264_cabac_encode_decision( cb, 27+5, 0 );
  665. - x264_cabac_encode_decision( cb, 27+5, 1 );
  666. -
  667. - /* suffix */
  668. - x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
  669. - }
  670. - else
  671. - {
  672. - static const uint8_t i_mb_bits[9*3] =
  673. - {
  674. - 0x31, 0x29, 0x4, /* L0 L0 */
  675. - 0x35, 0x2d, 0, /* L0 L1 */
  676. - 0x43, 0x63, 0, /* L0 BI */
  677. - 0x3d, 0x2f, 0, /* L1 L0 */
  678. - 0x39, 0x25, 0x6, /* L1 L1 */
  679. - 0x53, 0x73, 0, /* L1 BI */
  680. - 0x4b, 0x6b, 0, /* BI L0 */
  681. - 0x5b, 0x7b, 0, /* BI L1 */
  682. - 0x47, 0x67, 0x21 /* BI BI */
  683. - };
  684. -
  685. - const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
  686. - int bits = i_mb_bits[idx];
  687. -
  688. - x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
  689. - x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
  690. - if( bits != 1 )
  691. - {
  692. - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  693. - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  694. - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  695. - if( bits != 1 )
  696. - x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
  697. - }
  698. - }
  699. - }
  700. -}
  701. -
  702. static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode )
  703. {
  704. if( i_pred == i_mode )
  705. @@ -209,6 +64,12 @@ static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
  706. }
  707. }
  708.  
  709. +static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
  710. +{
  711. + int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
  712. + x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
  713. +}
  714. +
  715. static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb )
  716. {
  717. int cbp = h->mb.i_cbp_luma;
  718. @@ -244,7 +105,6 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
  719. static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
  720. {
  721. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  722. - int ctx;
  723.  
  724. /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
  725. if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
  726. @@ -257,7 +117,7 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
  727.  
  728. /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
  729. * we don't have to check for them. */
  730. - ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];
  731. + int ctx = !!h->mb.i_last_dqp;
  732.  
  733. if( i_dqp != 0 )
  734. {
  735. @@ -321,12 +181,6 @@ static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
  736. x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 );
  737. }
  738.  
  739. -static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
  740. -{
  741. - int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
  742. - x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
  743. -}
  744. -
  745. static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx )
  746. {
  747. const int i8 = x264_scan8[idx];
  748. @@ -463,6 +317,267 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
  749. }
  750. }
  751.  
  752. +static void x264_cabac_mb_header_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
  753. + int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
  754. +{
  755. + if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  756. + {
  757. + int i, di = h->mb.b_transform_8x8 ? 4 : 1;
  758. + x264_cabac_encode_decision_noup( cb, ctx0, 0 );
  759. +
  760. + if( h->pps->b_transform_8x8_mode )
  761. + x264_cabac_mb_transform_size( h, cb );
  762. +
  763. + for( i = 0; i < 16; i += di )
  764. + {
  765. + const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  766. + const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  767. + x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
  768. + }
  769. + }
  770. +#if !RDO_SKIP_BS
  771. + else if( i_mb_type == I_PCM )
  772. + {
  773. + x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  774. + x264_cabac_encode_flush( h, cb );
  775. + return;
  776. + }
  777. +#endif
  778. + else
  779. + {
  780. + int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
  781. +
  782. + x264_cabac_encode_decision_noup( cb, ctx0, 1 );
  783. + x264_cabac_encode_terminal( cb );
  784. +
  785. + x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
  786. + if( h->mb.i_cbp_chroma == 0 )
  787. + x264_cabac_encode_decision_noup( cb, ctx2, 0 );
  788. + else
  789. + {
  790. + x264_cabac_encode_decision( cb, ctx2, 1 );
  791. + x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
  792. + }
  793. + x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
  794. + x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
  795. + }
  796. + x264_cabac_mb_intra_chroma_pred_mode( h, cb );
  797. +}
  798. +
  799. +static inline void x264_cabac_mb_header( x264_t *h, x264_cabac_t *cb )
  800. +{
  801. + const int i_mb_type = h->mb.i_type;
  802. + int i_list, i;
  803. +
  804. + if( h->sh.b_mbaff &&
  805. + (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
  806. + {
  807. + x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced );
  808. + }
  809. +
  810. + if( h->sh.i_type == SLICE_TYPE_I )
  811. + {
  812. + int ctx = 0;
  813. + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 )
  814. + ctx++;
  815. + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 )
  816. + ctx++;
  817. +
  818. + x264_cabac_mb_header_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
  819. + }
  820. + else if( h->sh.i_type == SLICE_TYPE_P )
  821. + {
  822. + /* prefix: 14, suffix: 17 */
  823. + if( i_mb_type == P_L0 )
  824. + {
  825. + x264_cabac_encode_decision_noup( cb, 14, 0 );
  826. + if( h->mb.i_partition == D_16x16 )
  827. + {
  828. + x264_cabac_encode_decision_noup( cb, 15, 0 );
  829. + x264_cabac_encode_decision_noup( cb, 16, 0 );
  830. + if( h->mb.pic.i_fref[0] > 1 )
  831. + x264_cabac_mb_ref( h, cb, 0, 0 );
  832. + x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 );
  833. + }
  834. + else if( h->mb.i_partition == D_16x8 )
  835. + {
  836. + x264_cabac_encode_decision_noup( cb, 15, 1 );
  837. + x264_cabac_encode_decision_noup( cb, 17, 1 );
  838. + if( h->mb.pic.i_fref[0] > 1 )
  839. + {
  840. + x264_cabac_mb_ref( h, cb, 0, 0 );
  841. + x264_cabac_mb_ref( h, cb, 0, 8 );
  842. + }
  843. + x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
  844. + x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
  845. + }
  846. + else //if( h->mb.i_partition == D_8x16 )
  847. + {
  848. + x264_cabac_encode_decision_noup( cb, 15, 1 );
  849. + x264_cabac_encode_decision_noup( cb, 17, 0 );
  850. + if( h->mb.pic.i_fref[0] > 1 )
  851. + {
  852. + x264_cabac_mb_ref( h, cb, 0, 0 );
  853. + x264_cabac_mb_ref( h, cb, 0, 4 );
  854. + }
  855. + x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 );
  856. + x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 );
  857. + }
  858. + }
  859. + else if( i_mb_type == P_8x8 )
  860. + {
  861. + x264_cabac_encode_decision_noup( cb, 14, 0 );
  862. + x264_cabac_encode_decision_noup( cb, 15, 0 );
  863. + x264_cabac_encode_decision_noup( cb, 16, 1 );
  864. +
  865. + /* sub mb type */
  866. + for( i = 0; i < 4; i++ )
  867. + x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
  868. +
  869. + /* ref 0 */
  870. + if( h->mb.pic.i_fref[0] > 1 )
  871. + {
  872. + x264_cabac_mb_ref( h, cb, 0, 0 );
  873. + x264_cabac_mb_ref( h, cb, 0, 4 );
  874. + x264_cabac_mb_ref( h, cb, 0, 8 );
  875. + x264_cabac_mb_ref( h, cb, 0, 12 );
  876. + }
  877. +
  878. + for( i = 0; i < 4; i++ )
  879. + x264_cabac_mb8x8_mvd( h, cb, i );
  880. + }
  881. + else /* intra */
  882. + {
  883. + /* prefix */
  884. + x264_cabac_encode_decision_noup( cb, 14, 1 );
  885. +
  886. + /* suffix */
  887. + x264_cabac_mb_header_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
  888. + }
  889. + }
  890. + else //if( h->sh.i_type == SLICE_TYPE_B )
  891. + {
  892. + int ctx = 0;
  893. + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
  894. + ctx++;
  895. + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
  896. + ctx++;
  897. +
  898. + if( i_mb_type == B_DIRECT )
  899. + {
  900. + x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
  901. + return;
  902. + }
  903. + x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
  904. +
  905. + if( i_mb_type == B_8x8 )
  906. + {
  907. + x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  908. + x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  909. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  910. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  911. + x264_cabac_encode_decision_noup( cb, 27+5, 1 );
  912. +
  913. + /* sub mb type */
  914. + for( i = 0; i < 4; i++ )
  915. + x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
  916. +
  917. + /* ref */
  918. + if( h->mb.pic.i_fref[0] > 1 )
  919. + for( i = 0; i < 4; i++ )
  920. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  921. + x264_cabac_mb_ref( h, cb, 0, 4*i );
  922. +
  923. + if( h->mb.pic.i_fref[1] > 1 )
  924. + for( i = 0; i < 4; i++ )
  925. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  926. + x264_cabac_mb_ref( h, cb, 1, 4*i );
  927. +
  928. + for( i = 0; i < 4; i++ )
  929. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  930. + x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
  931. +
  932. + for( i = 0; i < 4; i++ )
  933. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  934. + x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
  935. + }
  936. + else if( IS_INTRA( i_mb_type ) )
  937. + {
  938. + /* prefix */
  939. + x264_cabac_encode_decision_noup( cb, 27+3, 1 );
  940. + x264_cabac_encode_decision_noup( cb, 27+4, 1 );
  941. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  942. + x264_cabac_encode_decision ( cb, 27+5, 0 );
  943. + x264_cabac_encode_decision ( cb, 27+5, 1 );
  944. +
  945. + /* suffix */
  946. + x264_cabac_mb_header_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
  947. + }
  948. + else
  949. + {
  950. + static const uint8_t i_mb_bits[9*3] =
  951. + {
  952. + 0x31, 0x29, 0x4, /* L0 L0 */
  953. + 0x35, 0x2d, 0, /* L0 L1 */
  954. + 0x43, 0x63, 0, /* L0 BI */
  955. + 0x3d, 0x2f, 0, /* L1 L0 */
  956. + 0x39, 0x25, 0x6, /* L1 L1 */
  957. + 0x53, 0x73, 0, /* L1 BI */
  958. + 0x4b, 0x6b, 0, /* BI L0 */
  959. + 0x5b, 0x7b, 0, /* BI L1 */
  960. + 0x47, 0x67, 0x21 /* BI BI */
  961. + };
  962. +
  963. + const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
  964. + int bits = i_mb_bits[idx];
  965. +
  966. + x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
  967. + x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
  968. + if( bits != 1 )
  969. + {
  970. + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  971. + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  972. + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
  973. + if( bits != 1 )
  974. + x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
  975. + }
  976. +
  977. + const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  978. + if( h->mb.pic.i_fref[0] > 1 )
  979. + {
  980. + if( b_list[0][0] )
  981. + x264_cabac_mb_ref( h, cb, 0, 0 );
  982. + if( b_list[0][1] && h->mb.i_partition != D_16x16 )
  983. + x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
  984. + }
  985. + if( h->mb.pic.i_fref[1] > 1 )
  986. + {
  987. + if( b_list[1][0] )
  988. + x264_cabac_mb_ref( h, cb, 1, 0 );
  989. + if( b_list[1][1] && h->mb.i_partition != D_16x16 )
  990. + x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
  991. + }
  992. + for( i_list = 0; i_list < 2; i_list++ )
  993. + {
  994. + if( h->mb.i_partition == D_16x16 )
  995. + {
  996. + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 );
  997. + }
  998. + else if( h->mb.i_partition == D_16x8 )
  999. + {
  1000. + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
  1001. + if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
  1002. + }
  1003. + else //if( h->mb.i_partition == D_8x16 )
  1004. + {
  1005. + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
  1006. + if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
  1007. + }
  1008. + }
  1009. + }
  1010. + }
  1011. +}
  1012. +
  1013. /* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0
  1014. * 1-> AC 16x16 i_idx = luma4x4idx
  1015. * 2-> Luma4x4 i_idx = luma4x4idx
  1016. @@ -752,7 +867,6 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
  1017. void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1018. {
  1019. const int i_mb_type = h->mb.i_type;
  1020. - int i_list;
  1021. int i;
  1022.  
  1023. #if !RDO_SKIP_BS
  1024. @@ -760,15 +874,14 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1025. int i_mb_pos_tex;
  1026. #endif
  1027.  
  1028. - /* Write the MB type */
  1029. - x264_cabac_mb_type( h, cb );
  1030. + x264_cabac_mb_header( h, cb );
  1031.  
  1032. #if !RDO_SKIP_BS
  1033. + i_mb_pos_tex = x264_cabac_pos( cb );
  1034. + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1035. +
  1036. if( i_mb_type == I_PCM )
  1037. {
  1038. - i_mb_pos_tex = x264_cabac_pos( cb );
  1039. - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1040. -
  1041. memcpy( cb->p, h->mb.pic.p_fenc[0], 256 );
  1042. cb->p += 256;
  1043. for( i = 0; i < 8; i++ )
  1044. @@ -793,140 +906,6 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1045. }
  1046. #endif
  1047.  
  1048. - if( IS_INTRA( i_mb_type ) )
  1049. - {
  1050. - if( h->pps->b_transform_8x8_mode && i_mb_type != I_16x16 )
  1051. - x264_cabac_mb_transform_size( h, cb );
  1052. -
  1053. - if( i_mb_type != I_16x16 )
  1054. - {
  1055. - int di = h->mb.b_transform_8x8 ? 4 : 1;
  1056. - for( i = 0; i < 16; i += di )
  1057. - {
  1058. - const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  1059. - const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  1060. - x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
  1061. - }
  1062. - }
  1063. -
  1064. - x264_cabac_mb_intra_chroma_pred_mode( h, cb );
  1065. - }
  1066. - else if( i_mb_type == P_L0 )
  1067. - {
  1068. - if( h->mb.i_partition == D_16x16 )
  1069. - {
  1070. - if( h->mb.pic.i_fref[0] > 1 )
  1071. - {
  1072. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1073. - }
  1074. - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 );
  1075. - }
  1076. - else if( h->mb.i_partition == D_16x8 )
  1077. - {
  1078. - if( h->mb.pic.i_fref[0] > 1 )
  1079. - {
  1080. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1081. - x264_cabac_mb_ref( h, cb, 0, 8 );
  1082. - }
  1083. - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
  1084. - x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
  1085. - }
  1086. - else //if( h->mb.i_partition == D_8x16 )
  1087. - {
  1088. - if( h->mb.pic.i_fref[0] > 1 )
  1089. - {
  1090. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1091. - x264_cabac_mb_ref( h, cb, 0, 4 );
  1092. - }
  1093. - x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 );
  1094. - x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 );
  1095. - }
  1096. - }
  1097. - else if( i_mb_type == P_8x8 )
  1098. - {
  1099. - /* sub mb type */
  1100. - for( i = 0; i < 4; i++ )
  1101. - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
  1102. -
  1103. - /* ref 0 */
  1104. - if( h->mb.pic.i_fref[0] > 1 )
  1105. - {
  1106. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1107. - x264_cabac_mb_ref( h, cb, 0, 4 );
  1108. - x264_cabac_mb_ref( h, cb, 0, 8 );
  1109. - x264_cabac_mb_ref( h, cb, 0, 12 );
  1110. - }
  1111. -
  1112. - for( i = 0; i < 4; i++ )
  1113. - x264_cabac_mb8x8_mvd( h, cb, i );
  1114. - }
  1115. - else if( i_mb_type == B_8x8 )
  1116. - {
  1117. - /* sub mb type */
  1118. - for( i = 0; i < 4; i++ )
  1119. - x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
  1120. -
  1121. - /* ref */
  1122. - if( h->mb.pic.i_fref[0] > 1 )
  1123. - for( i = 0; i < 4; i++ )
  1124. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  1125. - x264_cabac_mb_ref( h, cb, 0, 4*i );
  1126. -
  1127. - if( h->mb.pic.i_fref[1] > 1 )
  1128. - for( i = 0; i < 4; i++ )
  1129. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  1130. - x264_cabac_mb_ref( h, cb, 1, 4*i );
  1131. -
  1132. - for( i = 0; i < 4; i++ )
  1133. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  1134. - x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
  1135. -
  1136. - for( i = 0; i < 4; i++ )
  1137. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  1138. - x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
  1139. - }
  1140. - else if( i_mb_type != B_DIRECT )
  1141. - {
  1142. - /* All B mode */
  1143. - const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  1144. - if( h->mb.pic.i_fref[0] > 1 )
  1145. - {
  1146. - if( b_list[0][0] )
  1147. - x264_cabac_mb_ref( h, cb, 0, 0 );
  1148. - if( b_list[0][1] && h->mb.i_partition != D_16x16 )
  1149. - x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
  1150. - }
  1151. - if( h->mb.pic.i_fref[1] > 1 )
  1152. - {
  1153. - if( b_list[1][0] )
  1154. - x264_cabac_mb_ref( h, cb, 1, 0 );
  1155. - if( b_list[1][1] && h->mb.i_partition != D_16x16 )
  1156. - x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
  1157. - }
  1158. - for( i_list = 0; i_list < 2; i_list++ )
  1159. - {
  1160. - if( h->mb.i_partition == D_16x16 )
  1161. - {
  1162. - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 );
  1163. - }
  1164. - else if( h->mb.i_partition == D_16x8 )
  1165. - {
  1166. - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
  1167. - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
  1168. - }
  1169. - else //if( h->mb.i_partition == D_8x16 )
  1170. - {
  1171. - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
  1172. - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
  1173. - }
  1174. - }
  1175. - }
  1176. -
  1177. -#if !RDO_SKIP_BS
  1178. - i_mb_pos_tex = x264_cabac_pos( cb );
  1179. - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1180. -#endif
  1181. -
  1182. if( i_mb_type != I_16x16 )
  1183. {
  1184. x264_cabac_mb_cbp_luma( h, cb );
  1185. @@ -934,11 +913,9 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1186. }
  1187.  
  1188. if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
  1189. - {
  1190. x264_cabac_mb_transform_size( h, cb );
  1191. - }
  1192.  
  1193. - if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
  1194. + if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  1195. {
  1196. const int b_intra = IS_INTRA( i_mb_type );
  1197. x264_cabac_mb_qp_delta( h, cb );
  1198. @@ -950,7 +927,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1199. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 1 );
  1200.  
  1201. /* AC Luma */
  1202. - if( h->mb.i_cbp_luma != 0 )
  1203. + if( h->mb.i_cbp_luma )
  1204. for( i = 0; i < 16; i++ )
  1205. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 1 );
  1206. }
  1207. @@ -967,7 +944,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1208. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], b_intra );
  1209. }
  1210.  
  1211. - if( h->mb.i_cbp_chroma&0x03 ) /* Chroma DC residual present */
  1212. + if( h->mb.i_cbp_chroma ) /* Chroma DC residual present */
  1213. {
  1214. block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], b_intra );
  1215. block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], b_intra );
  1216. diff --git a/encoder/cavlc.c b/encoder/cavlc.c
  1217. index c65c9bd..d18408b 100644
  1218. --- a/encoder/cavlc.c
  1219. +++ b/encoder/cavlc.c
  1220. @@ -203,7 +203,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
  1221. *nnz = block_residual_write_cavlc(h,cat,l,nC);\
  1222. }
  1223.  
  1224. -static void cavlc_qp_delta( x264_t *h )
  1225. +static void x264_cavlc_mb_qp_delta( x264_t *h )
  1226. {
  1227. bs_t *s = &h->out.bs;
  1228. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1229. @@ -228,7 +228,7 @@ static void cavlc_qp_delta( x264_t *h )
  1230. bs_write_se( s, i_dqp );
  1231. }
  1232.  
  1233. -static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
  1234. +static void x264_cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
  1235. {
  1236. bs_t *s = &h->out.bs;
  1237. ALIGNED_4( int16_t mvp[2] );
  1238. @@ -237,26 +237,26 @@ static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
  1239. bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
  1240. }
  1241.  
  1242. -static inline void cavlc_mb8x8_mvd( x264_t *h, int i )
  1243. +static inline void x264_cavlc_mb8x8_mvd( x264_t *h, int i )
  1244. {
  1245. switch( h->mb.i_sub_partition[i] )
  1246. {
  1247. case D_L0_8x8:
  1248. - cavlc_mb_mvd( h, 0, 4*i, 2 );
  1249. + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  1250. break;
  1251. case D_L0_8x4:
  1252. - cavlc_mb_mvd( h, 0, 4*i+0, 2 );
  1253. - cavlc_mb_mvd( h, 0, 4*i+2, 2 );
  1254. + x264_cavlc_mb_mvd( h, 0, 4*i+0, 2 );
  1255. + x264_cavlc_mb_mvd( h, 0, 4*i+2, 2 );
  1256. break;
  1257. case D_L0_4x8:
  1258. - cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1259. - cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1260. + x264_cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1261. + x264_cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1262. break;
  1263. case D_L0_4x4:
  1264. - cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1265. - cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1266. - cavlc_mb_mvd( h, 0, 4*i+2, 1 );
  1267. - cavlc_mb_mvd( h, 0, 4*i+3, 1 );
  1268. + x264_cavlc_mb_mvd( h, 0, 4*i+0, 1 );
  1269. + x264_cavlc_mb_mvd( h, 0, 4*i+1, 1 );
  1270. + x264_cavlc_mb_mvd( h, 0, 4*i+2, 1 );
  1271. + x264_cavlc_mb_mvd( h, 0, 4*i+3, 1 );
  1272. break;
  1273. }
  1274. }
  1275. @@ -372,7 +372,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1276.  
  1277. if( h->mb.pic.i_fref[0] > 1 )
  1278. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1279. - cavlc_mb_mvd( h, 0, 0, 4 );
  1280. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1281. }
  1282. else if( h->mb.i_partition == D_16x8 )
  1283. {
  1284. @@ -382,8 +382,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1285. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1286. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  1287. }
  1288. - cavlc_mb_mvd( h, 0, 0, 4 );
  1289. - cavlc_mb_mvd( h, 0, 8, 4 );
  1290. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1291. + x264_cavlc_mb_mvd( h, 0, 8, 4 );
  1292. }
  1293. else if( h->mb.i_partition == D_8x16 )
  1294. {
  1295. @@ -393,8 +393,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1296. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1297. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  1298. }
  1299. - cavlc_mb_mvd( h, 0, 0, 2 );
  1300. - cavlc_mb_mvd( h, 0, 4, 2 );
  1301. + x264_cavlc_mb_mvd( h, 0, 0, 2 );
  1302. + x264_cavlc_mb_mvd( h, 0, 4, 2 );
  1303. }
  1304. }
  1305. else if( i_mb_type == P_8x8 )
  1306. @@ -429,7 +429,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1307. }
  1308.  
  1309. for( i = 0; i < 4; i++ )
  1310. - cavlc_mb8x8_mvd( h, i );
  1311. + x264_cavlc_mb8x8_mvd( h, i );
  1312. }
  1313. else if( i_mb_type == B_8x8 )
  1314. {
  1315. @@ -452,10 +452,10 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1316. /* mvd */
  1317. for( i = 0; i < 4; i++ )
  1318. if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  1319. - cavlc_mb_mvd( h, 0, 4*i, 2 );
  1320. + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  1321. for( i = 0; i < 4; i++ )
  1322. if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  1323. - cavlc_mb_mvd( h, 1, 4*i, 2 );
  1324. + x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
  1325. }
  1326. else if( i_mb_type != B_DIRECT )
  1327. {
  1328. @@ -470,8 +470,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1329. {
  1330. if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
  1331. if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
  1332. - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
  1333. - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
  1334. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1335. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  1336. }
  1337. else
  1338. {
  1339. @@ -481,17 +481,17 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1340. if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
  1341. if( h->mb.i_partition == D_16x8 )
  1342. {
  1343. - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
  1344. - if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 8, 4 );
  1345. - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
  1346. - if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 8, 4 );
  1347. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1348. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
  1349. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  1350. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
  1351. }
  1352. else //if( h->mb.i_partition == D_8x16 )
  1353. {
  1354. - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 2 );
  1355. - if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 4, 2 );
  1356. - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 2 );
  1357. - if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 4, 2 );
  1358. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
  1359. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
  1360. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
  1361. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
  1362. }
  1363. }
  1364. }
  1365. @@ -514,31 +514,31 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1366. bs_write1( s, h->mb.b_transform_8x8 );
  1367.  
  1368. /* write residual */
  1369. - if( i_mb_type == I_16x16 )
  1370. + if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  1371. {
  1372. - cavlc_qp_delta( h );
  1373. + x264_cavlc_mb_qp_delta( h );
  1374.  
  1375. - /* DC Luma */
  1376. - block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
  1377. + if( i_mb_type == I_16x16 )
  1378. + {
  1379. + /* DC Luma */
  1380. + block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
  1381.  
  1382. - /* AC Luma */
  1383. - if( h->mb.i_cbp_luma )
  1384. - for( i = 0; i < 16; i++ )
  1385. - block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
  1386. - }
  1387. - else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
  1388. - {
  1389. - cavlc_qp_delta( h );
  1390. - x264_macroblock_luma_write_cavlc( h, 0, 3 );
  1391. - }
  1392. - if( h->mb.i_cbp_chroma )
  1393. - {
  1394. - /* Chroma DC residual present */
  1395. - block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
  1396. - block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
  1397. - if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
  1398. - for( i = 16; i < 24; i++ )
  1399. - block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
  1400. + /* AC Luma */
  1401. + if( h->mb.i_cbp_luma )
  1402. + for( i = 0; i < 16; i++ )
  1403. + block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
  1404. + }
  1405. + else
  1406. + x264_macroblock_luma_write_cavlc( h, 0, 3 );
  1407. +
  1408. + if( h->mb.i_cbp_chroma ) /* Chroma DC residual present */
  1409. + {
  1410. + block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
  1411. + block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
  1412. + if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
  1413. + for( i = 16; i < 24; i++ )
  1414. + block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
  1415. + }
  1416. }
  1417.  
  1418. #if !RDO_SKIP_BS
  1419. @@ -563,22 +563,22 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
  1420.  
  1421. if( i_mb_type == P_8x8 )
  1422. {
  1423. - cavlc_mb8x8_mvd( h, i8 );
  1424. + x264_cavlc_mb8x8_mvd( h, i8 );
  1425. bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
  1426. }
  1427. else if( i_mb_type == P_L0 )
  1428. - cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1429. + x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1430. else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
  1431. {
  1432. - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1433. - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  1434. + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  1435. + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  1436. }
  1437. else //if( i_mb_type == B_8x8 )
  1438. {
  1439. if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  1440. - cavlc_mb_mvd( h, 0, 4*i8, 2 );
  1441. + x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
  1442. if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  1443. - cavlc_mb_mvd( h, 1, 4*i8, 2 );
  1444. + x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
  1445. }
  1446.  
  1447. for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
  1448. @@ -596,7 +596,7 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
  1449. {
  1450. int b_8x4 = i_pixel == PIXEL_8x4;
  1451. h->out.bs.i_bits_encoded = 0;
  1452. - cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
  1453. + x264_cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
  1454. block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
  1455. if( i_pixel != PIXEL_4x4 )
  1456. {
  1457. --
  1458. 1.6.1.2
  1459.  
  1460.  
  1461. From e494167d136a8a8cd044c5a555ecc1311c90effc Mon Sep 17 00:00:00 2001
  1462. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1463. Date: Wed, 3 Feb 2010 18:19:29 -0800
  1464. Subject: [PATCH 05/24] Simplify decimate checks in macroblock_encode
  1465. Also fix a misleading comment.
  1466.  
  1467. ---
  1468. common/common.h | 1 +
  1469. encoder/analyse.c | 2 ++
  1470. encoder/macroblock.c | 12 +++++-------
  1471. 3 files changed, 8 insertions(+), 7 deletions(-)
  1472.  
  1473. diff --git a/common/common.h b/common/common.h
  1474. index 950f48f..8b1b05a 100644
  1475. --- a/common/common.h
  1476. +++ b/common/common.h
  1477. @@ -484,6 +484,7 @@ struct x264_t
  1478. int b_chroma_me;
  1479. int b_trellis;
  1480. int b_noise_reduction;
  1481. + int b_dct_decimate;
  1482. int i_psy_rd; /* Psy RD strength--fixed point value*/
  1483. int i_psy_trellis; /* Psy trellis strength--fixed point value*/
  1484.  
  1485. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1486. index 1fb2206..92d6584 100644
  1487. --- a/encoder/analyse.c
  1488. +++ b/encoder/analyse.c
  1489. @@ -364,6 +364,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
  1490. h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
  1491. h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
  1492. && h->mb.i_subpel_refine >= 5;
  1493. + h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
  1494. + (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
  1495.  
  1496. h->mb.b_transform_8x8 = 0;
  1497. h->mb.b_noise_reduction = 0;
  1498. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  1499. index e4edb8a..fa7942d 100644
  1500. --- a/encoder/macroblock.c
  1501. +++ b/encoder/macroblock.c
  1502. @@ -208,8 +208,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
  1503. ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
  1504.  
  1505. int i, nz;
  1506. - int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
  1507. - int decimate_score = b_decimate ? 0 : 9;
  1508. + int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
  1509.  
  1510. if( h->mb.b_lossless )
  1511. {
  1512. @@ -342,7 +341,7 @@ static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp,
  1513. void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  1514. {
  1515. int i, ch, nz, nz_dc;
  1516. - int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
  1517. + int b_decimate = b_inter && h->mb.b_dct_decimate;
  1518. ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
  1519. h->mb.i_cbp_chroma = 0;
  1520.  
  1521. @@ -607,7 +606,7 @@ void x264_macroblock_encode( x264_t *h )
  1522. {
  1523. int i_cbp_dc = 0;
  1524. int i_qp = h->mb.i_qp;
  1525. - int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
  1526. + int b_decimate = h->mb.b_dct_decimate;
  1527. int b_force_no_skip = 0;
  1528. int i,idx,nz;
  1529. h->mb.i_cbp_luma = 0;
  1530. @@ -914,8 +913,7 @@ void x264_macroblock_encode( x264_t *h )
  1531.  
  1532. /*****************************************************************************
  1533. * x264_macroblock_probe_skip:
  1534. - * Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
  1535. - * the previous QP
  1536. + * Check if the current MB could be encoded as a [PB]_SKIP
  1537. *****************************************************************************/
  1538. int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  1539. {
  1540. @@ -1052,7 +1050,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1541. int i_qp = h->mb.i_qp;
  1542. uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
  1543. uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
  1544. - int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
  1545. + int b_decimate = h->mb.b_dct_decimate;
  1546. int nnz8x8 = 0;
  1547. int ch, nz;
  1548.  
  1549. --
  1550. 1.6.1.2
  1551.  
  1552.  
  1553. From d80fa482a6f99b0d0bb59fdace6ef5cdbd67b98e Mon Sep 17 00:00:00 2001
  1554. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1555. Date: Wed, 3 Feb 2010 18:36:44 -0800
  1556. Subject: [PATCH 06/24] Fix subpel iteration counts with B-frame analysis and subme 6/8
  1557. Since subme 6 means "like subme 5, except RD on P-frames", B-frame analysis
  1558. shouldn't use the RD subpel counts at subme 6. Similarly with subme 8.
  1559. Slightly faster (and very marginally worse) compression at subme 6 and 8.
  1560.  
  1561. ---
  1562. encoder/analyse.c | 2 ++
  1563. 1 files changed, 2 insertions(+), 0 deletions(-)
  1564.  
  1565. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1566. index 92d6584..c15bf8f 100644
  1567. --- a/encoder/analyse.c
  1568. +++ b/encoder/analyse.c
  1569. @@ -362,6 +362,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
  1570.  
  1571. h->mb.i_me_method = h->param.analyse.i_me_method;
  1572. h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
  1573. + if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
  1574. + h->mb.i_subpel_refine--;
  1575. h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
  1576. && h->mb.i_subpel_refine >= 5;
  1577. h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
  1578. --
  1579. 1.6.1.2
  1580.  
  1581.  
  1582. From 34b59c92d298b0fb58130d8601d053bfea1c870a Mon Sep 17 00:00:00 2001
  1583. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1584. Date: Wed, 3 Feb 2010 20:01:16 -0800
  1585. Subject: [PATCH 07/24] Smarter QPRD
  1586. Catch some cases in which RD checks can be avoided; reduces QPRD RD calls by 10-20%.
  1587.  
  1588. ---
  1589. encoder/analyse.c | 42 ++++++++++++++++++++++++++++++++++++++----
  1590. 1 files changed, 38 insertions(+), 4 deletions(-)
  1591.  
  1592. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1593. index c15bf8f..53ca025 100644
  1594. --- a/encoder/analyse.c
  1595. +++ b/encoder/analyse.c
  1596. @@ -2307,9 +2307,10 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1597. int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
  1598. int last_qp_tried = 0;
  1599. origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
  1600. + int origcbp = h->mb.cbp[h->mb.i_mb_xy];
  1601.  
  1602. /* If CBP is already zero, don't raise the quantizer any higher. */
  1603. - for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
  1604. + for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
  1605. {
  1606. /* Without psy-RD, require monotonicity when moving quant away from previous
  1607. * macroblock's quant; allow 1 failure when moving quant towards previous quant.
  1608. @@ -2324,14 +2325,47 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1609. h->mb.i_qp = orig_qp;
  1610. failures = 0;
  1611. prevcost = origcost;
  1612. +
  1613. + /* If the current QP results in an empty CBP, it's highly likely that lower QPs
  1614. + * (up to a point) will too. So, jump down to where the threshold will kick in
  1615. + * and check the QP there. If the CBP is still empty, skip the main loop.
  1616. + * If it isn't empty, we would have ended up having to check this QP anyways,
  1617. + * so as long as we store it for later lookup, we lose nothing. */
  1618. + int already_checked_qp = -1;
  1619. + int already_checked_cost = COST_MAX;
  1620. + if( direction == -1 )
  1621. + {
  1622. + if( !origcbp )
  1623. + {
  1624. + h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
  1625. + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1626. + already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1627. + if( !h->mb.cbp[h->mb.i_mb_xy] )
  1628. + {
  1629. + /* If our empty-CBP block is lower QP than the last QP,
  1630. + * the last QP cannot possibly have a CBP either. */
  1631. + if( h->mb.i_last_qp > h->mb.i_qp )
  1632. + last_qp_tried = 1;
  1633. + break;
  1634. + }
  1635. + already_checked_qp = h->mb.i_qp;
  1636. + h->mb.i_qp = orig_qp;
  1637. + }
  1638. + }
  1639. +
  1640. h->mb.i_qp += direction;
  1641. while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
  1642. {
  1643. if( h->mb.i_last_qp == h->mb.i_qp )
  1644. last_qp_tried = 1;
  1645. - h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1646. - cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1647. - COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
  1648. + if( h->mb.i_qp == already_checked_qp )
  1649. + cost = already_checked_cost;
  1650. + else
  1651. + {
  1652. + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1653. + cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1654. + COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
  1655. + }
  1656.  
  1657. /* We can't assume that the costs are monotonic over QPs.
  1658. * Tie case-as-failure seems to give better results. */
  1659. --
  1660. 1.6.1.2
  1661.  
  1662.  
  1663. From aa56cb41947ed2e737090b7f22ab54a323b6fa0b Mon Sep 17 00:00:00 2001
  1664. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1665. Date: Wed, 3 Feb 2010 20:27:57 -0800
  1666. Subject: [PATCH 08/24] Fix 2-pass ratecontrol continuation in case of missing statsfile
  1667. Didn't work properly if MB-tree was enabled.
  1668.  
  1669. ---
  1670. encoder/ratecontrol.c | 1 +
  1671. 1 files changed, 1 insertions(+), 0 deletions(-)
  1672.  
  1673. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  1674. index 52196e7..e314ba2 100644
  1675. --- a/encoder/ratecontrol.c
  1676. +++ b/encoder/ratecontrol.c
  1677. @@ -1280,6 +1280,7 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
  1678. h->thread[i]->param.rc.b_stat_read = 0;
  1679. h->thread[i]->param.i_bframe_adaptive = 0;
  1680. h->thread[i]->param.i_scenecut_threshold = 0;
  1681. + h->thread[i]->param.rc.b_mb_tree = 0;
  1682. if( h->thread[i]->param.i_bframe > 1 )
  1683. h->thread[i]->param.i_bframe = 1;
  1684. }
  1685. --
  1686. 1.6.1.2
  1687.  
  1688.  
  1689. From 479d312c2512244bf81a82b90815087a7e694f5e Mon Sep 17 00:00:00 2001
  1690. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1691. Date: Fri, 5 Feb 2010 16:15:23 -0800
  1692. Subject: [PATCH 09/24] Various CABAC/CAVLC cleanups/speedups
  1693. Make some if/else chains into switch statements.
  1694. Store CBP data in x264_t and only move it to frame storage later.
  1695. This saves a wasted cache line and some unnecessary dereferences in RDO.
  1696.  
  1697. ---
  1698. common/common.h | 1 +
  1699. common/macroblock.c | 3 +-
  1700. encoder/analyse.c | 8 +-
  1701. encoder/cabac.c | 40 +++---
  1702. encoder/cavlc.c | 365 ++++++++++++++++++++++++++------------------------
  1703. encoder/macroblock.c | 19 +--
  1704. 6 files changed, 219 insertions(+), 217 deletions(-)
  1705.  
  1706. diff --git a/common/common.h b/common/common.h
  1707. index 8b1b05a..d4a8dd9 100644
  1708. --- a/common/common.h
  1709. +++ b/common/common.h
  1710. @@ -542,6 +542,7 @@ struct x264_t
  1711. ALIGNED_4( uint8_t i_sub_partition[4] );
  1712. int b_transform_8x8;
  1713.  
  1714. + int i_cbp_combined;
  1715. int i_cbp_luma;
  1716. int i_cbp_chroma;
  1717.  
  1718. diff --git a/common/macroblock.c b/common/macroblock.c
  1719. index 10f09ac..d86f3af 100644
  1720. --- a/common/macroblock.c
  1721. +++ b/common/macroblock.c
  1722. @@ -1343,11 +1343,12 @@ void x264_macroblock_cache_save( x264_t *h )
  1723. M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
  1724. M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
  1725.  
  1726. - if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
  1727. + if( h->mb.i_type != I_16x16 && !h->mb.i_cbp_combined )
  1728. h->mb.i_qp = h->mb.i_last_qp;
  1729. h->mb.qp[i_mb_xy] = h->mb.i_qp;
  1730. h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1731. h->mb.i_last_qp = h->mb.i_qp;
  1732. + h->mb.cbp[i_mb_xy] = h->mb.i_cbp_combined;
  1733. }
  1734.  
  1735. if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
  1736. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1737. index 53ca025..4f3f35f 100644
  1738. --- a/encoder/analyse.c
  1739. +++ b/encoder/analyse.c
  1740. @@ -1199,7 +1199,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
  1741. h->mb.i_partition = D_16x16;
  1742. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
  1743. a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
  1744. - if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
  1745. + if( !h->mb.i_cbp_combined )
  1746. h->mb.i_type = P_SKIP;
  1747. }
  1748. }
  1749. @@ -2307,7 +2307,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1750. int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
  1751. int last_qp_tried = 0;
  1752. origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
  1753. - int origcbp = h->mb.cbp[h->mb.i_mb_xy];
  1754. + int origcbp = h->mb.i_cbp_combined;
  1755.  
  1756. /* If CBP is already zero, don't raise the quantizer any higher. */
  1757. for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
  1758. @@ -2340,7 +2340,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1759. h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
  1760. h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  1761. already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
  1762. - if( !h->mb.cbp[h->mb.i_mb_xy] )
  1763. + if( !h->mb.i_cbp_combined )
  1764. {
  1765. /* If our empty-CBP block is lower QP than the last QP,
  1766. * the last QP cannot possibly have a CBP either. */
  1767. @@ -2377,7 +2377,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  1768.  
  1769. if( failures > threshold )
  1770. break;
  1771. - if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
  1772. + if( direction == 1 && !h->mb.i_cbp_combined )
  1773. break;
  1774. h->mb.i_qp += direction;
  1775. }
  1776. diff --git a/encoder/cabac.c b/encoder/cabac.c
  1777. index 6ff2aed..6c14722 100644
  1778. --- a/encoder/cabac.c
  1779. +++ b/encoder/cabac.c
  1780. @@ -107,7 +107,7 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
  1781. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1782.  
  1783. /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
  1784. - if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
  1785. + if( h->mb.i_type == I_16x16 && !h->mb.i_cbp_combined )
  1786. {
  1787. #if !RDO_SKIP_BS
  1788. h->mb.i_qp = h->mb.i_last_qp;
  1789. @@ -915,7 +915,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  1790. if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
  1791. x264_cabac_mb_transform_size( h, cb );
  1792.  
  1793. - if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  1794. + if( h->mb.i_cbp_combined || i_mb_type == I_16x16 )
  1795. {
  1796. const int b_intra = IS_INTRA( i_mb_type );
  1797. x264_cabac_mb_qp_delta( h, cb );
  1798. @@ -973,24 +973,24 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
  1799. int b_8x16 = h->mb.i_partition == D_8x16;
  1800. int j;
  1801.  
  1802. - if( i_mb_type == P_8x8 )
  1803. + switch( i_mb_type )
  1804. {
  1805. - x264_cabac_mb8x8_mvd( h, cb, i8 );
  1806. - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
  1807. - }
  1808. - else if( i_mb_type == P_L0 )
  1809. - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1810. - else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
  1811. - {
  1812. - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1813. - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1814. - }
  1815. - else //if( i_mb_type == B_8x8 )
  1816. - {
  1817. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  1818. - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
  1819. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  1820. - x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
  1821. + case P_L0:
  1822. + x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1823. + break;
  1824. + case P_8x8:
  1825. + x264_cabac_mb8x8_mvd( h, cb, i8 );
  1826. + x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
  1827. + break;
  1828. + case B_8x8:
  1829. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  1830. + x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
  1831. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  1832. + x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
  1833. + break;
  1834. + default: /* Rest of the B types */
  1835. + if( x264_mb_type_list_table[i_mb_type][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1836. + if( x264_mb_type_list_table[i_mb_type][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
  1837. }
  1838.  
  1839. for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
  1840. @@ -1019,9 +1019,7 @@ static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, i
  1841. int b_8x4 = i_pixel == PIXEL_8x4;
  1842. block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 0 );
  1843. if( i_pixel == PIXEL_4x4 )
  1844. - {
  1845. x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
  1846. - }
  1847. else
  1848. {
  1849. x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
  1850. diff --git a/encoder/cavlc.c b/encoder/cavlc.c
  1851. index d18408b..45b55fe 100644
  1852. --- a/encoder/cavlc.c
  1853. +++ b/encoder/cavlc.c
  1854. @@ -209,8 +209,7 @@ static void x264_cavlc_mb_qp_delta( x264_t *h )
  1855. int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
  1856.  
  1857. /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
  1858. - if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
  1859. - && !h->mb.cache.non_zero_count[x264_scan8[24]] )
  1860. + if( h->mb.i_type == I_16x16 && !h->mb.i_cbp_combined )
  1861. {
  1862. #if !RDO_SKIP_BS
  1863. h->mb.i_qp = h->mb.i_last_qp;
  1864. @@ -302,201 +301,209 @@ void x264_macroblock_write_cavlc( x264_t *h )
  1865. bs_write1( s, h->mb.b_interlaced );
  1866. }
  1867.  
  1868. -#if !RDO_SKIP_BS
  1869. - if( i_mb_type == I_PCM )
  1870. - {
  1871. - uint8_t *p_start = s->p_start;
  1872. - bs_write_ue( s, i_mb_i_offset + 25 );
  1873. - i_mb_pos_tex = bs_pos( s );
  1874. - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1875. -
  1876. - bs_align_0( s );
  1877. -
  1878. - memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
  1879. - s->p += 256;
  1880. - for( i = 0; i < 8; i++ )
  1881. - memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
  1882. - s->p += 64;
  1883. - for( i = 0; i < 8; i++ )
  1884. - memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
  1885. - s->p += 64;
  1886. -
  1887. - bs_init( s, s->p, s->p_end - s->p );
  1888. - s->p_start = p_start;
  1889. -
  1890. - /* if PCM is chosen, we need to store reconstructed frame data */
  1891. - h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
  1892. - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
  1893. - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
  1894. -
  1895. - h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
  1896. - return;
  1897. - }
  1898. -#endif
  1899. -
  1900. /* Write:
  1901. - type
  1902. - prediction
  1903. - mv */
  1904. - if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  1905. + switch( i_mb_type )
  1906. {
  1907. - int di = i_mb_type == I_8x8 ? 4 : 1;
  1908. - bs_write_ue( s, i_mb_i_offset + 0 );
  1909. - if( h->pps->b_transform_8x8_mode )
  1910. - bs_write1( s, h->mb.b_transform_8x8 );
  1911. -
  1912. - /* Prediction: Luma */
  1913. - for( i = 0; i < 16; i += di )
  1914. + case I_4x4:
  1915. + case I_8x8:
  1916. {
  1917. - int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  1918. - int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  1919. + int di = i_mb_type == I_8x8 ? 4 : 1;
  1920. + bs_write_ue( s, i_mb_i_offset + 0 );
  1921. + if( h->pps->b_transform_8x8_mode )
  1922. + bs_write1( s, h->mb.b_transform_8x8 );
  1923.  
  1924. - if( i_pred == i_mode )
  1925. - bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
  1926. - else
  1927. - bs_write( s, 4, i_mode - (i_mode > i_pred) );
  1928. + /* Prediction: Luma */
  1929. + for( i = 0; i < 16; i += di )
  1930. + {
  1931. + int i_pred = x264_mb_predict_intra4x4_mode( h, i );
  1932. + int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
  1933. +
  1934. + if( i_pred == i_mode )
  1935. + bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
  1936. + else
  1937. + bs_write( s, 4, i_mode - (i_mode > i_pred) );
  1938. + }
  1939. + bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1940. + break;
  1941. + case I_16x16:
  1942. + bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
  1943. + h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
  1944. + bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1945. + break;
  1946. }
  1947. - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1948. - }
  1949. - else if( i_mb_type == I_16x16 )
  1950. - {
  1951. - bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
  1952. - h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
  1953. - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
  1954. - }
  1955. - else if( i_mb_type == P_L0 )
  1956. - {
  1957. - if( h->mb.i_partition == D_16x16 )
  1958. +#if !RDO_SKIP_BS
  1959. + case I_PCM:
  1960. {
  1961. - bs_write1( s, 1 );
  1962. -
  1963. - if( h->mb.pic.i_fref[0] > 1 )
  1964. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  1965. - x264_cavlc_mb_mvd( h, 0, 0, 4 );
  1966. + uint8_t *p_start = s->p_start;
  1967. + bs_write_ue( s, i_mb_i_offset + 25 );
  1968. + i_mb_pos_tex = bs_pos( s );
  1969. + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  1970. +
  1971. + bs_align_0( s );
  1972. +
  1973. + memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
  1974. + s->p += 256;
  1975. + for( i = 0; i < 8; i++ )
  1976. + memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
  1977. + s->p += 64;
  1978. + for( i = 0; i < 8; i++ )
  1979. + memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
  1980. + s->p += 64;
  1981. +
  1982. + bs_init( s, s->p, s->p_end - s->p );
  1983. + s->p_start = p_start;
  1984. +
  1985. + /* if PCM is chosen, we need to store reconstructed frame data */
  1986. + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
  1987. + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
  1988. + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
  1989. +
  1990. + h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
  1991. + return;
  1992. }
  1993. - else if( h->mb.i_partition == D_16x8 )
  1994. +#endif
  1995. + case P_L0:
  1996. {
  1997. - bs_write_ue( s, 1 );
  1998. - if( h->mb.pic.i_fref[0] > 1 )
  1999. + if( h->mb.i_partition == D_16x16 )
  2000. {
  2001. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2002. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2003. + bs_write1( s, 1 );
  2004. +
  2005. + if( h->mb.pic.i_fref[0] > 1 )
  2006. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2007. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2008. }
  2009. - x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2010. - x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2011. + else if( h->mb.i_partition == D_16x8 )
  2012. + {
  2013. + bs_write_ue( s, 1 );
  2014. + if( h->mb.pic.i_fref[0] > 1 )
  2015. + {
  2016. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2017. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2018. + }
  2019. + x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2020. + x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2021. + }
  2022. + else if( h->mb.i_partition == D_8x16 )
  2023. + {
  2024. + bs_write_ue( s, 2 );
  2025. + if( h->mb.pic.i_fref[0] > 1 )
  2026. + {
  2027. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2028. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  2029. + }
  2030. + x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2031. + x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2032. + }
  2033. + break;
  2034. }
  2035. - else if( h->mb.i_partition == D_8x16 )
  2036. + case P_8x8:
  2037. {
  2038. - bs_write_ue( s, 2 );
  2039. - if( h->mb.pic.i_fref[0] > 1 )
  2040. + int b_sub_ref;
  2041. + if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
  2042. + h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
  2043. + {
  2044. + bs_write_ue( s, 4 );
  2045. + b_sub_ref = 0;
  2046. + }
  2047. + else
  2048. + {
  2049. + bs_write_ue( s, 3 );
  2050. + b_sub_ref = 1;
  2051. + }
  2052. +
  2053. + /* sub mb type */
  2054. + if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
  2055. + for( i = 0; i < 4; i++ )
  2056. + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
  2057. + else
  2058. + bs_write( s, 4, 0xf );
  2059. +
  2060. + /* ref0 */
  2061. + if( b_sub_ref )
  2062. {
  2063. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2064. bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  2065. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2066. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
  2067. }
  2068. - x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2069. - x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2070. - }
  2071. - }
  2072. - else if( i_mb_type == P_8x8 )
  2073. - {
  2074. - int b_sub_ref;
  2075. - if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
  2076. - h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
  2077. - {
  2078. - bs_write_ue( s, 4 );
  2079. - b_sub_ref = 0;
  2080. - }
  2081. - else
  2082. - {
  2083. - bs_write_ue( s, 3 );
  2084. - b_sub_ref = 1;
  2085. - }
  2086.  
  2087. - /* sub mb type */
  2088. - if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
  2089. for( i = 0; i < 4; i++ )
  2090. - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
  2091. - else
  2092. - bs_write( s, 4, 0xf );
  2093. -
  2094. - /* ref0 */
  2095. - if( b_sub_ref )
  2096. - {
  2097. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
  2098. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
  2099. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
  2100. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
  2101. + x264_cavlc_mb8x8_mvd( h, i );
  2102. + break;
  2103. }
  2104. + case B_8x8:
  2105. + {
  2106. + bs_write_ue( s, 22 );
  2107.  
  2108. - for( i = 0; i < 4; i++ )
  2109. - x264_cavlc_mb8x8_mvd( h, i );
  2110. - }
  2111. - else if( i_mb_type == B_8x8 )
  2112. - {
  2113. - bs_write_ue( s, 22 );
  2114. -
  2115. - /* sub mb type */
  2116. - for( i = 0; i < 4; i++ )
  2117. - bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
  2118. + /* sub mb type */
  2119. + for( i = 0; i < 4; i++ )
  2120. + bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
  2121.  
  2122. - /* ref */
  2123. - if( h->mb.pic.i_fref[0] > 1 )
  2124. + /* ref */
  2125. + if( h->mb.pic.i_fref[0] > 1 )
  2126. + for( i = 0; i < 4; i++ )
  2127. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  2128. + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
  2129. + if( h->mb.pic.i_fref[1] > 1 )
  2130. + for( i = 0; i < 4; i++ )
  2131. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  2132. + bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
  2133. +
  2134. + /* mvd */
  2135. for( i = 0; i < 4; i++ )
  2136. if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  2137. - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
  2138. - if( h->mb.pic.i_fref[1] > 1 )
  2139. + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  2140. for( i = 0; i < 4; i++ )
  2141. if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  2142. - bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
  2143. -
  2144. - /* mvd */
  2145. - for( i = 0; i < 4; i++ )
  2146. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
  2147. - x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
  2148. - for( i = 0; i < 4; i++ )
  2149. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
  2150. - x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
  2151. - }
  2152. - else if( i_mb_type != B_DIRECT )
  2153. - {
  2154. - /* All B mode */
  2155. - /* Motion Vector */
  2156. - const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  2157. - const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
  2158. - const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
  2159. -
  2160. - bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
  2161. - if( h->mb.i_partition == D_16x16 )
  2162. + x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
  2163. + break;
  2164. + }
  2165. + case B_DIRECT:
  2166. {
  2167. - if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
  2168. - if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
  2169. - if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2170. - if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  2171. + bs_write1( s, 1 );
  2172. + break;
  2173. }
  2174. - else
  2175. + default: /* Rest of the B types */
  2176. {
  2177. - if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
  2178. - if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
  2179. - if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
  2180. - if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
  2181. - if( h->mb.i_partition == D_16x8 )
  2182. + const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
  2183. + const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
  2184. + const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
  2185. +
  2186. + bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
  2187. + if( h->mb.i_partition == D_16x16 )
  2188. {
  2189. + if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
  2190. + if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
  2191. if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2192. - if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2193. if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  2194. - if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
  2195. }
  2196. - else //if( h->mb.i_partition == D_8x16 )
  2197. + else
  2198. {
  2199. - if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2200. - if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2201. - if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
  2202. - if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
  2203. + if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
  2204. + if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
  2205. + if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
  2206. + if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
  2207. + if( h->mb.i_partition == D_16x8 )
  2208. + {
  2209. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
  2210. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
  2211. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
  2212. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
  2213. + }
  2214. + else //if( h->mb.i_partition == D_8x16 )
  2215. + {
  2216. + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
  2217. + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
  2218. + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
  2219. + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
  2220. + }
  2221. }
  2222. + break;
  2223. }
  2224. }
  2225. - else //if( i_mb_type == B_DIRECT )
  2226. - bs_write1( s, 1 );
  2227.  
  2228. #if !RDO_SKIP_BS
  2229. i_mb_pos_tex = bs_pos( s );
  2230. @@ -505,16 +512,16 @@ void x264_macroblock_write_cavlc( x264_t *h )
  2231.  
  2232. /* Coded block patern */
  2233. if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
  2234. - bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
  2235. + bs_write_ue( s, intra4x4_cbp_to_golomb[h->mb.i_cbp_combined&0x3f] );
  2236. else if( i_mb_type != I_16x16 )
  2237. - bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
  2238. + bs_write_ue( s, inter_cbp_to_golomb[h->mb.i_cbp_combined&0x3f] );
  2239.  
  2240. /* transform size 8x8 flag */
  2241. if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
  2242. bs_write1( s, h->mb.b_transform_8x8 );
  2243.  
  2244. /* write residual */
  2245. - if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
  2246. + if( h->mb.i_cbp_combined&0x3f || i_mb_type == I_16x16 )
  2247. {
  2248. x264_cavlc_mb_qp_delta( h );
  2249.  
  2250. @@ -561,24 +568,24 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
  2251. int b_8x16 = h->mb.i_partition == D_8x16;
  2252. int j;
  2253.  
  2254. - if( i_mb_type == P_8x8 )
  2255. - {
  2256. - x264_cavlc_mb8x8_mvd( h, i8 );
  2257. - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
  2258. - }
  2259. - else if( i_mb_type == P_L0 )
  2260. - x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2261. - else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
  2262. + switch( i_mb_type )
  2263. {
  2264. - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2265. - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  2266. - }
  2267. - else //if( i_mb_type == B_8x8 )
  2268. - {
  2269. - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  2270. - x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
  2271. - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  2272. - x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
  2273. + case P_L0:
  2274. + x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2275. + break;
  2276. + case P_8x8:
  2277. + x264_cavlc_mb8x8_mvd( h, i8 );
  2278. + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
  2279. + break;
  2280. + case B_8x8:
  2281. + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
  2282. + x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
  2283. + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
  2284. + x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
  2285. + break;
  2286. + default: /* Rest of the B types */
  2287. + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
  2288. + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
  2289. }
  2290.  
  2291. for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
  2292. @@ -618,6 +625,8 @@ static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
  2293. static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
  2294. {
  2295. h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
  2296. + /* We can't use h->mb.i_cbp_combined here because it's only calculated at the end of
  2297. + * x264_macroblock_encode(), which hasn't been called at this point. */
  2298. bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
  2299. x264_macroblock_luma_write_cavlc( h, i8, i8 );
  2300. return h->out.bs.i_bits_encoded;
  2301. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  2302. index fa7942d..f5f6267 100644
  2303. --- a/encoder/macroblock.c
  2304. +++ b/encoder/macroblock.c
  2305. @@ -488,7 +488,7 @@ static void x264_macroblock_encode_skip( x264_t *h )
  2306. h->mb.i_cbp_chroma = 0x00;
  2307. memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE );
  2308. /* store cbp */
  2309. - h->mb.cbp[h->mb.i_mb_xy] = 0;
  2310. + h->mb.i_cbp_combined = 0;
  2311. }
  2312.  
  2313. /*****************************************************************************
  2314. @@ -604,7 +604,6 @@ void x264_predict_lossless_16x16( x264_t *h, int i_mode )
  2315. *****************************************************************************/
  2316. void x264_macroblock_encode( x264_t *h )
  2317. {
  2318. - int i_cbp_dc = 0;
  2319. int i_qp = h->mb.i_qp;
  2320. int b_decimate = h->mb.b_dct_decimate;
  2321. int b_force_no_skip = 0;
  2322. @@ -880,34 +879,28 @@ void x264_macroblock_encode( x264_t *h )
  2323. /* encode the 8x8 blocks */
  2324. x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
  2325.  
  2326. - if( h->param.b_cabac )
  2327. - {
  2328. - i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
  2329. + int i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
  2330. | h->mb.cache.non_zero_count[x264_scan8[25]] << 1
  2331. | h->mb.cache.non_zero_count[x264_scan8[26]] << 2;
  2332. - }
  2333.  
  2334. /* store cbp */
  2335. - h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
  2336. + h->mb.i_cbp_combined = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
  2337.  
  2338. /* Check for P_SKIP
  2339. * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
  2340. * (if multiple mv give same result)*/
  2341. if( !b_force_no_skip )
  2342. {
  2343. - if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
  2344. - !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
  2345. - M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
  2346. + if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && !h->mb.i_cbp_combined
  2347. + && M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
  2348. && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
  2349. {
  2350. h->mb.i_type = P_SKIP;
  2351. }
  2352.  
  2353. /* Check for B_SKIP */
  2354. - if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
  2355. - {
  2356. + if( h->mb.i_type == B_DIRECT && !h->mb.i_cbp_combined )
  2357. h->mb.i_type = B_SKIP;
  2358. - }
  2359. }
  2360. }
  2361.  
  2362. --
  2363. 1.6.1.2
  2364.  
  2365.  
  2366. From 9fb95fea1304984d7d90b1670dcb7c5a4e261697 Mon Sep 17 00:00:00 2001
  2367. From: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
  2368. Date: Mon, 8 Feb 2010 01:48:38 -0800
  2369. Subject: [PATCH 10/24] Write PASP atom in mp4 muxing
  2370. Adds container-level aspect ratio support for mp4.
  2371.  
  2372. ---
  2373. output/mp4.c | 3 ++-
  2374. 1 files changed, 2 insertions(+), 1 deletions(-)
  2375.  
  2376. diff --git a/output/mp4.c b/output/mp4.c
  2377. index e3ad9c6..b817c82 100644
  2378. --- a/output/mp4.c
  2379. +++ b/output/mp4.c
  2380. @@ -121,7 +121,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  2381. if( mdhd_duration != total_duration )
  2382. {
  2383. uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
  2384. - uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
  2385. + uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
  2386. gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
  2387. total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
  2388. }
  2389. @@ -212,6 +212,7 @@ static int set_param( hnd_t handle, x264_param_t *p_param )
  2390. dw *= sar ;
  2391. else
  2392. dh /= sar;
  2393. + gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height );
  2394. gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
  2395. }
  2396.  
  2397. --
  2398. 1.6.1.2
  2399.  
  2400.  
  2401. From d0af4adb40429a5335a8b81c67f97bdf75e8dfa0 Mon Sep 17 00:00:00 2001
  2402. From: Henrik Gramner <hengar-6@student.ltu.se>
  2403. Date: Mon, 8 Feb 2010 15:53:52 -0800
  2404. Subject: [PATCH 11/24] Faster 2x2 chroma DC dequant
  2405.  
  2406. ---
  2407. doc/standards.txt | 1 +
  2408. encoder/macroblock.c | 24 +++++++++---------------
  2409. 2 files changed, 10 insertions(+), 15 deletions(-)
  2410.  
  2411. diff --git a/doc/standards.txt b/doc/standards.txt
  2412. index db9a691..7474d8f 100644
  2413. --- a/doc/standards.txt
  2414. +++ b/doc/standards.txt
  2415. @@ -4,6 +4,7 @@ checkasm is written in gcc, with no attempt at compatibility with anything else.
  2416. We make the following additional assumptions which are true of real systems but not guaranteed by C99:
  2417. * Two's complement.
  2418. * Signed right-shifts are sign-extended.
  2419. +* int is 32-bit or larger.
  2420.  
  2421. x86-specific assumptions:
  2422. * The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that.
  2423. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  2424. index f5f6267..3d859de 100644
  2425. --- a/encoder/macroblock.c
  2426. +++ b/encoder/macroblock.c
  2427. @@ -42,30 +42,24 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
  2428. int d1 = dct[2] + dct[3]; \
  2429. int d2 = dct[0] - dct[1]; \
  2430. int d3 = dct[2] - dct[3]; \
  2431. - int dmf = dequant_mf[i_qp%6][0]; \
  2432. - int qbits = i_qp/6 - 5; \
  2433. - if( qbits > 0 ) \
  2434. - { \
  2435. - dmf <<= qbits; \
  2436. - qbits = 0; \
  2437. - }
  2438. + int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  2439.  
  2440. static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
  2441. {
  2442. IDCT_DEQUANT_START
  2443. - dct4x4[0][0] = (d0 + d1) * dmf >> -qbits;
  2444. - dct4x4[1][0] = (d0 - d1) * dmf >> -qbits;
  2445. - dct4x4[2][0] = (d2 + d3) * dmf >> -qbits;
  2446. - dct4x4[3][0] = (d2 - d3) * dmf >> -qbits;
  2447. + dct4x4[0][0] = (d0 + d1) * dmf >> 5;
  2448. + dct4x4[1][0] = (d0 - d1) * dmf >> 5;
  2449. + dct4x4[2][0] = (d2 + d3) * dmf >> 5;
  2450. + dct4x4[3][0] = (d2 - d3) * dmf >> 5;
  2451. }
  2452.  
  2453. static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
  2454. {
  2455. IDCT_DEQUANT_START
  2456. - out[0] = (d0 + d1) * dmf >> -qbits;
  2457. - out[1] = (d0 - d1) * dmf >> -qbits;
  2458. - out[2] = (d2 + d3) * dmf >> -qbits;
  2459. - out[3] = (d2 - d3) * dmf >> -qbits;
  2460. + out[0] = (d0 + d1) * dmf >> 5;
  2461. + out[1] = (d0 - d1) * dmf >> 5;
  2462. + out[2] = (d2 + d3) * dmf >> 5;
  2463. + out[3] = (d2 - d3) * dmf >> 5;
  2464. }
  2465.  
  2466. static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
  2467. --
  2468. 1.6.1.2
  2469.  
  2470.  
  2471. From c2c3d4558253b8f4969c35be9442489363ed8902 Mon Sep 17 00:00:00 2001
  2472. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2473. Date: Tue, 9 Feb 2010 15:08:31 -0800
  2474. Subject: [PATCH 12/24] Make psy-(rd|trellis) use more precision in userdata SEI
  2475.  
  2476. ---
  2477. common/common.c | 2 +-
  2478. 1 files changed, 1 insertions(+), 1 deletions(-)
  2479.  
  2480. diff --git a/common/common.c b/common/common.c
  2481. index 6d1d7f0..aaccdf2 100644
  2482. --- a/common/common.c
  2483. +++ b/common/common.c
  2484. @@ -886,7 +886,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
  2485. s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
  2486. s += sprintf( s, " psy=%d", p->analyse.b_psy );
  2487. if( p->analyse.b_psy )
  2488. - s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
  2489. + s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
  2490. s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
  2491. s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
  2492. s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
  2493. --
  2494. 1.6.1.2
  2495.  
  2496.  
  2497. From d5cc99ce2f0ddbb9b27fe14526bb06b6745de0fd Mon Sep 17 00:00:00 2001
  2498. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2499. Date: Wed, 10 Feb 2010 12:12:29 -0800
  2500. Subject: [PATCH 13/24] Overhaul sliced-threads VBV
  2501. Make predictors thread-local and allow each thread to poll the others to get their predicted sizes.
  2502. Many, many other tweaks to improve quality with small VBV and sliced threads.
  2503. Note this may somewhat increase the risk of a VBV underflow in such extreme situations (single-frame VBV).
  2504. This is tolerable, as most relevant use-cases are better off with a few rare underflows (even if they have to drop a slice) than consistent low quality.
  2505.  
  2506. ---
  2507. encoder/encoder.c | 4 +-
  2508. encoder/ratecontrol.c | 150 ++++++++++++++++++++++++++++++-------------------
  2509. encoder/slicetype.c | 4 +-
  2510. 3 files changed, 97 insertions(+), 61 deletions(-)
  2511.  
  2512. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2513. index e266a1a..b977ec6 100644
  2514. --- a/encoder/encoder.c
  2515. +++ b/encoder/encoder.c
  2516. @@ -2061,6 +2061,8 @@ static int x264_threaded_slices_write( x264_t *h )
  2517. for( i = 0; i <= h->sps->i_mb_height; i++ )
  2518. x264_fdec_filter_row( h, i );
  2519.  
  2520. + x264_threads_merge_ratecontrol( h );
  2521. +
  2522. for( i = 1; i < h->param.i_threads; i++ )
  2523. {
  2524. x264_t *t = h->thread[i];
  2525. @@ -2076,8 +2078,6 @@ static int x264_threaded_slices_write( x264_t *h )
  2526. ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
  2527. }
  2528.  
  2529. - x264_threads_merge_ratecontrol( h );
  2530. -
  2531. return 0;
  2532. }
  2533.  
  2534. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  2535. index e314ba2..b2cbb26 100644
  2536. --- a/encoder/ratecontrol.c
  2537. +++ b/encoder/ratecontrol.c
  2538. @@ -134,9 +134,11 @@ struct x264_ratecontrol_t
  2539. * This value is the current position (0 or 1). */
  2540.  
  2541. /* MBRC stuff */
  2542. - double frame_size_estimated;
  2543. + float frame_size_estimated; /* Access to this variable must be atomic: double is
  2544. + * not atomic on all arches we care about */
  2545. double frame_size_planned;
  2546. double slice_size_planned;
  2547. + double max_frame_error;
  2548. predictor_t (*row_pred)[2];
  2549. predictor_t row_preds[5][2];
  2550. predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
  2551. @@ -505,17 +507,21 @@ int x264_ratecontrol_new( x264_t *h )
  2552.  
  2553. rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
  2554. rc->last_qscale = qp2qscale(26);
  2555. - CHECKED_MALLOC( rc->pred, 5*sizeof(predictor_t) );
  2556. + int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
  2557. + CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
  2558. CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
  2559. for( i = 0; i < 5; i++ )
  2560. {
  2561. rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
  2562. rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min );
  2563. rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
  2564. - rc->pred[i].coeff= 2.0;
  2565. - rc->pred[i].count= 1.0;
  2566. - rc->pred[i].decay= 0.5;
  2567. - rc->pred[i].offset= 0.0;
  2568. + for( j = 0; j < num_preds; j++ )
  2569. + {
  2570. + rc->pred[i+j*5].coeff= 2.0;
  2571. + rc->pred[i+j*5].count= 1.0;
  2572. + rc->pred[i+j*5].decay= 0.5;
  2573. + rc->pred[i+j*5].offset= 0.0;
  2574. + }
  2575. for( j = 0; j < 2; j++ )
  2576. {
  2577. rc->row_preds[i][j].coeff= .25;
  2578. @@ -986,20 +992,16 @@ void x264_ratecontrol_delete( x264_t *h )
  2579. x264_free( rc );
  2580. }
  2581.  
  2582. +/* We don't actually need mutexes here: the access orders aren't deterministic
  2583. + * to begin with, plus all operations are atomic. */
  2584. void x264_ratecontrol_set_estimated_size( x264_t *h, int bits )
  2585. {
  2586. - x264_pthread_mutex_lock( &h->fenc->mutex );
  2587. h->rc->frame_size_estimated = bits;
  2588. - x264_pthread_mutex_unlock( &h->fenc->mutex );
  2589. }
  2590.  
  2591. -int x264_ratecontrol_get_estimated_size( x264_t const *h)
  2592. +int x264_ratecontrol_get_estimated_size( x264_t const *h )
  2593. {
  2594. - int size;
  2595. - x264_pthread_mutex_lock( &h->fenc->mutex );
  2596. - size = h->rc->frame_size_estimated;
  2597. - x264_pthread_mutex_unlock( &h->fenc->mutex );
  2598. - return size;
  2599. + return h->rc->frame_size_estimated;
  2600. }
  2601.  
  2602. static void accum_p_qp_update( x264_t *h, float qp )
  2603. @@ -1173,6 +1175,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2604. /* tweak quality based on difference from predicted size */
  2605. if( y < h->i_threadslice_end-1 )
  2606. {
  2607. + int i;
  2608. int prev_row_qp = h->fdec->i_row_qp[y];
  2609. int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
  2610. int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
  2611. @@ -1186,19 +1189,23 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2612.  
  2613. float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
  2614. float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
  2615. - float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
  2616. + float size_of_other_slices = 0;
  2617. + if( h->param.b_sliced_threads )
  2618. + {
  2619. + for( i = 0; i < h->param.i_threads; i++ )
  2620. + if( h != h->thread[i] )
  2621. + size_of_other_slices += x264_ratecontrol_get_estimated_size( h->thread[i] );
  2622. + }
  2623. + else
  2624. + rc->max_frame_error = X264_MAX( 0.05, 1.0 / (h->sps->i_mb_width) );
  2625. +
  2626. /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
  2627. float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
  2628. - float max_frame_error = X264_MAX( 0.05, 1.0 / h->sps->i_mb_height );
  2629. - int b1 = predict_row_size_sum( h, y, rc->qpm );
  2630. -
  2631. - /* Assume that if this slice has become larger than expected,
  2632. - * the other slices will have gotten equally larger. */
  2633. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2634. + int b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2635.  
  2636. /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
  2637. /* area at the top of the frame was measured inaccurately. */
  2638. - if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
  2639. + if( row_bits_so_far( h, y ) < 0.05 * slice_size_planned )
  2640. return;
  2641.  
  2642. if( h->sh.i_type != SLICE_TYPE_I )
  2643. @@ -1213,8 +1220,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2644. (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
  2645. {
  2646. rc->qpm ++;
  2647. - b1 = predict_row_size_sum( h, y, rc->qpm );
  2648. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2649. + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2650. }
  2651.  
  2652. while( rc->qpm > i_qp_min
  2653. @@ -1223,20 +1229,18 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  2654. || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
  2655. {
  2656. rc->qpm --;
  2657. - b1 = predict_row_size_sum( h, y, rc->qpm );
  2658. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2659. + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2660. }
  2661.  
  2662. /* avoid VBV underflow */
  2663. while( (rc->qpm < h->param.rc.i_qp_max)
  2664. - && (rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) )
  2665. + && (rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) )
  2666. {
  2667. rc->qpm ++;
  2668. - b1 = predict_row_size_sum( h, y, rc->qpm );
  2669. - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
  2670. + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
  2671. }
  2672.  
  2673. - x264_ratecontrol_set_estimated_size(h, b1);
  2674. + x264_ratecontrol_set_estimated_size( h, predict_row_size_sum( h, y, rc->qpm ) );
  2675. }
  2676.  
  2677. /* loses the fractional part of the frame-wise qp */
  2678. @@ -1958,56 +1962,88 @@ static float rate_estimate_qscale( x264_t *h )
  2679. }
  2680. }
  2681.  
  2682. +void x264_threads_normalize_predictors( x264_t *h )
  2683. +{
  2684. + int i;
  2685. + double totalsize = 0;
  2686. + for( i = 0; i < h->param.i_threads; i++ )
  2687. + totalsize += h->thread[i]->rc->slice_size_planned;
  2688. + double factor = h->rc->frame_size_planned / totalsize;
  2689. + for( i = 0; i < h->param.i_threads; i++ )
  2690. + h->thread[i]->rc->slice_size_planned *= factor;
  2691. +}
  2692. +
  2693. void x264_threads_distribute_ratecontrol( x264_t *h )
  2694. {
  2695. - int i, row, totalsize = 0;
  2696. - if( h->rc->b_vbv )
  2697. - for( row = 0; row < h->sps->i_mb_height; row++ )
  2698. - totalsize += h->fdec->i_row_satd[row];
  2699. + int i, row;
  2700. + x264_ratecontrol_t *rc = h->rc;
  2701. +
  2702. + /* Initialize row predictors */
  2703. + if( h->i_frame == 0 )
  2704. + for( i = 0; i < h->param.i_threads; i++ )
  2705. + {
  2706. + x264_ratecontrol_t *t = h->thread[i]->rc;
  2707. + memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
  2708. + }
  2709. +
  2710. for( i = 0; i < h->param.i_threads; i++ )
  2711. {
  2712. x264_t *t = h->thread[i];
  2713. - x264_ratecontrol_t *rc = h->rc;
  2714. - memcpy( t->rc, rc, sizeof(x264_ratecontrol_t) );
  2715. + memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
  2716. + t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
  2717. /* Calculate the planned slice size. */
  2718. - if( h->rc->b_vbv && rc->frame_size_planned )
  2719. + if( rc->b_vbv && rc->frame_size_planned )
  2720. {
  2721. int size = 0;
  2722. for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
  2723. size += h->fdec->i_row_satd[row];
  2724. - t->rc->slice_size_planned = size * rc->frame_size_planned / totalsize;
  2725. + t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], rc->qpm, size );
  2726. }
  2727. else
  2728. t->rc->slice_size_planned = 0;
  2729. }
  2730. + if( rc->b_vbv && rc->frame_size_planned )
  2731. + {
  2732. + x264_threads_normalize_predictors( h );
  2733. +
  2734. + if( rc->single_frame_vbv )
  2735. + {
  2736. + /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */
  2737. + for( i = 0; i < h->param.i_threads; i++ )
  2738. + {
  2739. + x264_t *t = h->thread[i];
  2740. + t->rc->max_frame_error = X264_MAX( 0.05, 1.0 / (t->i_threadslice_end - t->i_threadslice_start) );
  2741. + t->rc->slice_size_planned += 2 * t->rc->max_frame_error * rc->frame_size_planned;
  2742. + }
  2743. + x264_threads_normalize_predictors( h );
  2744. + }
  2745. +
  2746. + for( i = 0; i < h->param.i_threads; i++ )
  2747. + x264_ratecontrol_set_estimated_size( h->thread[i], h->thread[i]->rc->slice_size_planned );
  2748. + }
  2749. }
  2750.  
  2751. void x264_threads_merge_ratecontrol( x264_t *h )
  2752. {
  2753. - int i, j, k;
  2754. + int i, row;
  2755. x264_ratecontrol_t *rc = h->rc;
  2756. x264_emms();
  2757.  
  2758. - for( i = 1; i < h->param.i_threads; i++ )
  2759. + for( i = 0; i < h->param.i_threads; i++ )
  2760. {
  2761. - x264_ratecontrol_t *t = h->thread[i]->rc;
  2762. - rc->qpa_rc += t->qpa_rc;
  2763. - rc->qpa_aq += t->qpa_aq;
  2764. - for( j = 0; j < 5; j++ )
  2765. - for( k = 0; k < 2; k++ )
  2766. - {
  2767. - rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
  2768. - rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
  2769. - rc->row_preds[j][k].count += t->row_preds[j][k].count;
  2770. - }
  2771. + x264_t *t = h->thread[i];
  2772. + x264_ratecontrol_t *rct = h->thread[i]->rc;
  2773. + int size = 0;
  2774. + for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
  2775. + size += h->fdec->i_row_satd[row];
  2776. + int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits;
  2777. + int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->sps->i_mb_width;
  2778. + update_predictor( &rc->pred[h->sh.i_type+5*i], qp2qscale(rct->qpa_rc/mb_count), size, bits );
  2779. + if( !i )
  2780. + continue;
  2781. + rc->qpa_rc += rct->qpa_rc;
  2782. + rc->qpa_aq += rct->qpa_aq;
  2783. }
  2784. - for( j = 0; j < 5; j++ )
  2785. - for( k = 0; k < 2; k++ )
  2786. - {
  2787. - rc->row_preds[j][k].coeff /= h->param.i_threads;
  2788. - rc->row_preds[j][k].offset /= h->param.i_threads;
  2789. - rc->row_preds[j][k].count /= h->param.i_threads;
  2790. - }
  2791. }
  2792.  
  2793. void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  2794. diff --git a/encoder/slicetype.c b/encoder/slicetype.c
  2795. index 057f6a6..bb2ed64 100644
  2796. --- a/encoder/slicetype.c
  2797. +++ b/encoder/slicetype.c
  2798. @@ -1394,10 +1394,10 @@ int x264_rc_analyse_slice( x264_t *h )
  2799. int mb_xy = y * h->mb.i_mb_stride;
  2800. for( x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
  2801. {
  2802. - int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor) >> 8;
  2803. + int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
  2804. int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
  2805. int diff = intra_cost - inter_cost;
  2806. - h->fdec->i_row_satd[y] += diff;
  2807. + h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
  2808. cost += diff;
  2809. }
  2810. }
  2811. --
  2812. 1.6.1.2
  2813.  
  2814.  
  2815. From f9012469506ff28ed869cc3518ff1ed5f252cf48 Mon Sep 17 00:00:00 2001
  2816. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2817. Date: Wed, 10 Feb 2010 13:44:28 -0800
  2818. Subject: [PATCH 14/24] Allow longer keyints with intra refresh
  2819. If a long keyint is specified (longer than macroblock width-1), the refresh will simply not occur all the time.
  2820. In other words, a refresh will take place, and then x264 will wait until keyint is over to start another refresh.
  2821.  
  2822. ---
  2823. encoder/encoder.c | 9 ++++-----
  2824. 1 files changed, 4 insertions(+), 5 deletions(-)
  2825.  
  2826. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2827. index b977ec6..6ad67d5 100644
  2828. --- a/encoder/encoder.c
  2829. +++ b/encoder/encoder.c
  2830. @@ -599,8 +599,6 @@ static int x264_validate_parameters( x264_t *h )
  2831. x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
  2832. h->param.i_frame_reference = 1;
  2833. }
  2834. - if( h->param.b_intra_refresh )
  2835. - h->param.i_keyint_max = X264_MIN( h->param.i_keyint_max, (h->param.i_width+15)/16 - 1 );
  2836. h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
  2837. h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
  2838. {
  2839. @@ -2306,12 +2304,12 @@ int x264_encoder_encode( x264_t *h,
  2840. if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
  2841. {
  2842. int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
  2843. - float increment = ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max;
  2844. + float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
  2845. if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
  2846. h->fdec->f_pir_position = 0;
  2847. else
  2848. {
  2849. - if( h->fref0[0]->i_pir_end_col == h->sps->i_mb_width - 1 )
  2850. + if( h->fdec->f_pir_position >= h->param.i_keyint_max )
  2851. {
  2852. h->fdec->f_pir_position = 0;
  2853. h->fenc->b_keyframe = 1;
  2854. @@ -2357,8 +2355,9 @@ int x264_encoder_encode( x264_t *h,
  2855.  
  2856. if( h->fenc->i_type != X264_TYPE_IDR )
  2857. {
  2858. + int time_to_recovery = X264_MIN( h->sps->i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe;
  2859. x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
  2860. - x264_sei_recovery_point_write( h, &h->out.bs, h->param.i_keyint_max );
  2861. + x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery );
  2862. x264_nal_end( h );
  2863. overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
  2864. }
  2865. --
  2866. 1.6.1.2
  2867.  
  2868.  
  2869. From 6ca08ce108471bd04a199e71571f18619988d3f4 Mon Sep 17 00:00:00 2001
  2870. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2871. Date: Fri, 12 Feb 2010 03:33:54 -0800
  2872. Subject: [PATCH 15/24] Implement direct temporal + interlaced
  2873. This was much easier than I expected.
  2874. It will also be basically useless until TFF/BFF support gets in, since it requires delta_poc_bottom to be set correctly to work well.
  2875.  
  2876. ---
  2877. common/common.h | 5 +++--
  2878. common/macroblock.c | 8 ++++----
  2879. encoder/encoder.c | 5 -----
  2880. 3 files changed, 7 insertions(+), 11 deletions(-)
  2881.  
  2882. diff --git a/common/common.h b/common/common.h
  2883. index d4a8dd9..6da462f 100644
  2884. --- a/common/common.h
  2885. +++ b/common/common.h
  2886. @@ -655,11 +655,12 @@ struct x264_t
  2887. int i_chroma_lambda2_offset;
  2888.  
  2889. /* B_direct and weighted prediction */
  2890. - int16_t dist_scale_factor[16][2];
  2891. + int16_t dist_scale_factor_buf[2][16][2];
  2892. + int16_t (*dist_scale_factor)[2];
  2893. int8_t bipred_weight_buf[2][32][4];
  2894. int8_t (*bipred_weight)[4];
  2895. /* maps fref1[0]'s ref indices into the current list0 */
  2896. -#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
  2897. +#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
  2898. int8_t map_col_to_list0[18];
  2899. int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
  2900. } mb;
  2901. diff --git a/common/macroblock.c b/common/macroblock.c
  2902. index d86f3af..e676b8b 100644
  2903. --- a/common/macroblock.c
  2904. +++ b/common/macroblock.c
  2905. @@ -190,7 +190,8 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
  2906. const int x8 = i8%2;
  2907. const int y8 = i8/2;
  2908. const int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
  2909. - const int i_ref = map_col_to_list0(h->fref1[0]->ref[0][i_part_8x8]);
  2910. + const int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8];
  2911. + const int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
  2912.  
  2913. if( i_ref >= 0 )
  2914. {
  2915. @@ -1238,6 +1239,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
  2916. if( h->sh.i_type == SLICE_TYPE_B )
  2917. {
  2918. h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(i_mb_y&1)];
  2919. + h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(i_mb_y&1)];
  2920. if( h->param.b_cabac )
  2921. {
  2922. uint8_t skipbp;
  2923. @@ -1478,9 +1480,7 @@ void x264_macroblock_bipred_init( x264_t *h )
  2924. dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
  2925. }
  2926.  
  2927. - // FIXME: will need this if we ever do temporal MV pred with interlaced
  2928. - if( !h->sh.b_mbaff )
  2929. - h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
  2930. + h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor;
  2931.  
  2932. dist_scale_factor >>= 2;
  2933. if( h->param.analyse.b_weighted_bipred
  2934. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2935. index 6ad67d5..25c4ae4 100644
  2936. --- a/encoder/encoder.c
  2937. +++ b/encoder/encoder.c
  2938. @@ -430,11 +430,6 @@ static int x264_validate_parameters( x264_t *h )
  2939. x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
  2940. h->param.analyse.i_me_method = X264_ME_UMH;
  2941. }
  2942. - if( h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
  2943. - {
  2944. - x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
  2945. - h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
  2946. - }
  2947. if( h->param.analyse.i_weighted_pred > 0 )
  2948. {
  2949. x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
  2950. --
  2951. 1.6.1.2
  2952.  
  2953.  
  2954. From f431bbc62a793e70865bbe83d94860739579f362 Mon Sep 17 00:00:00 2001
  2955. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2956. Date: Fri, 12 Feb 2010 21:15:12 -0800
  2957. Subject: [PATCH 16/24] Backport various speed tweak ideas from ffmpeg
  2958. Add mv0 early termination to spatial direct calculation
  2959. Up to twice as fast direct mv calculation on near-motionless video.
  2960.  
  2961. Branchless CAVLC level code adjustment based on trailing ones.
  2962. A few clocks faster.
  2963.  
  2964. Check tc value before clipping in C version of deblock functions.
  2965. Much faster, but nobody uses those anyways.
  2966.  
  2967. Thanks to Michael Niedermayer for the ideas.
  2968. ---
  2969. common/frame.c | 6 ++++--
  2970. common/macroblock.c | 3 +++
  2971. encoder/cavlc.c | 7 +++----
  2972. 3 files changed, 10 insertions(+), 6 deletions(-)
  2973.  
  2974. diff --git a/common/frame.c b/common/frame.c
  2975. index 40cc78f..d89f5ab 100644
  2976. --- a/common/frame.c
  2977. +++ b/common/frame.c
  2978. @@ -472,12 +472,14 @@ static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int a
  2979. int delta;
  2980. if( abs( p2 - p0 ) < beta )
  2981. {
  2982. - pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
  2983. + if( tc0[i] )
  2984. + pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
  2985. tc++;
  2986. }
  2987. if( abs( q2 - q0 ) < beta )
  2988. {
  2989. - pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
  2990. + if( tc0[i] )
  2991. + pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
  2992. tc++;
  2993. }
  2994.  
  2995. diff --git a/common/macroblock.c b/common/macroblock.c
  2996. index e676b8b..c9ce597 100644
  2997. --- a/common/macroblock.c
  2998. +++ b/common/macroblock.c
  2999. @@ -272,6 +272,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  3000. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
  3001. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
  3002.  
  3003. + if( !M64( mv ) )
  3004. + return 1;
  3005. +
  3006. if( h->param.i_threads > 1
  3007. && ( mv[0][1] > h->mb.mv_max_spel[1]
  3008. || mv[1][1] > h->mb.mv_max_spel[1] ) )
  3009. diff --git a/encoder/cavlc.c b/encoder/cavlc.c
  3010. index 45b55fe..12806ae 100644
  3011. --- a/encoder/cavlc.c
  3012. +++ b/encoder/cavlc.c
  3013. @@ -147,10 +147,9 @@ static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, int16_t *l,
  3014.  
  3015. if( i_trailing < i_total )
  3016. {
  3017. - int16_t val = runlevel.level[i_trailing];
  3018. - int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
  3019. - if( i_trailing < 3 )
  3020. - val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
  3021. + int val = runlevel.level[i_trailing];
  3022. + int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
  3023. + val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
  3024. val += LEVEL_TABLE_SIZE/2;
  3025.  
  3026. if( (unsigned)val_original < LEVEL_TABLE_SIZE )
  3027. --
  3028. 1.6.1.2
  3029.  
  3030.  
  3031. From f1194492a77e4bcc115be7a6dfc129b0ae9b835b Mon Sep 17 00:00:00 2001
  3032. From: Alexander Strange <astrange@ithinksw.com>
  3033. Date: Mon, 10 Nov 2008 00:55:20 -0500
  3034. Subject: [PATCH 17/24] Allow | as a separator between psy-rd and psy-trellis values.
  3035. [,:/] are all taken when setting psy-trellis in a zone in an mencoder option.
  3036.  
  3037. Also fix a comment typo and remove a useless line of code.
  3038. ---
  3039. common/common.c | 3 ++-
  3040. encoder/encoder.c | 4 +---
  3041. 2 files changed, 3 insertions(+), 4 deletions(-)
  3042.  
  3043. diff --git a/common/common.c b/common/common.c
  3044. index aaccdf2..0dd7af5 100644
  3045. --- a/common/common.c
  3046. +++ b/common/common.c
  3047. @@ -515,7 +515,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
  3048. OPT("psy-rd")
  3049. {
  3050. if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
  3051. - 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) )
  3052. + 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
  3053. + 2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ))
  3054. { }
  3055. else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
  3056. {
  3057. diff --git a/encoder/encoder.c b/encoder/encoder.c
  3058. index 25c4ae4..fb916b5 100644
  3059. --- a/encoder/encoder.c
  3060. +++ b/encoder/encoder.c
  3061. @@ -84,7 +84,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
  3062. x264_param_t *param = &h->param;
  3063. int i;
  3064.  
  3065. - /* First we fill all field */
  3066. + /* First we fill all fields */
  3067. sh->sps = sps;
  3068. sh->pps = pps;
  3069.  
  3070. @@ -685,8 +685,6 @@ static int x264_validate_parameters( x264_t *h )
  3071. /* Psy trellis has a similar effect. */
  3072. if( h->mb.i_psy_trellis )
  3073. h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
  3074. - else
  3075. - h->mb.i_psy_trellis = 0;
  3076. h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
  3077. h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
  3078. h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
  3079. --
  3080. 1.6.1.2
  3081.  
  3082.  
  3083. From 7b1d4aaf2dc8da4c53ec03adcff1c54ff94051fa Mon Sep 17 00:00:00 2001
  3084. From: Alexander Strange <astrange@ithinksw.com>
  3085. Date: Sat, 13 Feb 2010 01:41:41 -0500
  3086. Subject: [PATCH 18/24] mkv: Write SimpleBlock instead of Block for frame headers
  3087.  
  3088. mkvtoolnix writes these by default since 2009/04/13.
  3089. Slightly simplifies muxer and allows 'mkvinfo -s' to show B-frames
  3090. as 'B' (but not B-ref frames).
  3091. ---
  3092. output/matroska.c | 2 +-
  3093. output/matroska_ebml.c | 80 ++++++++----------------------------------------
  3094. output/matroska_ebml.h | 2 +-
  3095. 3 files changed, 15 insertions(+), 69 deletions(-)
  3096.  
  3097. diff --git a/output/matroska.c b/output/matroska.c
  3098. index 8e84f52..db7639c 100644
  3099. --- a/output/matroska.c
  3100. +++ b/output/matroska.c
  3101. @@ -185,7 +185,7 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
  3102.  
  3103. p_mkv->b_writing_frame = 0;
  3104.  
  3105. - if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe ) < 0 )
  3106. + if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 )
  3107. return -1;
  3108.  
  3109. return i_size;
  3110. diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
  3111. index d1c6e13..7265909 100644
  3112. --- a/output/matroska_ebml.c
  3113. +++ b/output/matroska_ebml.c
  3114. @@ -53,9 +53,9 @@ struct mk_writer
  3115. int64_t def_duration;
  3116. int64_t timescale;
  3117. int64_t cluster_tc_scaled;
  3118. - int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
  3119. + int64_t frame_tc, max_frame_tc;
  3120.  
  3121. - char wrote_header, in_frame, keyframe;
  3122. + char wrote_header, in_frame, keyframe, skippable;
  3123. };
  3124.  
  3125. static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
  3126. @@ -258,23 +258,6 @@ static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
  3127. return 0;
  3128. }
  3129.  
  3130. -static int mk_write_sint( mk_context *c, unsigned id, int64_t si )
  3131. -{
  3132. - unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
  3133. - unsigned i = 0;
  3134. -
  3135. - CHECK( mk_write_id( c, id ) );
  3136. - if( si < 0 )
  3137. - while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
  3138. - ++i;
  3139. - else
  3140. - while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80 ) )
  3141. - ++i;
  3142. - CHECK( mk_write_size( c, 8 - i ) );
  3143. - CHECK( mk_append_context_data( c, c_si+i, 8 - i ) );
  3144. - return 0;
  3145. -}
  3146. -
  3147. static int mk_write_float_raw( mk_context *c, float f )
  3148. {
  3149. union
  3150. @@ -301,34 +284,6 @@ static int mk_write_float( mk_context *c, unsigned id, float f )
  3151. return 0;
  3152. }
  3153.  
  3154. -static unsigned mk_ebml_size_size( unsigned s )
  3155. -{
  3156. - if( s < 0x7f )
  3157. - return 1;
  3158. - if( s < 0x3fff )
  3159. - return 2;
  3160. - if( s < 0x1fffff )
  3161. - return 3;
  3162. - if( s < 0x0fffffff )
  3163. - return 4;
  3164. - return 5;
  3165. -}
  3166. -
  3167. -static unsigned mk_ebml_sint_size( int64_t si )
  3168. -{
  3169. - unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
  3170. - unsigned i = 0;
  3171. -
  3172. - if( si < 0 )
  3173. - while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
  3174. - ++i;
  3175. - else
  3176. - while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80) )
  3177. - ++i;
  3178. -
  3179. - return 8 - i;
  3180. -}
  3181. -
  3182. mk_writer *mk_create_writer( const char *filename )
  3183. {
  3184. mk_writer *w = malloc( sizeof(*w) );
  3185. @@ -446,8 +401,8 @@ static int mk_close_cluster( mk_writer *w )
  3186.  
  3187. static int mk_flush_frame( mk_writer *w )
  3188. {
  3189. - int64_t delta, ref = 0;
  3190. - unsigned fsize, bgsize;
  3191. + int64_t delta;
  3192. + unsigned fsize;
  3193. unsigned char c_delta_flags[3];
  3194.  
  3195. if( !w->in_frame )
  3196. @@ -470,33 +425,22 @@ static int mk_flush_frame( mk_writer *w )
  3197. }
  3198.  
  3199. fsize = w->frame ? w->frame->d_cur : 0;
  3200. - bgsize = fsize + 4 + mk_ebml_size_size( fsize + 4 ) + 1;
  3201. - if( !w->keyframe )
  3202. - {
  3203. - ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
  3204. - bgsize += 1 + 1 + mk_ebml_sint_size( ref );
  3205. - }
  3206.  
  3207. - CHECK( mk_write_id( w->cluster, 0xa0 ) ); // BlockGroup
  3208. - CHECK( mk_write_size( w->cluster, bgsize ) );
  3209. - CHECK( mk_write_id( w->cluster, 0xa1 ) ); // Block
  3210. + CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock
  3211. CHECK( mk_write_size( w->cluster, fsize + 4 ) );
  3212. CHECK( mk_write_size( w->cluster, 1 ) ); // track number
  3213.  
  3214. c_delta_flags[0] = delta >> 8;
  3215. c_delta_flags[1] = delta;
  3216. - c_delta_flags[2] = 0;
  3217. + c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
  3218. CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
  3219. if( w->frame )
  3220. {
  3221. CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
  3222. w->frame->d_cur = 0;
  3223. }
  3224. - if( !w->keyframe )
  3225. - CHECK( mk_write_sint( w->cluster, 0xfb, ref ) ); // ReferenceBlock
  3226.  
  3227. w->in_frame = 0;
  3228. - w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
  3229.  
  3230. if( w->cluster->d_cur > CLSIZE )
  3231. CHECK( mk_close_cluster( w ) );
  3232. @@ -509,19 +453,21 @@ int mk_start_frame( mk_writer *w )
  3233. if( mk_flush_frame( w ) < 0 )
  3234. return -1;
  3235.  
  3236. - w->in_frame = 1;
  3237. - w->keyframe = 0;
  3238. + w->in_frame = 1;
  3239. + w->keyframe = 0;
  3240. + w->skippable = 0;
  3241.  
  3242. return 0;
  3243. }
  3244.  
  3245. -int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe )
  3246. +int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable )
  3247. {
  3248. if( !w->in_frame )
  3249. return -1;
  3250.  
  3251. - w->frame_tc = timestamp;
  3252. - w->keyframe = keyframe != 0;
  3253. + w->frame_tc = timestamp;
  3254. + w->keyframe = keyframe != 0;
  3255. + w->skippable = skippable != 0;
  3256.  
  3257. if( w->max_frame_tc < timestamp )
  3258. w->max_frame_tc = timestamp;
  3259. diff --git a/output/matroska_ebml.h b/output/matroska_ebml.h
  3260. index 252e781..56eb8cc 100644
  3261. --- a/output/matroska_ebml.h
  3262. +++ b/output/matroska_ebml.h
  3263. @@ -35,7 +35,7 @@ int mk_writeHeader( mk_writer *w, const char *writing_app,
  3264.  
  3265. int mk_start_frame( mk_writer *w );
  3266. int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
  3267. -int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe );
  3268. +int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable );
  3269. int mk_close( mk_writer *w, int64_t last_delta );
  3270.  
  3271. #endif
  3272. --
  3273. 1.6.1.2
  3274.  
  3275.  
  3276. From b3076b3d1dbdbe2efb29d23c52c98a6596a02687 Mon Sep 17 00:00:00 2001
  3277. From: Alexander Strange <astrange@ithinksw.com>
  3278. Date: Sat, 13 Feb 2010 02:00:57 -0500
  3279. Subject: [PATCH 19/24] mkv: Write the x264 version into the file header
  3280.  
  3281. This only updates the "writing application"; matroska_ebml.c is the
  3282. "muxing application", but the version string for that is still hardcoded.
  3283. ---
  3284. output/matroska.c | 2 +-
  3285. 1 files changed, 1 insertions(+), 1 deletions(-)
  3286.  
  3287. diff --git a/output/matroska.c b/output/matroska.c
  3288. index db7639c..b1805e4 100644
  3289. --- a/output/matroska.c
  3290. +++ b/output/matroska.c
  3291. @@ -146,7 +146,7 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
  3292.  
  3293. memcpy( avcC+11+sps_size, pps, pps_size );
  3294.  
  3295. - ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
  3296. + ret = mk_writeHeader( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
  3297. avcC, avcC_len, p_mkv->frame_duration, 50000,
  3298. p_mkv->width, p_mkv->height,
  3299. p_mkv->d_width, p_mkv->d_height );
  3300. --
  3301. 1.6.1.2
  3302.  
  3303.  
  3304. From 96f261c48ebe4108cd2e0f8a94d012a01f3f7235 Mon Sep 17 00:00:00 2001
  3305. From: Alexander Strange <astrange@ithinksw.com>
  3306. Date: Sat, 13 Feb 2010 02:22:04 -0500
  3307. Subject: [PATCH 20/24] Mark cli_input/output_t variables as const when possible
  3308.  
  3309. ---
  3310. input/avs.c | 2 +-
  3311. input/ffms.c | 2 +-
  3312. input/input.h | 10 +++++-----
  3313. input/lavf.c | 2 +-
  3314. input/y4m.c | 2 +-
  3315. input/yuv.c | 2 +-
  3316. output/flv.c | 2 +-
  3317. output/matroska.c | 2 +-
  3318. output/mp4.c | 2 +-
  3319. output/output.h | 8 ++++----
  3320. output/raw.c | 2 +-
  3321. 11 files changed, 18 insertions(+), 18 deletions(-)
  3322.  
  3323. diff --git a/input/avs.c b/input/avs.c
  3324. index 522f8fe..79b5c80 100644
  3325. --- a/input/avs.c
  3326. +++ b/input/avs.c
  3327. @@ -313,4 +313,4 @@ static int close_file( hnd_t handle )
  3328. return 0;
  3329. }
  3330.  
  3331. -cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
  3332. +const cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
  3333. diff --git a/input/ffms.c b/input/ffms.c
  3334. index b680967..14962c7 100644
  3335. --- a/input/ffms.c
  3336. +++ b/input/ffms.c
  3337. @@ -244,4 +244,4 @@ static int close_file( hnd_t handle )
  3338. return 0;
  3339. }
  3340.  
  3341. -cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3342. +const cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3343. diff --git a/input/input.h b/input/input.h
  3344. index 9fb425c..6e386f4 100644
  3345. --- a/input/input.h
  3346. +++ b/input/input.h
  3347. @@ -60,11 +60,11 @@ typedef struct
  3348. int (*close_file)( hnd_t handle );
  3349. } cli_input_t;
  3350.  
  3351. -extern cli_input_t yuv_input;
  3352. -extern cli_input_t y4m_input;
  3353. -extern cli_input_t avs_input;
  3354. +extern const cli_input_t yuv_input;
  3355. +extern const cli_input_t y4m_input;
  3356. +extern const cli_input_t avs_input;
  3357. extern cli_input_t thread_input;
  3358. -extern cli_input_t lavf_input;
  3359. -extern cli_input_t ffms_input;
  3360. +extern const cli_input_t lavf_input;
  3361. +extern const cli_input_t ffms_input;
  3362.  
  3363. #endif
  3364. diff --git a/input/lavf.c b/input/lavf.c
  3365. index 180e509..6ecc6b0 100644
  3366. --- a/input/lavf.c
  3367. +++ b/input/lavf.c
  3368. @@ -269,4 +269,4 @@ static int close_file( hnd_t handle )
  3369. return 0;
  3370. }
  3371.  
  3372. -cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
  3373. +const cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
  3374. diff --git a/input/y4m.c b/input/y4m.c
  3375. index 1619f74..8645ff7 100644
  3376. --- a/input/y4m.c
  3377. +++ b/input/y4m.c
  3378. @@ -242,4 +242,4 @@ static int close_file( hnd_t handle )
  3379. return 0;
  3380. }
  3381.  
  3382. -cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3383. +const cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3384. diff --git a/input/yuv.c b/input/yuv.c
  3385. index dbd0317..3e39e07 100644
  3386. --- a/input/yuv.c
  3387. +++ b/input/yuv.c
  3388. @@ -125,4 +125,4 @@ static int close_file( hnd_t handle )
  3389. return 0;
  3390. }
  3391.  
  3392. -cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3393. +const cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
  3394. diff --git a/output/flv.c b/output/flv.c
  3395. index b3e5d16..2e0a0e4 100644
  3396. --- a/output/flv.c
  3397. +++ b/output/flv.c
  3398. @@ -305,4 +305,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  3399. return 0;
  3400. }
  3401.  
  3402. -cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3403. +const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3404. diff --git a/output/matroska.c b/output/matroska.c
  3405. index b1805e4..fb39ced 100644
  3406. --- a/output/matroska.c
  3407. +++ b/output/matroska.c
  3408. @@ -206,4 +206,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  3409. return ret;
  3410. }
  3411.  
  3412. -cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3413. +const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
  3414. diff --git a/output/mp4.c b/output/mp4.c
  3415. index b817c82..b99eaed 100644
  3416. --- a/output/mp4.c
  3417. +++ b/output/mp4.c
  3418. @@ -298,4 +298,4 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
  3419. return i_size;
  3420. }
  3421.  
  3422. -cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
  3423. +const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
  3424. diff --git a/output/output.h b/output/output.h
  3425. index 851b819..c79b48e 100644
  3426. --- a/output/output.h
  3427. +++ b/output/output.h
  3428. @@ -33,9 +33,9 @@ typedef struct
  3429. int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
  3430. } cli_output_t;
  3431.  
  3432. -extern cli_output_t raw_output;
  3433. -extern cli_output_t mkv_output;
  3434. -extern cli_output_t mp4_output;
  3435. -extern cli_output_t flv_output;
  3436. +extern const cli_output_t raw_output;
  3437. +extern const cli_output_t mkv_output;
  3438. +extern const cli_output_t mp4_output;
  3439. +extern const cli_output_t flv_output;
  3440.  
  3441. #endif
  3442. diff --git a/output/raw.c b/output/raw.c
  3443. index a4d1175..02e4c56 100644
  3444. --- a/output/raw.c
  3445. +++ b/output/raw.c
  3446. @@ -62,5 +62,5 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  3447. return fclose( (FILE*)handle );
  3448. }
  3449.  
  3450. -cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
  3451. +const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
  3452.  
  3453. --
  3454. 1.6.1.2
  3455.  
  3456.  
  3457. From b1939fa0adbff86f35960c14619211d1ab51e174 Mon Sep 17 00:00:00 2001
  3458. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3459. Date: Sat, 13 Feb 2010 00:52:31 -0800
  3460. Subject: [PATCH 21/24] Make the ABR buffer consider the distance to the end of the video
  3461. Should improve bitrate accuracy in 2-pass mode.
  3462. May also slightly improve quality by allowing more variation earlier-on in a file.
  3463.  
  3464. Also fix abr_buffer with 1-pass: it does something very different than what it does for 2-pass.
  3465. Thus, the earlier change that increased it based on threads caused 1-pass ABR to be somewhat less accurate.
  3466. ---
  3467. encoder/ratecontrol.c | 6 ++++--
  3468. 1 files changed, 4 insertions(+), 2 deletions(-)
  3469.  
  3470. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  3471. index b2cbb26..0b809c5 100644
  3472. --- a/encoder/ratecontrol.c
  3473. +++ b/encoder/ratecontrol.c
  3474. @@ -1796,13 +1796,15 @@ static float rate_estimate_qscale( x264_t *h )
  3475. }
  3476. else
  3477. {
  3478. - double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate * h->i_thread_frames;
  3479. + double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
  3480.  
  3481. if( rcc->b_2pass )
  3482. {
  3483. - //FIXME adjust abr_buffer based on distance to the end of the video
  3484. int64_t diff;
  3485. int64_t predicted_bits = total_bits;
  3486. + /* Adjust ABR buffer based on distance to the end of the video. */
  3487. + if( rcc->num_entries > h->fenc->i_frame )
  3488. + abr_buffer *= X264_MAX( log( rcc->num_entries - h->fenc->i_frame ), 1 );
  3489.  
  3490. if( rcc->b_vbv )
  3491. {
  3492. --
  3493. 1.6.1.2
  3494.  
  3495.  
  3496. From 3779fc91240b422201395174e1610b2dd93334a9 Mon Sep 17 00:00:00 2001
  3497. From: David Conrad <lessen42@gmail.com>
  3498. Date: Sat, 13 Feb 2010 01:25:56 -0800
  3499. Subject: [PATCH 22/24] Use #ifdef instead of #if in checkasm
  3500.  
  3501. ---
  3502. tools/checkasm.c | 4 ++--
  3503. 1 files changed, 2 insertions(+), 2 deletions(-)
  3504.  
  3505. diff --git a/tools/checkasm.c b/tools/checkasm.c
  3506. index 0bedc5b..595bd9e 100644
  3507. --- a/tools/checkasm.c
  3508. +++ b/tools/checkasm.c
  3509. @@ -1662,13 +1662,13 @@ static int check_all_flags( void )
  3510. cpu1 &= ~X264_CPU_CACHELINE_64;
  3511. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
  3512. }
  3513. -#elif ARCH_PPC
  3514. +#elif defined(ARCH_PPC)
  3515. if( x264_cpu_detect() & X264_CPU_ALTIVEC )
  3516. {
  3517. fprintf( stderr, "x264: ALTIVEC against C\n" );
  3518. ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
  3519. }
  3520. -#elif ARCH_ARM
  3521. +#elif defined(ARCH_ARM)
  3522. if( x264_cpu_detect() & X264_CPU_ARMV6 )
  3523. ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
  3524. if( x264_cpu_detect() & X264_CPU_NEON )
  3525. --
  3526. 1.6.1.2
  3527.  
  3528.  
  3529. From 2ac5d04b4d20142fc2c277fe5ef8dbe41c73fcdb Mon Sep 17 00:00:00 2001
  3530. From: David Conrad <lessen42@gmail.com>
  3531. Date: Fri, 8 Jan 2010 22:40:09 -0500
  3532. Subject: [PATCH 23/24] ARM NEON versions of weightp functions
  3533.  
  3534. ---
  3535. common/arm/mc-a.S | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++++
  3536. common/arm/mc-c.c | 47 ++++++++
  3537. 2 files changed, 352 insertions(+), 0 deletions(-)
  3538.  
  3539. diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
  3540. index a62af39..e1db404 100644
  3541. --- a/common/arm/mc-a.S
  3542. +++ b/common/arm/mc-a.S
  3543. @@ -432,6 +432,311 @@ avg2_w20_loop:
  3544. .endfunc
  3545.  
  3546.  
  3547. +.macro weight_prologue type
  3548. + push {r4-r5,lr}
  3549. + ldr r4, [sp, #4*3] // weight_t
  3550. + ldr ip, [sp, #4*3+4] // h
  3551. +.ifc \type, full
  3552. + ldr lr, [r4, #32] // denom
  3553. +.endif
  3554. + ldrd r4, [r4, #32+4] // scale, offset
  3555. + vdup.16 q0, r4
  3556. + vdup.16 q1, r5
  3557. +.ifc \type, full
  3558. + rsb lr, lr, #0
  3559. + vdup.16 q2, lr
  3560. +.endif
  3561. +.endm
  3562. +
  3563. +// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
  3564. +// const x264_weight_t *weight, int height )
  3565. +function x264_mc_weight_w20_neon
  3566. + weight_prologue full
  3567. + sub r1, #16
  3568. +weight20_loop:
  3569. + subs ip, #2
  3570. + vld1.8 {d17-d19}, [r2], r3
  3571. + vmovl.u8 q10, d17
  3572. + vmovl.u8 q11, d18
  3573. + vmovl.u8 q14, d19
  3574. + vld1.8 {d16-d18}, [r2], r3
  3575. + vmovl.u8 q12, d16
  3576. + vmovl.u8 q13, d17
  3577. + vmovl.u8 q15, d18
  3578. + vmul.s16 q10, q10, q0
  3579. + vmul.s16 q11, q11, q0
  3580. + vmul.s16 q12, q12, q0
  3581. + vmul.s16 q13, q13, q0
  3582. + vmul.s16 d28, d28, d0
  3583. + vmul.s16 d29, d30, d0
  3584. + vrshl.s16 q10, q10, q2
  3585. + vrshl.s16 q11, q11, q2
  3586. + vrshl.s16 q12, q12, q2
  3587. + vrshl.s16 q13, q13, q2
  3588. + vrshl.s16 q14, q14, q2
  3589. + vadd.s16 q10, q10, q1
  3590. + vadd.s16 q11, q11, q1
  3591. + vadd.s16 q12, q12, q1
  3592. + vadd.s16 q13, q13, q1
  3593. + vadd.s16 q14, q14, q1
  3594. + vqmovun.s16 d16, q10
  3595. + vqmovun.s16 d17, q11
  3596. + vqmovun.s16 d18, q12
  3597. + vqmovun.s16 d19, q13
  3598. + vqmovun.s16 d20, q14
  3599. + vst1.8 {d16-d17}, [r0,:128]!
  3600. + vst1.32 {d20[0]}, [r0,:32], r1
  3601. + vst1.8 {d18-d19}, [r0,:128]!
  3602. + vst1.32 {d20[1]}, [r0,:32], r1
  3603. + bgt weight20_loop
  3604. + pop {r4-r5,pc}
  3605. +.endfunc
  3606. +
  3607. +function x264_mc_weight_w16_neon
  3608. + weight_prologue full
  3609. +weight16_loop:
  3610. + subs ip, #2
  3611. + vld1.8 {d16-d17}, [r2], r3
  3612. + vld1.8 {d18-d19}, [r2], r3
  3613. + vmovl.u8 q10, d16
  3614. + vmovl.u8 q11, d17
  3615. + vmovl.u8 q12, d18
  3616. + vmovl.u8 q13, d19
  3617. + vmul.s16 q10, q10, q0
  3618. + vmul.s16 q11, q11, q0
  3619. + vmul.s16 q12, q12, q0
  3620. + vmul.s16 q13, q13, q0
  3621. + vrshl.s16 q10, q10, q2
  3622. + vrshl.s16 q11, q11, q2
  3623. + vrshl.s16 q12, q12, q2
  3624. + vrshl.s16 q13, q13, q2
  3625. + vadd.s16 q10, q10, q1
  3626. + vadd.s16 q11, q11, q1
  3627. + vadd.s16 q12, q12, q1
  3628. + vadd.s16 q13, q13, q1
  3629. + vqmovun.s16 d16, q10
  3630. + vqmovun.s16 d17, q11
  3631. + vqmovun.s16 d18, q12
  3632. + vqmovun.s16 d19, q13
  3633. + vst1.8 {d16-d17}, [r0,:128], r1
  3634. + vst1.8 {d18-d19}, [r0,:128], r1
  3635. + bgt weight16_loop
  3636. + pop {r4-r5,pc}
  3637. +.endfunc
  3638. +
  3639. +function x264_mc_weight_w8_neon
  3640. + weight_prologue full
  3641. +weight8_loop:
  3642. + subs ip, #2
  3643. + vld1.8 {d16}, [r2], r3
  3644. + vld1.8 {d18}, [r2], r3
  3645. + vmovl.u8 q8, d16
  3646. + vmovl.u8 q9, d18
  3647. + vmul.s16 q8, q8, q0
  3648. + vmul.s16 q9, q9, q0
  3649. + vrshl.s16 q8, q8, q2
  3650. + vrshl.s16 q9, q9, q2
  3651. + vadd.s16 q8, q8, q1
  3652. + vadd.s16 q9, q9, q1
  3653. + vqmovun.s16 d16, q8
  3654. + vqmovun.s16 d18, q9
  3655. + vst1.8 {d16}, [r0,:64], r1
  3656. + vst1.8 {d18}, [r0,:64], r1
  3657. + bgt weight8_loop
  3658. + pop {r4-r5,pc}
  3659. +.endfunc
  3660. +
  3661. +function x264_mc_weight_w4_neon
  3662. + weight_prologue full
  3663. +weight4_loop:
  3664. + subs ip, #2
  3665. + vld1.32 {d16[]}, [r2], r3
  3666. + vld1.32 {d18[]}, [r2], r3
  3667. + vmovl.u8 q8, d16
  3668. + vmovl.u8 q9, d18
  3669. + vmul.s16 d16, d16, d0
  3670. + vmul.s16 d17, d18, d0
  3671. + vrshl.s16 q8, q8, q2
  3672. + vadd.s16 q8, q8, q1
  3673. + vqmovun.s16 d16, q8
  3674. + vst1.32 {d16[0]}, [r0,:32], r1
  3675. + vst1.32 {d16[1]}, [r0,:32], r1
  3676. + bgt weight4_loop
  3677. + pop {r4-r5,pc}
  3678. +.endfunc
  3679. +
  3680. +function x264_mc_weight_w20_nodenom_neon
  3681. + weight_prologue nodenom
  3682. + sub r1, #16
  3683. +weight20_nodenom_loop:
  3684. + subs ip, #2
  3685. + vld1.8 {d17-d19}, [r2], r3
  3686. + vmovl.u8 q10, d17
  3687. + vmovl.u8 q11, d18
  3688. + vmovl.u8 q14, d19
  3689. + vld1.8 {d16-d18}, [r2], r3
  3690. + vmovl.u8 q12, d16
  3691. + vmovl.u8 q13, d17
  3692. + vmovl.u8 q15, d18
  3693. + vmov q8, q1
  3694. + vmov q9, q1
  3695. + vmla.s16 q8, q10, q0
  3696. + vmla.s16 q9, q11, q0
  3697. + vmov q10, q1
  3698. + vmov q11, q1
  3699. + vmla.s16 q10, q12, q0
  3700. + vmla.s16 q11, q13, q0
  3701. + vmov q12, q1
  3702. + vmla.s16 d24, d28, d0
  3703. + vmla.s16 d25, d30, d0
  3704. + vqmovun.s16 d16, q8
  3705. + vqmovun.s16 d17, q9
  3706. + vqmovun.s16 d18, q10
  3707. + vqmovun.s16 d19, q11
  3708. + vqmovun.s16 d20, q12
  3709. + vst1.8 {d16-d17}, [r0,:128]!
  3710. + vst1.32 {d20[0]}, [r0,:32], r1
  3711. + vst1.8 {d18-d19}, [r0,:128]!
  3712. + vst1.32 {d20[1]}, [r0,:32], r1
  3713. + bgt weight20_nodenom_loop
  3714. + pop {r4-r5,pc}
  3715. +.endfunc
  3716. +
  3717. +function x264_mc_weight_w16_nodenom_neon
  3718. + weight_prologue nodenom
  3719. +weight16_nodenom_loop:
  3720. + subs ip, #2
  3721. + vld1.8 {d16-d17}, [r2], r3
  3722. + vld1.8 {d18-d19}, [r2], r3
  3723. + vmovl.u8 q12, d16
  3724. + vmovl.u8 q13, d17
  3725. + vmovl.u8 q14, d18
  3726. + vmovl.u8 q15, d19
  3727. + vmov q8, q1
  3728. + vmov q9, q1
  3729. + vmov q10, q1
  3730. + vmov q11, q1
  3731. + vmla.s16 q8, q12, q0
  3732. + vmla.s16 q9, q13, q0
  3733. + vmla.s16 q10, q14, q0
  3734. + vmla.s16 q11, q15, q0
  3735. + vqmovun.s16 d16, q8
  3736. + vqmovun.s16 d17, q9
  3737. + vqmovun.s16 d18, q10
  3738. + vqmovun.s16 d19, q11
  3739. + vst1.8 {d16-d17}, [r0,:128], r1
  3740. + vst1.8 {d18-d19}, [r0,:128], r1
  3741. + bgt weight16_nodenom_loop
  3742. + pop {r4-r5,pc}
  3743. +.endfunc
  3744. +
  3745. +function x264_mc_weight_w8_nodenom_neon
  3746. + weight_prologue nodenom
  3747. +weight8_nodenom_loop:
  3748. + subs ip, #2
  3749. + vld1.8 {d16}, [r2], r3
  3750. + vld1.8 {d18}, [r2], r3
  3751. + vmovl.u8 q8, d16
  3752. + vmovl.u8 q9, d18
  3753. + vmov q10, q1
  3754. + vmov q11, q1
  3755. + vmla.s16 q10, q8, q0
  3756. + vmla.s16 q11, q9, q0
  3757. + vqmovun.s16 d16, q10
  3758. + vqmovun.s16 d17, q11
  3759. + vst1.8 {d16}, [r0,:64], r1
  3760. + vst1.8 {d17}, [r0,:64], r1
  3761. + bgt weight8_nodenom_loop
  3762. + pop {r4-r5,pc}
  3763. +.endfunc
  3764. +
  3765. +function x264_mc_weight_w4_nodenom_neon
  3766. + weight_prologue nodenom
  3767. +weight4_nodenom_loop:
  3768. + subs ip, #2
  3769. + vld1.32 {d16[]}, [r2], r3
  3770. + vld1.32 {d18[]}, [r2], r3
  3771. + vmovl.u8 q8, d16
  3772. + vmovl.u8 q9, d18
  3773. + vmov q10, q1
  3774. + vmla.s16 d20, d16, d0
  3775. + vmla.s16 d21, d18, d0
  3776. + vqmovun.s16 d16, q10
  3777. + vst1.32 {d16[0]}, [r0,:32], r1
  3778. + vst1.32 {d16[1]}, [r0,:32], r1
  3779. + bgt weight4_nodenom_loop
  3780. + pop {r4-r5,pc}
  3781. +.endfunc
  3782. +
  3783. +.macro weight_simple_prologue
  3784. + push {lr}
  3785. + ldr lr, [sp, #4] // weight_t
  3786. + ldr ip, [sp, #8] // h
  3787. + ldr lr, [lr] // offset
  3788. + vdup.8 q1, lr
  3789. +.endm
  3790. +
  3791. +.macro weight_simple name op
  3792. +function x264_mc_weight_w20_\name\()_neon
  3793. + weight_simple_prologue
  3794. +weight20_\name\()_loop:
  3795. + subs ip, #2
  3796. + vld1.8 {d16-d18}, [r2], r3
  3797. + vld1.8 {d19-d21}, [r2], r3
  3798. + \op q8, q8, q1
  3799. + \op q9, q9, q1
  3800. + \op q10, q10, q1
  3801. + vst1.8 {d16-d18}, [r0,:64], r1
  3802. + vst1.8 {d19-d21}, [r0,:64], r1
  3803. + bgt weight20_\name\()_loop
  3804. + pop {pc}
  3805. +.endfunc
  3806. +
  3807. +function x264_mc_weight_w16_\name\()_neon
  3808. + weight_simple_prologue
  3809. +weight16_\name\()_loop:
  3810. + subs ip, #2
  3811. + vld1.8 {d16-d17}, [r2], r3
  3812. + vld1.8 {d18-d19}, [r2], r3
  3813. + \op q8, q8, q1
  3814. + \op q9, q9, q1
  3815. + vst1.8 {d16-d17}, [r0,:128], r1
  3816. + vst1.8 {d18-d19}, [r0,:128], r1
  3817. + bgt weight16_\name\()_loop
  3818. + pop {pc}
  3819. +.endfunc
  3820. +
  3821. +function x264_mc_weight_w8_\name\()_neon
  3822. + weight_simple_prologue
  3823. +weight8_\name\()_loop:
  3824. + subs ip, #2
  3825. + vld1.8 {d16}, [r2], r3
  3826. + vld1.8 {d17}, [r2], r3
  3827. + \op q8, q8, q1
  3828. + vst1.8 {d16}, [r0,:64], r1
  3829. + vst1.8 {d17}, [r0,:64], r1
  3830. + bgt weight8_\name\()_loop
  3831. + pop {pc}
  3832. +.endfunc
  3833. +
  3834. +function x264_mc_weight_w4_\name\()_neon
  3835. + weight_simple_prologue
  3836. +weight4_\name\()_loop:
  3837. + subs ip, #2
  3838. + vld1.32 {d16[]}, [r2], r3
  3839. + vld1.32 {d17[]}, [r2], r3
  3840. + \op q8, q8, q1
  3841. + vst1.32 {d16[0]}, [r0,:32], r1
  3842. + vst1.32 {d17[0]}, [r0,:32], r1
  3843. + bgt weight4_\name\()_loop
  3844. + pop {pc}
  3845. +.endfunc
  3846. +.endm
  3847. +
  3848. +weight_simple offsetadd, vqadd.u8
  3849. +weight_simple offsetsub, vqsub.u8
  3850. +
  3851. +
  3852. // void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
  3853. function x264_mc_copy_w4_neon
  3854. ldr ip, [sp]
  3855. diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
  3856. index 20cf151..0a7b734 100644
  3857. --- a/common/arm/mc-c.c
  3858. +++ b/common/arm/mc-c.c
  3859. @@ -43,6 +43,48 @@ void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  3860. void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  3861. void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  3862.  
  3863. +#define MC_WEIGHT(func)\
  3864. +void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3865. +void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3866. +void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3867. +void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
  3868. +\
  3869. +static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
  3870. +{\
  3871. + x264_mc_weight_w4##func##_neon,\
  3872. + x264_mc_weight_w4##func##_neon,\
  3873. + x264_mc_weight_w8##func##_neon,\
  3874. + x264_mc_weight_w16##func##_neon,\
  3875. + x264_mc_weight_w16##func##_neon,\
  3876. + x264_mc_weight_w20##func##_neon,\
  3877. +};
  3878. +
  3879. +MC_WEIGHT()
  3880. +MC_WEIGHT(_nodenom)
  3881. +MC_WEIGHT(_offsetadd)
  3882. +MC_WEIGHT(_offsetsub)
  3883. +
  3884. +static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
  3885. +{
  3886. + if( w->i_scale == 1<<w->i_denom )
  3887. + {
  3888. + if( w->i_offset < 0 )
  3889. + {
  3890. + w->weightfn = x264_mc_offsetsub_wtab_neon;
  3891. + w->cachea[0] = -w->i_offset;
  3892. + }
  3893. + else
  3894. + {
  3895. + w->weightfn = x264_mc_offsetadd_wtab_neon;
  3896. + w->cachea[0] = w->i_offset;
  3897. + }
  3898. + }
  3899. + else if( !w->i_denom )
  3900. + w->weightfn = x264_mc_nodenom_wtab_neon;
  3901. + else
  3902. + w->weightfn = x264_mc_wtab_neon;
  3903. +}
  3904. +
  3905. void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
  3906. void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
  3907. void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
  3908. @@ -182,6 +224,11 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
  3909. pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
  3910. pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
  3911.  
  3912. + pf->weight = x264_mc_wtab_neon;
  3913. + pf->offsetadd = x264_mc_offsetadd_wtab_neon;
  3914. + pf->offsetsub = x264_mc_offsetsub_wtab_neon;
  3915. + pf->weight_cache = x264_weight_cache_neon;
  3916. +
  3917. // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
  3918. #ifndef SYS_MACOSX
  3919. pf->memcpy_aligned = x264_memcpy_aligned_neon;
  3920. --
  3921. 1.6.1.2
  3922.  
  3923.  
  3924. From af5f0a12d0d3b74d0d0e50b1f4eee409c376a989 Mon Sep 17 00:00:00 2001
  3925. From: David Conrad <lessen42@gmail.com>
  3926. Date: Sun, 4 Oct 2009 07:24:42 -0400
  3927. Subject: [PATCH 24/24] iPhone compilation support
  3928. Also add --sysroot to configure options
  3929.  
  3930. To build for iPhone 3gs / iPod touch 3g:
  3931. CC=/Developer/Platforms/iPhoneOS.platform/Developer/usr/bin/gcc ./configure --host=arm-apple-darwin --sysroot=/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.0.sdk
  3932.  
  3933. For older devices, add
  3934. --extra-cflags='-arch armv6 -mcpu=arm1176jzf-s' --extra-ldflags='-arch armv6' --disable-asm
  3935. ---
  3936. common/arm/asm.S | 9 ++-
  3937. common/arm/pixel-a.S | 13 ++-
  3938. configure | 17 +++-
  3939. extras/gas-preprocessor.pl | 256 ++++++++++++++++++++++++++++++++++++++++++++
  3940. 4 files changed, 287 insertions(+), 8 deletions(-)
  3941. create mode 100755 extras/gas-preprocessor.pl
  3942.  
  3943. diff --git a/common/arm/asm.S b/common/arm/asm.S
  3944. index d163165..395267f 100644
  3945. --- a/common/arm/asm.S
  3946. +++ b/common/arm/asm.S
  3947. @@ -20,6 +20,12 @@
  3948.  
  3949. #include "config.h"
  3950.  
  3951. +#ifdef PREFIX
  3952. +# define EXTERN_ASM _
  3953. +#else
  3954. +# define EXTERN_ASM
  3955. +#endif
  3956. +
  3957. #ifdef __ELF__
  3958. # define ELF
  3959. #else
  3960. @@ -35,7 +41,8 @@ ELF .eabi_attribute 25, \val
  3961. .endm
  3962.  
  3963. .macro function name
  3964. - .global \name
  3965. + .global EXTERN_ASM\name
  3966. +EXTERN_ASM\name:
  3967. ELF .hidden \name
  3968. ELF .type \name, %function
  3969. .func \name
  3970. diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
  3971. index 4dd65ed..d8533e5 100644
  3972. --- a/common/arm/pixel-a.S
  3973. +++ b/common/arm/pixel-a.S
  3974. @@ -110,16 +110,17 @@ SAD4_ARMV6 8
  3975.  
  3976. .macro SAD_FUNC w, h, name, align:vararg
  3977. function x264_pixel_sad\name\()_\w\()x\h\()_neon
  3978. + SAD_START_\w \align
  3979. +
  3980. .if \w == 16
  3981. - .set r, \h / 2 - 1
  3982. +.rept \h / 2 - 1
  3983. + SAD_\w \align
  3984. +.endr
  3985. .else
  3986. - .set r, \h - 1
  3987. -.endif
  3988. -
  3989. - SAD_START_\w \align
  3990. -.rept r
  3991. +.rept \h - 1
  3992. SAD_\w \align
  3993. .endr
  3994. +.endif
  3995.  
  3996. .if \w > 8
  3997. vabal.u8 q8, d4, d6
  3998. diff --git a/configure b/configure
  3999. index b254383..b25cd36 100755
  4000. --- a/configure
  4001. +++ b/configure
  4002. @@ -23,6 +23,7 @@ echo " --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS"
  4003. echo " --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
  4004. echo " --host=HOST build programs to run on HOST"
  4005. echo " --cross-prefix=PREFIX use PREFIX for compilation tools"
  4006. +echo " --sysroot=SYSROOT root of cross-build tree"
  4007. echo ""
  4008. exit 1
  4009. fi
  4010. @@ -223,6 +224,10 @@ for opt do
  4011. --cross-prefix=*)
  4012. cross_prefix="${opt#--cross-prefix=}"
  4013. ;;
  4014. + --sysroot=*)
  4015. + CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}"
  4016. + LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}"
  4017. + ;;
  4018. *)
  4019. echo "Unknown option $opt, ignored"
  4020. ;;
  4021. @@ -367,7 +372,17 @@ case $host_cpu in
  4022. ;;
  4023. arm*)
  4024. ARCH="ARM"
  4025. - AS="${AS-${cross_prefix}gcc}"
  4026. + if [ "$SYS" = MACOSX ] ; then
  4027. + AS="${AS-./extras/gas-preprocessor.pl $CC}"
  4028. + ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all
  4029. + # build for armv7 by default
  4030. + if ! echo $CFLAGS | grep -Eq '\-arch' ; then
  4031. + CFLAGS="$CFLAGS -arch armv7"
  4032. + LDFLAGS="$LDFLAGS -arch armv7"
  4033. + fi
  4034. + else
  4035. + AS="${AS-${cross_prefix}gcc}"
  4036. + fi
  4037. ;;
  4038. s390|s390x)
  4039. ARCH="S390"
  4040. diff --git a/extras/gas-preprocessor.pl b/extras/gas-preprocessor.pl
  4041. new file mode 100755
  4042. index 0000000..d60893c
  4043. --- /dev/null
  4044. +++ b/extras/gas-preprocessor.pl
  4045. @@ -0,0 +1,256 @@
  4046. +#!/usr/bin/env perl
  4047. +# by David Conrad
  4048. +# This code is licensed under GPLv2 or later; go to gnu.org to read it
  4049. +# (not that it much matters for an asm preprocessor)
  4050. +# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
  4051. +use strict;
  4052. +
  4053. +# Apple's gas is ancient and doesn't support modern preprocessing features like
  4054. +# .rept and has ugly macro syntax, among other things. Thus, this script
  4055. +# implements the subset of the gas preprocessor used by x264 and ffmpeg
  4056. +# that isn't supported by Apple's gas.
  4057. +
  4058. +# FIXME: doesn't work if the path has spaces, but oh well...
  4059. +my $gcc_cmd = join(' ', @ARGV);
  4060. +my $preprocess_c_cmd;
  4061. +
  4062. +if ($gcc_cmd =~ /\S+\.c/) {
  4063. + # C file (inline asm?) - compile
  4064. + $preprocess_c_cmd = "$gcc_cmd -S";
  4065. + $gcc_cmd =~ s/\S+\.c/-x assembler -/g;
  4066. +} elsif ($gcc_cmd =~ /\S+\.S/) {
  4067. + # asm file, just do C preprocessor
  4068. + $preprocess_c_cmd = "$gcc_cmd -E";
  4069. + $gcc_cmd =~ s/\S+\.S/-x assembler -/g;
  4070. +} else {
  4071. + die "Unrecognized input filetype";
  4072. +}
  4073. +
  4074. +$preprocess_c_cmd =~ s/\S+\.o/-/g;
  4075. +
  4076. +open(ASMFILE, "-|", $preprocess_c_cmd) || die "Error running preprocessor";
  4077. +
  4078. +my $current_macro = '';
  4079. +my %macro_lines;
  4080. +my %macro_args;
  4081. +my %macro_args_default;
  4082. +
  4083. +my @pass1_lines;
  4084. +
  4085. +# pass 1: parse .macro
  4086. +# note that the handling of arguments is probably overly permissive vs. gas
  4087. +# but it should be the same for valid cases
  4088. +while (<ASMFILE>) {
  4089. + # comment out unsupported directives
  4090. + s/\.type/@.type/x;
  4091. + s/\.func/@.func/x;
  4092. + s/\.endfunc/@.endfunc/x;
  4093. + s/\.ltorg/@.ltorg/x;
  4094. + s/\.size/@.size/x;
  4095. + s/\.fpu/@.fpu/x;
  4096. +
  4097. + # the syntax for these is a little different
  4098. + s/\.global/.globl/x;
  4099. + # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
  4100. + s/(.*)\.rodata/.const_data/x;
  4101. + s/\.int/.long/x;
  4102. + s/\.float/.single/x;
  4103. +
  4104. + # catch unknown section names that aren't mach-o style (with a comma)
  4105. + if (/.section ([^,]*)$/) {
  4106. + die ".section $1 unsupported; figure out the mach-o section name and add it";
  4107. + }
  4108. +
  4109. + # macros creating macros is not handled (is that valid?)
  4110. + if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
  4111. + $current_macro = $1;
  4112. +
  4113. + # commas in the argument list are optional, so only use whitespace as the separator
  4114. + my $arglist = $2;
  4115. + $arglist =~ s/,/ /g;
  4116. +
  4117. + my @args = split(/\s+/, $arglist);
  4118. + foreach my $i (0 .. $#args) {
  4119. + my @argpair = split(/=/, $args[$i]);
  4120. + $macro_args{$current_macro}[$i] = $argpair[0];
  4121. + $argpair[0] =~ s/:vararg$//;
  4122. + $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
  4123. + }
  4124. + # ensure %macro_lines has the macro name added as a key
  4125. + $macro_lines{$current_macro} = [];
  4126. + } elsif (/\.endm/) {
  4127. + if (!$current_macro) {
  4128. + die "ERROR: .endm without .macro";
  4129. + }
  4130. + $current_macro = '';
  4131. + } elsif ($current_macro) {
  4132. + push(@{$macro_lines{$current_macro}}, $_);
  4133. + } else {
  4134. + expand_macros($_);
  4135. + }
  4136. +}
  4137. +
  4138. +sub expand_macros {
  4139. + my $line = @_[0];
  4140. + if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
  4141. + push(@pass1_lines, $1);
  4142. + my $macro = $2;
  4143. +
  4144. + # commas are optional here too, but are syntactically important because
  4145. + # parameters can be blank
  4146. + my @arglist = split(/,/, $3);
  4147. + my @args;
  4148. + foreach (@arglist) {
  4149. + my @whitespace_split = split(/\s+/, $_);
  4150. + if (!@whitespace_split) {
  4151. + push(@args, '');
  4152. + } else {
  4153. + foreach (@whitespace_split) {
  4154. + if (length($_)) {
  4155. + push(@args, $_);
  4156. + }
  4157. + }
  4158. + }
  4159. + }
  4160. +
  4161. + my %replacements;
  4162. + if ($macro_args_default{$macro}){
  4163. + %replacements = %{$macro_args_default{$macro}};
  4164. + }
  4165. +
  4166. + # construct hashtable of text to replace
  4167. + foreach my $i (0 .. $#args) {
  4168. + my $argname = $macro_args{$macro}[$i];
  4169. +
  4170. + if ($args[$i] =~ m/=/) {
  4171. + # arg=val references the argument name
  4172. + # XXX: I'm not sure what the expected behaviour if a lot of
  4173. + # these are mixed with unnamed args
  4174. + my @named_arg = split(/=/, $args[$i]);
  4175. + $replacements{$named_arg[0]} = $named_arg[1];
  4176. + } elsif ($i > $#{$macro_args{$macro}}) {
  4177. + # more args given than the macro has named args
  4178. + # XXX: is vararg allowed on arguments before the last?
  4179. + $argname = $macro_args{$macro}[-1];
  4180. + if ($argname =~ s/:vararg$//) {
  4181. + $replacements{$argname} .= ", $args[$i]";
  4182. + } else {
  4183. + die "Too many arguments to macro $macro";
  4184. + }
  4185. + } else {
  4186. + $argname =~ s/:vararg$//;
  4187. + $replacements{$argname} = $args[$i];
  4188. + }
  4189. + }
  4190. +
  4191. + # apply replacements as regex
  4192. + foreach (@{$macro_lines{$macro}}) {
  4193. + my $macro_line = $_;
  4194. + # do replacements by longest first, this avoids wrong replacement
  4195. + # when argument names are subsets of each other
  4196. + foreach (reverse sort {length $a <=> length $b} keys %replacements) {
  4197. + $macro_line =~ s/\\$_/$replacements{$_}/g;
  4198. + }
  4199. + $macro_line =~ s/\\\(\)//g; # remove \()
  4200. + expand_macros($macro_line);
  4201. + }
  4202. + } else {
  4203. + push(@pass1_lines, $line);
  4204. + }
  4205. +}
  4206. +
  4207. +close(ASMFILE) or exit 1;
  4208. +open(ASMFILE, "|-", $gcc_cmd) or die "Error running assembler";
  4209. +
  4210. +my @sections;
  4211. +my $num_repts;
  4212. +my $rept_lines;
  4213. +
  4214. +my %literal_labels; # for ldr <reg>, =<expr>
  4215. +my $literal_num = 0;
  4216. +
  4217. +# pass 2: parse .rept and .if variants
  4218. +# NOTE: since we don't implement a proper parser, using .rept with a
  4219. +# variable assigned from .set is not supported
  4220. +foreach my $line (@pass1_lines) {
  4221. + # textual comparison .if
  4222. + # this assumes nothing else on the same line
  4223. + if ($line =~ /\.ifnb\s+(.*)/) {
  4224. + if ($1) {
  4225. + $line = ".if 1\n";
  4226. + } else {
  4227. + $line = ".if 0\n";
  4228. + }
  4229. + } elsif ($line =~ /\.ifb\s+(.*)/) {
  4230. + if ($1) {
  4231. + $line = ".if 0\n";
  4232. + } else {
  4233. + $line = ".if 1\n";
  4234. + }
  4235. + } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
  4236. + if ($1 eq $2) {
  4237. + $line = ".if 1\n";
  4238. + } else {
  4239. + $line = ".if 0\n";
  4240. + }
  4241. + }
  4242. +
  4243. + # handle .previous (only with regard to .section not .subsection)
  4244. + if ($line =~ /\.(section|text|const_data)/) {
  4245. + push(@sections, $line);
  4246. + } elsif ($line =~ /\.previous/) {
  4247. + if (!$sections[-2]) {
  4248. + die ".previous without a previous section";
  4249. + }
  4250. + $line = $sections[-2];
  4251. + push(@sections, $line);
  4252. + }
  4253. +
  4254. + # handle ldr <reg>, =<expr>
  4255. + if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) {
  4256. + my $label = $literal_labels{$3};
  4257. + if (!$label) {
  4258. + $label = ".Literal_$literal_num";
  4259. + $literal_num++;
  4260. + $literal_labels{$3} = $label;
  4261. + }
  4262. + $line = "$1 ldr$2, $label\n";
  4263. + } elsif ($line =~ /\.ltorg/) {
  4264. + foreach my $literal (keys %literal_labels) {
  4265. + $line .= "$literal_labels{$literal}:\n .word $literal\n";
  4266. + }
  4267. + %literal_labels = ();
  4268. + }
  4269. +
  4270. + # @l -> lo16() @ha -> ha16()
  4271. + $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g;
  4272. + $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g;
  4273. +
  4274. + if ($line =~ /\.rept\s+(.*)/) {
  4275. + $num_repts = $1;
  4276. + $rept_lines = "\n";
  4277. +
  4278. + # handle the possibility of repeating another directive on the same line
  4279. + # .endr on the same line is not valid, I don't know if a non-directive is
  4280. + if ($num_repts =~ s/(\.\w+.*)//) {
  4281. + $rept_lines .= "$1\n";
  4282. + }
  4283. + $num_repts = eval($num_repts);
  4284. + } elsif ($line =~ /\.endr/) {
  4285. + for (1 .. $num_repts) {
  4286. + print ASMFILE $rept_lines;
  4287. + }
  4288. + $rept_lines = '';
  4289. + } elsif ($rept_lines) {
  4290. + $rept_lines .= $line;
  4291. + } else {
  4292. + print ASMFILE $line;
  4293. + }
  4294. +}
  4295. +
  4296. +print ASMFILE ".text\n";
  4297. +foreach my $literal (keys %literal_labels) {
  4298. + print ASMFILE "$literal_labels{$literal}:\n .word $literal\n";
  4299. +}
  4300. +
  4301. +close(ASMFILE) or exit 1;
  4302. --
  4303. 1.6.1.2
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement