Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From f555cf3758f46f3c4f7a2f05094b16f8f3c25a27 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 29 Jan 2010 02:40:41 -0800
- Subject: [PATCH 01/26] Add ability to adjust ratecontrol parameters on the fly
- encoder_reconfig and x264_picture_t->param can now be used to change ratecontrol parameters.
- This is extraordinarily useful in certain streaming situations where the encoder needs to adapt the bitrate to network circumstances.
- What can be changed:
- 1) CRF can be adjusted if in CRF mode.
- 2) VBV maxrate and bufsize can be adjusted if in VBV mode.
- 3) Bitrate can be adjusted if in CBR mode.
- However, x264 cannot switch between modes and cannot change bitrate in ABR mode.
- Also fix a bug where x264_picture_t->param reconfig method would not always be frame-exact.
- Commit sponsored by SayMama video calling.
- ---
- encoder/encoder.c | 56 +++++++++++++++++++-
- encoder/ratecontrol.c | 137 +++++++++++++++++++++++-------------------------
- encoder/ratecontrol.h | 2 +
- x264.h | 7 ++-
- 4 files changed, 126 insertions(+), 76 deletions(-)
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index d873cd0..008d0f2 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -507,6 +507,39 @@ static int x264_validate_parameters( x264_t *h )
- }
- h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
- h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
- + if( h->param.rc.i_vbv_buffer_size )
- + {
- + if( h->param.rc.i_rc_method == X264_RC_CQP )
- + {
- + x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" );
- + h->param.rc.i_vbv_max_bitrate = 0;
- + h->param.rc.i_vbv_buffer_size = 0;
- + }
- + else if( h->param.rc.i_vbv_max_bitrate == 0 )
- + {
- + if( h->param.rc.i_rc_method == X264_RC_ABR )
- + {
- + x264_log( h, X264_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n" );
- + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
- + }
- + else
- + {
- + x264_log( h, X264_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n" );
- + h->param.rc.i_vbv_buffer_size = 0;
- + }
- + }
- + else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
- + h->param.rc.i_rc_method == X264_RC_ABR )
- + {
- + x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
- + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
- + }
- + }
- + else if( h->param.rc.i_vbv_max_bitrate )
- + {
- + x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n" );
- + h->param.rc.i_vbv_max_bitrate = 0;
- + }
- int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
- if( h->param.b_sliced_threads )
- @@ -1071,7 +1104,7 @@ fail:
- ****************************************************************************/
- int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
- {
- - h = h->thread[h->i_thread_phase];
- + h = h->thread[h->thread[0]->i_thread_phase];
- x264_set_aspect_ratio( h, param, 0 );
- #define COPY(var) h->param.var = param->var
- COPY( i_frame_reference ); // but never uses more refs than initially specified
- @@ -1110,11 +1143,30 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
- COPY( i_slice_max_size );
- COPY( i_slice_max_mbs );
- COPY( i_slice_count );
- + /* VBV can't be turned on if it wasn't on to begin with */
- + if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 &&
- + param->rc.i_vbv_max_bitrate > 0 && param->rc.i_vbv_buffer_size > 0 )
- + {
- + COPY( rc.i_vbv_max_bitrate );
- + COPY( rc.i_vbv_buffer_size );
- + COPY( rc.i_bitrate );
- + }
- + COPY( rc.f_rf_constant );
- #undef COPY
- mbcmp_init( h );
- - return x264_validate_parameters( h );
- + int ret = x264_validate_parameters( h );
- +
- + /* Supported reconfiguration options (1-pass only):
- + * vbv-maxrate
- + * vbv-bufsize
- + * crf
- + * bitrate (CBR only) */
- + if( !ret )
- + x264_ratecontrol_init_reconfigurable( h, 0 );
- +
- + return ret;
- }
- /****************************************************************************
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index 63b3be6..52196e7 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -388,6 +388,53 @@ static char *x264_strcat_filename( char *input, char *suffix )
- return output;
- }
- +void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
- +{
- + x264_ratecontrol_t *rc = h->rc;
- + if( !b_init && rc->b_2pass )
- + return;
- +
- + if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
- + {
- + if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
- + {
- + h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
- + x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
- + h->param.rc.i_vbv_buffer_size );
- + }
- +
- + /* We don't support changing the ABR bitrate right now,
- + so if the stream starts as CBR, keep it CBR. */
- + if( rc->b_vbv_min_rate )
- + h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
- + rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
- + rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
- + rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
- + rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
- + * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
- + if( b_init )
- + {
- + if( h->param.rc.f_vbv_buffer_init > 1. )
- + h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
- + h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
- + rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
- + rc->b_vbv = 1;
- + rc->b_vbv_min_rate = !rc->b_2pass
- + && h->param.rc.i_rc_method == X264_RC_ABR
- + && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
- + }
- + }
- + if( h->param.rc.i_rc_method == X264_RC_CRF )
- + {
- + /* Arbitrary rescaling to make CRF somewhat similar to QP.
- + * Try to compensate for MB-tree's effects as well. */
- + double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
- + double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
- + rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
- + / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
- + }
- +}
- +
- int x264_ratecontrol_new( x264_t *h )
- {
- x264_ratecontrol_t *rc;
- @@ -426,60 +473,10 @@ int x264_ratecontrol_new( x264_t *h )
- x264_log(h, X264_LOG_ERROR, "constant rate-factor is incompatible with 2pass.\n");
- return -1;
- }
- - if( h->param.rc.i_vbv_buffer_size )
- - {
- - if( h->param.rc.i_rc_method == X264_RC_CQP )
- - {
- - x264_log(h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n");
- - h->param.rc.i_vbv_max_bitrate = 0;
- - h->param.rc.i_vbv_buffer_size = 0;
- - }
- - else if( h->param.rc.i_vbv_max_bitrate == 0 )
- - {
- - if( h->param.rc.i_rc_method == X264_RC_ABR )
- - {
- - x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
- - h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
- - }
- - else
- - {
- - x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
- - h->param.rc.i_vbv_buffer_size = 0;
- - }
- - }
- - }
- - if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
- - h->param.rc.i_vbv_max_bitrate > 0)
- - x264_log(h, X264_LOG_WARNING, "max bitrate less than average bitrate, ignored.\n");
- - else if( h->param.rc.i_vbv_max_bitrate > 0 &&
- - h->param.rc.i_vbv_buffer_size > 0 )
- - {
- - if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
- - {
- - h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
- - x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
- - h->param.rc.i_vbv_buffer_size );
- - }
- - if( h->param.rc.f_vbv_buffer_init > 1. )
- - h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
- - rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
- - rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
- - rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
- - h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
- - rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
- - rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
- - * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
- - rc->b_vbv = 1;
- - rc->b_vbv_min_rate = !rc->b_2pass
- - && h->param.rc.i_rc_method == X264_RC_ABR
- - && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
- - }
- - else if( h->param.rc.i_vbv_max_bitrate )
- - {
- - x264_log(h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize.\n");
- - h->param.rc.i_vbv_max_bitrate = 0;
- - }
- - if(rc->rate_tolerance < 0.01)
- +
- + x264_ratecontrol_init_reconfigurable( h, 1 );
- +
- + if( rc->rate_tolerance < 0.01 )
- {
- x264_log(h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n");
- rc->rate_tolerance = 0.01;
- @@ -499,16 +496,6 @@ int x264_ratecontrol_new( x264_t *h )
- rc->last_non_b_pict_type = SLICE_TYPE_I;
- }
- - if( h->param.rc.i_rc_method == X264_RC_CRF )
- - {
- - /* Arbitrary rescaling to make CRF somewhat similar to QP.
- - * Try to compensate for MB-tree's effects as well. */
- - double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
- - double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
- - rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
- - / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
- - }
- -
- rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
- rc->pb_offset = 6.0 * log(h->param.rc.f_pb_factor) / log(2.0);
- rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
- @@ -1577,15 +1564,15 @@ static void update_vbv( x264_t *h, int bits )
- if( rct->buffer_fill_final < 0 )
- x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, rct->buffer_fill_final );
- rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
- - rct->buffer_fill_final += rct->buffer_rate;
- - rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rct->buffer_size );
- + rct->buffer_fill_final += rcc->buffer_rate;
- + rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rcc->buffer_size );
- }
- // provisionally update VBV according to the planned size of all frames currently in progress
- static void update_vbv_plan( x264_t *h, int overhead )
- {
- x264_ratecontrol_t *rcc = h->rc;
- - rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
- + rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
- if( h->i_thread_frames > 1 )
- {
- int j = h->rc - h->thread[0]->rc;
- @@ -1603,6 +1590,8 @@ static void update_vbv_plan( x264_t *h, int overhead )
- rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
- }
- }
- + rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
- + rcc->buffer_fill -= overhead;
- }
- // apply VBV constraints and clip qscale to between lmin and lmax
- @@ -2027,8 +2016,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
- #define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
- /* these vars are updated in x264_ratecontrol_start()
- * so copy them from the context that most recently started (prev)
- - * to the context that's about to start (cur).
- - */
- + * to the context that's about to start (cur). */
- COPY(accum_p_qp);
- COPY(accum_p_norm);
- COPY(last_satd);
- @@ -2040,6 +2028,14 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
- COPY(bframes);
- COPY(prev_zone);
- COPY(qpbuf_pos);
- + /* these vars can be updated by x264_ratecontrol_init_reconfigurable */
- + COPY(buffer_rate);
- + COPY(buffer_size);
- + COPY(single_frame_vbv);
- + COPY(cbr_decay);
- + COPY(b_vbv_min_rate);
- + COPY(rate_factor_constant);
- + COPY(bitrate);
- #undef COPY
- }
- if( cur != next )
- @@ -2047,8 +2043,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
- #define COPY(var) next->rc->var = cur->rc->var
- /* these vars are updated in x264_ratecontrol_end()
- * so copy them from the context that most recently ended (cur)
- - * to the context that's about to end (next)
- - */
- + * to the context that's about to end (next) */
- COPY(cplxr_sum);
- COPY(expected_bits_sum);
- COPY(wanted_bits_window);
- diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
- index 5a8d088..2767866 100644
- --- a/encoder/ratecontrol.h
- +++ b/encoder/ratecontrol.h
- @@ -27,6 +27,8 @@
- int x264_ratecontrol_new ( x264_t * );
- void x264_ratecontrol_delete( x264_t * );
- +void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
- +
- void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
- void x264_adaptive_quant( x264_t * );
- int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
- diff --git a/x264.h b/x264.h
- index 2550864..e7d19b7 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -35,7 +35,7 @@
- #include <stdarg.h>
- -#define X264_BUILD 84
- +#define X264_BUILD 85
- /* x264_t:
- * opaque handler for encoder */
- @@ -480,11 +480,12 @@ typedef struct
- x264_t *x264_encoder_open( x264_param_t * );
- /* x264_encoder_reconfig:
- - * analysis-related parameters from x264_param_t are copied.
- + * various parameters from x264_param_t are copied.
- * this takes effect immediately, on whichever frame is encoded next;
- * due to delay, this may not be the next frame passed to encoder_encode.
- * if the change should apply to some particular frame, use x264_picture_t->param instead.
- - * returns 0 on success, negative on parameter validation error. */
- + * returns 0 on success, negative on parameter validation error.
- + * not all parameters can be changed; see the actual function for a detailed breakdown. */
- int x264_encoder_reconfig( x264_t *, x264_param_t * );
- /* x264_encoder_parameters:
- * copies the current internal set of parameters to the pointer provided
- --
- 1.6.1.2
- From 08d4a999b0300e50196afb3ee0e310834028b537 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Mon, 1 Feb 2010 13:04:47 -0800
- Subject: [PATCH 02/26] Slightly faster predictor_difference_mmxext
- ---
- common/x86/util.h | 17 ++++++++++-------
- 1 files changed, 10 insertions(+), 7 deletions(-)
- diff --git a/common/x86/util.h b/common/x86/util.h
- index efc700a..c8bcf4b 100644
- --- a/common/x86/util.h
- +++ b/common/x86/util.h
- @@ -45,8 +45,9 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
- #define x264_predictor_difference x264_predictor_difference_mmxext
- static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
- {
- - int sum = 0;
- - uint16_t output[4];
- + int sum;
- + static const uint64_t pw_1 = 0x0001000100010001ULL;
- +
- asm(
- "pxor %%mm4, %%mm4 \n"
- "test $1, %1 \n"
- @@ -56,7 +57,7 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
- "psubw %%mm3, %%mm0 \n"
- "jmp 2f \n"
- "3: \n"
- - "sub $1, %1 \n"
- + "dec %1 \n"
- "1: \n"
- "movq -8(%2,%1,4), %%mm0 \n"
- "psubw -4(%2,%1,4), %%mm0 \n"
- @@ -67,11 +68,13 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
- "pmaxsw %%mm2, %%mm0 \n"
- "paddusw %%mm0, %%mm4 \n"
- "jg 1b \n"
- - "movq %%mm4, %0 \n"
- - :"=m"(output), "+r"(i_mvc)
- - :"r"(mvc), "m"(M64( mvc ))
- + "pmaddwd %4, %%mm4 \n"
- + "pshufw $14, %%mm4, %%mm0 \n"
- + "paddd %%mm0, %%mm4 \n"
- + "movd %%mm4, %0 \n"
- + :"=r"(sum), "+r"(i_mvc)
- + :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
- );
- - sum += output[0] + output[1] + output[2] + output[3];
- return sum;
- }
- #define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
- --
- 1.6.1.2
- From 1ec82b87c875c5fa6e66e9cbedb4ec04ac6c058c Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Tue, 2 Feb 2010 03:15:18 -0800
- Subject: [PATCH 03/26] Improve bidir search, fix some artifacts in fades
- Modify analysis to allow bidir to use different motion vectors than L0/L1.
- Always try the <0,0,0,0> motion vector for bidir.
- Eliminates almost all errant motion vectors in fades.
- Slightly improves PSNR as well (~0.015db).
- ---
- encoder/analyse.c | 50 ++++++++++++++++++++++++++++++++++++++------------
- 1 files changed, 38 insertions(+), 12 deletions(-)
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index 666596b..1fb2206 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -40,6 +40,7 @@ typedef struct
- int i_ref;
- int i_rd16x16;
- x264_me_t me16x16;
- + x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
- /* 8x8 */
- int i_cost8x8;
- @@ -1722,20 +1723,45 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
- a->l1.me16x16.i_ref = a->l1.i_ref;
- /* get cost of BI mode */
- + int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
- + h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
- + h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
- src0 = h->mc.get_ref( pix0, &stride0,
- h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
- - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
- + a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
- src1 = h->mc.get_ref( pix1, &stride1,
- h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
- - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
- + a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
- h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
- a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
- - + REF_COST( 0, a->l0.i_ref )
- - + REF_COST( 1, a->l1.i_ref )
- - + a->l0.me16x16.cost_mv
- - + a->l1.me16x16.cost_mv;
- + + ref_costs
- + + a->l0.bi16x16.cost_mv
- + + a->l1.bi16x16.cost_mv;
- +
- +
- + /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
- + if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
- + {
- + int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
- + + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
- + int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
- + + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
- + h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
- + h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
- + h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
- + int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
- + + ref_costs + l0_mv_cost + l1_mv_cost;
- + if( cost00 < a->i_cost16x16bi )
- + {
- + M32( a->l0.bi16x16.mv ) = 0;
- + M32( a->l1.bi16x16.mv ) = 0;
- + a->l0.bi16x16.cost_mv = l0_mv_cost;
- + a->l1.bi16x16.cost_mv = l1_mv_cost;
- + a->i_cost16x16bi = cost00;
- + }
- + }
- /* mb type cost */
- a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
- @@ -2205,7 +2231,7 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
- {
- case D_16x16:
- if( h->mb.i_type == B_BI_BI )
- - x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
- + x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
- break;
- case D_16x8:
- for( i=0; i<2; i++ )
- @@ -2819,8 +2845,8 @@ intra_analysis:
- }
- else if( i_type == B_BI_BI )
- {
- - x264_me_refine_qpel( h, &analysis.l0.me16x16 );
- - x264_me_refine_qpel( h, &analysis.l1.me16x16 );
- + x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
- + x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
- }
- }
- else if( i_partition == D_16x8 )
- @@ -2938,7 +2964,7 @@ intra_analysis:
- x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
- }
- else if( i_type == B_BI_BI )
- - x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
- + x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
- }
- else if( i_partition == D_16x8 )
- {
- @@ -3121,10 +3147,10 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
- break;
- case B_BI_BI:
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
- - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
- + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
- - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
- + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
- break;
- }
- break;
- --
- 1.6.1.2
- From dd349567b662bb4c2d629cf0967c87843b9bb3a3 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 3 Feb 2010 14:22:05 -0800
- Subject: [PATCH 04/26] Faster CABAC MB header writing
- Reorganize the header writing to merge mb type and mb mode info (mv, pred, etc)
- Reduces redundant branches and better splits the code between frame types (for better code cache usage).
- Also slightly simplify qp delta calculation.
- Also make CAVLC and CABAC a bit more consistent in structure and function names.
- ---
- encoder/cabac.c | 573 ++++++++++++++++++++++++++-----------------------------
- encoder/cavlc.c | 118 ++++++------
- 2 files changed, 334 insertions(+), 357 deletions(-)
- diff --git a/encoder/cabac.c b/encoder/cabac.c
- index 271f527..6ff2aed 100644
- --- a/encoder/cabac.c
- +++ b/encoder/cabac.c
- @@ -29,151 +29,6 @@
- #define RDO_SKIP_BS 0
- #endif
- -static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
- - int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
- -{
- - if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
- - {
- - x264_cabac_encode_decision_noup( cb, ctx0, 0 );
- - }
- -#if !RDO_SKIP_BS
- - else if( i_mb_type == I_PCM )
- - {
- - x264_cabac_encode_decision_noup( cb, ctx0, 1 );
- - x264_cabac_encode_flush( h, cb );
- - }
- -#endif
- - else
- - {
- - int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
- -
- - x264_cabac_encode_decision_noup( cb, ctx0, 1 );
- - x264_cabac_encode_terminal( cb );
- -
- - x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
- - if( h->mb.i_cbp_chroma == 0 )
- - x264_cabac_encode_decision_noup( cb, ctx2, 0 );
- - else
- - {
- - x264_cabac_encode_decision( cb, ctx2, 1 );
- - x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
- - }
- - x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
- - x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
- - }
- -}
- -
- -static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
- -{
- - const int i_mb_type = h->mb.i_type;
- -
- - if( h->sh.b_mbaff &&
- - (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
- - {
- - x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced );
- - }
- -
- - if( h->sh.i_type == SLICE_TYPE_I )
- - {
- - int ctx = 0;
- - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 )
- - ctx++;
- - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 )
- - ctx++;
- -
- - x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
- - }
- - else if( h->sh.i_type == SLICE_TYPE_P )
- - {
- - /* prefix: 14, suffix: 17 */
- - if( i_mb_type == P_L0 )
- - {
- - x264_cabac_encode_decision_noup( cb, 14, 0 );
- - x264_cabac_encode_decision_noup( cb, 15, h->mb.i_partition != D_16x16 );
- - x264_cabac_encode_decision_noup( cb, 17-(h->mb.i_partition == D_16x16), h->mb.i_partition == D_16x8 );
- - }
- - else if( i_mb_type == P_8x8 )
- - {
- - x264_cabac_encode_decision_noup( cb, 14, 0 );
- - x264_cabac_encode_decision_noup( cb, 15, 0 );
- - x264_cabac_encode_decision_noup( cb, 16, 1 );
- - }
- - else /* intra */
- - {
- - /* prefix */
- - x264_cabac_encode_decision_noup( cb, 14, 1 );
- -
- - /* suffix */
- - x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
- - }
- - }
- - else //if( h->sh.i_type == SLICE_TYPE_B )
- - {
- - int ctx = 0;
- - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
- - ctx++;
- - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
- - ctx++;
- -
- - if( i_mb_type == B_DIRECT )
- - {
- - x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
- - return;
- - }
- - x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
- -
- - if( i_mb_type == B_8x8 )
- - {
- - x264_cabac_encode_decision_noup( cb, 27+3, 1 );
- - x264_cabac_encode_decision_noup( cb, 27+4, 1 );
- - x264_cabac_encode_decision( cb, 27+5, 1 );
- - x264_cabac_encode_decision( cb, 27+5, 1 );
- - x264_cabac_encode_decision_noup( cb, 27+5, 1 );
- - }
- - else if( IS_INTRA( i_mb_type ) )
- - {
- - /* prefix */
- - x264_cabac_encode_decision_noup( cb, 27+3, 1 );
- - x264_cabac_encode_decision_noup( cb, 27+4, 1 );
- - x264_cabac_encode_decision( cb, 27+5, 1 );
- - x264_cabac_encode_decision( cb, 27+5, 0 );
- - x264_cabac_encode_decision( cb, 27+5, 1 );
- -
- - /* suffix */
- - x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
- - }
- - else
- - {
- - static const uint8_t i_mb_bits[9*3] =
- - {
- - 0x31, 0x29, 0x4, /* L0 L0 */
- - 0x35, 0x2d, 0, /* L0 L1 */
- - 0x43, 0x63, 0, /* L0 BI */
- - 0x3d, 0x2f, 0, /* L1 L0 */
- - 0x39, 0x25, 0x6, /* L1 L1 */
- - 0x53, 0x73, 0, /* L1 BI */
- - 0x4b, 0x6b, 0, /* BI L0 */
- - 0x5b, 0x7b, 0, /* BI L1 */
- - 0x47, 0x67, 0x21 /* BI BI */
- - };
- -
- - const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
- - int bits = i_mb_bits[idx];
- -
- - x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
- - x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
- - if( bits != 1 )
- - {
- - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
- - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
- - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
- - if( bits != 1 )
- - x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
- - }
- - }
- - }
- -}
- -
- static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode )
- {
- if( i_pred == i_mode )
- @@ -209,6 +64,12 @@ static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
- }
- }
- +static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
- +{
- + int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
- + x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
- +}
- +
- static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb )
- {
- int cbp = h->mb.i_cbp_luma;
- @@ -244,7 +105,6 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
- static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
- {
- int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
- - int ctx;
- /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
- if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
- @@ -257,7 +117,7 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
- /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
- * we don't have to check for them. */
- - ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];
- + int ctx = !!h->mb.i_last_dqp;
- if( i_dqp != 0 )
- {
- @@ -321,12 +181,6 @@ static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
- x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 );
- }
- -static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
- -{
- - int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
- - x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
- -}
- -
- static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx )
- {
- const int i8 = x264_scan8[idx];
- @@ -463,6 +317,267 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
- }
- }
- +static void x264_cabac_mb_header_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
- + int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
- +{
- + if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
- + {
- + int i, di = h->mb.b_transform_8x8 ? 4 : 1;
- + x264_cabac_encode_decision_noup( cb, ctx0, 0 );
- +
- + if( h->pps->b_transform_8x8_mode )
- + x264_cabac_mb_transform_size( h, cb );
- +
- + for( i = 0; i < 16; i += di )
- + {
- + const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
- + const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
- + x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
- + }
- + }
- +#if !RDO_SKIP_BS
- + else if( i_mb_type == I_PCM )
- + {
- + x264_cabac_encode_decision_noup( cb, ctx0, 1 );
- + x264_cabac_encode_flush( h, cb );
- + return;
- + }
- +#endif
- + else
- + {
- + int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
- +
- + x264_cabac_encode_decision_noup( cb, ctx0, 1 );
- + x264_cabac_encode_terminal( cb );
- +
- + x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
- + if( h->mb.i_cbp_chroma == 0 )
- + x264_cabac_encode_decision_noup( cb, ctx2, 0 );
- + else
- + {
- + x264_cabac_encode_decision( cb, ctx2, 1 );
- + x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
- + }
- + x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
- + x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
- + }
- + x264_cabac_mb_intra_chroma_pred_mode( h, cb );
- +}
- +
- +static inline void x264_cabac_mb_header( x264_t *h, x264_cabac_t *cb )
- +{
- + const int i_mb_type = h->mb.i_type;
- + int i_list, i;
- +
- + if( h->sh.b_mbaff &&
- + (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
- + {
- + x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced );
- + }
- +
- + if( h->sh.i_type == SLICE_TYPE_I )
- + {
- + int ctx = 0;
- + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 )
- + ctx++;
- + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 )
- + ctx++;
- +
- + x264_cabac_mb_header_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
- + }
- + else if( h->sh.i_type == SLICE_TYPE_P )
- + {
- + /* prefix: 14, suffix: 17 */
- + if( i_mb_type == P_L0 )
- + {
- + x264_cabac_encode_decision_noup( cb, 14, 0 );
- + if( h->mb.i_partition == D_16x16 )
- + {
- + x264_cabac_encode_decision_noup( cb, 15, 0 );
- + x264_cabac_encode_decision_noup( cb, 16, 0 );
- + if( h->mb.pic.i_fref[0] > 1 )
- + x264_cabac_mb_ref( h, cb, 0, 0 );
- + x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 );
- + }
- + else if( h->mb.i_partition == D_16x8 )
- + {
- + x264_cabac_encode_decision_noup( cb, 15, 1 );
- + x264_cabac_encode_decision_noup( cb, 17, 1 );
- + if( h->mb.pic.i_fref[0] > 1 )
- + {
- + x264_cabac_mb_ref( h, cb, 0, 0 );
- + x264_cabac_mb_ref( h, cb, 0, 8 );
- + }
- + x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
- + x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
- + }
- + else //if( h->mb.i_partition == D_8x16 )
- + {
- + x264_cabac_encode_decision_noup( cb, 15, 1 );
- + x264_cabac_encode_decision_noup( cb, 17, 0 );
- + if( h->mb.pic.i_fref[0] > 1 )
- + {
- + x264_cabac_mb_ref( h, cb, 0, 0 );
- + x264_cabac_mb_ref( h, cb, 0, 4 );
- + }
- + x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 );
- + x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 );
- + }
- + }
- + else if( i_mb_type == P_8x8 )
- + {
- + x264_cabac_encode_decision_noup( cb, 14, 0 );
- + x264_cabac_encode_decision_noup( cb, 15, 0 );
- + x264_cabac_encode_decision_noup( cb, 16, 1 );
- +
- + /* sub mb type */
- + for( i = 0; i < 4; i++ )
- + x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
- +
- + /* ref 0 */
- + if( h->mb.pic.i_fref[0] > 1 )
- + {
- + x264_cabac_mb_ref( h, cb, 0, 0 );
- + x264_cabac_mb_ref( h, cb, 0, 4 );
- + x264_cabac_mb_ref( h, cb, 0, 8 );
- + x264_cabac_mb_ref( h, cb, 0, 12 );
- + }
- +
- + for( i = 0; i < 4; i++ )
- + x264_cabac_mb8x8_mvd( h, cb, i );
- + }
- + else /* intra */
- + {
- + /* prefix */
- + x264_cabac_encode_decision_noup( cb, 14, 1 );
- +
- + /* suffix */
- + x264_cabac_mb_header_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
- + }
- + }
- + else //if( h->sh.i_type == SLICE_TYPE_B )
- + {
- + int ctx = 0;
- + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT )
- + ctx++;
- + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
- + ctx++;
- +
- + if( i_mb_type == B_DIRECT )
- + {
- + x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
- + return;
- + }
- + x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
- +
- + if( i_mb_type == B_8x8 )
- + {
- + x264_cabac_encode_decision_noup( cb, 27+3, 1 );
- + x264_cabac_encode_decision_noup( cb, 27+4, 1 );
- + x264_cabac_encode_decision ( cb, 27+5, 1 );
- + x264_cabac_encode_decision ( cb, 27+5, 1 );
- + x264_cabac_encode_decision_noup( cb, 27+5, 1 );
- +
- + /* sub mb type */
- + for( i = 0; i < 4; i++ )
- + x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
- +
- + /* ref */
- + if( h->mb.pic.i_fref[0] > 1 )
- + for( i = 0; i < 4; i++ )
- + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- + x264_cabac_mb_ref( h, cb, 0, 4*i );
- +
- + if( h->mb.pic.i_fref[1] > 1 )
- + for( i = 0; i < 4; i++ )
- + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- + x264_cabac_mb_ref( h, cb, 1, 4*i );
- +
- + for( i = 0; i < 4; i++ )
- + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- + x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
- +
- + for( i = 0; i < 4; i++ )
- + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- + x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
- + }
- + else if( IS_INTRA( i_mb_type ) )
- + {
- + /* prefix */
- + x264_cabac_encode_decision_noup( cb, 27+3, 1 );
- + x264_cabac_encode_decision_noup( cb, 27+4, 1 );
- + x264_cabac_encode_decision ( cb, 27+5, 1 );
- + x264_cabac_encode_decision ( cb, 27+5, 0 );
- + x264_cabac_encode_decision ( cb, 27+5, 1 );
- +
- + /* suffix */
- + x264_cabac_mb_header_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
- + }
- + else
- + {
- + static const uint8_t i_mb_bits[9*3] =
- + {
- + 0x31, 0x29, 0x4, /* L0 L0 */
- + 0x35, 0x2d, 0, /* L0 L1 */
- + 0x43, 0x63, 0, /* L0 BI */
- + 0x3d, 0x2f, 0, /* L1 L0 */
- + 0x39, 0x25, 0x6, /* L1 L1 */
- + 0x53, 0x73, 0, /* L1 BI */
- + 0x4b, 0x6b, 0, /* BI L0 */
- + 0x5b, 0x7b, 0, /* BI L1 */
- + 0x47, 0x67, 0x21 /* BI BI */
- + };
- +
- + const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
- + int bits = i_mb_bits[idx];
- +
- + x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
- + x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
- + if( bits != 1 )
- + {
- + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
- + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
- + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
- + if( bits != 1 )
- + x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
- + }
- +
- + const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
- + if( h->mb.pic.i_fref[0] > 1 )
- + {
- + if( b_list[0][0] )
- + x264_cabac_mb_ref( h, cb, 0, 0 );
- + if( b_list[0][1] && h->mb.i_partition != D_16x16 )
- + x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
- + }
- + if( h->mb.pic.i_fref[1] > 1 )
- + {
- + if( b_list[1][0] )
- + x264_cabac_mb_ref( h, cb, 1, 0 );
- + if( b_list[1][1] && h->mb.i_partition != D_16x16 )
- + x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
- + }
- + for( i_list = 0; i_list < 2; i_list++ )
- + {
- + if( h->mb.i_partition == D_16x16 )
- + {
- + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 );
- + }
- + else if( h->mb.i_partition == D_16x8 )
- + {
- + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
- + if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
- + }
- + else //if( h->mb.i_partition == D_8x16 )
- + {
- + if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
- + if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
- + }
- + }
- + }
- + }
- +}
- +
- /* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0
- * 1-> AC 16x16 i_idx = luma4x4idx
- * 2-> Luma4x4 i_idx = luma4x4idx
- @@ -752,7 +867,6 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
- void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
- {
- const int i_mb_type = h->mb.i_type;
- - int i_list;
- int i;
- #if !RDO_SKIP_BS
- @@ -760,15 +874,14 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
- int i_mb_pos_tex;
- #endif
- - /* Write the MB type */
- - x264_cabac_mb_type( h, cb );
- + x264_cabac_mb_header( h, cb );
- #if !RDO_SKIP_BS
- + i_mb_pos_tex = x264_cabac_pos( cb );
- + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
- +
- if( i_mb_type == I_PCM )
- {
- - i_mb_pos_tex = x264_cabac_pos( cb );
- - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
- -
- memcpy( cb->p, h->mb.pic.p_fenc[0], 256 );
- cb->p += 256;
- for( i = 0; i < 8; i++ )
- @@ -793,140 +906,6 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
- }
- #endif
- - if( IS_INTRA( i_mb_type ) )
- - {
- - if( h->pps->b_transform_8x8_mode && i_mb_type != I_16x16 )
- - x264_cabac_mb_transform_size( h, cb );
- -
- - if( i_mb_type != I_16x16 )
- - {
- - int di = h->mb.b_transform_8x8 ? 4 : 1;
- - for( i = 0; i < 16; i += di )
- - {
- - const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
- - const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
- - x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
- - }
- - }
- -
- - x264_cabac_mb_intra_chroma_pred_mode( h, cb );
- - }
- - else if( i_mb_type == P_L0 )
- - {
- - if( h->mb.i_partition == D_16x16 )
- - {
- - if( h->mb.pic.i_fref[0] > 1 )
- - {
- - x264_cabac_mb_ref( h, cb, 0, 0 );
- - }
- - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 );
- - }
- - else if( h->mb.i_partition == D_16x8 )
- - {
- - if( h->mb.pic.i_fref[0] > 1 )
- - {
- - x264_cabac_mb_ref( h, cb, 0, 0 );
- - x264_cabac_mb_ref( h, cb, 0, 8 );
- - }
- - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
- - x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
- - }
- - else //if( h->mb.i_partition == D_8x16 )
- - {
- - if( h->mb.pic.i_fref[0] > 1 )
- - {
- - x264_cabac_mb_ref( h, cb, 0, 0 );
- - x264_cabac_mb_ref( h, cb, 0, 4 );
- - }
- - x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 );
- - x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 );
- - }
- - }
- - else if( i_mb_type == P_8x8 )
- - {
- - /* sub mb type */
- - for( i = 0; i < 4; i++ )
- - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
- -
- - /* ref 0 */
- - if( h->mb.pic.i_fref[0] > 1 )
- - {
- - x264_cabac_mb_ref( h, cb, 0, 0 );
- - x264_cabac_mb_ref( h, cb, 0, 4 );
- - x264_cabac_mb_ref( h, cb, 0, 8 );
- - x264_cabac_mb_ref( h, cb, 0, 12 );
- - }
- -
- - for( i = 0; i < 4; i++ )
- - x264_cabac_mb8x8_mvd( h, cb, i );
- - }
- - else if( i_mb_type == B_8x8 )
- - {
- - /* sub mb type */
- - for( i = 0; i < 4; i++ )
- - x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
- -
- - /* ref */
- - if( h->mb.pic.i_fref[0] > 1 )
- - for( i = 0; i < 4; i++ )
- - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- - x264_cabac_mb_ref( h, cb, 0, 4*i );
- -
- - if( h->mb.pic.i_fref[1] > 1 )
- - for( i = 0; i < 4; i++ )
- - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- - x264_cabac_mb_ref( h, cb, 1, 4*i );
- -
- - for( i = 0; i < 4; i++ )
- - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- - x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
- -
- - for( i = 0; i < 4; i++ )
- - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- - x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
- - }
- - else if( i_mb_type != B_DIRECT )
- - {
- - /* All B mode */
- - const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
- - if( h->mb.pic.i_fref[0] > 1 )
- - {
- - if( b_list[0][0] )
- - x264_cabac_mb_ref( h, cb, 0, 0 );
- - if( b_list[0][1] && h->mb.i_partition != D_16x16 )
- - x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
- - }
- - if( h->mb.pic.i_fref[1] > 1 )
- - {
- - if( b_list[1][0] )
- - x264_cabac_mb_ref( h, cb, 1, 0 );
- - if( b_list[1][1] && h->mb.i_partition != D_16x16 )
- - x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
- - }
- - for( i_list = 0; i_list < 2; i_list++ )
- - {
- - if( h->mb.i_partition == D_16x16 )
- - {
- - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 );
- - }
- - else if( h->mb.i_partition == D_16x8 )
- - {
- - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
- - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
- - }
- - else //if( h->mb.i_partition == D_8x16 )
- - {
- - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
- - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
- - }
- - }
- - }
- -
- -#if !RDO_SKIP_BS
- - i_mb_pos_tex = x264_cabac_pos( cb );
- - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
- -#endif
- -
- if( i_mb_type != I_16x16 )
- {
- x264_cabac_mb_cbp_luma( h, cb );
- @@ -934,11 +913,9 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
- }
- if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
- - {
- x264_cabac_mb_transform_size( h, cb );
- - }
- - if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
- + if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
- {
- const int b_intra = IS_INTRA( i_mb_type );
- x264_cabac_mb_qp_delta( h, cb );
- @@ -950,7 +927,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
- block_residual_write_cabac_cbf( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 1 );
- /* AC Luma */
- - if( h->mb.i_cbp_luma != 0 )
- + if( h->mb.i_cbp_luma )
- for( i = 0; i < 16; i++ )
- block_residual_write_cabac_cbf( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 1 );
- }
- @@ -967,7 +944,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
- block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], b_intra );
- }
- - if( h->mb.i_cbp_chroma&0x03 ) /* Chroma DC residual present */
- + if( h->mb.i_cbp_chroma ) /* Chroma DC residual present */
- {
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], b_intra );
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], b_intra );
- diff --git a/encoder/cavlc.c b/encoder/cavlc.c
- index c65c9bd..d18408b 100644
- --- a/encoder/cavlc.c
- +++ b/encoder/cavlc.c
- @@ -203,7 +203,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
- *nnz = block_residual_write_cavlc(h,cat,l,nC);\
- }
- -static void cavlc_qp_delta( x264_t *h )
- +static void x264_cavlc_mb_qp_delta( x264_t *h )
- {
- bs_t *s = &h->out.bs;
- int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
- @@ -228,7 +228,7 @@ static void cavlc_qp_delta( x264_t *h )
- bs_write_se( s, i_dqp );
- }
- -static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
- +static void x264_cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
- {
- bs_t *s = &h->out.bs;
- ALIGNED_4( int16_t mvp[2] );
- @@ -237,26 +237,26 @@ static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
- bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
- }
- -static inline void cavlc_mb8x8_mvd( x264_t *h, int i )
- +static inline void x264_cavlc_mb8x8_mvd( x264_t *h, int i )
- {
- switch( h->mb.i_sub_partition[i] )
- {
- case D_L0_8x8:
- - cavlc_mb_mvd( h, 0, 4*i, 2 );
- + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
- break;
- case D_L0_8x4:
- - cavlc_mb_mvd( h, 0, 4*i+0, 2 );
- - cavlc_mb_mvd( h, 0, 4*i+2, 2 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+0, 2 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+2, 2 );
- break;
- case D_L0_4x8:
- - cavlc_mb_mvd( h, 0, 4*i+0, 1 );
- - cavlc_mb_mvd( h, 0, 4*i+1, 1 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+0, 1 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+1, 1 );
- break;
- case D_L0_4x4:
- - cavlc_mb_mvd( h, 0, 4*i+0, 1 );
- - cavlc_mb_mvd( h, 0, 4*i+1, 1 );
- - cavlc_mb_mvd( h, 0, 4*i+2, 1 );
- - cavlc_mb_mvd( h, 0, 4*i+3, 1 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+0, 1 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+1, 1 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+2, 1 );
- + x264_cavlc_mb_mvd( h, 0, 4*i+3, 1 );
- break;
- }
- }
- @@ -372,7 +372,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
- if( h->mb.pic.i_fref[0] > 1 )
- bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- - cavlc_mb_mvd( h, 0, 0, 4 );
- + x264_cavlc_mb_mvd( h, 0, 0, 4 );
- }
- else if( h->mb.i_partition == D_16x8 )
- {
- @@ -382,8 +382,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
- bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
- }
- - cavlc_mb_mvd( h, 0, 0, 4 );
- - cavlc_mb_mvd( h, 0, 8, 4 );
- + x264_cavlc_mb_mvd( h, 0, 0, 4 );
- + x264_cavlc_mb_mvd( h, 0, 8, 4 );
- }
- else if( h->mb.i_partition == D_8x16 )
- {
- @@ -393,8 +393,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
- bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
- }
- - cavlc_mb_mvd( h, 0, 0, 2 );
- - cavlc_mb_mvd( h, 0, 4, 2 );
- + x264_cavlc_mb_mvd( h, 0, 0, 2 );
- + x264_cavlc_mb_mvd( h, 0, 4, 2 );
- }
- }
- else if( i_mb_type == P_8x8 )
- @@ -429,7 +429,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
- }
- for( i = 0; i < 4; i++ )
- - cavlc_mb8x8_mvd( h, i );
- + x264_cavlc_mb8x8_mvd( h, i );
- }
- else if( i_mb_type == B_8x8 )
- {
- @@ -452,10 +452,10 @@ void x264_macroblock_write_cavlc( x264_t *h )
- /* mvd */
- for( i = 0; i < 4; i++ )
- if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- - cavlc_mb_mvd( h, 0, 4*i, 2 );
- + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
- for( i = 0; i < 4; i++ )
- if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- - cavlc_mb_mvd( h, 1, 4*i, 2 );
- + x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
- }
- else if( i_mb_type != B_DIRECT )
- {
- @@ -470,8 +470,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
- {
- if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
- if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
- - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
- - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
- + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
- + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
- }
- else
- {
- @@ -481,17 +481,17 @@ void x264_macroblock_write_cavlc( x264_t *h )
- if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
- if( h->mb.i_partition == D_16x8 )
- {
- - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
- - if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 8, 4 );
- - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
- - if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 8, 4 );
- + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
- + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
- + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
- + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
- }
- else //if( h->mb.i_partition == D_8x16 )
- {
- - if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 2 );
- - if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 4, 2 );
- - if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 2 );
- - if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 4, 2 );
- + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
- + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
- + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
- + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
- }
- }
- }
- @@ -514,31 +514,31 @@ void x264_macroblock_write_cavlc( x264_t *h )
- bs_write1( s, h->mb.b_transform_8x8 );
- /* write residual */
- - if( i_mb_type == I_16x16 )
- + if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
- {
- - cavlc_qp_delta( h );
- + x264_cavlc_mb_qp_delta( h );
- - /* DC Luma */
- - block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
- + if( i_mb_type == I_16x16 )
- + {
- + /* DC Luma */
- + block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
- - /* AC Luma */
- - if( h->mb.i_cbp_luma )
- - for( i = 0; i < 16; i++ )
- - block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
- - }
- - else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
- - {
- - cavlc_qp_delta( h );
- - x264_macroblock_luma_write_cavlc( h, 0, 3 );
- - }
- - if( h->mb.i_cbp_chroma )
- - {
- - /* Chroma DC residual present */
- - block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
- - block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
- - if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
- - for( i = 16; i < 24; i++ )
- - block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
- + /* AC Luma */
- + if( h->mb.i_cbp_luma )
- + for( i = 0; i < 16; i++ )
- + block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
- + }
- + else
- + x264_macroblock_luma_write_cavlc( h, 0, 3 );
- +
- + if( h->mb.i_cbp_chroma ) /* Chroma DC residual present */
- + {
- + block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
- + block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
- + if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
- + for( i = 16; i < 24; i++ )
- + block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
- + }
- }
- #if !RDO_SKIP_BS
- @@ -563,22 +563,22 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
- if( i_mb_type == P_8x8 )
- {
- - cavlc_mb8x8_mvd( h, i8 );
- + x264_cavlc_mb8x8_mvd( h, i8 );
- bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
- }
- else if( i_mb_type == P_L0 )
- - cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- + x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
- {
- - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
- + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
- }
- else //if( i_mb_type == B_8x8 )
- {
- if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
- - cavlc_mb_mvd( h, 0, 4*i8, 2 );
- + x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
- if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
- - cavlc_mb_mvd( h, 1, 4*i8, 2 );
- + x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
- }
- for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
- @@ -596,7 +596,7 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
- {
- int b_8x4 = i_pixel == PIXEL_8x4;
- h->out.bs.i_bits_encoded = 0;
- - cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
- + x264_cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
- block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
- if( i_pixel != PIXEL_4x4 )
- {
- --
- 1.6.1.2
- From 8b3167396b9f48eefe4f6d1c7fda24d3f8e91dfc Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 3 Feb 2010 18:19:29 -0800
- Subject: [PATCH 05/26] Simplify decimate checks in macroblock_encode
- Also fix a misleading comment.
- ---
- common/common.h | 1 +
- encoder/analyse.c | 2 ++
- encoder/macroblock.c | 12 +++++-------
- 3 files changed, 8 insertions(+), 7 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index 950f48f..8b1b05a 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -484,6 +484,7 @@ struct x264_t
- int b_chroma_me;
- int b_trellis;
- int b_noise_reduction;
- + int b_dct_decimate;
- int i_psy_rd; /* Psy RD strength--fixed point value*/
- int i_psy_trellis; /* Psy trellis strength--fixed point value*/
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index 1fb2206..92d6584 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -364,6 +364,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
- h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
- h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
- && h->mb.i_subpel_refine >= 5;
- + h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
- + (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
- h->mb.b_transform_8x8 = 0;
- h->mb.b_noise_reduction = 0;
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index e4edb8a..fa7942d 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- @@ -208,8 +208,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
- ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
- int i, nz;
- - int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
- - int decimate_score = b_decimate ? 0 : 9;
- + int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
- if( h->mb.b_lossless )
- {
- @@ -342,7 +341,7 @@ static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp,
- void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
- {
- int i, ch, nz, nz_dc;
- - int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
- + int b_decimate = b_inter && h->mb.b_dct_decimate;
- ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
- h->mb.i_cbp_chroma = 0;
- @@ -607,7 +606,7 @@ void x264_macroblock_encode( x264_t *h )
- {
- int i_cbp_dc = 0;
- int i_qp = h->mb.i_qp;
- - int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
- + int b_decimate = h->mb.b_dct_decimate;
- int b_force_no_skip = 0;
- int i,idx,nz;
- h->mb.i_cbp_luma = 0;
- @@ -914,8 +913,7 @@ void x264_macroblock_encode( x264_t *h )
- /*****************************************************************************
- * x264_macroblock_probe_skip:
- - * Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
- - * the previous QP
- + * Check if the current MB could be encoded as a [PB]_SKIP
- *****************************************************************************/
- int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
- {
- @@ -1052,7 +1050,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
- int i_qp = h->mb.i_qp;
- uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
- uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
- - int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
- + int b_decimate = h->mb.b_dct_decimate;
- int nnz8x8 = 0;
- int ch, nz;
- --
- 1.6.1.2
- From ea1bb5fb815d19ade6ace7482094bc8bb8b276c5 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 3 Feb 2010 18:36:44 -0800
- Subject: [PATCH 06/26] Fix subpel iteration counts with B-frame analysis and subme 6/8
- Since subme 6 means "like subme 5, except RD on P-frames", B-frame analysis
- shouldn't use the RD subpel counts at subme 6. Similarly with subme 8.
- Slightly faster (and very marginally worse) compression at subme 6 and 8.
- ---
- encoder/analyse.c | 2 ++
- 1 files changed, 2 insertions(+), 0 deletions(-)
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index 92d6584..c15bf8f 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -362,6 +362,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
- h->mb.i_me_method = h->param.analyse.i_me_method;
- h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
- + if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
- + h->mb.i_subpel_refine--;
- h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
- && h->mb.i_subpel_refine >= 5;
- h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
- --
- 1.6.1.2
- From 51f1ee4cfc93870c89c8708bcc79d83236c07f7e Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 3 Feb 2010 20:01:16 -0800
- Subject: [PATCH 07/26] Smarter QPRD
- Catch some cases in which RD checks can be avoided; reduces QPRD RD calls by 10-20%.
- ---
- encoder/analyse.c | 42 ++++++++++++++++++++++++++++++++++++++----
- 1 files changed, 38 insertions(+), 4 deletions(-)
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index c15bf8f..1d48b7d 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -2307,9 +2307,10 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
- int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
- int last_qp_tried = 0;
- origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
- + int origcbp = h->mb.cbp[h->mb.i_mb_xy];
- /* If CBP is already zero, don't raise the quantizer any higher. */
- - for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
- + for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
- {
- /* Without psy-RD, require monotonicity when moving quant away from previous
- * macroblock's quant; allow 1 failure when moving quant towards previous quant.
- @@ -2324,14 +2325,47 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
- h->mb.i_qp = orig_qp;
- failures = 0;
- prevcost = origcost;
- +
- + /* If the current QP results in an empty CBP, it's highly likely that lower QPs
- + * (up to a point) will too. So, jump down to where the threshold will kick in
- + * and check the QP there. If the CBP is still empty, skip the main loop.
- + * If it isn't empty, we would have ended up having to check this QP anyways,
- + * so as long as we store it for later lookup, we lose nothing. */
- + int already_checked_qp = -1;
- + int already_checked_cost = COST_MAX;
- + if( direction == -1 )
- + {
- + if( !origcbp )
- + {
- + h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
- + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
- + already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
- + if( !h->mb.cbp[h->mb.i_mb_xy] )
- + {
- + /* If our empty-CBP block is lower QP than the last QP,
- + * the last QP almost surely doesn't have a CBP either. */
- + if( h->mb.i_last_qp > h->mb.i_qp )
- + last_qp_tried = 1;
- + break;
- + }
- + already_checked_qp = h->mb.i_qp;
- + h->mb.i_qp = orig_qp;
- + }
- + }
- +
- h->mb.i_qp += direction;
- while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
- {
- if( h->mb.i_last_qp == h->mb.i_qp )
- last_qp_tried = 1;
- - h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
- - cost = x264_rd_cost_mb( h, a->i_lambda2 );
- - COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
- + if( h->mb.i_qp == already_checked_qp )
- + cost = already_checked_cost;
- + else
- + {
- + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
- + cost = x264_rd_cost_mb( h, a->i_lambda2 );
- + COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
- + }
- /* We can't assume that the costs are monotonic over QPs.
- * Tie case-as-failure seems to give better results. */
- --
- 1.6.1.2
- From 029e2dfc709039b56ec0cd195a0803c160ed73d9 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 3 Feb 2010 20:27:57 -0800
- Subject: [PATCH 08/26] Fix 2-pass ratecontrol continuation in case of missing statsfile
- Didn't work properly if MB-tree was enabled.
- ---
- encoder/ratecontrol.c | 1 +
- 1 files changed, 1 insertions(+), 0 deletions(-)
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index 52196e7..e314ba2 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -1280,6 +1280,7 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
- h->thread[i]->param.rc.b_stat_read = 0;
- h->thread[i]->param.i_bframe_adaptive = 0;
- h->thread[i]->param.i_scenecut_threshold = 0;
- + h->thread[i]->param.rc.b_mb_tree = 0;
- if( h->thread[i]->param.i_bframe > 1 )
- h->thread[i]->param.i_bframe = 1;
- }
- --
- 1.6.1.2
- From de673993912a20ca9616f8733dbfbaf5c2d144f2 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 5 Feb 2010 16:15:23 -0800
- Subject: [PATCH 09/26] Various CABAC/CAVLC cleanups/speedups
- Make some if/else chains into switch statements.
- Store CBP data in x264_t and only move it to frame storage later.
- This saves a wasted cache line and some unnecessary dereferences in RDO.
- ---
- common/common.h | 1 +
- common/macroblock.c | 3 +-
- encoder/analyse.c | 8 +-
- encoder/cabac.c | 40 +++---
- encoder/cavlc.c | 365 ++++++++++++++++++++++++++------------------------
- encoder/macroblock.c | 19 +--
- 6 files changed, 219 insertions(+), 217 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index 8b1b05a..d4a8dd9 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -542,6 +542,7 @@ struct x264_t
- ALIGNED_4( uint8_t i_sub_partition[4] );
- int b_transform_8x8;
- + int i_cbp_combined;
- int i_cbp_luma;
- int i_cbp_chroma;
- diff --git a/common/macroblock.c b/common/macroblock.c
- index 10f09ac..d86f3af 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -1343,11 +1343,12 @@ void x264_macroblock_cache_save( x264_t *h )
- M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
- M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
- - if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
- + if( h->mb.i_type != I_16x16 && !h->mb.i_cbp_combined )
- h->mb.i_qp = h->mb.i_last_qp;
- h->mb.qp[i_mb_xy] = h->mb.i_qp;
- h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
- h->mb.i_last_qp = h->mb.i_qp;
- + h->mb.cbp[i_mb_xy] = h->mb.i_cbp_combined;
- }
- if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index 1d48b7d..63db36a 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -1199,7 +1199,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
- h->mb.i_partition = D_16x16;
- x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
- a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
- - if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
- + if( !h->mb.i_cbp_combined )
- h->mb.i_type = P_SKIP;
- }
- }
- @@ -2307,7 +2307,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
- int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
- int last_qp_tried = 0;
- origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
- - int origcbp = h->mb.cbp[h->mb.i_mb_xy];
- + int origcbp = h->mb.i_cbp_combined;
- /* If CBP is already zero, don't raise the quantizer any higher. */
- for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
- @@ -2340,7 +2340,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
- h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
- h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
- already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
- - if( !h->mb.cbp[h->mb.i_mb_xy] )
- + if( !h->mb.i_cbp_combined )
- {
- /* If our empty-CBP block is lower QP than the last QP,
- * the last QP almost surely doesn't have a CBP either. */
- @@ -2377,7 +2377,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
- if( failures > threshold )
- break;
- - if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
- + if( direction == 1 && !h->mb.i_cbp_combined )
- break;
- h->mb.i_qp += direction;
- }
- diff --git a/encoder/cabac.c b/encoder/cabac.c
- index 6ff2aed..6c14722 100644
- --- a/encoder/cabac.c
- +++ b/encoder/cabac.c
- @@ -107,7 +107,7 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
- int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
- /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
- - if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
- + if( h->mb.i_type == I_16x16 && !h->mb.i_cbp_combined )
- {
- #if !RDO_SKIP_BS
- h->mb.i_qp = h->mb.i_last_qp;
- @@ -915,7 +915,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
- if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
- x264_cabac_mb_transform_size( h, cb );
- - if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
- + if( h->mb.i_cbp_combined || i_mb_type == I_16x16 )
- {
- const int b_intra = IS_INTRA( i_mb_type );
- x264_cabac_mb_qp_delta( h, cb );
- @@ -973,24 +973,24 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
- int b_8x16 = h->mb.i_partition == D_8x16;
- int j;
- - if( i_mb_type == P_8x8 )
- + switch( i_mb_type )
- {
- - x264_cabac_mb8x8_mvd( h, cb, i8 );
- - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
- - }
- - else if( i_mb_type == P_L0 )
- - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
- - else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
- - {
- - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
- - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
- - }
- - else //if( i_mb_type == B_8x8 )
- - {
- - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
- - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
- - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
- - x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
- + case P_L0:
- + x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
- + break;
- + case P_8x8:
- + x264_cabac_mb8x8_mvd( h, cb, i8 );
- + x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
- + break;
- + case B_8x8:
- + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
- + x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
- + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
- + x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
- + break;
- + default: /* Rest of the B types */
- + if( x264_mb_type_list_table[i_mb_type][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
- + if( x264_mb_type_list_table[i_mb_type][1][!!i8] ) x264_cabac_mb_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
- }
- for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
- @@ -1019,9 +1019,7 @@ static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, i
- int b_8x4 = i_pixel == PIXEL_8x4;
- block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 0 );
- if( i_pixel == PIXEL_4x4 )
- - {
- x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
- - }
- else
- {
- x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
- diff --git a/encoder/cavlc.c b/encoder/cavlc.c
- index d18408b..45b55fe 100644
- --- a/encoder/cavlc.c
- +++ b/encoder/cavlc.c
- @@ -209,8 +209,7 @@ static void x264_cavlc_mb_qp_delta( x264_t *h )
- int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
- /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
- - if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
- - && !h->mb.cache.non_zero_count[x264_scan8[24]] )
- + if( h->mb.i_type == I_16x16 && !h->mb.i_cbp_combined )
- {
- #if !RDO_SKIP_BS
- h->mb.i_qp = h->mb.i_last_qp;
- @@ -302,201 +301,209 @@ void x264_macroblock_write_cavlc( x264_t *h )
- bs_write1( s, h->mb.b_interlaced );
- }
- -#if !RDO_SKIP_BS
- - if( i_mb_type == I_PCM )
- - {
- - uint8_t *p_start = s->p_start;
- - bs_write_ue( s, i_mb_i_offset + 25 );
- - i_mb_pos_tex = bs_pos( s );
- - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
- -
- - bs_align_0( s );
- -
- - memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
- - s->p += 256;
- - for( i = 0; i < 8; i++ )
- - memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
- - s->p += 64;
- - for( i = 0; i < 8; i++ )
- - memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
- - s->p += 64;
- -
- - bs_init( s, s->p, s->p_end - s->p );
- - s->p_start = p_start;
- -
- - /* if PCM is chosen, we need to store reconstructed frame data */
- - h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
- - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
- - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
- -
- - h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
- - return;
- - }
- -#endif
- -
- /* Write:
- - type
- - prediction
- - mv */
- - if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
- + switch( i_mb_type )
- {
- - int di = i_mb_type == I_8x8 ? 4 : 1;
- - bs_write_ue( s, i_mb_i_offset + 0 );
- - if( h->pps->b_transform_8x8_mode )
- - bs_write1( s, h->mb.b_transform_8x8 );
- -
- - /* Prediction: Luma */
- - for( i = 0; i < 16; i += di )
- + case I_4x4:
- + case I_8x8:
- {
- - int i_pred = x264_mb_predict_intra4x4_mode( h, i );
- - int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
- + int di = i_mb_type == I_8x8 ? 4 : 1;
- + bs_write_ue( s, i_mb_i_offset + 0 );
- + if( h->pps->b_transform_8x8_mode )
- + bs_write1( s, h->mb.b_transform_8x8 );
- - if( i_pred == i_mode )
- - bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
- - else
- - bs_write( s, 4, i_mode - (i_mode > i_pred) );
- + /* Prediction: Luma */
- + for( i = 0; i < 16; i += di )
- + {
- + int i_pred = x264_mb_predict_intra4x4_mode( h, i );
- + int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
- +
- + if( i_pred == i_mode )
- + bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
- + else
- + bs_write( s, 4, i_mode - (i_mode > i_pred) );
- + }
- + bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
- + break;
- + case I_16x16:
- + bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
- + h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
- + bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
- + break;
- }
- - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
- - }
- - else if( i_mb_type == I_16x16 )
- - {
- - bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
- - h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
- - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
- - }
- - else if( i_mb_type == P_L0 )
- - {
- - if( h->mb.i_partition == D_16x16 )
- +#if !RDO_SKIP_BS
- + case I_PCM:
- {
- - bs_write1( s, 1 );
- -
- - if( h->mb.pic.i_fref[0] > 1 )
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- - x264_cavlc_mb_mvd( h, 0, 0, 4 );
- + uint8_t *p_start = s->p_start;
- + bs_write_ue( s, i_mb_i_offset + 25 );
- + i_mb_pos_tex = bs_pos( s );
- + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
- +
- + bs_align_0( s );
- +
- + memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
- + s->p += 256;
- + for( i = 0; i < 8; i++ )
- + memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
- + s->p += 64;
- + for( i = 0; i < 8; i++ )
- + memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
- + s->p += 64;
- +
- + bs_init( s, s->p, s->p_end - s->p );
- + s->p_start = p_start;
- +
- + /* if PCM is chosen, we need to store reconstructed frame data */
- + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
- + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
- + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
- +
- + h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
- + return;
- }
- - else if( h->mb.i_partition == D_16x8 )
- +#endif
- + case P_L0:
- {
- - bs_write_ue( s, 1 );
- - if( h->mb.pic.i_fref[0] > 1 )
- + if( h->mb.i_partition == D_16x16 )
- {
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
- + bs_write1( s, 1 );
- +
- + if( h->mb.pic.i_fref[0] > 1 )
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- + x264_cavlc_mb_mvd( h, 0, 0, 4 );
- }
- - x264_cavlc_mb_mvd( h, 0, 0, 4 );
- - x264_cavlc_mb_mvd( h, 0, 8, 4 );
- + else if( h->mb.i_partition == D_16x8 )
- + {
- + bs_write_ue( s, 1 );
- + if( h->mb.pic.i_fref[0] > 1 )
- + {
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
- + }
- + x264_cavlc_mb_mvd( h, 0, 0, 4 );
- + x264_cavlc_mb_mvd( h, 0, 8, 4 );
- + }
- + else if( h->mb.i_partition == D_8x16 )
- + {
- + bs_write_ue( s, 2 );
- + if( h->mb.pic.i_fref[0] > 1 )
- + {
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
- + }
- + x264_cavlc_mb_mvd( h, 0, 0, 2 );
- + x264_cavlc_mb_mvd( h, 0, 4, 2 );
- + }
- + break;
- }
- - else if( h->mb.i_partition == D_8x16 )
- + case P_8x8:
- {
- - bs_write_ue( s, 2 );
- - if( h->mb.pic.i_fref[0] > 1 )
- + int b_sub_ref;
- + if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
- + h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
- + {
- + bs_write_ue( s, 4 );
- + b_sub_ref = 0;
- + }
- + else
- + {
- + bs_write_ue( s, 3 );
- + b_sub_ref = 1;
- + }
- +
- + /* sub mb type */
- + if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
- + for( i = 0; i < 4; i++ )
- + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
- + else
- + bs_write( s, 4, 0xf );
- +
- + /* ref0 */
- + if( b_sub_ref )
- {
- bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
- }
- - x264_cavlc_mb_mvd( h, 0, 0, 2 );
- - x264_cavlc_mb_mvd( h, 0, 4, 2 );
- - }
- - }
- - else if( i_mb_type == P_8x8 )
- - {
- - int b_sub_ref;
- - if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
- - h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
- - {
- - bs_write_ue( s, 4 );
- - b_sub_ref = 0;
- - }
- - else
- - {
- - bs_write_ue( s, 3 );
- - b_sub_ref = 1;
- - }
- - /* sub mb type */
- - if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
- for( i = 0; i < 4; i++ )
- - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
- - else
- - bs_write( s, 4, 0xf );
- -
- - /* ref0 */
- - if( b_sub_ref )
- - {
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
- + x264_cavlc_mb8x8_mvd( h, i );
- + break;
- }
- + case B_8x8:
- + {
- + bs_write_ue( s, 22 );
- - for( i = 0; i < 4; i++ )
- - x264_cavlc_mb8x8_mvd( h, i );
- - }
- - else if( i_mb_type == B_8x8 )
- - {
- - bs_write_ue( s, 22 );
- -
- - /* sub mb type */
- - for( i = 0; i < 4; i++ )
- - bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
- + /* sub mb type */
- + for( i = 0; i < 4; i++ )
- + bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
- - /* ref */
- - if( h->mb.pic.i_fref[0] > 1 )
- + /* ref */
- + if( h->mb.pic.i_fref[0] > 1 )
- + for( i = 0; i < 4; i++ )
- + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- + bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
- + if( h->mb.pic.i_fref[1] > 1 )
- + for( i = 0; i < 4; i++ )
- + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- + bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
- +
- + /* mvd */
- for( i = 0; i < 4; i++ )
- if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- - bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
- - if( h->mb.pic.i_fref[1] > 1 )
- + x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
- for( i = 0; i < 4; i++ )
- if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- - bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
- -
- - /* mvd */
- - for( i = 0; i < 4; i++ )
- - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- - x264_cavlc_mb_mvd( h, 0, 4*i, 2 );
- - for( i = 0; i < 4; i++ )
- - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- - x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
- - }
- - else if( i_mb_type != B_DIRECT )
- - {
- - /* All B mode */
- - /* Motion Vector */
- - const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
- - const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
- - const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
- -
- - bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
- - if( h->mb.i_partition == D_16x16 )
- + x264_cavlc_mb_mvd( h, 1, 4*i, 2 );
- + break;
- + }
- + case B_DIRECT:
- {
- - if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
- - if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
- - if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
- - if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
- + bs_write1( s, 1 );
- + break;
- }
- - else
- + default: /* Rest of the B types */
- {
- - if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
- - if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
- - if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
- - if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
- - if( h->mb.i_partition == D_16x8 )
- + const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
- + const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
- + const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
- +
- + bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
- + if( h->mb.i_partition == D_16x16 )
- {
- + if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
- + if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
- if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
- - if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
- if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
- - if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
- }
- - else //if( h->mb.i_partition == D_8x16 )
- + else
- {
- - if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
- - if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
- - if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
- - if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
- + if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
- + if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
- + if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
- + if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
- + if( h->mb.i_partition == D_16x8 )
- + {
- + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 4 );
- + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 8, 4 );
- + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 4 );
- + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 8, 4 );
- + }
- + else //if( h->mb.i_partition == D_8x16 )
- + {
- + if( b_list[0][0] ) x264_cavlc_mb_mvd( h, 0, 0, 2 );
- + if( b_list[0][1] ) x264_cavlc_mb_mvd( h, 0, 4, 2 );
- + if( b_list[1][0] ) x264_cavlc_mb_mvd( h, 1, 0, 2 );
- + if( b_list[1][1] ) x264_cavlc_mb_mvd( h, 1, 4, 2 );
- + }
- }
- + break;
- }
- }
- - else //if( i_mb_type == B_DIRECT )
- - bs_write1( s, 1 );
- #if !RDO_SKIP_BS
- i_mb_pos_tex = bs_pos( s );
- @@ -505,16 +512,16 @@ void x264_macroblock_write_cavlc( x264_t *h )
- /* Coded block patern */
- if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
- - bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
- + bs_write_ue( s, intra4x4_cbp_to_golomb[h->mb.i_cbp_combined&0x3f] );
- else if( i_mb_type != I_16x16 )
- - bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
- + bs_write_ue( s, inter_cbp_to_golomb[h->mb.i_cbp_combined&0x3f] );
- /* transform size 8x8 flag */
- if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
- bs_write1( s, h->mb.b_transform_8x8 );
- /* write residual */
- - if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma || i_mb_type == I_16x16 )
- + if( h->mb.i_cbp_combined&0x3f || i_mb_type == I_16x16 )
- {
- x264_cavlc_mb_qp_delta( h );
- @@ -561,24 +568,24 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
- int b_8x16 = h->mb.i_partition == D_8x16;
- int j;
- - if( i_mb_type == P_8x8 )
- - {
- - x264_cavlc_mb8x8_mvd( h, i8 );
- - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
- - }
- - else if( i_mb_type == P_L0 )
- - x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- - else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
- + switch( i_mb_type )
- {
- - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- - if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
- - }
- - else //if( i_mb_type == B_8x8 )
- - {
- - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
- - x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
- - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
- - x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
- + case P_L0:
- + x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- + break;
- + case P_8x8:
- + x264_cavlc_mb8x8_mvd( h, i8 );
- + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
- + break;
- + case B_8x8:
- + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
- + x264_cavlc_mb_mvd( h, 0, 4*i8, 2 );
- + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
- + x264_cavlc_mb_mvd( h, 1, 4*i8, 2 );
- + break;
- + default: /* Rest of the B types */
- + if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
- + if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
- }
- for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
- @@ -618,6 +625,8 @@ static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
- static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
- {
- h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
- + /* We can't use h->mb.i_cbp_combined here because it's only calculated at the end of
- + * x264_macroblock_encode(), which hasn't been called at this point. */
- bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
- x264_macroblock_luma_write_cavlc( h, i8, i8 );
- return h->out.bs.i_bits_encoded;
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index fa7942d..f5f6267 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- @@ -488,7 +488,7 @@ static void x264_macroblock_encode_skip( x264_t *h )
- h->mb.i_cbp_chroma = 0x00;
- memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE );
- /* store cbp */
- - h->mb.cbp[h->mb.i_mb_xy] = 0;
- + h->mb.i_cbp_combined = 0;
- }
- /*****************************************************************************
- @@ -604,7 +604,6 @@ void x264_predict_lossless_16x16( x264_t *h, int i_mode )
- *****************************************************************************/
- void x264_macroblock_encode( x264_t *h )
- {
- - int i_cbp_dc = 0;
- int i_qp = h->mb.i_qp;
- int b_decimate = h->mb.b_dct_decimate;
- int b_force_no_skip = 0;
- @@ -880,34 +879,28 @@ void x264_macroblock_encode( x264_t *h )
- /* encode the 8x8 blocks */
- x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
- - if( h->param.b_cabac )
- - {
- - i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
- + int i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
- | h->mb.cache.non_zero_count[x264_scan8[25]] << 1
- | h->mb.cache.non_zero_count[x264_scan8[26]] << 2;
- - }
- /* store cbp */
- - h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
- + h->mb.i_cbp_combined = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
- /* Check for P_SKIP
- * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
- * (if multiple mv give same result)*/
- if( !b_force_no_skip )
- {
- - if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
- - !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
- - M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
- + if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && !h->mb.i_cbp_combined
- + && M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
- && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
- {
- h->mb.i_type = P_SKIP;
- }
- /* Check for B_SKIP */
- - if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
- - {
- + if( h->mb.i_type == B_DIRECT && !h->mb.i_cbp_combined )
- h->mb.i_type = B_SKIP;
- - }
- }
- }
- --
- 1.6.1.2
- From 2e760d47c213cdfe77c652b9d03518043e831615 Mon Sep 17 00:00:00 2001
- From: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
- Date: Mon, 8 Feb 2010 01:48:38 -0800
- Subject: [PATCH 10/26] Write PASP atom in mp4 muxing
- Adds container-level aspect ratio support for mp4.
- ---
- output/mp4.c | 3 ++-
- 1 files changed, 2 insertions(+), 1 deletions(-)
- diff --git a/output/mp4.c b/output/mp4.c
- index e3ad9c6..b817c82 100644
- --- a/output/mp4.c
- +++ b/output/mp4.c
- @@ -121,7 +121,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
- if( mdhd_duration != total_duration )
- {
- uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
- - uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
- + uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
- gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
- total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
- }
- @@ -212,6 +212,7 @@ static int set_param( hnd_t handle, x264_param_t *p_param )
- dw *= sar ;
- else
- dh /= sar;
- + gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height );
- gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
- }
- --
- 1.6.1.2
- From 0c1c12eaab8ac05af4962c0e4ebdd24407cf9a13 Mon Sep 17 00:00:00 2001
- From: Henrik Gramner <hengar-6@student.ltu.se>
- Date: Mon, 8 Feb 2010 15:53:52 -0800
- Subject: [PATCH 11/26] Faster 2x2 chroma DC dequant
- ---
- doc/standards.txt | 1 +
- encoder/macroblock.c | 24 +++++++++---------------
- 2 files changed, 10 insertions(+), 15 deletions(-)
- diff --git a/doc/standards.txt b/doc/standards.txt
- index db9a691..7474d8f 100644
- --- a/doc/standards.txt
- +++ b/doc/standards.txt
- @@ -4,6 +4,7 @@ checkasm is written in gcc, with no attempt at compatibility with anything else.
- We make the following additional assumptions which are true of real systems but not guaranteed by C99:
- * Two's complement.
- * Signed right-shifts are sign-extended.
- +* int is 32-bit or larger.
- x86-specific assumptions:
- * The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that.
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index f5f6267..3d859de 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- @@ -42,30 +42,24 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
- int d1 = dct[2] + dct[3]; \
- int d2 = dct[0] - dct[1]; \
- int d3 = dct[2] - dct[3]; \
- - int dmf = dequant_mf[i_qp%6][0]; \
- - int qbits = i_qp/6 - 5; \
- - if( qbits > 0 ) \
- - { \
- - dmf <<= qbits; \
- - qbits = 0; \
- - }
- + int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
- static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
- {
- IDCT_DEQUANT_START
- - dct4x4[0][0] = (d0 + d1) * dmf >> -qbits;
- - dct4x4[1][0] = (d0 - d1) * dmf >> -qbits;
- - dct4x4[2][0] = (d2 + d3) * dmf >> -qbits;
- - dct4x4[3][0] = (d2 - d3) * dmf >> -qbits;
- + dct4x4[0][0] = (d0 + d1) * dmf >> 5;
- + dct4x4[1][0] = (d0 - d1) * dmf >> 5;
- + dct4x4[2][0] = (d2 + d3) * dmf >> 5;
- + dct4x4[3][0] = (d2 - d3) * dmf >> 5;
- }
- static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
- {
- IDCT_DEQUANT_START
- - out[0] = (d0 + d1) * dmf >> -qbits;
- - out[1] = (d0 - d1) * dmf >> -qbits;
- - out[2] = (d2 + d3) * dmf >> -qbits;
- - out[3] = (d2 - d3) * dmf >> -qbits;
- + out[0] = (d0 + d1) * dmf >> 5;
- + out[1] = (d0 - d1) * dmf >> 5;
- + out[2] = (d2 + d3) * dmf >> 5;
- + out[3] = (d2 - d3) * dmf >> 5;
- }
- static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
- --
- 1.6.1.2
- From d944b740aaa9e07434ff6b022b86460dc27d4b63 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Tue, 9 Feb 2010 15:08:31 -0800
- Subject: [PATCH 12/26] Print psy-(rd|trellis) with more precision in userdata SEI
- ---
- common/common.c | 2 +-
- 1 files changed, 1 insertions(+), 1 deletions(-)
- diff --git a/common/common.c b/common/common.c
- index 6d1d7f0..aaccdf2 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -886,7 +886,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
- s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
- s += sprintf( s, " psy=%d", p->analyse.b_psy );
- if( p->analyse.b_psy )
- - s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
- + s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
- s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
- s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
- s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
- --
- 1.6.1.2
- From eb0d5bd9a8f5bbd0da6fbc7baf214f78de8b26d7 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 10 Feb 2010 12:12:29 -0800
- Subject: [PATCH 13/26] Overhaul sliced-threads VBV
- Make predictors thread-local and allow each thread to poll the others to get their predicted sizes.
- Many, many other tweaks to improve quality with small VBV and sliced threads.
- Note this may somewhat increase the risk of a VBV underflow in such extreme situations (single-frame VBV).
- This is tolerable, as most relevant use-cases are better off with a few rare underflows (even if they have to drop a slice) than consistent low quality.
- ---
- encoder/encoder.c | 4 +-
- encoder/ratecontrol.c | 163 ++++++++++++++++++++++++++++--------------------
- encoder/slicetype.c | 4 +-
- 3 files changed, 99 insertions(+), 72 deletions(-)
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 008d0f2..0ca6694 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -2062,6 +2062,8 @@ static int x264_threaded_slices_write( x264_t *h )
- for( i = 0; i <= h->sps->i_mb_height; i++ )
- x264_fdec_filter_row( h, i );
- + x264_threads_merge_ratecontrol( h );
- +
- for( i = 1; i < h->param.i_threads; i++ )
- {
- x264_t *t = h->thread[i];
- @@ -2077,8 +2079,6 @@ static int x264_threaded_slices_write( x264_t *h )
- ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
- }
- - x264_threads_merge_ratecontrol( h );
- -
- return 0;
- }
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index e314ba2..0c946ba 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -134,9 +134,11 @@ struct x264_ratecontrol_t
- * This value is the current position (0 or 1). */
- /* MBRC stuff */
- - double frame_size_estimated;
- + float frame_size_estimated; /* Access to this variable must be atomic: double is
- + * not atomic on all arches we care about */
- double frame_size_planned;
- double slice_size_planned;
- + double max_frame_error;
- predictor_t (*row_pred)[2];
- predictor_t row_preds[5][2];
- predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
- @@ -505,17 +507,21 @@ int x264_ratecontrol_new( x264_t *h )
- rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
- rc->last_qscale = qp2qscale(26);
- - CHECKED_MALLOC( rc->pred, 5*sizeof(predictor_t) );
- + int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
- + CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
- CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
- for( i = 0; i < 5; i++ )
- {
- rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
- rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min );
- rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
- - rc->pred[i].coeff= 2.0;
- - rc->pred[i].count= 1.0;
- - rc->pred[i].decay= 0.5;
- - rc->pred[i].offset= 0.0;
- + for( j = 0; j < num_preds; j++ )
- + {
- + rc->pred[i+j*5].coeff= 2.0;
- + rc->pred[i+j*5].count= 1.0;
- + rc->pred[i+j*5].decay= 0.5;
- + rc->pred[i+j*5].offset= 0.0;
- + }
- for( j = 0; j < 2; j++ )
- {
- rc->row_preds[i][j].coeff= .25;
- @@ -986,22 +992,6 @@ void x264_ratecontrol_delete( x264_t *h )
- x264_free( rc );
- }
- -void x264_ratecontrol_set_estimated_size( x264_t *h, int bits )
- -{
- - x264_pthread_mutex_lock( &h->fenc->mutex );
- - h->rc->frame_size_estimated = bits;
- - x264_pthread_mutex_unlock( &h->fenc->mutex );
- -}
- -
- -int x264_ratecontrol_get_estimated_size( x264_t const *h)
- -{
- - int size;
- - x264_pthread_mutex_lock( &h->fenc->mutex );
- - size = h->rc->frame_size_estimated;
- - x264_pthread_mutex_unlock( &h->fenc->mutex );
- - return size;
- -}
- -
- static void accum_p_qp_update( x264_t *h, float qp )
- {
- x264_ratecontrol_t *rc = h->rc;
- @@ -1173,6 +1163,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
- /* tweak quality based on difference from predicted size */
- if( y < h->i_threadslice_end-1 )
- {
- + int i;
- int prev_row_qp = h->fdec->i_row_qp[y];
- int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
- int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
- @@ -1186,19 +1177,23 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
- float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
- float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
- - float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
- + float size_of_other_slices = 0;
- + if( h->param.b_sliced_threads )
- + {
- + for( i = 0; i < h->param.i_threads; i++ )
- + if( h != h->thread[i] )
- + size_of_other_slices += h->thread[i]->rc->frame_size_estimated;
- + }
- + else
- + rc->max_frame_error = X264_MAX( 0.05, 1.0 / (h->sps->i_mb_width) );
- +
- /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
- float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
- - float max_frame_error = X264_MAX( 0.05, 1.0 / h->sps->i_mb_height );
- - int b1 = predict_row_size_sum( h, y, rc->qpm );
- -
- - /* Assume that if this slice has become larger than expected,
- - * the other slices will have gotten equally larger. */
- - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
- + int b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
- /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
- /* area at the top of the frame was measured inaccurately. */
- - if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
- + if( row_bits_so_far( h, y ) < 0.05 * slice_size_planned )
- return;
- if( h->sh.i_type != SLICE_TYPE_I )
- @@ -1213,8 +1208,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
- (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
- {
- rc->qpm ++;
- - b1 = predict_row_size_sum( h, y, rc->qpm );
- - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
- + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
- }
- while( rc->qpm > i_qp_min
- @@ -1223,20 +1217,18 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
- || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
- {
- rc->qpm --;
- - b1 = predict_row_size_sum( h, y, rc->qpm );
- - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
- + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
- }
- /* avoid VBV underflow */
- while( (rc->qpm < h->param.rc.i_qp_max)
- - && (rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) )
- + && (rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) )
- {
- rc->qpm ++;
- - b1 = predict_row_size_sum( h, y, rc->qpm );
- - b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
- + b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
- }
- - x264_ratecontrol_set_estimated_size(h, b1);
- + h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm );
- }
- /* loses the fractional part of the frame-wise qp */
- @@ -1584,7 +1576,7 @@ static void update_vbv_plan( x264_t *h, int overhead )
- double bits = t->rc->frame_size_planned;
- if( !t->b_thread_active )
- continue;
- - bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
- + bits = X264_MAX(bits, t->rc->frame_size_estimated);
- rcc->buffer_fill -= bits;
- rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
- rcc->buffer_fill += rcc->buffer_rate;
- @@ -1783,7 +1775,7 @@ static float rate_estimate_qscale( x264_t *h )
- rcc->frame_size_planned = qscale2bits( &rce, q );
- else
- rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
- - x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
- + h->rc->frame_size_estimated = rcc->frame_size_planned;
- /* For row SATDs */
- if( rcc->b_vbv )
- @@ -1812,7 +1804,7 @@ static float rate_estimate_qscale( x264_t *h )
- double bits = t->rc->frame_size_planned;
- if( !t->b_thread_active )
- continue;
- - bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
- + bits = X264_MAX(bits, t->rc->frame_size_estimated);
- predicted_bits += (int64_t)bits;
- }
- }
- @@ -1953,61 +1945,96 @@ static float rate_estimate_qscale( x264_t *h )
- /* Always use up the whole VBV in this case. */
- if( rcc->single_frame_vbv )
- rcc->frame_size_planned = rcc->buffer_rate;
- - x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
- + h->rc->frame_size_estimated = rcc->frame_size_planned;
- return q;
- }
- }
- +void x264_threads_normalize_predictors( x264_t *h )
- +{
- + int i;
- + double totalsize = 0;
- + for( i = 0; i < h->param.i_threads; i++ )
- + totalsize += h->thread[i]->rc->slice_size_planned;
- + double factor = h->rc->frame_size_planned / totalsize;
- + for( i = 0; i < h->param.i_threads; i++ )
- + h->thread[i]->rc->slice_size_planned *= factor;
- +}
- +
- void x264_threads_distribute_ratecontrol( x264_t *h )
- {
- - int i, row, totalsize = 0;
- - if( h->rc->b_vbv )
- - for( row = 0; row < h->sps->i_mb_height; row++ )
- - totalsize += h->fdec->i_row_satd[row];
- + int i, row;
- + x264_ratecontrol_t *rc = h->rc;
- +
- + /* Initialize row predictors */
- + if( h->i_frame == 0 )
- + for( i = 0; i < h->param.i_threads; i++ )
- + {
- + x264_ratecontrol_t *t = h->thread[i]->rc;
- + memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
- + }
- +
- for( i = 0; i < h->param.i_threads; i++ )
- {
- x264_t *t = h->thread[i];
- - x264_ratecontrol_t *rc = h->rc;
- - memcpy( t->rc, rc, sizeof(x264_ratecontrol_t) );
- + memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
- + t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
- /* Calculate the planned slice size. */
- - if( h->rc->b_vbv && rc->frame_size_planned )
- + if( rc->b_vbv && rc->frame_size_planned )
- {
- int size = 0;
- for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
- size += h->fdec->i_row_satd[row];
- - t->rc->slice_size_planned = size * rc->frame_size_planned / totalsize;
- + t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], rc->qpm, size );
- }
- else
- t->rc->slice_size_planned = 0;
- }
- + if( rc->b_vbv && rc->frame_size_planned )
- + {
- + x264_threads_normalize_predictors( h );
- +
- + if( rc->single_frame_vbv )
- + {
- + /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */
- + for( i = 0; i < h->param.i_threads; i++ )
- + {
- + x264_t *t = h->thread[i];
- + t->rc->max_frame_error = X264_MAX( 0.05, 1.0 / (t->i_threadslice_end - t->i_threadslice_start) );
- + t->rc->slice_size_planned += 2 * t->rc->max_frame_error * rc->frame_size_planned;
- + }
- + x264_threads_normalize_predictors( h );
- + }
- +
- + for( i = 0; i < h->param.i_threads; i++ )
- + h->thread[i]->rc->frame_size_estimated = h->thread[i]->rc->slice_size_planned;
- + }
- }
- void x264_threads_merge_ratecontrol( x264_t *h )
- {
- - int i, j, k;
- + int i, row;
- x264_ratecontrol_t *rc = h->rc;
- x264_emms();
- - for( i = 1; i < h->param.i_threads; i++ )
- + for( i = 0; i < h->param.i_threads; i++ )
- {
- - x264_ratecontrol_t *t = h->thread[i]->rc;
- - rc->qpa_rc += t->qpa_rc;
- - rc->qpa_aq += t->qpa_aq;
- - for( j = 0; j < 5; j++ )
- - for( k = 0; k < 2; k++ )
- - {
- - rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
- - rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
- - rc->row_preds[j][k].count += t->row_preds[j][k].count;
- - }
- - }
- - for( j = 0; j < 5; j++ )
- - for( k = 0; k < 2; k++ )
- + x264_t *t = h->thread[i];
- + x264_ratecontrol_t *rct = h->thread[i]->rc;
- + if( h->param.rc.i_vbv_buffer_size )
- {
- - rc->row_preds[j][k].coeff /= h->param.i_threads;
- - rc->row_preds[j][k].offset /= h->param.i_threads;
- - rc->row_preds[j][k].count /= h->param.i_threads;
- + int size = 0;
- + for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
- + size += h->fdec->i_row_satd[row];
- + int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits;
- + int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->sps->i_mb_width;
- + update_predictor( &rc->pred[h->sh.i_type+5*i], qp2qscale(rct->qpa_rc/mb_count), size, bits );
- }
- + if( !i )
- + continue;
- + rc->qpa_rc += rct->qpa_rc;
- + rc->qpa_aq += rct->qpa_aq;
- + }
- }
- void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
- diff --git a/encoder/slicetype.c b/encoder/slicetype.c
- index 057f6a6..bb2ed64 100644
- --- a/encoder/slicetype.c
- +++ b/encoder/slicetype.c
- @@ -1394,10 +1394,10 @@ int x264_rc_analyse_slice( x264_t *h )
- int mb_xy = y * h->mb.i_mb_stride;
- for( x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
- {
- - int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor) >> 8;
- + int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
- int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
- int diff = intra_cost - inter_cost;
- - h->fdec->i_row_satd[y] += diff;
- + h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
- cost += diff;
- }
- }
- --
- 1.6.1.2
- From 55cd605a06a1f09925d2707351774f34263ebe3f Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 10 Feb 2010 13:44:28 -0800
- Subject: [PATCH 14/26] Allow longer keyints with intra refresh
- If a long keyint is specified (longer than macroblock width-1), the refresh will simply not occur all the time.
- In other words, a refresh will take place, and then x264 will wait until keyint is over to start another refresh.
- ---
- encoder/encoder.c | 15 +++++++--------
- 1 files changed, 7 insertions(+), 8 deletions(-)
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 0ca6694..d43a758 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -599,8 +599,6 @@ static int x264_validate_parameters( x264_t *h )
- x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
- h->param.i_frame_reference = 1;
- }
- - if( h->param.b_intra_refresh )
- - h->param.i_keyint_max = X264_MIN( h->param.i_keyint_max, (h->param.i_width+15)/16 - 1 );
- h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
- h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
- {
- @@ -2307,22 +2305,22 @@ int x264_encoder_encode( x264_t *h,
- if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
- {
- int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
- - float increment = ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max;
- + float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
- + int max_position = (int)(increment * h->param.i_keyint_max);
- if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
- h->fdec->f_pir_position = 0;
- else
- {
- - if( h->fref0[0]->i_pir_end_col == h->sps->i_mb_width - 1 )
- + h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
- + if( h->fdec->f_pir_position+0.5 >= max_position )
- {
- h->fdec->f_pir_position = 0;
- h->fenc->b_keyframe = 1;
- }
- - else
- - h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
- }
- h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
- h->fdec->f_pir_position += increment * pocdiff;
- - h->fdec->i_pir_end_col = X264_MIN( h->fdec->f_pir_position+0.5, h->sps->i_mb_width-1 );
- + h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
- }
- /* Write SPS and PPS */
- @@ -2358,8 +2356,9 @@ int x264_encoder_encode( x264_t *h,
- if( h->fenc->i_type != X264_TYPE_IDR )
- {
- + int time_to_recovery = X264_MIN( h->sps->i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe;
- x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
- - x264_sei_recovery_point_write( h, &h->out.bs, h->param.i_keyint_max );
- + x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery );
- x264_nal_end( h );
- overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
- }
- --
- 1.6.1.2
- From 2684c8486c7365db25188a70810f663de10428fa Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 12 Feb 2010 03:33:54 -0800
- Subject: [PATCH 15/26] Implement direct temporal + interlaced
- This was much easier than I expected.
- It will also be basically useless until TFF/BFF support gets in, since it requires delta_poc_bottom to be set correctly to work well.
- ---
- common/common.h | 5 +++--
- common/macroblock.c | 8 ++++----
- encoder/encoder.c | 5 -----
- 3 files changed, 7 insertions(+), 11 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index d4a8dd9..6da462f 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -655,11 +655,12 @@ struct x264_t
- int i_chroma_lambda2_offset;
- /* B_direct and weighted prediction */
- - int16_t dist_scale_factor[16][2];
- + int16_t dist_scale_factor_buf[2][16][2];
- + int16_t (*dist_scale_factor)[2];
- int8_t bipred_weight_buf[2][32][4];
- int8_t (*bipred_weight)[4];
- /* maps fref1[0]'s ref indices into the current list0 */
- -#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
- +#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
- int8_t map_col_to_list0[18];
- int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
- } mb;
- diff --git a/common/macroblock.c b/common/macroblock.c
- index d86f3af..e676b8b 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -190,7 +190,8 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
- const int x8 = i8%2;
- const int y8 = i8/2;
- const int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
- - const int i_ref = map_col_to_list0(h->fref1[0]->ref[0][i_part_8x8]);
- + const int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8];
- + const int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
- if( i_ref >= 0 )
- {
- @@ -1238,6 +1239,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
- if( h->sh.i_type == SLICE_TYPE_B )
- {
- h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(i_mb_y&1)];
- + h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(i_mb_y&1)];
- if( h->param.b_cabac )
- {
- uint8_t skipbp;
- @@ -1478,9 +1480,7 @@ void x264_macroblock_bipred_init( x264_t *h )
- dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
- }
- - // FIXME: will need this if we ever do temporal MV pred with interlaced
- - if( !h->sh.b_mbaff )
- - h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
- + h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor;
- dist_scale_factor >>= 2;
- if( h->param.analyse.b_weighted_bipred
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index d43a758..9efe88a 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -430,11 +430,6 @@ static int x264_validate_parameters( x264_t *h )
- x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
- h->param.analyse.i_me_method = X264_ME_UMH;
- }
- - if( h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
- - {
- - x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
- - h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
- - }
- if( h->param.analyse.i_weighted_pred > 0 )
- {
- x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
- --
- 1.6.1.2
- From 436109f0c9cba043559f360cc69bae22d4b188f7 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 12 Feb 2010 21:15:12 -0800
- Subject: [PATCH 16/26] Backport various speed tweak ideas from ffmpeg
- Add mv0 early termination to spatial direct calculation
- Up to twice as fast direct mv calculation on near-motionless video.
- Branchless CAVLC level code adjustment based on trailing ones.
- A few clocks faster.
- Check tc value before clipping in C version of deblock functions.
- Much faster, but nobody uses those anyways.
- Thanks to Michael Niedermayer for the ideas.
- ---
- common/frame.c | 6 ++++--
- common/macroblock.c | 3 +++
- encoder/cavlc.c | 7 +++----
- 3 files changed, 10 insertions(+), 6 deletions(-)
- diff --git a/common/frame.c b/common/frame.c
- index 40cc78f..d89f5ab 100644
- --- a/common/frame.c
- +++ b/common/frame.c
- @@ -472,12 +472,14 @@ static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int a
- int delta;
- if( abs( p2 - p0 ) < beta )
- {
- - pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
- + if( tc0[i] )
- + pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
- tc++;
- }
- if( abs( q2 - q0 ) < beta )
- {
- - pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
- + if( tc0[i] )
- + pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
- tc++;
- }
- diff --git a/common/macroblock.c b/common/macroblock.c
- index e676b8b..c9ce597 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -272,6 +272,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
- x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
- x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
- + if( !M64( mv ) )
- + return 1;
- +
- if( h->param.i_threads > 1
- && ( mv[0][1] > h->mb.mv_max_spel[1]
- || mv[1][1] > h->mb.mv_max_spel[1] ) )
- diff --git a/encoder/cavlc.c b/encoder/cavlc.c
- index 45b55fe..12806ae 100644
- --- a/encoder/cavlc.c
- +++ b/encoder/cavlc.c
- @@ -147,10 +147,9 @@ static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, int16_t *l,
- if( i_trailing < i_total )
- {
- - int16_t val = runlevel.level[i_trailing];
- - int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
- - if( i_trailing < 3 )
- - val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
- + int val = runlevel.level[i_trailing];
- + int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
- + val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
- val += LEVEL_TABLE_SIZE/2;
- if( (unsigned)val_original < LEVEL_TABLE_SIZE )
- --
- 1.6.1.2
- From 88a2153ed4519582b61cc516ca59b2d9559e6725 Mon Sep 17 00:00:00 2001
- From: Alexander Strange <astrange@ithinksw.com>
- Date: Mon, 10 Nov 2008 00:55:20 -0500
- Subject: [PATCH 17/26] Allow | as a separator between psy-rd and psy-trellis values.
- [,:/] are all taken when setting psy-trellis in a zone in an mencoder option.
- Also fix a comment typo and remove a useless line of code.
- ---
- common/common.c | 3 ++-
- encoder/encoder.c | 4 +---
- 2 files changed, 3 insertions(+), 4 deletions(-)
- diff --git a/common/common.c b/common/common.c
- index aaccdf2..0dd7af5 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -515,7 +515,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
- OPT("psy-rd")
- {
- if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
- - 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) )
- + 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
- + 2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ))
- { }
- else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
- {
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 9efe88a..cca9c45 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -84,7 +84,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
- x264_param_t *param = &h->param;
- int i;
- - /* First we fill all field */
- + /* First we fill all fields */
- sh->sps = sps;
- sh->pps = pps;
- @@ -685,8 +685,6 @@ static int x264_validate_parameters( x264_t *h )
- /* Psy trellis has a similar effect. */
- if( h->mb.i_psy_trellis )
- h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
- - else
- - h->mb.i_psy_trellis = 0;
- h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
- h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
- h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
- --
- 1.6.1.2
- From ed7f1d8c708dad428a2706254016095ebf755e8b Mon Sep 17 00:00:00 2001
- From: Alexander Strange <astrange@ithinksw.com>
- Date: Sat, 13 Feb 2010 01:41:41 -0500
- Subject: [PATCH 18/26] mkv: Write SimpleBlock instead of Block for frame headers
- mkvtoolnix writes these by default since 2009/04/13.
- Slightly simplifies muxer and allows 'mkvinfo -s' to show B-frames
- as 'B' (but not B-ref frames).
- ---
- output/matroska.c | 2 +-
- output/matroska_ebml.c | 80 ++++++++----------------------------------------
- output/matroska_ebml.h | 2 +-
- 3 files changed, 15 insertions(+), 69 deletions(-)
- diff --git a/output/matroska.c b/output/matroska.c
- index 8e84f52..db7639c 100644
- --- a/output/matroska.c
- +++ b/output/matroska.c
- @@ -185,7 +185,7 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
- p_mkv->b_writing_frame = 0;
- - if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe ) < 0 )
- + if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 )
- return -1;
- return i_size;
- diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
- index d1c6e13..7265909 100644
- --- a/output/matroska_ebml.c
- +++ b/output/matroska_ebml.c
- @@ -53,9 +53,9 @@ struct mk_writer
- int64_t def_duration;
- int64_t timescale;
- int64_t cluster_tc_scaled;
- - int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
- + int64_t frame_tc, max_frame_tc;
- - char wrote_header, in_frame, keyframe;
- + char wrote_header, in_frame, keyframe, skippable;
- };
- static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
- @@ -258,23 +258,6 @@ static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
- return 0;
- }
- -static int mk_write_sint( mk_context *c, unsigned id, int64_t si )
- -{
- - unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
- - unsigned i = 0;
- -
- - CHECK( mk_write_id( c, id ) );
- - if( si < 0 )
- - while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
- - ++i;
- - else
- - while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80 ) )
- - ++i;
- - CHECK( mk_write_size( c, 8 - i ) );
- - CHECK( mk_append_context_data( c, c_si+i, 8 - i ) );
- - return 0;
- -}
- -
- static int mk_write_float_raw( mk_context *c, float f )
- {
- union
- @@ -301,34 +284,6 @@ static int mk_write_float( mk_context *c, unsigned id, float f )
- return 0;
- }
- -static unsigned mk_ebml_size_size( unsigned s )
- -{
- - if( s < 0x7f )
- - return 1;
- - if( s < 0x3fff )
- - return 2;
- - if( s < 0x1fffff )
- - return 3;
- - if( s < 0x0fffffff )
- - return 4;
- - return 5;
- -}
- -
- -static unsigned mk_ebml_sint_size( int64_t si )
- -{
- - unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
- - unsigned i = 0;
- -
- - if( si < 0 )
- - while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
- - ++i;
- - else
- - while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80) )
- - ++i;
- -
- - return 8 - i;
- -}
- -
- mk_writer *mk_create_writer( const char *filename )
- {
- mk_writer *w = malloc( sizeof(*w) );
- @@ -446,8 +401,8 @@ static int mk_close_cluster( mk_writer *w )
- static int mk_flush_frame( mk_writer *w )
- {
- - int64_t delta, ref = 0;
- - unsigned fsize, bgsize;
- + int64_t delta;
- + unsigned fsize;
- unsigned char c_delta_flags[3];
- if( !w->in_frame )
- @@ -470,33 +425,22 @@ static int mk_flush_frame( mk_writer *w )
- }
- fsize = w->frame ? w->frame->d_cur : 0;
- - bgsize = fsize + 4 + mk_ebml_size_size( fsize + 4 ) + 1;
- - if( !w->keyframe )
- - {
- - ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
- - bgsize += 1 + 1 + mk_ebml_sint_size( ref );
- - }
- - CHECK( mk_write_id( w->cluster, 0xa0 ) ); // BlockGroup
- - CHECK( mk_write_size( w->cluster, bgsize ) );
- - CHECK( mk_write_id( w->cluster, 0xa1 ) ); // Block
- + CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock
- CHECK( mk_write_size( w->cluster, fsize + 4 ) );
- CHECK( mk_write_size( w->cluster, 1 ) ); // track number
- c_delta_flags[0] = delta >> 8;
- c_delta_flags[1] = delta;
- - c_delta_flags[2] = 0;
- + c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
- CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
- if( w->frame )
- {
- CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
- w->frame->d_cur = 0;
- }
- - if( !w->keyframe )
- - CHECK( mk_write_sint( w->cluster, 0xfb, ref ) ); // ReferenceBlock
- w->in_frame = 0;
- - w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
- if( w->cluster->d_cur > CLSIZE )
- CHECK( mk_close_cluster( w ) );
- @@ -509,19 +453,21 @@ int mk_start_frame( mk_writer *w )
- if( mk_flush_frame( w ) < 0 )
- return -1;
- - w->in_frame = 1;
- - w->keyframe = 0;
- + w->in_frame = 1;
- + w->keyframe = 0;
- + w->skippable = 0;
- return 0;
- }
- -int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe )
- +int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable )
- {
- if( !w->in_frame )
- return -1;
- - w->frame_tc = timestamp;
- - w->keyframe = keyframe != 0;
- + w->frame_tc = timestamp;
- + w->keyframe = keyframe != 0;
- + w->skippable = skippable != 0;
- if( w->max_frame_tc < timestamp )
- w->max_frame_tc = timestamp;
- diff --git a/output/matroska_ebml.h b/output/matroska_ebml.h
- index 252e781..56eb8cc 100644
- --- a/output/matroska_ebml.h
- +++ b/output/matroska_ebml.h
- @@ -35,7 +35,7 @@ int mk_writeHeader( mk_writer *w, const char *writing_app,
- int mk_start_frame( mk_writer *w );
- int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
- -int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe );
- +int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable );
- int mk_close( mk_writer *w, int64_t last_delta );
- #endif
- --
- 1.6.1.2
- From 6c236ef44883e926c00f75f961a7423a8aa56036 Mon Sep 17 00:00:00 2001
- From: Alexander Strange <astrange@ithinksw.com>
- Date: Sat, 13 Feb 2010 02:00:57 -0500
- Subject: [PATCH 19/26] mkv: Write the x264 version into the file header
- This only updates the "writing application"; matroska_ebml.c is the
- "muxing application", but the version string for that is still hardcoded.
- ---
- output/matroska.c | 2 +-
- 1 files changed, 1 insertions(+), 1 deletions(-)
- diff --git a/output/matroska.c b/output/matroska.c
- index db7639c..b1805e4 100644
- --- a/output/matroska.c
- +++ b/output/matroska.c
- @@ -146,7 +146,7 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
- memcpy( avcC+11+sps_size, pps, pps_size );
- - ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
- + ret = mk_writeHeader( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
- avcC, avcC_len, p_mkv->frame_duration, 50000,
- p_mkv->width, p_mkv->height,
- p_mkv->d_width, p_mkv->d_height );
- --
- 1.6.1.2
- From 04b8ec5fa470d1132114ffcc09494050c6c5751e Mon Sep 17 00:00:00 2001
- From: Alexander Strange <astrange@ithinksw.com>
- Date: Sat, 13 Feb 2010 02:22:04 -0500
- Subject: [PATCH 20/26] Mark cli_input/output_t variables as const when possible
- ---
- input/avs.c | 2 +-
- input/ffms.c | 2 +-
- input/input.h | 10 +++++-----
- input/lavf.c | 2 +-
- input/y4m.c | 2 +-
- input/yuv.c | 2 +-
- output/flv.c | 2 +-
- output/matroska.c | 2 +-
- output/mp4.c | 2 +-
- output/output.h | 8 ++++----
- output/raw.c | 2 +-
- 11 files changed, 18 insertions(+), 18 deletions(-)
- diff --git a/input/avs.c b/input/avs.c
- index 522f8fe..79b5c80 100644
- --- a/input/avs.c
- +++ b/input/avs.c
- @@ -313,4 +313,4 @@ static int close_file( hnd_t handle )
- return 0;
- }
- -cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
- +const cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
- diff --git a/input/ffms.c b/input/ffms.c
- index b680967..14962c7 100644
- --- a/input/ffms.c
- +++ b/input/ffms.c
- @@ -244,4 +244,4 @@ static int close_file( hnd_t handle )
- return 0;
- }
- -cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
- +const cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
- diff --git a/input/input.h b/input/input.h
- index 9fb425c..6e386f4 100644
- --- a/input/input.h
- +++ b/input/input.h
- @@ -60,11 +60,11 @@ typedef struct
- int (*close_file)( hnd_t handle );
- } cli_input_t;
- -extern cli_input_t yuv_input;
- -extern cli_input_t y4m_input;
- -extern cli_input_t avs_input;
- +extern const cli_input_t yuv_input;
- +extern const cli_input_t y4m_input;
- +extern const cli_input_t avs_input;
- extern cli_input_t thread_input;
- -extern cli_input_t lavf_input;
- -extern cli_input_t ffms_input;
- +extern const cli_input_t lavf_input;
- +extern const cli_input_t ffms_input;
- #endif
- diff --git a/input/lavf.c b/input/lavf.c
- index 180e509..6ecc6b0 100644
- --- a/input/lavf.c
- +++ b/input/lavf.c
- @@ -269,4 +269,4 @@ static int close_file( hnd_t handle )
- return 0;
- }
- -cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
- +const cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
- diff --git a/input/y4m.c b/input/y4m.c
- index 1619f74..8645ff7 100644
- --- a/input/y4m.c
- +++ b/input/y4m.c
- @@ -242,4 +242,4 @@ static int close_file( hnd_t handle )
- return 0;
- }
- -cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
- +const cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
- diff --git a/input/yuv.c b/input/yuv.c
- index dbd0317..3e39e07 100644
- --- a/input/yuv.c
- +++ b/input/yuv.c
- @@ -125,4 +125,4 @@ static int close_file( hnd_t handle )
- return 0;
- }
- -cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
- +const cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
- diff --git a/output/flv.c b/output/flv.c
- index b3e5d16..2e0a0e4 100644
- --- a/output/flv.c
- +++ b/output/flv.c
- @@ -305,4 +305,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
- return 0;
- }
- -cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
- +const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
- diff --git a/output/matroska.c b/output/matroska.c
- index b1805e4..fb39ced 100644
- --- a/output/matroska.c
- +++ b/output/matroska.c
- @@ -206,4 +206,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
- return ret;
- }
- -cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
- +const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
- diff --git a/output/mp4.c b/output/mp4.c
- index b817c82..b99eaed 100644
- --- a/output/mp4.c
- +++ b/output/mp4.c
- @@ -298,4 +298,4 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
- return i_size;
- }
- -cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
- +const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
- diff --git a/output/output.h b/output/output.h
- index 851b819..c79b48e 100644
- --- a/output/output.h
- +++ b/output/output.h
- @@ -33,9 +33,9 @@ typedef struct
- int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
- } cli_output_t;
- -extern cli_output_t raw_output;
- -extern cli_output_t mkv_output;
- -extern cli_output_t mp4_output;
- -extern cli_output_t flv_output;
- +extern const cli_output_t raw_output;
- +extern const cli_output_t mkv_output;
- +extern const cli_output_t mp4_output;
- +extern const cli_output_t flv_output;
- #endif
- diff --git a/output/raw.c b/output/raw.c
- index a4d1175..02e4c56 100644
- --- a/output/raw.c
- +++ b/output/raw.c
- @@ -62,5 +62,5 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
- return fclose( (FILE*)handle );
- }
- -cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
- +const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
- --
- 1.6.1.2
- From f3dad80b901593c9d504930cd610650c8d8ff104 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Sat, 13 Feb 2010 00:52:31 -0800
- Subject: [PATCH 21/26] Make the ABR buffer consider the distance to the end of the video
- Should improve bitrate accuracy in 2-pass mode.
- May also slightly improve quality by allowing more variation earlier-on in a file.
- Also fix abr_buffer with 1-pass: it does something very different than what it does for 2-pass.
- Thus, the earlier change that increased it based on threads caused 1-pass ABR to be somewhat less accurate.
- ---
- encoder/ratecontrol.c | 6 ++++--
- 1 files changed, 4 insertions(+), 2 deletions(-)
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index 0c946ba..8c61582 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -1784,13 +1784,15 @@ static float rate_estimate_qscale( x264_t *h )
- }
- else
- {
- - double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate * h->i_thread_frames;
- + double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
- if( rcc->b_2pass )
- {
- - //FIXME adjust abr_buffer based on distance to the end of the video
- int64_t diff;
- int64_t predicted_bits = total_bits;
- + /* Adjust ABR buffer based on distance to the end of the video. */
- + if( rcc->num_entries > h->fenc->i_frame )
- + abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->fenc->i_frame );
- if( rcc->b_vbv )
- {
- --
- 1.6.1.2
- From 2fd2dfe22704ce1de0cb8811484de6c2c2c7ea64 Mon Sep 17 00:00:00 2001
- From: David Conrad <lessen42@gmail.com>
- Date: Sat, 13 Feb 2010 01:25:56 -0800
- Subject: [PATCH 22/26] Use #ifdef instead of #if in checkasm
- ---
- tools/checkasm.c | 4 ++--
- 1 files changed, 2 insertions(+), 2 deletions(-)
- diff --git a/tools/checkasm.c b/tools/checkasm.c
- index 0bedc5b..595bd9e 100644
- --- a/tools/checkasm.c
- +++ b/tools/checkasm.c
- @@ -1662,13 +1662,13 @@ static int check_all_flags( void )
- cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
- }
- -#elif ARCH_PPC
- +#elif defined(ARCH_PPC)
- if( x264_cpu_detect() & X264_CPU_ALTIVEC )
- {
- fprintf( stderr, "x264: ALTIVEC against C\n" );
- ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
- }
- -#elif ARCH_ARM
- +#elif defined(ARCH_ARM)
- if( x264_cpu_detect() & X264_CPU_ARMV6 )
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
- if( x264_cpu_detect() & X264_CPU_NEON )
- --
- 1.6.1.2
- From 15132ee0c913cdca90598c14f8a7532579603721 Mon Sep 17 00:00:00 2001
- From: David Conrad <lessen42@gmail.com>
- Date: Fri, 8 Jan 2010 22:40:09 -0500
- Subject: [PATCH 23/26] ARM NEON versions of weightp functions
- ---
- common/arm/mc-a.S | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++++
- common/arm/mc-c.c | 47 ++++++++
- 2 files changed, 352 insertions(+), 0 deletions(-)
- diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
- index a62af39..e1db404 100644
- --- a/common/arm/mc-a.S
- +++ b/common/arm/mc-a.S
- @@ -432,6 +432,311 @@ avg2_w20_loop:
- .endfunc
- +.macro weight_prologue type
- + push {r4-r5,lr}
- + ldr r4, [sp, #4*3] // weight_t
- + ldr ip, [sp, #4*3+4] // h
- +.ifc \type, full
- + ldr lr, [r4, #32] // denom
- +.endif
- + ldrd r4, [r4, #32+4] // scale, offset
- + vdup.16 q0, r4
- + vdup.16 q1, r5
- +.ifc \type, full
- + rsb lr, lr, #0
- + vdup.16 q2, lr
- +.endif
- +.endm
- +
- +// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
- +// const x264_weight_t *weight, int height )
- +function x264_mc_weight_w20_neon
- + weight_prologue full
- + sub r1, #16
- +weight20_loop:
- + subs ip, #2
- + vld1.8 {d17-d19}, [r2], r3
- + vmovl.u8 q10, d17
- + vmovl.u8 q11, d18
- + vmovl.u8 q14, d19
- + vld1.8 {d16-d18}, [r2], r3
- + vmovl.u8 q12, d16
- + vmovl.u8 q13, d17
- + vmovl.u8 q15, d18
- + vmul.s16 q10, q10, q0
- + vmul.s16 q11, q11, q0
- + vmul.s16 q12, q12, q0
- + vmul.s16 q13, q13, q0
- + vmul.s16 d28, d28, d0
- + vmul.s16 d29, d30, d0
- + vrshl.s16 q10, q10, q2
- + vrshl.s16 q11, q11, q2
- + vrshl.s16 q12, q12, q2
- + vrshl.s16 q13, q13, q2
- + vrshl.s16 q14, q14, q2
- + vadd.s16 q10, q10, q1
- + vadd.s16 q11, q11, q1
- + vadd.s16 q12, q12, q1
- + vadd.s16 q13, q13, q1
- + vadd.s16 q14, q14, q1
- + vqmovun.s16 d16, q10
- + vqmovun.s16 d17, q11
- + vqmovun.s16 d18, q12
- + vqmovun.s16 d19, q13
- + vqmovun.s16 d20, q14
- + vst1.8 {d16-d17}, [r0,:128]!
- + vst1.32 {d20[0]}, [r0,:32], r1
- + vst1.8 {d18-d19}, [r0,:128]!
- + vst1.32 {d20[1]}, [r0,:32], r1
- + bgt weight20_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +function x264_mc_weight_w16_neon
- + weight_prologue full
- +weight16_loop:
- + subs ip, #2
- + vld1.8 {d16-d17}, [r2], r3
- + vld1.8 {d18-d19}, [r2], r3
- + vmovl.u8 q10, d16
- + vmovl.u8 q11, d17
- + vmovl.u8 q12, d18
- + vmovl.u8 q13, d19
- + vmul.s16 q10, q10, q0
- + vmul.s16 q11, q11, q0
- + vmul.s16 q12, q12, q0
- + vmul.s16 q13, q13, q0
- + vrshl.s16 q10, q10, q2
- + vrshl.s16 q11, q11, q2
- + vrshl.s16 q12, q12, q2
- + vrshl.s16 q13, q13, q2
- + vadd.s16 q10, q10, q1
- + vadd.s16 q11, q11, q1
- + vadd.s16 q12, q12, q1
- + vadd.s16 q13, q13, q1
- + vqmovun.s16 d16, q10
- + vqmovun.s16 d17, q11
- + vqmovun.s16 d18, q12
- + vqmovun.s16 d19, q13
- + vst1.8 {d16-d17}, [r0,:128], r1
- + vst1.8 {d18-d19}, [r0,:128], r1
- + bgt weight16_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +function x264_mc_weight_w8_neon
- + weight_prologue full
- +weight8_loop:
- + subs ip, #2
- + vld1.8 {d16}, [r2], r3
- + vld1.8 {d18}, [r2], r3
- + vmovl.u8 q8, d16
- + vmovl.u8 q9, d18
- + vmul.s16 q8, q8, q0
- + vmul.s16 q9, q9, q0
- + vrshl.s16 q8, q8, q2
- + vrshl.s16 q9, q9, q2
- + vadd.s16 q8, q8, q1
- + vadd.s16 q9, q9, q1
- + vqmovun.s16 d16, q8
- + vqmovun.s16 d18, q9
- + vst1.8 {d16}, [r0,:64], r1
- + vst1.8 {d18}, [r0,:64], r1
- + bgt weight8_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +function x264_mc_weight_w4_neon
- + weight_prologue full
- +weight4_loop:
- + subs ip, #2
- + vld1.32 {d16[]}, [r2], r3
- + vld1.32 {d18[]}, [r2], r3
- + vmovl.u8 q8, d16
- + vmovl.u8 q9, d18
- + vmul.s16 d16, d16, d0
- + vmul.s16 d17, d18, d0
- + vrshl.s16 q8, q8, q2
- + vadd.s16 q8, q8, q1
- + vqmovun.s16 d16, q8
- + vst1.32 {d16[0]}, [r0,:32], r1
- + vst1.32 {d16[1]}, [r0,:32], r1
- + bgt weight4_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +function x264_mc_weight_w20_nodenom_neon
- + weight_prologue nodenom
- + sub r1, #16
- +weight20_nodenom_loop:
- + subs ip, #2
- + vld1.8 {d17-d19}, [r2], r3
- + vmovl.u8 q10, d17
- + vmovl.u8 q11, d18
- + vmovl.u8 q14, d19
- + vld1.8 {d16-d18}, [r2], r3
- + vmovl.u8 q12, d16
- + vmovl.u8 q13, d17
- + vmovl.u8 q15, d18
- + vmov q8, q1
- + vmov q9, q1
- + vmla.s16 q8, q10, q0
- + vmla.s16 q9, q11, q0
- + vmov q10, q1
- + vmov q11, q1
- + vmla.s16 q10, q12, q0
- + vmla.s16 q11, q13, q0
- + vmov q12, q1
- + vmla.s16 d24, d28, d0
- + vmla.s16 d25, d30, d0
- + vqmovun.s16 d16, q8
- + vqmovun.s16 d17, q9
- + vqmovun.s16 d18, q10
- + vqmovun.s16 d19, q11
- + vqmovun.s16 d20, q12
- + vst1.8 {d16-d17}, [r0,:128]!
- + vst1.32 {d20[0]}, [r0,:32], r1
- + vst1.8 {d18-d19}, [r0,:128]!
- + vst1.32 {d20[1]}, [r0,:32], r1
- + bgt weight20_nodenom_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +function x264_mc_weight_w16_nodenom_neon
- + weight_prologue nodenom
- +weight16_nodenom_loop:
- + subs ip, #2
- + vld1.8 {d16-d17}, [r2], r3
- + vld1.8 {d18-d19}, [r2], r3
- + vmovl.u8 q12, d16
- + vmovl.u8 q13, d17
- + vmovl.u8 q14, d18
- + vmovl.u8 q15, d19
- + vmov q8, q1
- + vmov q9, q1
- + vmov q10, q1
- + vmov q11, q1
- + vmla.s16 q8, q12, q0
- + vmla.s16 q9, q13, q0
- + vmla.s16 q10, q14, q0
- + vmla.s16 q11, q15, q0
- + vqmovun.s16 d16, q8
- + vqmovun.s16 d17, q9
- + vqmovun.s16 d18, q10
- + vqmovun.s16 d19, q11
- + vst1.8 {d16-d17}, [r0,:128], r1
- + vst1.8 {d18-d19}, [r0,:128], r1
- + bgt weight16_nodenom_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +function x264_mc_weight_w8_nodenom_neon
- + weight_prologue nodenom
- +weight8_nodenom_loop:
- + subs ip, #2
- + vld1.8 {d16}, [r2], r3
- + vld1.8 {d18}, [r2], r3
- + vmovl.u8 q8, d16
- + vmovl.u8 q9, d18
- + vmov q10, q1
- + vmov q11, q1
- + vmla.s16 q10, q8, q0
- + vmla.s16 q11, q9, q0
- + vqmovun.s16 d16, q10
- + vqmovun.s16 d17, q11
- + vst1.8 {d16}, [r0,:64], r1
- + vst1.8 {d17}, [r0,:64], r1
- + bgt weight8_nodenom_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +function x264_mc_weight_w4_nodenom_neon
- + weight_prologue nodenom
- +weight4_nodenom_loop:
- + subs ip, #2
- + vld1.32 {d16[]}, [r2], r3
- + vld1.32 {d18[]}, [r2], r3
- + vmovl.u8 q8, d16
- + vmovl.u8 q9, d18
- + vmov q10, q1
- + vmla.s16 d20, d16, d0
- + vmla.s16 d21, d18, d0
- + vqmovun.s16 d16, q10
- + vst1.32 {d16[0]}, [r0,:32], r1
- + vst1.32 {d16[1]}, [r0,:32], r1
- + bgt weight4_nodenom_loop
- + pop {r4-r5,pc}
- +.endfunc
- +
- +.macro weight_simple_prologue
- + push {lr}
- + ldr lr, [sp, #4] // weight_t
- + ldr ip, [sp, #8] // h
- + ldr lr, [lr] // offset
- + vdup.8 q1, lr
- +.endm
- +
- +.macro weight_simple name op
- +function x264_mc_weight_w20_\name\()_neon
- + weight_simple_prologue
- +weight20_\name\()_loop:
- + subs ip, #2
- + vld1.8 {d16-d18}, [r2], r3
- + vld1.8 {d19-d21}, [r2], r3
- + \op q8, q8, q1
- + \op q9, q9, q1
- + \op q10, q10, q1
- + vst1.8 {d16-d18}, [r0,:64], r1
- + vst1.8 {d19-d21}, [r0,:64], r1
- + bgt weight20_\name\()_loop
- + pop {pc}
- +.endfunc
- +
- +function x264_mc_weight_w16_\name\()_neon
- + weight_simple_prologue
- +weight16_\name\()_loop:
- + subs ip, #2
- + vld1.8 {d16-d17}, [r2], r3
- + vld1.8 {d18-d19}, [r2], r3
- + \op q8, q8, q1
- + \op q9, q9, q1
- + vst1.8 {d16-d17}, [r0,:128], r1
- + vst1.8 {d18-d19}, [r0,:128], r1
- + bgt weight16_\name\()_loop
- + pop {pc}
- +.endfunc
- +
- +function x264_mc_weight_w8_\name\()_neon
- + weight_simple_prologue
- +weight8_\name\()_loop:
- + subs ip, #2
- + vld1.8 {d16}, [r2], r3
- + vld1.8 {d17}, [r2], r3
- + \op q8, q8, q1
- + vst1.8 {d16}, [r0,:64], r1
- + vst1.8 {d17}, [r0,:64], r1
- + bgt weight8_\name\()_loop
- + pop {pc}
- +.endfunc
- +
- +function x264_mc_weight_w4_\name\()_neon
- + weight_simple_prologue
- +weight4_\name\()_loop:
- + subs ip, #2
- + vld1.32 {d16[]}, [r2], r3
- + vld1.32 {d17[]}, [r2], r3
- + \op q8, q8, q1
- + vst1.32 {d16[0]}, [r0,:32], r1
- + vst1.32 {d17[0]}, [r0,:32], r1
- + bgt weight4_\name\()_loop
- + pop {pc}
- +.endfunc
- +.endm
- +
- +weight_simple offsetadd, vqadd.u8
- +weight_simple offsetsub, vqsub.u8
- +
- +
- // void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
- function x264_mc_copy_w4_neon
- ldr ip, [sp]
- diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
- index 20cf151..0a7b734 100644
- --- a/common/arm/mc-c.c
- +++ b/common/arm/mc-c.c
- @@ -43,6 +43,48 @@ void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
- void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
- void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
- +#define MC_WEIGHT(func)\
- +void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
- +void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
- +void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
- +void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
- +\
- +static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
- +{\
- + x264_mc_weight_w4##func##_neon,\
- + x264_mc_weight_w4##func##_neon,\
- + x264_mc_weight_w8##func##_neon,\
- + x264_mc_weight_w16##func##_neon,\
- + x264_mc_weight_w16##func##_neon,\
- + x264_mc_weight_w20##func##_neon,\
- +};
- +
- +MC_WEIGHT()
- +MC_WEIGHT(_nodenom)
- +MC_WEIGHT(_offsetadd)
- +MC_WEIGHT(_offsetsub)
- +
- +static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
- +{
- + if( w->i_scale == 1<<w->i_denom )
- + {
- + if( w->i_offset < 0 )
- + {
- + w->weightfn = x264_mc_offsetsub_wtab_neon;
- + w->cachea[0] = -w->i_offset;
- + }
- + else
- + {
- + w->weightfn = x264_mc_offsetadd_wtab_neon;
- + w->cachea[0] = w->i_offset;
- + }
- + }
- + else if( !w->i_denom )
- + w->weightfn = x264_mc_nodenom_wtab_neon;
- + else
- + w->weightfn = x264_mc_wtab_neon;
- +}
- +
- void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
- void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
- void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
- @@ -182,6 +224,11 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
- pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
- pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
- + pf->weight = x264_mc_wtab_neon;
- + pf->offsetadd = x264_mc_offsetadd_wtab_neon;
- + pf->offsetsub = x264_mc_offsetsub_wtab_neon;
- + pf->weight_cache = x264_weight_cache_neon;
- +
- // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
- #ifndef SYS_MACOSX
- pf->memcpy_aligned = x264_memcpy_aligned_neon;
- --
- 1.6.1.2
- From 44057dac8a3c3a1fb359035895b9126a52f75993 Mon Sep 17 00:00:00 2001
- From: David Conrad <lessen42@gmail.com>
- Date: Sun, 4 Oct 2009 07:24:42 -0400
- Subject: [PATCH 24/26] iPhone compilation support
- Also add --sysroot to configure options
- To build for iPhone 3gs / iPod touch 3g:
- CC=/Developer/Platforms/iPhoneOS.platform/Developer/usr/bin/gcc ./configure --host=arm-apple-darwin --sysroot=/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.0.sdk
- For older devices, add
- --extra-cflags='-arch armv6 -mcpu=arm1176jzf-s' --extra-ldflags='-arch armv6' --disable-asm
- ---
- common/arm/asm.S | 9 ++-
- common/arm/pixel-a.S | 13 ++-
- configure | 17 +++-
- extras/gas-preprocessor.pl | 256 ++++++++++++++++++++++++++++++++++++++++++++
- 4 files changed, 287 insertions(+), 8 deletions(-)
- create mode 100755 extras/gas-preprocessor.pl
- diff --git a/common/arm/asm.S b/common/arm/asm.S
- index d163165..395267f 100644
- --- a/common/arm/asm.S
- +++ b/common/arm/asm.S
- @@ -20,6 +20,12 @@
- #include "config.h"
- +#ifdef PREFIX
- +# define EXTERN_ASM _
- +#else
- +# define EXTERN_ASM
- +#endif
- +
- #ifdef __ELF__
- # define ELF
- #else
- @@ -35,7 +41,8 @@ ELF .eabi_attribute 25, \val
- .endm
- .macro function name
- - .global \name
- + .global EXTERN_ASM\name
- +EXTERN_ASM\name:
- ELF .hidden \name
- ELF .type \name, %function
- .func \name
- diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
- index 4dd65ed..d8533e5 100644
- --- a/common/arm/pixel-a.S
- +++ b/common/arm/pixel-a.S
- @@ -110,16 +110,17 @@ SAD4_ARMV6 8
- .macro SAD_FUNC w, h, name, align:vararg
- function x264_pixel_sad\name\()_\w\()x\h\()_neon
- + SAD_START_\w \align
- +
- .if \w == 16
- - .set r, \h / 2 - 1
- +.rept \h / 2 - 1
- + SAD_\w \align
- +.endr
- .else
- - .set r, \h - 1
- -.endif
- -
- - SAD_START_\w \align
- -.rept r
- +.rept \h - 1
- SAD_\w \align
- .endr
- +.endif
- .if \w > 8
- vabal.u8 q8, d4, d6
- diff --git a/configure b/configure
- index b254383..5288351 100755
- --- a/configure
- +++ b/configure
- @@ -23,6 +23,7 @@ echo " --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS"
- echo " --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
- echo " --host=HOST build programs to run on HOST"
- echo " --cross-prefix=PREFIX use PREFIX for compilation tools"
- +echo " --sysroot=SYSROOT root of cross-build tree"
- echo ""
- exit 1
- fi
- @@ -223,6 +224,10 @@ for opt do
- --cross-prefix=*)
- cross_prefix="${opt#--cross-prefix=}"
- ;;
- + --sysroot=*)
- + CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}"
- + LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}"
- + ;;
- *)
- echo "Unknown option $opt, ignored"
- ;;
- @@ -367,7 +372,17 @@ case $host_cpu in
- ;;
- arm*)
- ARCH="ARM"
- - AS="${AS-${cross_prefix}gcc}"
- + if [ "$SYS" = MACOSX ] ; then
- + AS="${AS-extras/gas-preprocessor.pl $CC}"
- + ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all
- + # build for armv7 by default
- + if ! echo $CFLAGS | grep -Eq '\-arch' ; then
- + CFLAGS="$CFLAGS -arch armv7"
- + LDFLAGS="$LDFLAGS -arch armv7"
- + fi
- + else
- + AS="${AS-${cross_prefix}gcc}"
- + fi
- ;;
- s390|s390x)
- ARCH="S390"
- diff --git a/extras/gas-preprocessor.pl b/extras/gas-preprocessor.pl
- new file mode 100755
- index 0000000..d60893c
- --- /dev/null
- +++ b/extras/gas-preprocessor.pl
- @@ -0,0 +1,256 @@
- +#!/usr/bin/env perl
- +# by David Conrad
- +# This code is licensed under GPLv2 or later; go to gnu.org to read it
- +# (not that it much matters for an asm preprocessor)
- +# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
- +use strict;
- +
- +# Apple's gas is ancient and doesn't support modern preprocessing features like
- +# .rept and has ugly macro syntax, among other things. Thus, this script
- +# implements the subset of the gas preprocessor used by x264 and ffmpeg
- +# that isn't supported by Apple's gas.
- +
- +# FIXME: doesn't work if the path has spaces, but oh well...
- +my $gcc_cmd = join(' ', @ARGV);
- +my $preprocess_c_cmd;
- +
- +if ($gcc_cmd =~ /\S+\.c/) {
- + # C file (inline asm?) - compile
- + $preprocess_c_cmd = "$gcc_cmd -S";
- + $gcc_cmd =~ s/\S+\.c/-x assembler -/g;
- +} elsif ($gcc_cmd =~ /\S+\.S/) {
- + # asm file, just do C preprocessor
- + $preprocess_c_cmd = "$gcc_cmd -E";
- + $gcc_cmd =~ s/\S+\.S/-x assembler -/g;
- +} else {
- + die "Unrecognized input filetype";
- +}
- +
- +$preprocess_c_cmd =~ s/\S+\.o/-/g;
- +
- +open(ASMFILE, "-|", $preprocess_c_cmd) || die "Error running preprocessor";
- +
- +my $current_macro = '';
- +my %macro_lines;
- +my %macro_args;
- +my %macro_args_default;
- +
- +my @pass1_lines;
- +
- +# pass 1: parse .macro
- +# note that the handling of arguments is probably overly permissive vs. gas
- +# but it should be the same for valid cases
- +while (<ASMFILE>) {
- + # comment out unsupported directives
- + s/\.type/@.type/x;
- + s/\.func/@.func/x;
- + s/\.endfunc/@.endfunc/x;
- + s/\.ltorg/@.ltorg/x;
- + s/\.size/@.size/x;
- + s/\.fpu/@.fpu/x;
- +
- + # the syntax for these is a little different
- + s/\.global/.globl/x;
- + # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
- + s/(.*)\.rodata/.const_data/x;
- + s/\.int/.long/x;
- + s/\.float/.single/x;
- +
- + # catch unknown section names that aren't mach-o style (with a comma)
- + if (/.section ([^,]*)$/) {
- + die ".section $1 unsupported; figure out the mach-o section name and add it";
- + }
- +
- + # macros creating macros is not handled (is that valid?)
- + if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
- + $current_macro = $1;
- +
- + # commas in the argument list are optional, so only use whitespace as the separator
- + my $arglist = $2;
- + $arglist =~ s/,/ /g;
- +
- + my @args = split(/\s+/, $arglist);
- + foreach my $i (0 .. $#args) {
- + my @argpair = split(/=/, $args[$i]);
- + $macro_args{$current_macro}[$i] = $argpair[0];
- + $argpair[0] =~ s/:vararg$//;
- + $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
- + }
- + # ensure %macro_lines has the macro name added as a key
- + $macro_lines{$current_macro} = [];
- + } elsif (/\.endm/) {
- + if (!$current_macro) {
- + die "ERROR: .endm without .macro";
- + }
- + $current_macro = '';
- + } elsif ($current_macro) {
- + push(@{$macro_lines{$current_macro}}, $_);
- + } else {
- + expand_macros($_);
- + }
- +}
- +
- +sub expand_macros {
- + my $line = @_[0];
- + if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
- + push(@pass1_lines, $1);
- + my $macro = $2;
- +
- + # commas are optional here too, but are syntactically important because
- + # parameters can be blank
- + my @arglist = split(/,/, $3);
- + my @args;
- + foreach (@arglist) {
- + my @whitespace_split = split(/\s+/, $_);
- + if (!@whitespace_split) {
- + push(@args, '');
- + } else {
- + foreach (@whitespace_split) {
- + if (length($_)) {
- + push(@args, $_);
- + }
- + }
- + }
- + }
- +
- + my %replacements;
- + if ($macro_args_default{$macro}){
- + %replacements = %{$macro_args_default{$macro}};
- + }
- +
- + # construct hashtable of text to replace
- + foreach my $i (0 .. $#args) {
- + my $argname = $macro_args{$macro}[$i];
- +
- + if ($args[$i] =~ m/=/) {
- + # arg=val references the argument name
- + # XXX: I'm not sure what the expected behaviour if a lot of
- + # these are mixed with unnamed args
- + my @named_arg = split(/=/, $args[$i]);
- + $replacements{$named_arg[0]} = $named_arg[1];
- + } elsif ($i > $#{$macro_args{$macro}}) {
- + # more args given than the macro has named args
- + # XXX: is vararg allowed on arguments before the last?
- + $argname = $macro_args{$macro}[-1];
- + if ($argname =~ s/:vararg$//) {
- + $replacements{$argname} .= ", $args[$i]";
- + } else {
- + die "Too many arguments to macro $macro";
- + }
- + } else {
- + $argname =~ s/:vararg$//;
- + $replacements{$argname} = $args[$i];
- + }
- + }
- +
- + # apply replacements as regex
- + foreach (@{$macro_lines{$macro}}) {
- + my $macro_line = $_;
- + # do replacements by longest first, this avoids wrong replacement
- + # when argument names are subsets of each other
- + foreach (reverse sort {length $a <=> length $b} keys %replacements) {
- + $macro_line =~ s/\\$_/$replacements{$_}/g;
- + }
- + $macro_line =~ s/\\\(\)//g; # remove \()
- + expand_macros($macro_line);
- + }
- + } else {
- + push(@pass1_lines, $line);
- + }
- +}
- +
- +close(ASMFILE) or exit 1;
- +open(ASMFILE, "|-", $gcc_cmd) or die "Error running assembler";
- +
- +my @sections;
- +my $num_repts;
- +my $rept_lines;
- +
- +my %literal_labels; # for ldr <reg>, =<expr>
- +my $literal_num = 0;
- +
- +# pass 2: parse .rept and .if variants
- +# NOTE: since we don't implement a proper parser, using .rept with a
- +# variable assigned from .set is not supported
- +foreach my $line (@pass1_lines) {
- + # textual comparison .if
- + # this assumes nothing else on the same line
- + if ($line =~ /\.ifnb\s+(.*)/) {
- + if ($1) {
- + $line = ".if 1\n";
- + } else {
- + $line = ".if 0\n";
- + }
- + } elsif ($line =~ /\.ifb\s+(.*)/) {
- + if ($1) {
- + $line = ".if 0\n";
- + } else {
- + $line = ".if 1\n";
- + }
- + } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
- + if ($1 eq $2) {
- + $line = ".if 1\n";
- + } else {
- + $line = ".if 0\n";
- + }
- + }
- +
- + # handle .previous (only with regard to .section not .subsection)
- + if ($line =~ /\.(section|text|const_data)/) {
- + push(@sections, $line);
- + } elsif ($line =~ /\.previous/) {
- + if (!$sections[-2]) {
- + die ".previous without a previous section";
- + }
- + $line = $sections[-2];
- + push(@sections, $line);
- + }
- +
- + # handle ldr <reg>, =<expr>
- + if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) {
- + my $label = $literal_labels{$3};
- + if (!$label) {
- + $label = ".Literal_$literal_num";
- + $literal_num++;
- + $literal_labels{$3} = $label;
- + }
- + $line = "$1 ldr$2, $label\n";
- + } elsif ($line =~ /\.ltorg/) {
- + foreach my $literal (keys %literal_labels) {
- + $line .= "$literal_labels{$literal}:\n .word $literal\n";
- + }
- + %literal_labels = ();
- + }
- +
- + # @l -> lo16() @ha -> ha16()
- + $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g;
- + $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g;
- +
- + if ($line =~ /\.rept\s+(.*)/) {
- + $num_repts = $1;
- + $rept_lines = "\n";
- +
- + # handle the possibility of repeating another directive on the same line
- + # .endr on the same line is not valid, I don't know if a non-directive is
- + if ($num_repts =~ s/(\.\w+.*)//) {
- + $rept_lines .= "$1\n";
- + }
- + $num_repts = eval($num_repts);
- + } elsif ($line =~ /\.endr/) {
- + for (1 .. $num_repts) {
- + print ASMFILE $rept_lines;
- + }
- + $rept_lines = '';
- + } elsif ($rept_lines) {
- + $rept_lines .= $line;
- + } else {
- + print ASMFILE $line;
- + }
- +}
- +
- +print ASMFILE ".text\n";
- +foreach my $literal (keys %literal_labels) {
- + print ASMFILE "$literal_labels{$literal}:\n .word $literal\n";
- +}
- +
- +close(ASMFILE) or exit 1;
- --
- 1.6.1.2
- From d3dfd8704d23ae2c723263478e40326b51a2ceaf Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Sat, 13 Feb 2010 11:19:38 -0800
- Subject: [PATCH 25/26] Don't even try direct temporal when it would give junk MVs
- In PbBbP pyramid structure, the last "b" cannot use temporal because L0Ref0(L1Ref0) != L0Ref0.
- Don't even bother analyzing it, just use spatial.
- Should improve speed and direct auto effectiveness in CRF and 1-pass modes when b-pyramid is used.
- Also makes --direct temporal useful with --b-pyramid, since it will fall back to spatial for frames where temporal is broken.
- ---
- common/frame.h | 1 +
- encoder/encoder.c | 30 +++++++++++++++++++++---------
- 2 files changed, 22 insertions(+), 9 deletions(-)
- diff --git a/common/frame.h b/common/frame.h
- index b1852b3..7c8e2ff 100644
- --- a/common/frame.h
- +++ b/common/frame.h
- @@ -48,6 +48,7 @@ typedef struct x264_frame
- uint8_t i_bframes; /* number of bframes following this nonb in coded order */
- float f_qp_avg_rc; /* QPs as decided by ratecontrol */
- float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
- + int i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
- /* YUV buffer */
- int i_plane;
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index cca9c45..df62389 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -108,12 +108,24 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
- sh->i_redundant_pic_cnt = 0;
- - if( !h->mb.b_direct_auto_read )
- + h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
- + && h->param.i_bframe
- + && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
- +
- + if( !h->mb.b_direct_auto_read && sh->i_type == SLICE_TYPE_B )
- {
- - if( h->mb.b_direct_auto_write )
- - sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
- + if( h->fref1[0]->i_poc_l0ref0 == h->fref0[0]->i_poc )
- + {
- + if( h->mb.b_direct_auto_write )
- + sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
- + else
- + sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
- + }
- else
- - sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
- + {
- + h->mb.b_direct_auto_write = 0;
- + sh->b_direct_spatial_mv_pred = 1;
- + }
- }
- /* else b_direct_spatial_mv_pred was read from the 2pass statsfile */
- @@ -623,10 +635,6 @@ static int x264_validate_parameters( x264_t *h )
- h->param.i_sync_lookahead = 0;
- #endif
- - h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
- - && h->param.i_bframe
- - && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
- -
- h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
- h->param.i_deblocking_filter_beta = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
- h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
- @@ -2371,6 +2379,9 @@ int x264_encoder_encode( x264_t *h,
- x264_reference_check_reorder( h );
- }
- + if( h->i_ref0 )
- + h->fdec->i_poc_l0ref0 = h->fref0[0]->i_poc;
- +
- if( h->sh.i_type == SLICE_TYPE_B )
- x264_macroblock_bipred_init( h );
- @@ -2806,7 +2817,8 @@ void x264_encoder_close ( x264_t *h )
- x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
- }
- - if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
- + if( (h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ||
- + (h->stat.i_direct_frames[0] && h->stat.i_direct_frames[1]))
- && h->stat.i_frame_count[SLICE_TYPE_B] )
- {
- x264_log( h, X264_LOG_INFO, "direct mvs spatial:%.1f%% temporal:%.1f%%\n",
- --
- 1.6.1.2
- From dcf583527aea433e7a3972cd7597d167bb8f3fe5 Mon Sep 17 00:00:00 2001
- From: Loren Merritt <pengvado@akuvian.org>
- Date: Thu, 28 Jan 2010 18:09:07 +0000
- Subject: [PATCH 26/26] Remove unnecessary PIC support macros
- yasm has a directive to enable PIC globally
- ---
- common/x86/cabac-a.asm | 2 +-
- common/x86/dct-32.asm | 10 ++++----
- common/x86/dct-64.asm | 10 ++++----
- common/x86/dct-a.asm | 42 +++++++++++++++++++-------------------
- common/x86/deblock-a.asm | 34 +++++++++++++++---------------
- common/x86/mc-a.asm | 40 ++++++++++++++++++------------------
- common/x86/mc-a2.asm | 30 +++++++++++++-------------
- common/x86/pixel-a.asm | 50 +++++++++++++++++++++++-----------------------
- common/x86/predict-a.asm | 28 ++++++++++++------------
- common/x86/quant-a.asm | 22 ++++++++++----------
- common/x86/sad-a.asm | 14 ++++++------
- common/x86/x86inc.asm | 20 +++--------------
- common/x86/x86util.asm | 4 +-
- tools/checkasm-a.asm | 16 +++++++-------
- 14 files changed, 155 insertions(+), 167 deletions(-)
- diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
- index 29e05f1..62e281a 100644
- --- a/common/x86/cabac-a.asm
- +++ b/common/x86/cabac-a.asm
- @@ -59,7 +59,7 @@ endstruc
- %macro LOAD_GLOBAL 4
- %ifdef PIC
- ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
- - lea r11, [%2 GLOBAL]
- + lea r11, [%2]
- %ifnidn %3, 0
- add r11, %3
- %endif
- diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
- index a713dd6..3350e40 100644
- --- a/common/x86/dct-32.asm
- +++ b/common/x86/dct-32.asm
- @@ -349,7 +349,7 @@ cglobal x264_sub8x8_dct_%1, 3,3
- global x264_sub8x8_dct_%1.skip_prologue
- .skip_prologue:
- %ifnidn %1, sse2
- - mova m7, [hsub_mul GLOBAL]
- + mova m7, [hsub_mul]
- %endif
- LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
- SPILL r0, 1,2
- @@ -393,7 +393,7 @@ global x264_sub8x8_dct8_%1.skip_prologue
- LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- UNSPILL r0, 0
- %else
- - mova m7, [hsub_mul GLOBAL]
- + mova m7, [hsub_mul]
- LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
- SPILL r0, 0,1
- SWAP 1, 7
- @@ -441,9 +441,9 @@ global x264_add8x8_idct_sse2.skip_prologue
- SPILL r1, 0
- TRANSPOSE2x4x4W 4,5,6,7,0
- UNSPILL r1, 0
- - paddw m0, [pw_32 GLOBAL]
- + paddw m0, [pw_32]
- IDCT4_1D 0,1,2,3,r1
- - paddw m4, [pw_32 GLOBAL]
- + paddw m4, [pw_32]
- IDCT4_1D 4,5,6,7,r1
- SPILL r1, 6,7
- pxor m7, m7
- @@ -466,7 +466,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
- IDCT8_1D 0,1,2,3,4,5,6,7,r1
- SPILL r1, 6
- TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
- - paddw m0, [pw_32 GLOBAL]
- + paddw m0, [pw_32]
- SPILL r1, 0
- IDCT8_1D 0,1,2,3,4,5,6,7,r1
- SPILL r1, 6,7
- diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
- index 9915789..ba7741e 100644
- --- a/common/x86/dct-64.asm
- +++ b/common/x86/dct-64.asm
- @@ -143,7 +143,7 @@ INIT_XMM
- cglobal x264_sub8x8_dct_%1, 3,3,11
- add r2, 4*FDEC_STRIDE
- %ifnidn %1, sse2
- - mova m7, [hsub_mul GLOBAL]
- + mova m7, [hsub_mul]
- %endif
- %ifdef WIN64
- call .skip_prologue
- @@ -170,7 +170,7 @@ global x264_sub8x8_dct_%1.skip_prologue
- cglobal x264_sub8x8_dct8_%1, 3,3,11
- add r2, 4*FDEC_STRIDE
- %ifnidn %1, sse2
- - mova m7, [hsub_mul GLOBAL]
- + mova m7, [hsub_mul]
- %endif
- %ifdef WIN64
- call .skip_prologue
- @@ -227,7 +227,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
- movdqa m7, [r1+0x70]
- IDCT8_1D 0,1,2,3,4,5,6,7,8,10
- TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
- - paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
- + paddw m0, [pw_32] ; rounding for the >>6 at the end
- IDCT8_1D 0,1,2,3,4,5,6,7,8,10
- DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
- DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
- @@ -265,9 +265,9 @@ global x264_add8x8_idct_sse2.skip_prologue
- TRANSPOSE2x4x4W 0,1,2,3,8
- IDCT4_1D 4,5,6,7,8,10
- TRANSPOSE2x4x4W 4,5,6,7,8
- - paddw m0, [pw_32 GLOBAL]
- + paddw m0, [pw_32]
- IDCT4_1D 0,1,2,3,8,10
- - paddw m4, [pw_32 GLOBAL]
- + paddw m4, [pw_32]
- IDCT4_1D 4,5,6,7,8,10
- DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
- DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
- index d4a0cae..618433c 100644
- --- a/common/x86/dct-a.asm
- +++ b/common/x86/dct-a.asm
- @@ -80,7 +80,7 @@ cglobal x264_dct4x4dc_mmx, 1,1
- movq m2, [r0+16]
- movq m1, [r0+ 8]
- movq m0, [r0+ 0]
- - movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
- + movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
- WALSH4_1D 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- SUMSUB_BADC m1, m0, m3, m2, m4
- @@ -123,7 +123,7 @@ cglobal x264_sub4x4_dct_%1, 3,3
- LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- %else
- - mova m5, [hsub_mul GLOBAL]
- + mova m5, [hsub_mul]
- LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
- %endif
- DCT4_1D 0,1,2,3,4
- @@ -151,7 +151,7 @@ cglobal x264_add4x4_idct_mmx, 2,2
- movq m0, [r1+ 0]
- IDCT4_1D 0,1,2,3,4,5
- TRANSPOSE4x4W 0,1,2,3,4
- - paddw m0, [pw_32 GLOBAL]
- + paddw m0, [pw_32]
- IDCT4_1D 0,1,2,3,4,5
- STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
- STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
- @@ -179,7 +179,7 @@ cglobal x264_add4x4_idct_sse4, 2,2,6
- punpckhdq m2, m0
- SWAP 0, 1
- - mova m1, [pw_32_0 GLOBAL]
- + mova m1, [pw_32_0]
- paddw m1, m0 ; row1/row0 corrected
- psraw m0, 1 ; row1>>1/...
- mova m3, m2 ; row3/row2
- @@ -221,7 +221,7 @@ cglobal %1, 3,3,11
- pxor m7, m7
- %else
- add r2, 4*FDEC_STRIDE
- - mova m7, [hsub_mul GLOBAL]
- + mova m7, [hsub_mul]
- %endif
- .skip_prologue:
- %ifdef WIN64
- @@ -335,7 +335,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
- movq mm0, [r1]
- pxor mm1, mm1
- add r0, FDEC_STRIDE*4
- - paddw mm0, [pw_32 GLOBAL]
- + paddw mm0, [pw_32]
- psraw mm0, 6
- psubw mm1, mm0
- packuswb mm0, mm0
- @@ -354,10 +354,10 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
- movq xmm0, [r1]
- pxor xmm1, xmm1
- add r0, FDEC_STRIDE*4
- - paddw xmm0, [pw_32 GLOBAL]
- + paddw xmm0, [pw_32]
- psraw xmm0, 6
- psubw xmm1, xmm0
- - movdqa xmm5, [pb_idctdc_unpack GLOBAL]
- + movdqa xmm5, [pb_idctdc_unpack]
- packuswb xmm0, xmm0
- packuswb xmm1, xmm1
- pshufb xmm0, xmm5
- @@ -393,7 +393,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
- .loop:
- movq mm0, [r1]
- pxor mm1, mm1
- - paddw mm0, [pw_32 GLOBAL]
- + paddw mm0, [pw_32]
- psraw mm0, 6
- psubw mm1, mm0
- packuswb mm0, mm0
- @@ -447,8 +447,8 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2,8
- punpcklwd xmm2, xmm2
- pxor xmm1, xmm1
- pxor xmm3, xmm3
- - paddw xmm0, [pw_32 GLOBAL]
- - paddw xmm2, [pw_32 GLOBAL]
- + paddw xmm0, [pw_32]
- + paddw xmm2, [pw_32]
- psraw xmm0, 6
- psraw xmm2, 6
- psubw xmm1, xmm0
- @@ -477,11 +477,11 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
- movdqa xmm0, [r1]
- add r1, 16
- pxor xmm1, xmm1
- - paddw xmm0, [pw_32 GLOBAL]
- + paddw xmm0, [pw_32]
- psraw xmm0, 6
- psubw xmm1, xmm0
- - movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
- - movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
- + movdqa xmm5, [ pb_idctdc_unpack]
- + movdqa xmm6, [pb_idctdc_unpack2]
- packuswb xmm0, xmm0
- packuswb xmm1, xmm1
- movdqa xmm2, xmm0
- @@ -815,8 +815,8 @@ cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
- cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
- movdqa xmm1, [r1+16]
- movdqa xmm0, [r1]
- - pshufb xmm1, [pb_scan4frameb GLOBAL]
- - pshufb xmm0, [pb_scan4framea GLOBAL]
- + pshufb xmm1, [pb_scan4frameb]
- + pshufb xmm0, [pb_scan4framea]
- movdqa xmm2, xmm1
- psrldq xmm1, 6
- palignr xmm2, xmm0, 6
- @@ -963,9 +963,9 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
- punpcklqdq xmm0, xmm2
- punpcklqdq xmm4, xmm6
- %ifidn %2, frame
- - movdqa xmm7, [pb_sub4frame GLOBAL]
- + movdqa xmm7, [pb_sub4frame]
- %else
- - movdqa xmm7, [pb_sub4field GLOBAL]
- + movdqa xmm7, [pb_sub4field]
- %endif
- pshufb xmm0, xmm7
- pshufb xmm4, xmm7
- @@ -980,7 +980,7 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
- psubw xmm1, xmm5
- %ifidn %1, ac
- movd r2d, xmm0
- - pand xmm0, [pb_subacmask GLOBAL]
- + pand xmm0, [pb_subacmask]
- %endif
- movdqa [r0], xmm0
- pxor xmm2, xmm2
- @@ -1039,7 +1039,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
- packsswb m5, m5
- pxor m0, m0
- pcmpeqb m5, m0
- - paddb m5, [pb_1 GLOBAL]
- + paddb m5, [pb_1]
- movd r0d, m5
- mov [r2+0], r0w
- shr r0d, 16
- @@ -1085,7 +1085,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
- packsswb m2, m2
- packsswb m2, m2
- pcmpeqb m5, m2
- - paddb m5, [pb_1 GLOBAL]
- + paddb m5, [pb_1]
- movd r0d, m5
- mov [r2+0], r0w
- shr r0d, 16
- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
- index 75b308f..00d0418 100644
- --- a/common/x86/deblock-a.asm
- +++ b/common/x86/deblock-a.asm
- @@ -233,19 +233,19 @@ SECTION .text
- ; clobbers: m0,3-6
- %macro DEBLOCK_P0_Q0 0
- mova m5, m1
- - pxor m5, m2 ; p0^q0
- - pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
- + pxor m5, m2 ; p0^q0
- + pand m5, [pb_01] ; (p0^q0)&1
- pcmpeqb m4, m4
- pxor m3, m4
- - pavgb m3, m0 ; (p1 - q1 + 256)>>1
- - pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
- + pavgb m3, m0 ; (p1 - q1 + 256)>>1
- + pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
- pxor m4, m1
- - pavgb m4, m2 ; (q0 - p0 + 256)>>1
- + pavgb m4, m2 ; (q0 - p0 + 256)>>1
- pavgb m3, m5
- - paddusb m3, m4 ; d+128+33
- - mova m6, [pb_a1 GLOBAL]
- + paddusb m3, m4 ; d+128+33
- + mova m6, [pb_a1]
- psubusb m6, m3
- - psubusb m3, [pb_a1 GLOBAL]
- + psubusb m3, [pb_a1]
- pminub m6, m7
- pminub m3, m7
- psubusb m1, m6
- @@ -261,10 +261,10 @@ SECTION .text
- %macro LUMA_Q1 6
- mova %6, m1
- pavgb %6, m2
- - pavgb %2, %6 ; avg(p2,avg(p0,q0))
- + pavgb %2, %6 ; avg(p2,avg(p0,q0))
- pxor %6, %3
- - pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
- - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- + pand %6, [pb_01] ; (p2^avg(p0,q0))&1
- + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- mova %6, %1
- psubusb %6, %5
- paddusb %5, %1
- @@ -614,8 +614,8 @@ DEBLOCK_LUMA sse2, v, 16
- %define mask0 spill(2)
- %define mask1p spill(3)
- %define mask1q spill(4)
- - %define mpb_00 [pb_00 GLOBAL]
- - %define mpb_01 [pb_01 GLOBAL]
- + %define mpb_00 [pb_00]
- + %define mpb_01 [pb_01]
- %endif
- ;-----------------------------------------------------------------------------
- @@ -639,7 +639,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
- mova q1, [r0+r1]
- %ifdef ARCH_X86_64
- pxor mpb_00, mpb_00
- - mova mpb_01, [pb_01 GLOBAL]
- + mova mpb_01, [pb_01]
- LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
- SWAP 7, 12 ; m12=mask0
- pavgb t5, mpb_00
- @@ -658,8 +658,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
- LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
- mova m4, t5
- mova mask0, m7
- - pavgb m4, [pb_00 GLOBAL]
- - pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
- + pavgb m4, [pb_00]
- + pavgb m4, [pb_01] ; alpha/4+1
- DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
- pand m6, mask0
- DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
- @@ -835,7 +835,7 @@ chroma_inter_body_mmxext:
- %macro CHROMA_INTRA_P0 3
- movq m4, %1
- pxor m4, %3
- - pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
- + pand m4, [pb_01] ; m4 = (p0^q1)&1
- pavgb %1, %3
- psubusb %1, m4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
- diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
- index f486a8d..9783066 100644
- --- a/common/x86/mc-a.asm
- +++ b/common/x86/mc-a.asm
- @@ -89,9 +89,9 @@ SECTION .text
- %macro BIWEIGHT_START_MMX 0
- movd m2, r6m
- SPLATW m2, m2 ; weight_dst
- - mova m3, [pw_64 GLOBAL]
- + mova m3, [pw_64]
- psubw m3, m2 ; weight_src
- - mova m4, [pw_32 GLOBAL] ; rounding
- + mova m4, [pw_32] ; rounding
- pxor m5, m5
- %endmacro
- @@ -111,7 +111,7 @@ SECTION .text
- shl t7d, 8
- add t6d, t7d
- movd m3, t6d
- - mova m4, [pw_32 GLOBAL]
- + mova m4, [pw_32]
- SPLATW m3, m3 ; weight_dst,src
- %endmacro
- @@ -641,7 +641,7 @@ AVG2_W20 sse2_misalign
- %macro INIT_SHIFT 2
- and eax, 7
- shl eax, 3
- - movd %1, [sw_64 GLOBAL]
- + movd %1, [sw_64]
- movd %2, eax
- psubw %1, %2
- %endmacro
- @@ -778,10 +778,10 @@ cglobal x264_pixel_avg2_w16_cache64_ssse3
- shl r6, 4 ;jump = (offset + align*2)*48
- %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
- %ifdef PIC
- - lea r11, [avg_w16_addr GLOBAL]
- + lea r11, [avg_w16_addr]
- add r6, r11
- %else
- - lea r6, [avg_w16_addr + r6 GLOBAL]
- + lea r6, [avg_w16_addr + r6]
- %endif
- %ifdef UNIX64
- jmp r6
- @@ -1007,7 +1007,7 @@ cglobal x264_mc_chroma_%1
- SPLATW m5, m5 ; m5 = dx
- SPLATW m6, m6 ; m6 = dy
- - mova m4, [pw_8 GLOBAL]
- + mova m4, [pw_8]
- mova m0, m4
- psubw m4, m5 ; m4 = 8-dx
- psubw m0, m6 ; m0 = 8-dy
- @@ -1042,7 +1042,7 @@ cglobal x264_mc_chroma_%1
- punpcklbw m2, m3
- punpcklbw m1, m3
- - paddw m0, [pw_32 GLOBAL]
- + paddw m0, [pw_32]
- pmullw m2, m5 ; line * cB
- pmullw m1, m7 ; line * cD
- @@ -1084,9 +1084,9 @@ cglobal x264_mc_chroma_%1
- movd m6, r4d
- mov r5d, 1
- .mc1d:
- - mova m5, [pw_8 GLOBAL]
- + mova m5, [pw_8]
- SPLATW m6, m6
- - mova m7, [pw_4 GLOBAL]
- + mova m7, [pw_4]
- psubw m5, m6
- movifnidn r0, r0mp
- movifnidn r1d, r1m
- @@ -1166,7 +1166,7 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
- imul r4d, t0d ; (x*255+8)*(8-y)
- cmp dword r6m, 4
- jg .width8
- - mova m5, [pw_32 GLOBAL]
- + mova m5, [pw_32]
- movd m6, r5d
- movd m7, r4d
- movifnidn r0, r0mp
- @@ -1178,10 +1178,10 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
- and r2, ~3
- and r5, 3
- %ifdef PIC
- - lea r11, [ch_shuffle GLOBAL]
- + lea r11, [ch_shuffle]
- movu m5, [r11 + r5*2]
- %else
- - movu m5, [ch_shuffle + r5*2 GLOBAL]
- + movu m5, [ch_shuffle + r5*2]
- %endif
- movu m0, [r2]
- pshufb m0, m5
- @@ -1197,8 +1197,8 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- pmaddubsw m3, m6
- - paddw m0, [pw_32 GLOBAL]
- - paddw m2, [pw_32 GLOBAL]
- + paddw m0, [pw_32]
- + paddw m2, [pw_32]
- paddw m1, m0
- paddw m3, m2
- mova m0, m4
- @@ -1228,7 +1228,7 @@ INIT_XMM
- cmp r5, 0x38
- jge .split
- %endif
- - mova m5, [pw_32 GLOBAL]
- + mova m5, [pw_32]
- movh m0, [r2]
- movh m1, [r2+1]
- punpcklbw m0, m1
- @@ -1265,18 +1265,18 @@ INIT_XMM
- and r2, ~7
- and r5, 7
- %ifdef PIC
- - lea r11, [ch_shuffle GLOBAL]
- + lea r11, [ch_shuffle]
- movu m5, [r11 + r5*2]
- %else
- - movu m5, [ch_shuffle + r5*2 GLOBAL]
- + movu m5, [ch_shuffle + r5*2]
- %endif
- movu m0, [r2]
- pshufb m0, m5
- %ifdef ARCH_X86_64
- - mova m8, [pw_32 GLOBAL]
- + mova m8, [pw_32]
- %define round m8
- %else
- - %define round [pw_32 GLOBAL]
- + %define round [pw_32]
- %endif
- .splitloop8:
- movu m1, [r2+r3]
- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
- index 245c09f..f2e69c0 100644
- --- a/common/x86/mc-a2.asm
- +++ b/common/x86/mc-a2.asm
- @@ -125,7 +125,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
- %ifnidn %1, ssse3
- pxor m0, m0
- %else
- - mova m0, [filt_mul51 GLOBAL]
- + mova m0, [filt_mul51]
- %endif
- .loop:
- %ifidn %1, ssse3
- @@ -142,8 +142,8 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
- pmaddubsw m4, m0
- pmaddubsw m2, m0
- pmaddubsw m5, m0
- - pmaddubsw m3, [filt_mul20 GLOBAL]
- - pmaddubsw m6, [filt_mul20 GLOBAL]
- + pmaddubsw m3, [filt_mul20]
- + pmaddubsw m6, [filt_mul20]
- paddw m1, m2
- paddw m4, m5
- paddw m1, m3
- @@ -155,7 +155,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
- LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
- FILT_V2
- %endif
- - mova m7, [pw_16 GLOBAL]
- + mova m7, [pw_16]
- mova [r2+r4*2], m1
- mova [r2+r4*2+mmsize], m4
- paddw m1, m7
- @@ -180,7 +180,7 @@ cglobal x264_hpel_filter_c_mmxext, 3,3
- lea r1, [r1+r2*2]
- neg r2
- %define src r1+r2*2
- - movq m7, [pw_32 GLOBAL]
- + movq m7, [pw_32]
- .loop:
- movq m1, [src-4]
- movq m2, [src-2]
- @@ -237,7 +237,7 @@ cglobal x264_hpel_filter_h_mmxext, 3,3
- punpcklbw m7, m0
- punpcklbw m6, m0
- paddw m6, m7 ; a1
- - movq m7, [pw_1 GLOBAL]
- + movq m7, [pw_1]
- FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, 1
- movntq [r0+r2], m1
- @@ -257,13 +257,13 @@ cglobal x264_hpel_filter_c_%1, 3,3,9
- neg r2
- %define src r1+r2*2
- %ifidn %1, ssse3
- - mova m7, [pw_32 GLOBAL]
- + mova m7, [pw_32]
- %define tpw_32 m7
- %elifdef ARCH_X86_64
- - mova m8, [pw_32 GLOBAL]
- + mova m8, [pw_32]
- %define tpw_32 m8
- %else
- - %define tpw_32 [pw_32 GLOBAL]
- + %define tpw_32 [pw_32]
- %endif
- .loop:
- %ifidn %1,sse2_misalign
- @@ -340,7 +340,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3,8
- punpcklbw m6, m0
- punpcklbw m7, m0
- paddw m6, m7 ; c1
- - mova m7, [pw_1 GLOBAL] ; FIXME xmm8
- + mova m7, [pw_1] ; FIXME xmm8
- FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, 1
- movntdq [r0+r2], m1
- @@ -362,7 +362,7 @@ cglobal x264_hpel_filter_h_ssse3, 3,3
- punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
- movh m2, [src]
- punpcklbw m2, m0
- - mova m7, [pw_1 GLOBAL]
- + mova m7, [pw_1]
- .loop:
- movh m3, [src+8]
- punpcklbw m3, m0
- @@ -436,7 +436,7 @@ HPEL_V ssse3
- mova m3, [r1]
- mova %4, [r1+r2]
- mova m0, [r1+r2*2]
- - mova %2, [filt_mul51 GLOBAL]
- + mova %2, [filt_mul51]
- mova m4, m1
- punpcklbw m1, m2
- punpckhbw m4, m2
- @@ -452,8 +452,8 @@ HPEL_V ssse3
- pmaddubsw m4, %2
- pmaddubsw m0, %2
- pmaddubsw m2, %2
- - pmaddubsw m3, [filt_mul20 GLOBAL]
- - pmaddubsw %1, [filt_mul20 GLOBAL]
- + pmaddubsw m3, [filt_mul20]
- + pmaddubsw %1, [filt_mul20]
- psrlw %3, 8
- psrlw %4, 8
- paddw m1, m0
- @@ -1096,7 +1096,7 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
- add r4, r5
- neg r5
- pxor xmm5, xmm5
- - movdqa xmm4, [pd_128 GLOBAL]
- + movdqa xmm4, [pd_128]
- .loop:
- movq xmm2, [r2+r5] ; intra
- movq xmm0, [r4+r5] ; invq
- diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
- index d94daaf..46b4557 100644
- --- a/common/x86/pixel-a.asm
- +++ b/common/x86/pixel-a.asm
- @@ -59,7 +59,7 @@ SECTION .text
- %endmacro
- %macro HADDW 2
- - pmaddwd %1, [pw_1 GLOBAL]
- + pmaddwd %1, [pw_1]
- HADDD %1, %2
- %endmacro
- @@ -244,9 +244,9 @@ cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
- %endif
- %ifidn %3, ssse3
- - mova m7, [hsub_mul GLOBAL]
- + mova m7, [hsub_mul]
- %elifidn %3, sse2
- - mova m7, [pw_00ff GLOBAL]
- + mova m7, [pw_00ff]
- %elif %1 >= mmsize
- pxor m7, m7
- %endif
- @@ -310,7 +310,7 @@ SSD 4, 8, ssse3
- pxor m5, m5 ; sum
- pxor m6, m6 ; sum squared
- %if %1
- - mova m7, [pw_00ff GLOBAL]
- + mova m7, [pw_00ff]
- %else
- pxor m7, m7 ; zero
- %endif
- @@ -482,7 +482,7 @@ cglobal x264_pixel_var2_8x8_sse2, 5,6,8
- cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
- pxor m5, m5 ; sum
- pxor m6, m6 ; sum squared
- - mova m7, [hsub_mul GLOBAL]
- + mova m7, [hsub_mul]
- mov r5d, 2
- .loop:
- movq m0, [r0]
- @@ -775,7 +775,7 @@ cglobal x264_pixel_satd_4x4_mmxext, 4,6
- %macro SATD_START_SSE2 3
- %ifnidn %1, sse2
- - mova %3, [hmul_8p GLOBAL]
- + mova %3, [hmul_8p]
- %endif
- lea r4, [3*r1]
- lea r5, [3*r3]
- @@ -815,7 +815,7 @@ INIT_XMM
- %ifnidn %1, sse2
- cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
- SATD_START_MMX
- - mova m4, [hmul_4p GLOBAL]
- + mova m4, [hmul_4p]
- LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
- LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
- LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
- @@ -832,7 +832,7 @@ cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
- cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
- SATD_START_MMX
- %ifnidn %1, sse2
- - mova m7, [hmul_4p GLOBAL]
- + mova m7, [hmul_4p]
- %endif
- movd m4, [r2]
- movd m5, [r2+r3]
- @@ -889,14 +889,14 @@ cglobal x264_pixel_satd_16x4_internal_%1
- cglobal x264_pixel_satd_16x8_%1, 4,6,12
- SATD_START_SSE2 %1, m10, m7
- %ifidn %1, sse2
- - mova m7, [pw_00ff GLOBAL]
- + mova m7, [pw_00ff]
- %endif
- jmp x264_pixel_satd_16x8_internal_%1
- cglobal x264_pixel_satd_16x16_%1, 4,6,12
- SATD_START_SSE2 %1, m10, m7
- %ifidn %1, sse2
- - mova m7, [pw_00ff GLOBAL]
- + mova m7, [pw_00ff]
- %endif
- call x264_pixel_satd_16x4_internal_%1
- call x264_pixel_satd_16x4_internal_%1
- @@ -977,7 +977,7 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
- lea r4, [3*r1]
- lea r5, [3*r3]
- %ifnidn %1, sse2
- - mova m7, [hmul_8p GLOBAL]
- + mova m7, [hmul_8p]
- %endif
- call x264_pixel_sa8d_8x8_internal_%1
- HADDW m0, m1
- @@ -990,7 +990,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
- lea r4, [3*r1]
- lea r5, [3*r3]
- %ifnidn %1, sse2
- - mova m7, [hmul_8p GLOBAL]
- + mova m7, [hmul_8p]
- %endif
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
- add r2, 8
- @@ -1029,7 +1029,7 @@ cglobal x264_pixel_sa8d_8x8_internal_%1
- paddw m0, m1
- HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
- %else ; non-sse2
- - mova m7, [hmul_8p GLOBAL]
- + mova m7, [hmul_8p]
- LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
- ; could do first HADAMARD4_V here to save spilling later
- ; surprisingly, not a win on conroe or even p4
- @@ -1221,7 +1221,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
- paddusw m2, m0
- ; 3x HADDW
- - movdqa m7, [pw_1 GLOBAL]
- + movdqa m7, [pw_1]
- pmaddwd m2, m7
- pmaddwd m14, m7
- pmaddwd m15, m7
- @@ -1650,7 +1650,7 @@ cglobal x264_hadamard_ac_2x2max_mmxext
- ret
- cglobal x264_hadamard_ac_8x8_mmxext
- - mova m6, [mask_ac4 GLOBAL]
- + mova m6, [mask_ac4]
- pxor m7, m7
- call x264_hadamard_ac_4x4_mmxext
- add r0, 4
- @@ -1727,7 +1727,7 @@ cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
- mova m3, m0
- paddusw m1, [rsp+0x38]
- pxor m3, m2
- - pand m3, [pw_1 GLOBAL]
- + pand m3, [pw_1]
- pavgw m0, m2
- psubusw m0, m3
- HADDUW m0, m2
- @@ -1791,7 +1791,7 @@ cglobal x264_hadamard_ac_8x8_%1
- %endif
- %ifnidn %1, sse2
- ;LOAD_INC loads sumsubs
- - mova m7, [hmul_8p GLOBAL]
- + mova m7, [hmul_8p]
- %else
- ;LOAD_INC only unpacks to words
- pxor m7, m7
- @@ -1834,9 +1834,9 @@ cglobal x264_hadamard_ac_8x8_%1
- paddw m1, m2
- SUMSUB_BA m0, m4; m2
- %ifnidn %1, sse2
- - pand m1, [mask_ac4b GLOBAL]
- + pand m1, [mask_ac4b]
- %else
- - pand m1, [mask_ac4 GLOBAL]
- + pand m1, [mask_ac4]
- %endif
- ABS_MOV m2, spill0
- paddw m1, m3
- @@ -1878,7 +1878,7 @@ cglobal x264_hadamard_ac_8x8_%1
- paddw m2, m1
- paddw m2, m2
- ABS1 m4, m7
- - pand m0, [mask_ac8 GLOBAL]
- + pand m0, [mask_ac8]
- ABS1 m0, m7
- paddw m2, m4
- paddw m0, m2
- @@ -2041,7 +2041,7 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
- SSIM_ITER 3
- ; PHADDW m1, m2
- ; PHADDD m3, m4
- - movdqa m7, [pw_1 GLOBAL]
- + movdqa m7, [pw_1]
- pshufd m5, m3, 0xb1
- pmaddwd m1, m7
- pmaddwd m2, m7
- @@ -2086,8 +2086,8 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
- paddd m1, m2
- paddd m2, m3
- paddd m3, m4
- - movdqa m5, [ssim_c1 GLOBAL]
- - movdqa m6, [ssim_c2 GLOBAL]
- + movdqa m5, [ssim_c1]
- + movdqa m6, [ssim_c2]
- TRANSPOSE4x4D 0, 1, 2, 3, 4
- ; s1=m0, s2=m1, ss=m2, s12=m3
- @@ -2117,10 +2117,10 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
- je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
- neg r2
- %ifdef PIC
- - lea r3, [mask_ff + 16 GLOBAL]
- + lea r3, [mask_ff + 16]
- movdqu m1, [r3 + r2*4]
- %else
- - movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
- + movdqu m1, [mask_ff + r2*4 + 16]
- %endif
- pand m4, m1
- .skip:
- diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
- index 808aa31..4d03f8f 100644
- --- a/common/x86/predict-a.asm
- +++ b/common/x86/predict-a.asm
- @@ -99,7 +99,7 @@ SECTION .text
- pavgb %2, %3
- pxor %3, %5
- mov%6 %1, %4
- - pand %3, [pb_1 GLOBAL]
- + pand %3, [pb_1]
- psubusb %2, %3
- pavgb %1, %2
- %endmacro
- @@ -466,7 +466,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
- pxor mm1, mm1
- psadbw mm0, [r1+7]
- psadbw mm1, [r1+16]
- - paddw mm0, [pw_8 GLOBAL]
- + paddw mm0, [pw_8]
- paddw mm0, mm1
- psrlw mm0, 4
- pshufw mm0, mm0, 0
- @@ -481,7 +481,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
- cglobal %1, 2,2
- pxor mm0, mm0
- psadbw mm0, [r1+%2]
- - paddw mm0, [pw_4 GLOBAL]
- + paddw mm0, [pw_4]
- psrlw mm0, 3
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- @@ -643,7 +643,7 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
- cglobal predict_8x8c_p_core_mmxext, 1,2
- LOAD_PLANE_ARGS
- movq mm1, mm2
- - pmullw mm2, [pw_3210 GLOBAL]
- + pmullw mm2, [pw_3210]
- psllw mm1, 2
- paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
- paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
- @@ -672,7 +672,7 @@ cglobal predict_16x16_p_core_mmxext, 1,2
- LOAD_PLANE_ARGS
- movq mm5, mm2
- movq mm1, mm2
- - pmullw mm5, [pw_3210 GLOBAL]
- + pmullw mm5, [pw_3210]
- psllw mm2, 3
- psllw mm1, 2
- movq mm3, mm2
- @@ -786,7 +786,7 @@ cglobal predict_8x8_vl_sse2, 2,2
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_vr_sse2, 2,2,7
- movdqu xmm0, [r1+8]
- - movdqa xmm6, [pw_ff00 GLOBAL]
- + movdqa xmm6, [pw_ff00]
- add r0, 4*FDEC_STRIDE
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- @@ -910,7 +910,7 @@ cglobal predict_8x8_hu_%1, 2,2
- add r0, 4*FDEC_STRIDE
- %ifidn %1, ssse3
- movq mm5, [r1+7]
- - movq mm6, [pb_reverse GLOBAL]
- + movq mm6, [pb_reverse]
- movq mm1, mm5
- movq mm2, mm5
- movq mm3, mm5
- @@ -979,7 +979,7 @@ cglobal predict_8x8c_v_mmx, 1,1
- %macro PRED_8x8C_H 1
- cglobal predict_8x8c_h_%1, 1,1
- %ifidn %1, ssse3
- - mova m1, [pb_3 GLOBAL]
- + mova m1, [pb_3]
- %endif
- %assign n 0
- %rep 8
- @@ -1018,7 +1018,7 @@ cglobal predict_8x8c_dc_core_mmxext, 1,1
- pshufw mm2, r2m, 0
- %endif
- psrlw mm0, 3
- - paddw mm1, [pw_2 GLOBAL]
- + paddw mm1, [pw_2]
- movq mm3, mm2
- pshufw mm1, mm1, 0
- pshufw mm0, mm0, 0 ; dc0 (w)
- @@ -1065,7 +1065,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm4, xmm4
- - pmullw xmm2, [pw_76543210 GLOBAL]
- + pmullw xmm2, [pw_76543210]
- paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
- movdqa xmm3, xmm0
- paddsw xmm3, xmm4
- @@ -1107,7 +1107,7 @@ cglobal predict_16x16_p_core_sse2, 1,2,8
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- movdqa xmm3, xmm1
- - pmullw xmm3, [pw_76543210 GLOBAL]
- + pmullw xmm3, [pw_76543210]
- psllw xmm1, 3
- paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
- @@ -1162,7 +1162,7 @@ cglobal predict_16x16_v_sse2, 1,1
- cglobal predict_16x16_h_%1, 1,2
- mov r1, FDEC_STRIDE*12
- %ifidn %1, ssse3
- - mova m1, [pb_3 GLOBAL]
- + mova m1, [pb_3]
- %endif
- .vloop:
- %assign n 0
- @@ -1214,7 +1214,7 @@ cglobal predict_16x16_dc_core_mmxext, 1,2
- REP_RET
- cglobal predict_16x16_dc_top_mmxext, 1,2
- - PRED16x16_DC [pw_8 GLOBAL], 4
- + PRED16x16_DC [pw_8], 4
- REP_RET
- cglobal predict_16x16_dc_left_core_mmxext, 1,1
- @@ -1247,7 +1247,7 @@ cglobal predict_16x16_dc_core_sse2, 1,1
- RET
- cglobal predict_16x16_dc_top_sse2, 1,1
- - PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
- + PRED16x16_DC_SSE2 [pw_8], 4
- RET
- cglobal predict_16x16_dc_left_core_sse2, 1,1
- diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
- index 52e121a..3edd244 100644
- --- a/common/x86/quant-a.asm
- +++ b/common/x86/quant-a.asm
- @@ -86,7 +86,7 @@ SECTION .text
- %endmacro
- %macro QUANT_DC_START_SSSE3 0
- - movdqa m5, [pb_01 GLOBAL]
- + movdqa m5, [pb_01]
- movd m6, r1m ; mf
- movd m7, r2m ; bias
- pshufb m6, m5
- @@ -361,7 +361,7 @@ cglobal x264_dequant_%2x%2_%1, 0,3
- .rshift32:
- neg t0d
- movd m2, t0d
- - mova m3, [pd_1 GLOBAL]
- + mova m3, [pd_1]
- pxor m4, m4
- pslld m3, m2
- psrld m3, 1
- @@ -381,10 +381,10 @@ cglobal x264_dequant_%2x%2_flat16_%1, 0,3
- sub t2d, t1d ; i_mf = i_qp % 6
- shl t2d, %3
- %ifdef PIC
- - lea r1, [dequant%2_scale GLOBAL]
- + lea r1, [dequant%2_scale]
- add r1, t2
- %else
- - lea r1, [dequant%2_scale + t2 GLOBAL]
- + lea r1, [dequant%2_scale + t2]
- %endif
- movifnidn r0, r0mp
- movd m4, t0d
- @@ -446,7 +446,7 @@ cglobal x264_dequant_4x4dc_%1, 0,3
- .rshift32:
- neg t0d
- movd m3, t0d
- - mova m4, [pw_1 GLOBAL]
- + mova m4, [pw_1]
- mova m5, m4
- pslld m4, m3
- psrld m4, 1
- @@ -588,15 +588,15 @@ cextern x264_decimate_table8
- ;This is not true for score64.
- cglobal x264_decimate_score%1_%2, 1,3
- %ifdef PIC
- - lea r10, [x264_decimate_table4 GLOBAL]
- - lea r11, [decimate_mask_table4 GLOBAL]
- + lea r10, [x264_decimate_table4]
- + lea r11, [decimate_mask_table4]
- %define table r10
- %define mask_table r11
- %else
- %define table x264_decimate_table4
- %define mask_table decimate_mask_table4
- %endif
- - DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
- + DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
- xor edx, 0xffff
- je .ret
- test eax, eax
- @@ -640,12 +640,12 @@ DECIMATE4x4 16, ssse3
- %ifdef ARCH_X86_64
- cglobal x264_decimate_score64_%1, 1,4
- %ifdef PIC
- - lea r10, [x264_decimate_table8 GLOBAL]
- + lea r10, [x264_decimate_table8]
- %define table r10
- %else
- %define table x264_decimate_table8
- %endif
- - mova m5, [pb_1 GLOBAL]
- + mova m5, [pb_1]
- DECIMATE_MASK r1d, eax, r0, m5, %1, null
- test eax, eax
- jne .ret9
- @@ -681,7 +681,7 @@ cglobal x264_decimate_score64_%1, 1,6
- %else
- cglobal x264_decimate_score64_%1, 1,5
- %endif
- - mova m7, [pb_1 GLOBAL]
- + mova m7, [pb_1]
- DECIMATE_MASK r3, r2, r0, m7, %1, r5
- test r2, r2
- jne .ret9
- diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
- index 342a984..6db8abf 100644
- --- a/common/x86/sad-a.asm
- +++ b/common/x86/sad-a.asm
- @@ -351,7 +351,7 @@ cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
- psadbw m0, m7
- psadbw m1, m6
- paddw m0, m1
- - paddw m0, [pw_8 GLOBAL]
- + paddw m0, [pw_8]
- psrlw m0, 4
- punpcklbw m0, m0
- pshufw m0, m0, 0x0 ;DC prediction
- @@ -411,7 +411,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
- movq m6, [r1 - FDEC_STRIDE]
- add r1, FDEC_STRIDE*4
- %ifidn %1,ssse3
- - movq m7, [pb_3 GLOBAL]
- + movq m7, [pb_3]
- %endif
- INTRA_SAD_HV_ITER 0, %1
- INTRA_SAD_HV_ITER 2, %1
- @@ -450,7 +450,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
- pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
- %ifidn %1, ssse3
- movq2dq xmm0, m0
- - pshufb xmm0, [pb_shuf8x8c GLOBAL]
- + pshufb xmm0, [pb_shuf8x8c]
- movq xmm1, [r0+FENC_STRIDE*0]
- movq xmm2, [r0+FENC_STRIDE*1]
- movq xmm3, [r0+FENC_STRIDE*2]
- @@ -522,7 +522,7 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
- paddw mm0, mm1
- movd r3d, mm0
- %ifidn %1, ssse3
- - mova m1, [pb_3 GLOBAL]
- + mova m1, [pb_3]
- %endif
- %assign x 0
- %rep 16
- @@ -1301,10 +1301,10 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
- %endif
- %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
- %ifdef PIC
- - lea r5, [sad_w16_addr GLOBAL]
- + lea r5, [sad_w16_addr]
- add r5, r4
- %else
- - lea r5, [sad_w16_addr + r4 GLOBAL]
- + lea r5, [sad_w16_addr + r4]
- %endif
- and r2, ~15
- mov r4d, %2/2
- @@ -1323,7 +1323,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
- jle x264_pixel_sad_%1x%2_mmxext
- and eax, 7
- shl eax, 3
- - movd mm6, [sw_64 GLOBAL]
- + movd mm6, [sw_64]
- movd mm7, eax
- psubw mm6, mm7
- PROLOGUE 4,5
- diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
- index 2a91084..ee3eca9 100644
- --- a/common/x86/x86inc.asm
- +++ b/common/x86/x86inc.asm
- @@ -65,28 +65,16 @@
- %endif
- %endmacro
- -; PIC support macros.
- -; x86_64 can't fit 64bit address literals in most instruction types,
- -; so shared objects (under the assumption that they might be anywhere
- -; in memory) must use an address mode that does fit.
- -; So all accesses to global variables must use this macro, e.g.
- -; mov eax, [foo GLOBAL]
- -; instead of
- -; mov eax, [foo]
- -;
- -; x86_32 doesn't require PIC.
- -; Some distros prefer shared objects to be PIC, but nothing breaks if
- -; the code contains a few textrels, so we'll skip that complexity.
- -
- %ifdef WIN64
- %define PIC
- %elifndef ARCH_X86_64
- +; x86_32 doesn't require PIC.
- +; Some distros prefer shared objects to be PIC, but nothing breaks if
- +; the code contains a few textrels, so we'll skip that complexity.
- %undef PIC
- %endif
- %ifdef PIC
- - %define GLOBAL wrt rip
- -%else
- - %define GLOBAL
- + default rel
- %endif
- ; Macros to eliminate most code duplication between x86_32 and x86_64:
- diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
- index b822688..d70bb0e 100644
- --- a/common/x86/x86util.asm
- +++ b/common/x86/x86util.asm
- @@ -239,10 +239,10 @@
- ; %3/%4: source regs
- ; %5/%6: tmp regs
- %ifidn %1, d
- -%define mask [mask_10 GLOBAL]
- +%define mask [mask_10]
- %define shift 16
- %elifidn %1, q
- -%define mask [mask_1100 GLOBAL]
- +%define mask [mask_1100]
- %define shift 32
- %endif
- %if %0==6 ; less dependency if we have two tmp
- diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
- index 966615b..1970cb9 100644
- --- a/tools/checkasm-a.asm
- +++ b/tools/checkasm-a.asm
- @@ -71,19 +71,19 @@ cglobal x264_checkasm_call, 4,7,16
- %endrep
- %assign i 6
- %rep 16-6
- - movdqa xmm %+ i, [x %+ i GLOBAL]
- + movdqa xmm %+ i, [x %+ i]
- %assign i i+1
- %endrep
- - mov r4, [n4 GLOBAL]
- - mov r5, [n5 GLOBAL]
- + mov r4, [n4]
- + mov r5, [n5]
- call r6
- - xor r4, [n4 GLOBAL]
- - xor r5, [n5 GLOBAL]
- + xor r4, [n4]
- + xor r5, [n5]
- or r4, r5
- pxor xmm5, xmm5
- %assign i 6
- %rep 16-6
- - pxor xmm %+ i, [x %+ i GLOBAL]
- + pxor xmm %+ i, [x %+ i]
- por xmm5, xmm %+ i
- %assign i i+1
- %endrep
- @@ -92,7 +92,7 @@ cglobal x264_checkasm_call, 4,7,16
- or r4, r5
- jz .ok
- mov r4, rax
- - lea r0, [error_message GLOBAL]
- + lea r0, [error_message]
- call puts
- mov r1, [rsp+stack_offset+16]
- mov dword [r1], 0
- @@ -132,7 +132,7 @@ cglobal x264_checkasm_call, 1,7
- or r3, r5
- jz .ok
- mov r3, eax
- - lea r1, [error_message GLOBAL]
- + lea r1, [error_message]
- push r1
- call puts
- add esp, 4
- --
- 1.6.1.2
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement