Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From d7ea15bf5b3dd2ada6449facd52cf2e35db0fbe9 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 21 May 2010 13:07:12 -0700
- Subject: [PATCH 01/10] Avoid a redundant qpel check in lookahead with subme <= 1.
- ---
- encoder/me.c | 2 +-
- 1 files changed, 1 insertions(+), 1 deletions(-)
- diff --git a/encoder/me.c b/encoder/me.c
- index a35da53..77073cc 100644
- --- a/encoder/me.c
- +++ b/encoder/me.c
- @@ -852,7 +852,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
- break;
- }
- - if( !b_refine_qpel )
- + if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) )
- {
- bcost = COST_MAX;
- COST_MV_SATD( bmx, bmy, -1 );
- --
- 1.7.0.4
- From 7fc5984e9ad11bafe20d4585848066554fb4a171 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 21 May 2010 14:32:13 -0700
- Subject: [PATCH 02/10] Avoid an extra var2 in chroma encoding if possible
- Also remove a redundant if.
- ---
- encoder/analyse.c | 5 ++---
- encoder/macroblock.c | 3 ++-
- 2 files changed, 4 insertions(+), 4 deletions(-)
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index 8868012..a128a70 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -2637,9 +2637,8 @@ intra_analysis:
- h->mb.i_partition = D_16x16;
- assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
- /* Set up MVs for future predictors */
- - if( b_skip )
- - for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
- - M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
- + for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
- + M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
- }
- else
- {
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index a961baf..199bb68 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- @@ -331,7 +331,8 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
- {
- int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
- int ssd[2];
- - int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
- + int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
- + if( score < thresh*4 )
- score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
- if( score < thresh*4 )
- {
- --
- 1.7.0.4
- From 038481fb5dd4144946824c7ecd94646d13db1710 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 21 May 2010 15:39:38 -0700
- Subject: [PATCH 03/10] Faster deblock strength asm on conroe/penryn
- ---
- common/x86/deblock-a.asm | 24 +++++++++++++++++++++++-
- 1 files changed, 23 insertions(+), 1 deletions(-)
- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
- index 628ee5d..f2f3e58 100644
- --- a/common/x86/deblock-a.asm
- +++ b/common/x86/deblock-a.asm
- @@ -1022,7 +1022,7 @@ cglobal deblock_strength_mmxext, 6,6
- RET
- %macro DEBLOCK_STRENGTH_XMM 1
- -cglobal deblock_strength_%1, 6,6,7
- +cglobal deblock_strength_%1, 6,6,8
- ; Prepare mv comparison register
- shl r4d, 8
- add r4d, 3 - (1<<8)
- @@ -1040,6 +1040,27 @@ cglobal deblock_strength_%1, 6,6,7
- por m5, m1
- ; Check mvs
- +%ifidn %1, ssse3
- + mova m3, [mv+4*8*0]
- + mova m2, [mv+4*8*1]
- + mova m0, m3
- + mova m1, m2
- + palignr m3, [mv+4*8*0-16], 12
- + palignr m2, [mv+4*8*1-16], 12
- + psubw m0, m3
- + psubw m1, m2
- + packsswb m0, m1
- +
- + mova m3, [mv+4*8*2]
- + mova m7, [mv+4*8*3]
- + mova m2, m3
- + mova m1, m7
- + palignr m3, [mv+4*8*2-16], 12
- + palignr m7, [mv+4*8*3-16], 12
- + psubw m2, m3
- + psubw m1, m7
- + packsswb m2, m1
- +%else
- movu m0, [mv-4+4*8*0]
- movu m1, [mv-4+4*8*1]
- movu m2, [mv-4+4*8*2]
- @@ -1050,6 +1071,7 @@ cglobal deblock_strength_%1, 6,6,7
- psubw m3, [mv+4*8*3]
- packsswb m0, m1
- packsswb m2, m3
- +%endif
- ABSB2 m0, m2, m1, m3
- psubusb m0, m6
- psubusb m2, m6
- --
- 1.7.0.4
- From 50fd9b03194695828b822020133e28430bce3d45 Mon Sep 17 00:00:00 2001
- From: Kieran Kunhya <kieran@kunhya.com>
- Date: Sat, 22 May 2010 14:32:53 +0100
- Subject: [PATCH 04/10] Fix typo in fake-interlaced documentation
- ---
- x264.h | 2 +-
- 1 files changed, 1 insertions(+), 1 deletions(-)
- diff --git a/x264.h b/x264.h
- index b11acf8..f714b72 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -351,7 +351,7 @@ typedef struct x264_param_t
- /* Fake Interlaced.
- *
- - * Used only when b_interlaced=0. Setting this flag to zero makes it possible to flag the stream as PAFF interlaced yet
- + * Used only when b_interlaced=0. Setting this flag makes it possible to flag the stream as PAFF interlaced yet
- * encode all frames progessively. It is useful for encoding 25p and 30p Blu-Ray streams.
- */
- --
- 1.7.0.4
- From 23f7cfda89cd7e8c1632f86a9af887017a05594a Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Mon, 24 May 2010 11:13:22 -0700
- Subject: [PATCH 05/10] Slightly faster mbtree asm
- ---
- common/x86/mc-a2.asm | 5 +++--
- 1 files changed, 3 insertions(+), 2 deletions(-)
- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
- index 8deb9e0..aee3f0a 100644
- --- a/common/x86/mc-a2.asm
- +++ b/common/x86/mc-a2.asm
- @@ -1111,7 +1111,7 @@ FRAME_INIT_LOWRES ssse3, 12
- ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
- ;-----------------------------------------------------------------------------
- -cglobal mbtree_propagate_cost_sse2, 6,6
- +cglobal mbtree_propagate_cost_sse2, 6,6,7
- shl r5d, 1
- lea r0, [r0+r5*2]
- add r1, r5
- @@ -1121,6 +1121,7 @@ cglobal mbtree_propagate_cost_sse2, 6,6
- neg r5
- pxor xmm5, xmm5
- movdqa xmm4, [pd_128]
- + movdqa xmm6, [pw_3fff]
- .loop:
- movq xmm2, [r2+r5] ; intra
- movq xmm0, [r4+r5] ; invq
- @@ -1131,7 +1132,7 @@ cglobal mbtree_propagate_cost_sse2, 6,6
- psrld xmm0, 8 ; intra*invq>>8
- movq xmm3, [r3+r5] ; inter
- movq xmm1, [r1+r5] ; prop
- - pand xmm3, [pw_3fff]
- + pand xmm3, xmm6
- punpcklwd xmm1, xmm5
- punpcklwd xmm3, xmm5
- paddd xmm0, xmm1 ; prop + (intra*invq>>8)
- --
- 1.7.0.4
- From 1d3e99cfff9c31a626a6720bc83f1fd25793d24f Mon Sep 17 00:00:00 2001
- From: Anton Mitrofanov <BugMaster@narod.ru>
- Date: Tue, 25 May 2010 18:45:16 +0400
- Subject: [PATCH 06/10] Fix calculation of total bitrate printed after stop by CTRL+C
- ---
- x264.c | 2 ++
- 1 files changed, 2 insertions(+), 0 deletions(-)
- diff --git a/x264.c b/x264.c
- index c4a7400..3a01854 100644
- --- a/x264.c
- +++ b/x264.c
- @@ -1560,6 +1560,8 @@ static int Encode( x264_param_t *param, cli_opt_t *opt )
- /* duration algorithm fails when only 1 frame is output */
- if( i_frame_output == 1 )
- duration = (double)param->i_fps_den / param->i_fps_num;
- + else if( b_ctrl_c )
- + duration = (double)(2 * last_dts - prev_dts - first_dts) * param->i_timebase_num / param->i_timebase_den;
- else
- duration = (double)(2 * largest_pts - second_largest_pts) * param->i_timebase_num / param->i_timebase_den;
- if( !(opt->i_pulldown && !param->b_vfr_input) )
- --
- 1.7.0.4
- From 9ac371a36e15b18991727d625887fad88154afd8 Mon Sep 17 00:00:00 2001
- From: Anton Mitrofanov <BugMaster@narod.ru>
- Date: Tue, 25 May 2010 19:11:42 +0400
- Subject: [PATCH 07/10] Fix ABR rate control calculations (incorrect use of h->fenc->i_frame instead of h->i_frame)
- ---
- common/common.h | 2 +-
- encoder/encoder.c | 4 ++--
- encoder/ratecontrol.c | 25 ++++++++++++-------------
- 3 files changed, 15 insertions(+), 16 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index c564768..e1f4d0c 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -491,7 +491,7 @@ struct x264_t
- /* hrd */
- int initial_cpb_removal_delay;
- int initial_cpb_removal_delay_offset;
- - int64_t first_pts;
- + int64_t i_reordered_pts_delay;
- /* Current MB DCT coeffs */
- struct
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index de06251..a7ccd3f 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -2275,8 +2275,8 @@ int x264_encoder_encode( x264_t *h,
- /* ------------------- Get frame to be encoded ------------------------- */
- /* 4: get picture to encode */
- h->fenc = x264_frame_shift( h->frames.current );
- - if( h->i_frame == 0 )
- - h->first_pts = h->fenc->i_reordered_pts;
- + if( h->i_frame == h->i_thread_frames - 1 )
- + h->i_reordered_pts_delay = h->fenc->i_reordered_pts;
- if( h->fenc->param )
- {
- x264_encoder_reconfig( h, h->fenc->param );
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index efa872c..a725a24 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -1966,8 +1966,8 @@ static float rate_estimate_qscale( x264_t *h )
- int64_t diff;
- int64_t predicted_bits = total_bits;
- /* Adjust ABR buffer based on distance to the end of the video. */
- - if( rcc->num_entries > h->fenc->i_frame )
- - abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->fenc->i_frame );
- + if( rcc->num_entries > h->i_frame )
- + abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->i_frame );
- if( rcc->b_vbv )
- {
- @@ -1987,8 +1987,8 @@ static float rate_estimate_qscale( x264_t *h )
- }
- else
- {
- - if( h->fenc->i_frame < h->i_thread_frames )
- - predicted_bits += (int64_t)h->fenc->i_frame * rcc->bitrate / rcc->fps;
- + if( h->i_frame < h->i_thread_frames )
- + predicted_bits += (int64_t)h->i_frame * rcc->bitrate / rcc->fps;
- else
- predicted_bits += (int64_t)(h->i_thread_frames - 1) * rcc->bitrate / rcc->fps;
- }
- @@ -1996,12 +1996,12 @@ static float rate_estimate_qscale( x264_t *h )
- diff = predicted_bits - (int64_t)rce.expected_bits;
- q = rce.new_qscale;
- q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2);
- - if( ((h->fenc->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
- + if( ((h->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
- (rcc->expected_bits_sum > 0))
- {
- /* Adjust quant based on the difference between
- * achieved and expected bitrate so far */
- - double cur_time = (double)h->fenc->i_frame / rcc->num_entries;
- + double cur_time = (double)h->i_frame / rcc->num_entries;
- double w = x264_clip3f( cur_time*100, 0.0, 1.0 );
- q *= pow( (double)total_bits / rcc->expected_bits_sum, w );
- }
- @@ -2063,11 +2063,6 @@ static float rate_estimate_qscale( x264_t *h )
- }
- else
- {
- - int i_frame_done = h->fenc->i_frame + 1 - h->i_thread_frames;
- - double i_time_done = i_frame_done / rcc->fps;
- - if( h->param.b_vfr_input )
- - i_time_done = ((double)(h->fenc->i_reordered_pts - h->first_pts)) * h->param.i_timebase_num / h->param.i_timebase_den;
- -
- q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );
- /* ABR code can potentially be counterproductive in CBR, so just don't bother.
- @@ -2075,10 +2070,14 @@ static float rate_estimate_qscale( x264_t *h )
- if( !rcc->b_vbv_min_rate && rcc->last_satd )
- {
- // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end?
- - wanted_bits = i_time_done * rcc->bitrate;
- + int i_frame_done = h->i_frame + 1 - h->i_thread_frames;
- + double time_done = i_frame_done / rcc->fps;
- + if( h->param.b_vfr_input && i_frame_done > 0 )
- + time_done = ((double)(h->fenc->i_reordered_pts - h->i_reordered_pts_delay)) * h->param.i_timebase_num / h->param.i_timebase_den;
- + wanted_bits = time_done * rcc->bitrate;
- if( wanted_bits > 0 )
- {
- - abr_buffer *= X264_MAX( 1, sqrt(i_time_done) );
- + abr_buffer *= X264_MAX( 1, sqrt( time_done ) );
- overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
- q *= overflow;
- }
- --
- 1.7.0.4
- From e45175154b20332a29137c093bdc9866015e10c8 Mon Sep 17 00:00:00 2001
- From: Anton Mitrofanov <BugMaster@narod.ru>
- Date: Tue, 25 May 2010 13:35:45 -0700
- Subject: [PATCH 08/10] Use a thread pool instead of constantly spawning threads
- Small performance increase; may be as high as 1-2% in some cases.
- Probably helps more on OSs where thread-spawning is expensive, like OS X.
- Also gets rid of "thread created" spam when debugging in gdb.
- ---
- common/common.h | 20 ++++--
- encoder/encoder.c | 198 +++++++++++++++++++++++++++++++++++++++------------
- encoder/lookahead.c | 4 +-
- 3 files changed, 168 insertions(+), 54 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index e1f4d0c..98fcab5 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -365,12 +365,20 @@ struct x264_t
- /* encoder parameters */
- x264_param_t param;
- - x264_t *thread[X264_THREAD_MAX+1];
- - x264_pthread_t thread_handle;
- - int b_thread_active;
- - int i_thread_phase; /* which thread to use for the next frame */
- - int i_threadslice_start; /* first row in this thread slice */
- - int i_threadslice_end; /* row after the end of this thread slice */
- + x264_t *thread[X264_THREAD_MAX+1]; /* contexts for each frame in progress + lookahead */
- + x264_pthread_t *thread_handle;
- + x264_pthread_cond_t thread_queue_cv;
- + x264_pthread_mutex_t thread_queue_mutex;
- + x264_t **thread_queue; /* frames that have been prepared but not yet claimed by a worker thread */
- + x264_pthread_cond_t thread_active_cv;
- + x264_pthread_mutex_t thread_active_mutex;
- + int thread_active;
- + int b_thread_active;
- + int i_thread_phase; /* which thread to use for the next frame */
- + int thread_exit;
- + int thread_error;
- + int i_threadslice_start; /* first row in this thread slice */
- + int i_threadslice_end; /* row after the end of this thread slice */
- /* bitstream output */
- struct
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index a7ccd3f..e839370 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -44,6 +44,53 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
- x264_nal_t **pp_nal, int *pi_nal,
- x264_picture_t *pic_out );
- +/* threading */
- +
- +static void *x264_slices_write_thread( x264_t *h );
- +
- +#ifdef HAVE_PTHREAD
- +static void x264_int_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
- +{
- + x264_pthread_mutex_lock( mutex );
- + *var = val;
- + x264_pthread_cond_broadcast( cv );
- + x264_pthread_mutex_unlock( mutex );
- +}
- +
- +static void x264_int_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
- +{
- + x264_pthread_mutex_lock( mutex );
- + while( *var != val )
- + x264_pthread_cond_wait( cv, mutex );
- + x264_pthread_mutex_unlock( mutex );
- +}
- +
- +#else
- +static void x264_int_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
- +{}
- +static void x264_int_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
- +{}
- +#endif
- +
- +static void x264_thread_pool_push( x264_t *h )
- +{
- + assert( h->thread_active == 0 );
- + h->thread_active = 1;
- + assert( h->b_thread_active == 0 );
- + h->b_thread_active = 1;
- + x264_pthread_mutex_lock( &h->thread[0]->thread_queue_mutex );
- + x264_frame_push( (void*)h->thread_queue, (void*)h );
- + x264_pthread_cond_broadcast( &h->thread[0]->thread_queue_cv );
- + x264_pthread_mutex_unlock( &h->thread[0]->thread_queue_mutex );
- +}
- +
- +static int x264_thread_pool_wait( x264_t *h )
- +{
- + x264_int_cond_wait( &h->thread_active_cv, &h->thread_active_mutex, &h->thread_active, 0 );
- + h->b_thread_active = 0;
- + return h->thread_error;
- +}
- +
- /****************************************************************************
- *
- ******************************* x264 libs **********************************
- @@ -1047,6 +1094,16 @@ x264_t *x264_encoder_open( x264_param_t *param )
- for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
- CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
- + if( h->param.i_threads > 1 )
- + {
- + CHECKED_MALLOCZERO( h->thread_handle, (h->param.i_threads + 1) * sizeof(x264_pthread_t) );
- + CHECKED_MALLOCZERO( h->thread_queue, (h->param.i_threads + 1) * sizeof(x264_t*) );
- + if( x264_pthread_cond_init( &h->thread_queue_cv, NULL ) )
- + goto fail;
- + if( x264_pthread_mutex_init( &h->thread_queue_mutex, NULL ) )
- + goto fail;
- + }
- +
- if( x264_lookahead_init( h, i_slicetype_length ) )
- goto fail;
- @@ -1071,6 +1128,20 @@ x264_t *x264_encoder_open( x264_param_t *param )
- CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
- h->thread[i]->out.i_nals_allocated = init_nal_count;
- + if( h->param.i_threads > 1 )
- + {
- + if( x264_pthread_cond_init( &h->thread[i]->thread_active_cv, NULL ) )
- + goto fail;
- + if( x264_pthread_mutex_init( &h->thread[i]->thread_active_mutex, NULL ) )
- + goto fail;
- + }
- +
- +#ifdef HAVE_VISUALIZE
- + if( h->param.b_visualize )
- + if( x264_visualize_init( h->thread[i] ) )
- + goto fail;
- +#endif
- +
- if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
- goto fail;
- }
- @@ -1111,6 +1182,11 @@ x264_t *x264_encoder_open( x264_param_t *param )
- h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
- "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
- + if( h->param.i_threads > 1 )
- + for( int i = 0; i < h->param.i_threads; i++ )
- + if( x264_pthread_create( &h->thread_handle[i], NULL, (void*)x264_slices_write_thread, h ) )
- + return NULL;
- +
- return h;
- fail:
- x264_free( h );
- @@ -2013,24 +2089,10 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
- memcpy( &dst->stat.i_frame_count, &src->stat.i_frame_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
- }
- -static void *x264_slices_write( x264_t *h )
- +static int x264_slices_write_internal( x264_t *h )
- {
- int i_slice_num = 0;
- int last_thread_mb = h->sh.i_last_mb;
- - if( h->param.i_sync_lookahead )
- - x264_lower_thread_priority( 10 );
- -
- -#ifdef HAVE_MMX
- - /* Misalign mask has to be set separately for each thread. */
- - if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- - x264_cpu_mask_misalign_sse();
- -#endif
- -
- -#ifdef HAVE_VISUALIZE
- - if( h->param.b_visualize )
- - if( x264_visualize_init( h ) )
- - return (void *)-1;
- -#endif
- /* init stats */
- memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
- @@ -2049,24 +2111,69 @@ static void *x264_slices_write( x264_t *h )
- }
- h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
- if( x264_stack_align( x264_slice_write, h ) )
- - return (void *)-1;
- + return -1;
- h->sh.i_first_mb = h->sh.i_last_mb + 1;
- }
- #ifdef HAVE_VISUALIZE
- if( h->param.b_visualize )
- - {
- x264_visualize_show( h );
- - x264_visualize_close( h );
- - }
- #endif
- + return 0;
- +}
- +
- +static int x264_slices_write( x264_t *h )
- +{
- +#ifdef HAVE_MMX
- + /* Misalign mask has to be set separately for each thread. */
- + if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- + x264_cpu_mask_misalign_sse();
- +#endif
- +
- + if( x264_slices_write_internal( h ) )
- + return -1;
- +
- + return 0;
- +}
- +
- +static void *x264_slices_write_thread( x264_t *h )
- +{
- + if( h->param.i_sync_lookahead )
- + x264_lower_thread_priority( 10 );
- +
- +#ifdef HAVE_MMX
- + /* Misalign mask has to be set separately for each thread. */
- + if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- + x264_cpu_mask_misalign_sse();
- +#endif
- +
- + for(;;)
- + {
- + int b_exit;
- + x264_t *t = NULL;
- +
- + // get one frame from the queue
- + x264_pthread_mutex_lock( &h->thread_queue_mutex );
- + while( !h->thread_queue[0] && !h->thread_exit )
- + x264_pthread_cond_wait( &h->thread_queue_cv, &h->thread_queue_mutex );
- + b_exit = h->thread_exit;
- + if( !b_exit )
- + t = (void*)x264_frame_shift( (void*)h->thread_queue );
- + x264_pthread_mutex_unlock( &h->thread_queue_mutex );
- + if( b_exit )
- + break;
- +
- + t->thread_error = x264_slices_write_internal( t );
- +
- + x264_int_cond_broadcast( &t->thread_active_cv, &t->thread_active_mutex, &t->thread_active, 0 );
- + }
- +
- return (void *)0;
- }
- static int x264_threaded_slices_write( x264_t *h )
- {
- - void *ret = NULL;
- #ifdef HAVE_MMX
- if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- x264_cpu_mask_misalign_sse();
- @@ -2093,18 +2200,10 @@ static int x264_threaded_slices_write( x264_t *h )
- /* dispatch */
- for( int i = 0; i < h->param.i_threads; i++ )
- - {
- - if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
- - return -1;
- - h->thread[i]->b_thread_active = 1;
- - }
- + x264_thread_pool_push( h->thread[i] );
- for( int i = 0; i < h->param.i_threads; i++ )
- - {
- - x264_pthread_join( h->thread[i]->thread_handle, &ret );
- - h->thread[i]->b_thread_active = 0;
- - if( (intptr_t)ret )
- - return (intptr_t)ret;
- - }
- + if( x264_thread_pool_wait( h->thread[i] ) )
- + return -1;
- /* Go back and fix up the hpel on the borders between slices. */
- for( int i = 1; i < h->param.i_threads; i++ )
- @@ -2502,18 +2601,14 @@ int x264_encoder_encode( x264_t *h,
- h->i_threadslice_start = 0;
- h->i_threadslice_end = h->sps->i_mb_height;
- if( h->i_thread_frames > 1 )
- - {
- - if( x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ) )
- - return -1;
- - h->b_thread_active = 1;
- - }
- + x264_thread_pool_push( h );
- else if( h->param.b_sliced_threads )
- {
- if( x264_threaded_slices_write( h ) )
- return -1;
- }
- else
- - if( (intptr_t)x264_slices_write( h ) )
- + if( x264_slices_write( h ) )
- return -1;
- return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
- @@ -2526,13 +2621,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
- char psz_message[80];
- if( h->b_thread_active )
- - {
- - void *ret = NULL;
- - x264_pthread_join( h->thread_handle, &ret );
- - h->b_thread_active = 0;
- - if( (intptr_t)ret )
- - return (intptr_t)ret;
- - }
- + if( x264_thread_pool_wait( h ) )
- + return -1;
- if( !h->out.i_nal )
- {
- pic_out->i_type = X264_TYPE_AUTO;
- @@ -2798,9 +2888,21 @@ void x264_encoder_close ( x264_t *h )
- if( h->param.i_threads > 1 )
- {
- // don't strictly have to wait for the other threads, but it's simpler than canceling them
- + x264_pthread_mutex_lock( &h->thread_queue_mutex );
- + h->thread_exit = 1;
- + x264_pthread_cond_broadcast( &h->thread_queue_cv );
- + x264_pthread_mutex_unlock( &h->thread_queue_mutex );
- for( int i = 0; i < h->param.i_threads; i++ )
- - if( h->thread[i]->b_thread_active )
- - x264_pthread_join( h->thread[i]->thread_handle, NULL );
- + x264_pthread_join( h->thread_handle[i], NULL );
- + for( int i = 0; i < h->param.i_threads; i++ )
- + {
- + x264_pthread_cond_destroy( &h->thread[i]->thread_active_cv );
- + x264_pthread_mutex_destroy( &h->thread[i]->thread_active_mutex );
- + }
- + x264_pthread_cond_destroy( &h->thread_queue_cv );
- + x264_pthread_mutex_destroy( &h->thread_queue_mutex );
- + x264_free( h->thread_handle );
- + x264_free( h->thread_queue );
- if( h->i_thread_frames > 1 )
- {
- for( int i = 0; i < h->i_thread_frames; i++ )
- @@ -3114,6 +3216,10 @@ void x264_encoder_close ( x264_t *h )
- x264_macroblock_cache_free( h->thread[i] );
- }
- x264_macroblock_thread_free( h->thread[i], 0 );
- +#ifdef HAVE_VISUALIZE
- + if( h->param.b_visualize )
- + x264_visualize_close( h->thread[i] );
- +#endif
- x264_free( h->thread[i]->out.p_bitstream );
- x264_free( h->thread[i]->out.nal);
- x264_free( h->thread[i] );
- diff --git a/encoder/lookahead.c b/encoder/lookahead.c
- index 942e952..1b56c16 100644
- --- a/encoder/lookahead.c
- +++ b/encoder/lookahead.c
- @@ -153,7 +153,7 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )
- if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
- goto fail;
- - if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
- + if( x264_pthread_create( &h->thread_handle[h->param.i_threads], NULL, (void *)x264_lookahead_thread, look_h ) )
- goto fail;
- look->b_thread_active = 1;
- @@ -171,7 +171,7 @@ void x264_lookahead_delete( x264_t *h )
- h->lookahead->b_exit_thread = 1;
- x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
- x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
- - x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
- + x264_pthread_join( h->thread_handle[h->param.i_threads], NULL );
- x264_macroblock_cache_free( h->thread[h->param.i_threads] );
- x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
- x264_free( h->thread[h->param.i_threads] );
- --
- 1.7.0.4
- From 9572e5b2f839316f69c295d86dd4891f64308d4d Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Tue, 25 May 2010 12:42:44 -0700
- Subject: [PATCH 09/10] Overhaul deblocking again
- Move deblock strength calculation to immediately after encoding to take advantage of the data that's already in cache.
- Keep the deblocking itself as per-row.
- ---
- common/common.h | 3 +
- common/deblock.c | 44 +++---------
- common/frame.h | 2 +-
- common/macroblock.c | 172 ++++++++++++++++++++++++++++------------------
- common/macroblock.h | 4 +-
- common/x86/deblock-a.asm | 3 +-
- encoder/encoder.c | 17 +++++
- encoder/macroblock.c | 8 ++-
- tools/checkasm.c | 4 +-
- 9 files changed, 147 insertions(+), 110 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index 98fcab5..d88d695 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -588,6 +588,8 @@ struct x264_t
- int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
- uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of
- * NOTE: this will fail on resolutions above 2^16 MBs... */
- + int8_t deblock_ref_table[32+2];
- + #define deblock_ref_table(x) h->mb.deblock_ref_table[x+2]
- /* buffer for weighted versions of the reference frames */
- uint8_t *p_weight_buf[16];
- @@ -787,6 +789,7 @@ struct x264_t
- /* Buffers that are allocated per-thread even in sliced threads. */
- void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
- uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
- + uint8_t (*deblock_strength[2])[2][4][4];
- /* CPU functions dependents */
- x264_predict_t predict_16x16[4+3];
- diff --git a/common/deblock.c b/common/deblock.c
- index 9450a8b..af59b18 100644
- --- a/common/deblock.c
- +++ b/common/deblock.c
- @@ -274,13 +274,15 @@ static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int b
- deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
- }
- -static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, int bframe, int step, int first_edge_only )
- +static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit,
- + int bframe )
- {
- for( int dir = 0; dir < 2; dir++ )
- {
- int s1 = dir ? 1 : 8;
- int s2 = dir ? 8 : 1;
- - for( int edge = 0; edge < (first_edge_only ? 1 : 4); edge += step )
- + for( int edge = 0; edge < 4; edge++ )
- for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
- {
- int locn = loc - s2;
- @@ -337,46 +339,25 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
- void x264_frame_deblock_row( x264_t *h, int mb_y )
- {
- int b_interlaced = h->sh.b_mbaff;
- - int mvy_limit = 4 >> b_interlaced;
- int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
- int stridey = h->fdec->i_stride[0];
- int stride2y = stridey << b_interlaced;
- int strideuv = h->fdec->i_stride[1];
- int stride2uv = strideuv << b_interlaced;
- - int deblock_ref_table[2][32+2];
- uint8_t (*nnz_backup)[16] = h->scratch_buffer;
- - for( int l = 0; l < 2; l++ )
- - {
- - int refs = (l ? h->i_ref1 : h->i_ref0) << h->sh.b_mbaff;
- - x264_frame_t **fref = l ? h->fref1 : h->fref0;
- - deblock_ref_table(l,-2) = -2;
- - deblock_ref_table(l,-1) = -1;
- - for( int i = 0; i < refs; i++ )
- - {
- - /* Mask off high bits to avoid frame num collisions with -1/-2.
- - * frame num values don't actually have to be correct, just unique.
- - * frame num values can't cover a range of more than 32. */
- - if( !h->mb.b_interlaced )
- - deblock_ref_table(l,i) = fref[i]->i_frame_num&63;
- - else
- - deblock_ref_table(l,i) = ((fref[i>>1]->i_frame_num&63)<<1) + (i&1);
- - }
- - }
- -
- if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
- munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
- for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
- {
- - ALIGNED_ARRAY_16( uint8_t, bs, [2][4][4] );
- -
- x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
- - x264_macroblock_cache_load_deblock( h, mb_x, mb_y, deblock_ref_table );
- + x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
- int mb_xy = h->mb.i_mb_xy;
- - int transform_8x8 = h->mb.mb_transform_size[mb_xy];
- + int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy];
- int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
- + uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&1&b_interlaced][mb_x];
- uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
- uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
- @@ -404,11 +385,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
- h->loopf.deblock_chroma##intra[dir] );\
- } while(0)
- - if( intra_cur )
- - memset( bs, 3, sizeof(bs) );
- - else
- - h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B, transform_8x8 + 1, first_edge_only );
- -
- if( h->mb.i_neighbour & MB_LEFT )
- {
- int qpl = h->mb.qp[h->mb.i_mb_left_xy];
- @@ -468,13 +444,13 @@ void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int be
- void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
- void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
- - int mvy_limit, int bframe, int step, int first_edge_only );
- + int mvy_limit, int bframe );
- void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
- - int mvy_limit, int bframe, int step, int first_edge_only );
- + int mvy_limit, int bframe );
- void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
- - int mvy_limit, int bframe, int step, int first_edge_only );
- + int mvy_limit, int bframe );
- #ifdef ARCH_X86
- void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
- void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
- diff --git a/common/frame.h b/common/frame.h
- index adc707c..91d27b5 100644
- --- a/common/frame.h
- +++ b/common/frame.h
- @@ -166,7 +166,7 @@ typedef struct
- x264_deblock_intra_t deblock_chroma_intra[2];
- void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit,
- - int bframe, int step, int first_edge_only );
- + int bframe );
- } x264_deblock_function_t;
- x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
- diff --git a/common/macroblock.c b/common/macroblock.c
- index 1c0ff9b..fbd0307 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -325,12 +325,15 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
- {
- if( !b_lookahead )
- for( int i = 0; i <= h->param.b_interlaced; i++ )
- + {
- for( int j = 0; j < 3; j++ )
- {
- /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
- CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
- h->intra_border_backup[i][j] += 8;
- }
- + CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->sps->i_mb_width );
- + }
- /* Allocate scratch buffer */
- int scratch_size = 0;
- @@ -357,8 +360,11 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
- {
- if( !b_lookahead )
- for( int i = 0; i <= h->param.b_interlaced; i++ )
- + {
- + x264_free( h->deblock_strength[i] );
- for( int j = 0; j < 3; j++ )
- x264_free( h->intra_border_backup[i][j] - 8 );
- + }
- x264_free( h->scratch_buffer );
- }
- @@ -413,6 +419,19 @@ void x264_macroblock_slice_init( x264_t *h )
- h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
- }
- + deblock_ref_table(-2) = -2;
- + deblock_ref_table(-1) = -1;
- + for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
- + {
- + /* Mask off high bits to avoid frame num collisions with -1/-2.
- + * frame num values don't actually have to be correct, just unique.
- + * frame num values can't cover a range of more than 32. */
- + if( !h->mb.b_interlaced )
- + deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
- + else
- + deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
- + }
- +
- h->mb.i_neighbour4[6] =
- h->mb.i_neighbour4[9] =
- h->mb.i_neighbour4[12] =
- @@ -873,15 +892,13 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
- }
- -static void inline x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
- +void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
- {
- - int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
- int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
- + int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
- h->mb.i_neighbour = 0;
- h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
- - h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x);
- - h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x);
- if( mb_x > 0 )
- {
- @@ -898,86 +915,105 @@ static void inline x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int
- }
- }
- -void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] )
- +void x264_macroblock_cache_load_deblock( x264_t *h )
- {
- - x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
- + int mb_x = h->mb.i_mb_x;
- + int mb_y = h->mb.i_mb_y;
- + int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
- if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
- return;
- - int cur = h->mb.i_mb_xy;
- - int left = h->mb.i_mb_left_xy;
- - int top = h->mb.i_mb_top_xy;
- - int top_y = mb_y - (1 << h->mb.b_interlaced);
- - int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
- - int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
- - int s8x8 = h->mb.i_b8_stride;
- - int s4x4 = h->mb.i_b4_stride;
- + /* If we have multiple slices and we're deblocking on slice edges, we
- + * have to reload neighbour data. */
- + if( h->sh.i_first_mb && deblock_on_slice_edges )
- + {
- + int old_neighbour = h->mb.i_neighbour;
- + x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
- + h->mb.i_neighbour &= ~old_neighbour;
- + if( h->mb.i_neighbour )
- + {
- + int left = h->mb.i_mb_left_xy;
- + int top = h->mb.i_mb_top_xy;
- + int top_y = mb_y - (1 << h->mb.b_interlaced);
- + int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
- + int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
- + int s8x8 = h->mb.i_b8_stride;
- + int s4x4 = h->mb.i_b4_stride;
- - uint8_t (*nnz)[24] = h->mb.non_zero_count;
- + uint8_t (*nnz)[24] = h->mb.non_zero_count;
- - if( h->mb.i_neighbour & MB_TOP )
- - CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
- + if( h->mb.i_neighbour & MB_TOP )
- + CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
- - if( h->mb.i_neighbour & MB_LEFT )
- - {
- - h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
- - h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
- - h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
- - h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
- - }
- + if( h->mb.i_neighbour & MB_LEFT )
- + {
- + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
- + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
- + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
- + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
- + }
- - CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8], &nnz[cur][0*4] );
- - CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8], &nnz[cur][1*4] );
- - CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8], &nnz[cur][2*4] );
- - CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8], &nnz[cur][3*4] );
- + for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
- + {
- + int16_t (*mv)[2] = h->mb.mv[l];
- + int8_t *ref = h->mb.ref[l];
- - for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
- - {
- - int16_t (*mv)[2] = h->mb.mv[l];
- - int8_t *ref = h->mb.ref[l];
- + int i8 = x264_scan8[0] - 8;
- + if( h->mb.i_neighbour & MB_TOP )
- + {
- + h->mb.cache.ref[l][i8+0] =
- + h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
- + h->mb.cache.ref[l][i8+2] =
- + h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
- + CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
- + }
- - int i8 = x264_scan8[0] - 8;
- - if( h->mb.i_neighbour & MB_TOP )
- - {
- - h->mb.cache.ref[l][i8+0] =
- - h->mb.cache.ref[l][i8+1] = deblock_ref_table(l,ref[top_8x8 + 0]);
- - h->mb.cache.ref[l][i8+2] =
- - h->mb.cache.ref[l][i8+3] = deblock_ref_table(l,ref[top_8x8 + 1]);
- - CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
- + i8 = x264_scan8[0] - 1;
- + if( h->mb.i_neighbour & MB_LEFT )
- + {
- + int ir = h->mb.i_b8_xy - 1;
- + int iv = h->mb.i_b4_xy - 1;
- + h->mb.cache.ref[l][i8+0*8] =
- + h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
- + h->mb.cache.ref[l][i8+2*8] =
- + h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
- +
- + CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
- + CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
- + CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
- + CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
- + }
- + }
- }
- + }
- - i8 = x264_scan8[0] - 1;
- - if( h->mb.i_neighbour & MB_LEFT )
- - {
- - int ir = h->mb.i_b8_xy - 1;
- - int iv = h->mb.i_b4_xy - 1;
- - h->mb.cache.ref[l][i8+0*8] =
- - h->mb.cache.ref[l][i8+1*8] = deblock_ref_table(l,ref[ir + 0*s8x8]);
- - h->mb.cache.ref[l][i8+2*8] =
- - h->mb.cache.ref[l][i8+3*8] = deblock_ref_table(l,ref[ir + 1*s8x8]);
- -
- - CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
- - CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
- - CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
- - CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
- - }
- + if( h->param.analyse.i_weighted_pred && h->sh.i_type == SLICE_TYPE_P )
- + {
- + /* Handle reference frame duplicates */
- + int i8 = x264_scan8[0] - 8;
- + h->mb.cache.ref[0][i8+0] =
- + h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]);
- + h->mb.cache.ref[0][i8+2] =
- + h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]);
- - int ref0 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+0*s8x8]);
- - int ref1 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+0*s8x8]);
- - int ref2 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+1*s8x8]);
- - int ref3 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+1*s8x8]);
- + i8 = x264_scan8[0] - 1;
- + h->mb.cache.ref[0][i8+0*8] =
- + h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]);
- + h->mb.cache.ref[0][i8+2*8] =
- + h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]);
- +
- + int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]);
- + int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]);
- + int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]);
- + int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]);
- uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101;
- uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101;
- - M32( &h->mb.cache.ref[l][x264_scan8[0]+8*0] ) = reftop;
- - M32( &h->mb.cache.ref[l][x264_scan8[0]+8*1] ) = reftop;
- - M32( &h->mb.cache.ref[l][x264_scan8[0]+8*2] ) = refbot;
- - M32( &h->mb.cache.ref[l][x264_scan8[0]+8*3] ) = refbot;
- - CP128( h->mb.cache.mv[l][x264_scan8[0]+8*0], mv[h->mb.i_b4_xy+0*s4x4] );
- - CP128( h->mb.cache.mv[l][x264_scan8[0]+8*1], mv[h->mb.i_b4_xy+1*s4x4] );
- - CP128( h->mb.cache.mv[l][x264_scan8[0]+8*2], mv[h->mb.i_b4_xy+2*s4x4] );
- - CP128( h->mb.cache.mv[l][x264_scan8[0]+8*3], mv[h->mb.i_b4_xy+3*s4x4] );
- + M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop;
- + M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop;
- + M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
- + M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
- }
- }
- @@ -1041,6 +1077,8 @@ void x264_macroblock_cache_save( x264_t *h )
- h->mb.cbp[i_mb_xy] = 0x72f; /* all set */
- h->mb.b_transform_8x8 = 0;
- memset( nnz, 16, sizeof( *h->mb.non_zero_count ) );
- + for( int i = 0; i < 24; i++ )
- + h->mb.cache.non_zero_count[x264_scan8[i]] = 16;
- }
- else
- {
- diff --git a/common/macroblock.h b/common/macroblock.h
- index 5fbbd16..8dc65b8 100644
- --- a/common/macroblock.h
- +++ b/common/macroblock.h
- @@ -271,8 +271,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
- void x264_macroblock_slice_init( x264_t *h );
- void x264_macroblock_thread_init( x264_t *h );
- void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
- -void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] );
- -#define deblock_ref_table(l,x) deblock_ref_table[l][x+2]
- +void x264_macroblock_cache_load_deblock( x264_t *h );
- +void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y );
- void x264_macroblock_cache_save( x264_t *h );
- void x264_macroblock_bipred_init( x264_t *h );
- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
- index f2f3e58..aedd688 100644
- --- a/common/x86/deblock-a.asm
- +++ b/common/x86/deblock-a.asm
- @@ -889,8 +889,7 @@ chroma_intra_body_mmxext:
- ;-----------------------------------------------------------------------------
- ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
- -; uint8_t bs[2][4][4], int mvy_limit, int bframe, int step,
- -; int first_edge_only )
- +; uint8_t bs[2][4][4], int mvy_limit, int bframe )
- ;-----------------------------------------------------------------------------
- %define scan8start (4+1*8)
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index e839370..7872013 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -1828,6 +1828,9 @@ static int x264_slice_write( x264_t *h )
- int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 3;
- int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : INT_MAX;
- int starting_bits = bs_pos(&h->out.bs);
- + int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
- + int b_hpel = h->fdec->b_kept_as_ref;
- + b_deblock &= b_hpel || h->param.psz_dump_yuv;
- bs_realign( &h->out.bs );
- /* Slice */
- @@ -1966,6 +1969,20 @@ static int x264_slice_write( x264_t *h )
- /* save cache */
- x264_macroblock_cache_save( h );
- + /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */
- + if( b_deblock )
- + {
- + int mvy_limit = 4 >> h->sh.b_mbaff;
- + int type = h->mb.type[h->mb.i_mb_xy];
- + uint8_t (*bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1&h->sh.b_mbaff][h->mb.i_mb_x];
- + x264_macroblock_cache_load_deblock( h );
- + if( IS_INTRA( type ) )
- + memset( bs, 3, sizeof(uint8_t)*2*4*4 );
- + else
- + h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
- + bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B );
- + }
- +
- /* accumulate mb stats */
- h->stat.frame.i_mb_count[h->mb.i_type]++;
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index 199bb68..b7e5f34 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- @@ -459,8 +459,12 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
- static void x264_macroblock_encode_skip( x264_t *h )
- {
- - for( int i = 0; i < sizeof( h->mb.cache.non_zero_count ); i += 16 )
- - M128( &h->mb.cache.non_zero_count[i] ) = M128_ZERO;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ) = 0;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ) = 0;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ) = 0;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ) = 0;
- + for( int i = 16; i < 24; i++ )
- + h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
- h->mb.i_cbp_luma = 0;
- h->mb.i_cbp_chroma = 0;
- h->mb.cbp[h->mb.i_mb_xy] = 0;
- diff --git a/tools/checkasm.c b/tools/checkasm.c
- index 5dd360a..6469017 100644
- --- a/tools/checkasm.c
- +++ b/tools/checkasm.c
- @@ -1164,8 +1164,8 @@ static int check_deblock( int cpu_ref, int cpu_new )
- mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
- }
- set_func_name( "deblock_strength" );
- - call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1), 1, 0 );
- - call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1), 1, 0 );
- + call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
- + call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
- if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
- {
- ok = 0;
- --
- 1.7.0.4
- From bd8642cef14e8dc13a0c87526b8f43e4436ab3a1 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Tue, 25 May 2010 16:13:59 -0700
- Subject: [PATCH 10/10] Detect Atom CPU, enable appropriate asm functions
- I'm not going to actually optimize for this pile of garbage unless someone pays me.
- But it can't hurt to at least enable the correct functions based on benchmarks.
- Also save some cache on Intel CPUs that don't need the decimate LUT due to having fast bsr/bsf.
- ---
- common/cpu.c | 16 ++++++++++++----
- common/dct.c | 2 +-
- common/pixel.c | 17 ++++++++++-------
- common/quant.c | 15 +++++++++++++++
- common/x86/mc-c.c | 9 ++++++---
- common/x86/quant-a.asm | 32 ++++++++++++++++++++++++--------
- common/x86/quant.h | 6 ++++++
- encoder/macroblock.c | 5 +----
- tools/checkasm.c | 14 +++++++++++++-
- x264.h | 2 ++
- 10 files changed, 90 insertions(+), 28 deletions(-)
- diff --git a/common/cpu.c b/common/cpu.c
- index 933a754..10ac303 100644
- --- a/common/cpu.c
- +++ b/common/cpu.c
- @@ -64,6 +64,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
- {"ARMv6", X264_CPU_ARMV6},
- {"NEON", X264_CPU_NEON},
- {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
- + {"SlowCTZ", X264_CPU_SLOW_CTZ},
- + {"SlowAtom", X264_CPU_SLOW_ATOM},
- {"", 0},
- };
- @@ -135,6 +137,7 @@ uint32_t x264_cpu_detect( void )
- if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
- {
- + cpu |= X264_CPU_SLOW_CTZ;
- x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
- if( edx&0x00400000 )
- cpu |= X264_CPU_MMXEXT;
- @@ -145,6 +148,7 @@ uint32_t x264_cpu_detect( void )
- cpu |= X264_CPU_SSE2_IS_FAST;
- cpu |= X264_CPU_LZCNT;
- cpu |= X264_CPU_SHUFFLE_IS_FAST;
- + cpu &= ~X264_CPU_SLOW_CTZ;
- }
- else
- cpu |= X264_CPU_SSE2_IS_SLOW;
- @@ -159,11 +163,9 @@ uint32_t x264_cpu_detect( void )
- if( !strcmp((char*)vendor, "GenuineIntel") )
- {
- - int family, model, stepping;
- x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
- - family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
- - model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
- - stepping = eax&0xf;
- + int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
- + int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
- /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
- * theoretically support sse2, but it's significantly slower than mmx for
- * almost all of x264's functions, so let's just pretend they don't. */
- @@ -172,6 +174,12 @@ uint32_t x264_cpu_detect( void )
- cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
- assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
- }
- + /* Detect Atom CPU */
- + if( family == 6 && model == 28 )
- + {
- + cpu |= X264_CPU_SLOW_ATOM;
- + cpu |= X264_CPU_SLOW_CTZ;
- + }
- }
- if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
- diff --git a/common/dct.c b/common/dct.c
- index 3917510..10fe2f7 100644
- --- a/common/dct.c
- +++ b/common/dct.c
- @@ -457,7 +457,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
- dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
- }
- - if( cpu&X264_CPU_SSSE3 )
- + if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
- {
- dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
- dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
- diff --git a/common/pixel.c b/common/pixel.c
- index 20c5170..5759abf 100644
- --- a/common/pixel.c
- +++ b/common/pixel.c
- @@ -768,17 +768,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
- if( cpu&X264_CPU_SSSE3 )
- {
- - INIT7( ssd, _ssse3 );
- - INIT7( satd, _ssse3 );
- - INIT7( satd_x3, _ssse3 );
- - INIT7( satd_x4, _ssse3 );
- if( !(cpu&X264_CPU_STACK_MOD4) )
- {
- INIT4( hadamard_ac, _ssse3 );
- }
- INIT_ADS( _ssse3 );
- - pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
- - pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
- + if( !(cpu&X264_CPU_SLOW_ATOM) )
- + {
- + INIT7( ssd, _ssse3 );
- + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
- + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
- + INIT7( satd, _ssse3 );
- + INIT7( satd_x3, _ssse3 );
- + INIT7( satd_x4, _ssse3 );
- + }
- pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
- pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
- pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
- @@ -794,7 +797,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
- INIT2( sad_x3, _cache64_ssse3 );
- INIT2( sad_x4, _cache64_ssse3 );
- }
- - if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
- + if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
- {
- INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
- }
- diff --git a/common/quant.c b/common/quant.c
- index ce074e2..e62fa0f 100644
- --- a/common/quant.c
- +++ b/common/quant.c
- @@ -312,6 +312,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
- pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
- pf->decimate_score15 = x264_decimate_score15_mmxext;
- pf->decimate_score16 = x264_decimate_score16_mmxext;
- + if( cpu&X264_CPU_SLOW_CTZ )
- + {
- + pf->decimate_score15 = x264_decimate_score15_mmxext_slowbsr;
- + pf->decimate_score16 = x264_decimate_score16_mmxext_slowbsr;
- + }
- pf->decimate_score64 = x264_decimate_score64_mmxext;
- pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
- pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
- @@ -345,6 +350,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
- pf->decimate_score15 = x264_decimate_score15_sse2;
- pf->decimate_score16 = x264_decimate_score16_sse2;
- pf->decimate_score64 = x264_decimate_score64_sse2;
- + if( cpu&X264_CPU_SLOW_CTZ )
- + {
- + pf->decimate_score15 = x264_decimate_score15_sse2_slowbsr;
- + pf->decimate_score16 = x264_decimate_score16_sse2_slowbsr;
- + }
- pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
- pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
- pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
- @@ -369,6 +379,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
- pf->denoise_dct = x264_denoise_dct_ssse3;
- pf->decimate_score15 = x264_decimate_score15_ssse3;
- pf->decimate_score16 = x264_decimate_score16_ssse3;
- + if( cpu&X264_CPU_SLOW_CTZ )
- + {
- + pf->decimate_score15 = x264_decimate_score15_ssse3_slowbsr;
- + pf->decimate_score16 = x264_decimate_score16_ssse3_slowbsr;
- + }
- pf->decimate_score64 = x264_decimate_score64_ssse3;
- }
- diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
- index f641cff..2171f89 100644
- --- a/common/x86/mc-c.c
- +++ b/common/x86/mc-c.c
- @@ -427,8 +427,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- return;
- pf->weight = x264_mc_weight_wtab_sse2;
- - pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
- - pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
- + if( !(cpu&X264_CPU_SLOW_ATOM) )
- + {
- + pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
- + pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
- + }
- pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
- pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
- @@ -481,7 +484,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
- pf->weight = x264_mc_weight_wtab_ssse3;
- }
- - if( cpu&X264_CPU_SHUFFLE_IS_FAST )
- + if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
- pf->integral_init4v = x264_integral_init4v_ssse3;
- if( !(cpu&X264_CPU_SSE4) )
- diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
- index 3e520fa..b770adf 100644
- --- a/common/x86/quant-a.asm
- +++ b/common/x86/quant-a.asm
- @@ -583,7 +583,7 @@ DENOISE_DCT ssse3, 7
- cextern decimate_table4
- cextern decimate_table8
- -%macro DECIMATE4x4 2
- +%macro DECIMATE4x4 3
- ;A LUT is faster than bsf on AMD processors, and no slower on Intel
- ;This is not true for score64.
- @@ -605,6 +605,7 @@ cglobal decimate_score%1_%2, 1,3
- %if %1==15
- shr edx, 1
- %endif
- +%if %3==1
- movzx ecx, dl
- movzx eax, byte [mask_table + rcx]
- cmp edx, ecx
- @@ -617,8 +618,17 @@ cglobal decimate_score%1_%2, 1,3
- shr edx, cl
- add al, byte [table + rcx]
- add al, byte [mask_table + rdx]
- +%else
- +.loop:
- + bsf ecx, edx
- + shr edx, cl
- + movzx ecx, byte [table + rcx]
- + add eax, ecx
- + shr edx, 1
- + jne .loop
- +%endif
- .ret:
- - REP_RET
- + RET
- .ret9:
- mov eax, 9
- RET
- @@ -627,14 +637,20 @@ cglobal decimate_score%1_%2, 1,3
- %ifndef ARCH_X86_64
- %define DECIMATE_MASK DECIMATE_MASK_MMX
- -DECIMATE4x4 15, mmxext
- -DECIMATE4x4 16, mmxext
- +DECIMATE4x4 15, mmxext, 0
- +DECIMATE4x4 16, mmxext, 0
- +DECIMATE4x4 15, mmxext_slowbsr, 1
- +DECIMATE4x4 16, mmxext_slowbsr, 1
- %endif
- %define DECIMATE_MASK DECIMATE_MASK_SSE2
- -DECIMATE4x4 15, sse2
- -DECIMATE4x4 15, ssse3
- -DECIMATE4x4 16, sse2
- -DECIMATE4x4 16, ssse3
- +DECIMATE4x4 15, sse2, 0
- +DECIMATE4x4 16, sse2, 0
- +DECIMATE4x4 15, sse2_slowbsr, 1
- +DECIMATE4x4 16, sse2_slowbsr, 1
- +DECIMATE4x4 15, ssse3, 0
- +DECIMATE4x4 16, ssse3, 0
- +DECIMATE4x4 15, ssse3_slowbsr, 1
- +DECIMATE4x4 16, ssse3_slowbsr, 1
- %macro DECIMATE8x8 1
- diff --git a/common/x86/quant.h b/common/x86/quant.h
- index 4e42b81..4ffd684 100644
- --- a/common/x86/quant.h
- +++ b/common/x86/quant.h
- @@ -57,6 +57,12 @@ int x264_decimate_score15_ssse3 ( int16_t *dct );
- int x264_decimate_score16_mmxext( int16_t *dct );
- int x264_decimate_score16_sse2 ( int16_t *dct );
- int x264_decimate_score16_ssse3 ( int16_t *dct );
- +int x264_decimate_score15_mmxext_slowbsr( int16_t *dct );
- +int x264_decimate_score15_sse2_slowbsr ( int16_t *dct );
- +int x264_decimate_score15_ssse3_slowbsr ( int16_t *dct );
- +int x264_decimate_score16_mmxext_slowbsr( int16_t *dct );
- +int x264_decimate_score16_sse2_slowbsr ( int16_t *dct );
- +int x264_decimate_score16_ssse3_slowbsr ( int16_t *dct );
- int x264_decimate_score64_mmxext( int16_t *dct );
- int x264_decimate_score64_sse2 ( int16_t *dct );
- int x264_decimate_score64_ssse3 ( int16_t *dct );
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index b7e5f34..984f8a8 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- @@ -997,10 +997,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
- /* calculate dct coeffs */
- for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
- {
- - /* We don't need to zero the DC coefficient before quantization because we already
- - * checked that all the DCs were zero above at twice the precision that quant4x4
- - * uses. This applies even though the DC here is being quantized before the 2x2
- - * transform. */
- + dct4x4[i4x4][0] = 0;
- if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
- continue;
- h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
- diff --git a/tools/checkasm.c b/tools/checkasm.c
- index 6469017..a0a9d54 100644
- --- a/tools/checkasm.c
- +++ b/tools/checkasm.c
- @@ -173,7 +173,9 @@ static void print_bench(void)
- b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
- b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
- b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
- - b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
- + b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
- + b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
- + b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
- ((int64_t)10*b->cycles/b->den - nop_time)/4 );
- }
- }
- @@ -1700,6 +1702,8 @@ static int check_all_flags( void )
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
- cpu1 &= ~X264_CPU_LZCNT;
- }
- + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
- + cpu1 &= ~X264_CPU_SLOW_CTZ;
- }
- if( x264_cpu_detect() & X264_CPU_SSE2 )
- {
- @@ -1708,6 +1712,10 @@ static int check_all_flags( void )
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
- cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
- + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
- + cpu1 &= ~X264_CPU_SLOW_CTZ;
- + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
- + cpu1 &= ~X264_CPU_SLOW_ATOM;
- }
- if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
- {
- @@ -1730,6 +1738,10 @@ static int check_all_flags( void )
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
- cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
- + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
- + cpu1 &= ~X264_CPU_SLOW_CTZ;
- + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
- + cpu1 &= ~X264_CPU_SLOW_ATOM;
- }
- if( x264_cpu_detect() & X264_CPU_SSE4 )
- {
- diff --git a/x264.h b/x264.h
- index f714b72..6d7b703 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -66,6 +66,8 @@ typedef struct x264_t x264_t;
- #define X264_CPU_ARMV6 0x020000
- #define X264_CPU_NEON 0x040000 /* ARM NEON */
- #define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
- +#define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
- +#define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */
- /* Analyse flags
- */
- --
- 1.7.0.4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement