Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From f3677c61bc31dbe79d69dee092cba504c3f6f523 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Mon, 31 May 2010 11:14:22 -0700
- Subject: [PATCH 01/10] Fix cavlc+deblock+8x8dct (regression in r1612)
- Add cavlc+8x8dct munging to new deblock system.
- May have caused minor visual artifacts.
- ---
- common/deblock.c | 47 -----------------------------------------------
- common/macroblock.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
- 2 files changed, 44 insertions(+), 49 deletions(-)
- diff --git a/common/deblock.c b/common/deblock.c
- index fc039c5..27c73ae 100644
- --- a/common/deblock.c
- +++ b/common/deblock.c
- @@ -24,46 +24,6 @@
- #include "common.h"
- -/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
- - * entropy coding, but per 64 coeffs for the purpose of deblocking */
- -static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
- -{
- - uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
- - int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
- - for( int x = 0; x<h->sps->i_mb_width; x++ )
- - {
- - memcpy( buf+x, src+x, 16 );
- - if( transform[x] )
- - {
- - int nnz = src[x][0] | src[x][1];
- - src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
- - nnz = src[x][2] | src[x][3];
- - src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
- - }
- - }
- -}
- -
- -static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
- -{
- - uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
- - for( int x = 0; x < h->sps->i_mb_width; x++ )
- - memcpy( dst+x, buf+x, 16 );
- -}
- -
- -static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
- -{
- - func( h, mb_y, buf );
- - if( mb_y > 0 )
- - func( h, mb_y-1, buf + h->sps->i_mb_width );
- - if( h->sh.b_mbaff )
- - {
- - func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
- - if( mb_y > 0 )
- - func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
- - }
- -}
- -
- -
- /* Deblocking filter */
- static const uint8_t i_alpha_table[52+12*2] =
- {
- @@ -344,10 +304,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
- int stride2y = stridey << b_interlaced;
- int strideuv = h->fdec->i_stride[1];
- int stride2uv = strideuv << b_interlaced;
- - uint8_t (*nnz_backup)[16] = h->scratch_buffer;
- -
- - if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
- - munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
- for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
- {
- @@ -427,9 +383,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
- if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
- }
- }
- -
- - if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
- - munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
- }
- #ifdef HAVE_MMX
- diff --git a/common/macroblock.c b/common/macroblock.c
- index ce510e9..1b2d37b 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -344,8 +344,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
- int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
- int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
- ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
- - int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
- - scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
- + scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
- }
- int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
- scratch_size = X264_MAX( scratch_size, buf_mbtree );
- @@ -1013,6 +1012,49 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
- M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
- M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
- }
- +
- + /* Munge NNZ for cavlc + 8x8dct */
- + if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
- + {
- + uint8_t (*nnz)[24] = h->mb.non_zero_count;
- + int top = h->mb.i_mb_top_xy;
- + int left = h->mb.i_mb_left_xy;
- +
- + if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
- + {
- + int i8 = x264_scan8[0] - 8;
- + int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
- + int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
- + M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
- + M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
- + }
- +
- + if( h->mb.i_neighbour & MB_LEFT && h->mb.mb_transform_size[left] )
- + {
- + int i8 = x264_scan8[0] - 1;
- + int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] );
- + int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] );
- + h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
- + h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
- + h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
- + h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
- + }
- +
- + if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
- + {
- + int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
- + int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
- + int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
- + int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
- + uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
- + uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
- +
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
- + }
- + }
- }
- static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
- --
- 1.7.0.4
- From 925b5fd15ac24ccbce54f5e2ff6119f8f4f4710c Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Sun, 30 May 2010 09:42:53 -0700
- Subject: [PATCH 02/10] Fix ultrafast to actually turn off weightb
- ---
- common/common.c | 1 +
- 1 files changed, 1 insertions(+), 0 deletions(-)
- diff --git a/common/common.c b/common/common.c
- index 62bef99..fccf2b0 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
- param->i_bframe_adaptive = X264_B_ADAPT_NONE;
- param->rc.b_mb_tree = 0;
- param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
- + param->analyse.b_weighted_bipred = 0;
- }
- else if( !strcasecmp( preset, "superfast" ) )
- {
- --
- 1.7.0.4
- From 49a832188629fdea4269977a48102029a6300b8b Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Thu, 27 May 2010 12:31:41 -0700
- Subject: [PATCH 03/10] Fix omission in libx264 tuning documentation
- ---
- x264.h | 2 +-
- 1 files changed, 1 insertions(+), 1 deletions(-)
- diff --git a/x264.h b/x264.h
- index 6d7b703..95efd88 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
- /* Multiple tunings can be used if separated by a delimiter in ",./-+",
- * however multiple psy tunings cannot be used.
- - * film, animation, grain, psnr, and ssim are psy tunings.
- + * film, animation, grain, stillimage, psnr, and ssim are psy tunings.
- *
- * returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
- int x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
- --
- 1.7.0.4
- From 69cda7770f3851d2c5785af74b82ba583794c7a6 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 26 May 2010 12:55:35 -0700
- Subject: [PATCH 04/10] Merge some of adaptive quant and weightp
- Eliminate redundant work; both of them were calculating variance of the frame.
- ---
- common/frame.h | 4 +-
- encoder/analyse.h | 1 -
- encoder/encoder.c | 12 ++---
- encoder/ratecontrol.c | 124 +++++++++++++++++++++++++++++++-----------------
- encoder/slicetype.c | 31 ++----------
- 5 files changed, 92 insertions(+), 80 deletions(-)
- diff --git a/common/frame.h b/common/frame.h
- index 91d27b5..ca5cb7a 100644
- --- a/common/frame.h
- +++ b/common/frame.h
- @@ -118,8 +118,8 @@ typedef struct x264_frame
- uint16_t *i_inv_qscale_factor;
- int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
- float f_weighted_cost_delta[X264_BFRAME_MAX+2];
- - uint32_t i_pixel_sum;
- - uint64_t i_pixel_ssd;
- + uint32_t i_pixel_sum[3];
- + uint64_t i_pixel_ssd[3];
- /* hrd */
- x264_hrd_t hrd_timing;
- diff --git a/encoder/analyse.h b/encoder/analyse.h
- index 7c2c22c..53e4c2e 100644
- --- a/encoder/analyse.h
- +++ b/encoder/analyse.h
- @@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
- void x264_slicetype_analyse( x264_t *h, int keyframe );
- int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
- -void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
- int x264_lookahead_init( x264_t *h, int i_slicetype_length );
- int x264_lookahead_is_empty( x264_t *h );
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 52017ff..6e0dc54 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -2246,21 +2246,17 @@ int x264_encoder_encode( x264_t *h,
- fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
- }
- - if( h->frames.b_have_lowres )
- - {
- - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- - x264_weight_plane_analyse( h, fenc );
- - x264_frame_init_lowres( h, fenc );
- - }
- -
- if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
- {
- if( x264_macroblock_tree_read( h, fenc ) )
- return -1;
- }
- - else if( h->param.rc.i_aq_mode )
- + else
- x264_adaptive_quant_frame( h, fenc );
- + if( h->frames.b_have_lowres )
- + x264_frame_init_lowres( h, fenc );
- +
- /* 2: Place the frame into the queue for its slice type decision */
- x264_lookahead_put_frame( h, fenc );
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index a725a24..bf0a400 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
- stride <<= h->mb.b_interlaced;
- uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
- uint32_t sum = (uint32_t)res;
- - uint32_t sqr = res >> 32;
- - return sqr - (sum * sum >> shift);
- + uint32_t ssd = res >> 32;
- + frame->i_pixel_sum[i] += sum;
- + frame->i_pixel_ssd[i] += ssd;
- + return ssd - (sum * sum >> shift);
- }
- // Find the total AC energy of the block in all planes.
- -static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
- +static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
- {
- /* This function contains annoying hacks because GCC has a habit of reordering emms
- * and putting it after floating point ops. As a result, we put the emms at the end of the
- @@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
- * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
- float strength;
- float avg_adj = 0.f;
- - /* Need to init it anyways for MB tree. */
- - if( h->param.rc.f_aq_strength == 0 )
- - {
- - memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
- - memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
- - if( h->frames.b_have_lowres )
- - for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
- - frame->i_inv_qscale_factor[mb_xy] = 256;
- - return;
- + int width = h->sps->i_mb_width;
- + int height = h->sps->i_mb_height;
- + /* Initialize frame stats */
- + for( int i = 0; i < 3; i++ )
- + {
- + frame->i_pixel_sum[i] = 0;
- + frame->i_pixel_ssd[i] = 0;
- }
- - if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
- + /* Degenerate cases */
- + if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
- {
- - float avg_adj_pow2 = 0.f;
- - for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
- - for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
- - {
- - uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
- - float qp_adj = powf( energy + 1, 0.125f );
- - frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
- - avg_adj += qp_adj;
- - avg_adj_pow2 += qp_adj * qp_adj;
- - }
- - avg_adj /= h->mb.i_mb_count;
- - avg_adj_pow2 /= h->mb.i_mb_count;
- - strength = h->param.rc.f_aq_strength * avg_adj;
- - avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
- + /* Need to init it anyways for MB tree */
- + if( h->param.rc.f_aq_strength == 0 )
- + {
- + memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
- + memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
- + if( h->frames.b_have_lowres )
- + for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
- + frame->i_inv_qscale_factor[mb_xy] = 256;
- + }
- + /* Need variance data for weighted prediction */
- + if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- + {
- + for( int mb_y = 0; mb_y < height; mb_y++ )
- + for( int mb_x = 0; mb_x < width; mb_x++ )
- + x264_ac_energy_mb( h, mb_x, mb_y, frame );
- + }
- + else
- + return;
- }
- + /* Actual adaptive quantization */
- else
- - strength = h->param.rc.f_aq_strength * 1.0397f;
- -
- - for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
- - for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
- + {
- + if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
- {
- - float qp_adj;
- - if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
- - {
- - qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
- - qp_adj = strength * (qp_adj - avg_adj);
- - }
- - else
- + float avg_adj_pow2 = 0.f;
- + for( int mb_y = 0; mb_y < height; mb_y++ )
- + for( int mb_x = 0; mb_x < width; mb_x++ )
- + {
- + uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
- + float qp_adj = powf( energy + 1, 0.125f );
- + frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
- + avg_adj += qp_adj;
- + avg_adj_pow2 += qp_adj * qp_adj;
- + }
- + avg_adj /= h->mb.i_mb_count;
- + avg_adj_pow2 /= h->mb.i_mb_count;
- + strength = h->param.rc.f_aq_strength * avg_adj;
- + avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
- + }
- + else
- + strength = h->param.rc.f_aq_strength * 1.0397f;
- +
- + for( int mb_y = 0; mb_y < height; mb_y++ )
- + for( int mb_x = 0; mb_x < width; mb_x++ )
- {
- - uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
- - qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
- + float qp_adj;
- + if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
- + {
- + qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
- + qp_adj = strength * (qp_adj - avg_adj);
- + }
- + else
- + {
- + uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
- + qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
- + }
- + frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
- + frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
- + if( h->frames.b_have_lowres )
- + frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
- }
- - frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
- - frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
- - if( h->frames.b_have_lowres )
- - frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
- - }
- + }
- +
- + /* Remove mean from SSD calculation */
- + for( int i = 0; i < 3; i++ )
- + {
- + uint64_t ssd = frame->i_pixel_ssd[i];
- + uint64_t sum = frame->i_pixel_sum[i];
- + int w = width*16>>!!i;
- + int h = height*16>>!!i;
- + frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
- + }
- }
- int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
- diff --git a/encoder/slicetype.c b/encoder/slicetype.c
- index 9352367..e454e12 100644
- --- a/encoder/slicetype.c
- +++ b/encoder/slicetype.c
- @@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
- w->i_scale = X264_MIN( w->i_scale, 127 );
- }
- -void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
- -{
- - uint32_t sad = 0;
- - uint64_t ssd = 0;
- - uint8_t *p = frame->plane[0];
- - int stride = frame->i_stride[0];
- - int width = frame->i_width[0];
- - int height = frame->i_lines[0];
- - for( int y = 0; y < height>>4; y++, p += stride*16 )
- - for( int x = 0; x < width; x += 16 )
- - {
- - uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
- - sad += (uint32_t)res;
- - ssd += res >> 32;
- - }
- - frame->i_pixel_sum = sad;
- - frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
- -}
- -
- static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
- {
- int ref0_distance = fenc->i_frame - ref->i_frame - 1;
- @@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
- int found;
- x264_weight_t *weights = fenc->weight[0];
- - fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
- - ref_var = round( sqrt( ref->i_pixel_ssd ) );
- - fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
- - ref_mean = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
- + fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
- + ref_var = round( sqrt( ref->i_pixel_ssd[0] ) );
- + fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
- + ref_mean = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
- //early termination
- if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
- @@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
- do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
- if( do_search[0] )
- {
- - if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
- - || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
- + if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
- + h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
- {
- x264_emms();
- x264_weights_analyse( h, frames[b], frames[p0], 1 );
- --
- 1.7.0.4
- From 0bf2d9e3e55fa6b1cda4ca2b1066c3034c575225 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Thu, 27 May 2010 10:42:15 -0700
- Subject: [PATCH 05/10] Add fast skip in lookahead motion search
- Helps speed very significantly on motionless blocks.
- ---
- encoder/slicetype.c | 16 +++++++++++++++-
- 1 files changed, 15 insertions(+), 1 deletions(-)
- diff --git a/encoder/slicetype.c b/encoder/slicetype.c
- index e454e12..d7cfe5c 100644
- --- a/encoder/slicetype.c
- +++ b/encoder/slicetype.c
- @@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
- CP32( m[l].mvp, mvc[0] );
- else
- x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
- - x264_me_search( h, &m[l], mvc, i_mvc );
- + /* Fast skip for cases of near-zero residual. Shortcut: don't bother except in the mv0 case,
- + * since anything else is likely to have enough residual to not trigger the skip. */
- + if( !M32( m[l].mvp ) )
- + {
- + m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
- + if( m[l].cost < 64 )
- + {
- + M32( m[l].mv ) = 0;
- + goto skip_motionest;
- + }
- + }
- +
- + x264_me_search( h, &m[l], mvc, i_mvc );
- m[l].cost -= 2; // remove mvcost from skip mbs
- if( M32( m[l].mv ) )
- m[l].cost += 5;
- +
- +skip_motionest:
- CP32( fenc_mvs[l], m[l].mv );
- *fenc_costs[l] = m[l].cost;
- }
- --
- 1.7.0.4
- From f6abca2c4c0e582d522e135773b88f1ab3d459d2 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Thu, 27 May 2010 14:27:32 -0700
- Subject: [PATCH 06/10] x86 assembly code for NAL escaping
- Up to ~10x faster than C depending on CPU.
- Helps the most at very high bitrates (e.g. lossless).
- Also make the C code faster and simpler.
- ---
- Makefile | 4 +-
- common/bitstream.c | 92 ++++++++++++++
- common/bitstream.h | 299 ++++++++++++++++++++++++++++++++++++++++++++
- common/bs.h | 291 ------------------------------------------
- common/common.c | 54 --------
- common/common.h | 5 +-
- common/x86/bitstream-a.asm | 112 +++++++++++++++++
- common/x86/deblock-a.asm | 1 +
- encoder/encoder.c | 3 +-
- tools/checkasm.c | 52 ++++++++-
- 10 files changed, 561 insertions(+), 352 deletions(-)
- create mode 100644 common/bitstream.c
- create mode 100644 common/bitstream.h
- delete mode 100644 common/bs.h
- create mode 100644 common/x86/bitstream-a.asm
- diff --git a/Makefile b/Makefile
- index 0b43a3e..519e181 100644
- --- a/Makefile
- +++ b/Makefile
- @@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
- common/frame.c common/dct.c common/cpu.c common/cabac.c \
- common/common.c common/mdate.c common/rectangle.c \
- common/set.c common/quant.c common/deblock.c common/vlc.c \
- - common/mvpred.c \
- + common/mvpred.c common/bitstream.c \
- encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
- encoder/set.c encoder/macroblock.c encoder/cabac.c \
- encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
- @@ -52,7 +52,7 @@ endif
- ifneq ($(AS),)
- X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
- mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
- - cpu-a.asm dct-32.asm
- + cpu-a.asm dct-32.asm bitstream-a.asm
- X86SRC = $(X86SRC0:%=common/x86/%)
- ifeq ($(ARCH),X86)
- diff --git a/common/bitstream.c b/common/bitstream.c
- new file mode 100644
- index 0000000..0aaac21
- --- /dev/null
- +++ b/common/bitstream.c
- @@ -0,0 +1,92 @@
- +/*****************************************************************************
- + * bitstream.c: h264 encoder library
- + *****************************************************************************
- + * Copyright (C) 2010 x264 project
- + *
- + * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- + * Jason Garrett-Glaser <darkshikari@gmail.com>
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License
- + * along with this program; if not, write to the Free Software
- + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- + *****************************************************************************/
- +
- +#include "common.h"
- +
- +static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
- +{
- + if( src < end ) *dst++ = *src++;
- + if( src < end ) *dst++ = *src++;
- + while( src < end )
- + {
- + if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
- + *dst++ = 0x03;
- + *dst++ = *src++;
- + }
- + return dst;
- +}
- +
- +#ifdef HAVE_MMX
- +uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
- +uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
- +#endif
- +
- +/****************************************************************************
- + * x264_nal_encode:
- + ****************************************************************************/
- +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
- +{
- + uint8_t *src = nal->p_payload;
- + uint8_t *end = nal->p_payload + nal->i_payload;
- + uint8_t *orig_dst = dst;
- +
- + if( h->param.b_annexb )
- + {
- + if( b_long_startcode )
- + *dst++ = 0x00;
- + *dst++ = 0x00;
- + *dst++ = 0x00;
- + *dst++ = 0x01;
- + }
- + else /* save room for size later */
- + dst += 4;
- +
- + /* nal header */
- + *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
- +
- + dst = h->bsf.nal_escape( dst, src, end );
- + int size = (dst - orig_dst) - 4;
- +
- + /* Write the size header for mp4/etc */
- + if( !h->param.b_annexb )
- + {
- + /* Size doesn't include the size of the header we're writing now. */
- + orig_dst[0] = size>>24;
- + orig_dst[1] = size>>16;
- + orig_dst[2] = size>> 8;
- + orig_dst[3] = size>> 0;
- + }
- +
- + return size+4;
- +}
- +
- +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
- +{
- + pf->nal_escape = x264_nal_escape_c;
- +#ifdef HAVE_MMX
- + if( cpu&X264_CPU_MMXEXT )
- + pf->nal_escape = x264_nal_escape_mmxext;
- + if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
- + pf->nal_escape = x264_nal_escape_sse2;
- +#endif
- +}
- diff --git a/common/bitstream.h b/common/bitstream.h
- new file mode 100644
- index 0000000..d018c7d
- --- /dev/null
- +++ b/common/bitstream.h
- @@ -0,0 +1,299 @@
- +/*****************************************************************************
- + * bitstream.h: h264 encoder library
- + *****************************************************************************
- + * Copyright (C) 2003-2008 x264 project
- + *
- + * Authors: Loren Merritt <lorenm@u.washington.edu>
- + * Jason Garrett-Glaser <darkshikari@gmail.com>
- + * Laurent Aimar <fenrir@via.ecp.fr>
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License
- + * along with this program; if not, write to the Free Software
- + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- + *****************************************************************************/
- +
- +#ifndef X264_BS_H
- +#define X264_BS_H
- +
- +typedef struct
- +{
- + uint8_t i_bits;
- + uint8_t i_size;
- +} vlc_t;
- +
- +typedef struct
- +{
- + uint16_t i_bits;
- + uint8_t i_size;
- + /* Next level table to use */
- + uint8_t i_next;
- +} vlc_large_t;
- +
- +typedef struct bs_s
- +{
- + uint8_t *p_start;
- + uint8_t *p;
- + uint8_t *p_end;
- +
- + intptr_t cur_bits;
- + int i_left; /* i_count number of available bits */
- + int i_bits_encoded; /* RD only */
- +} bs_t;
- +
- +typedef struct
- +{
- + int last;
- + int16_t level[16];
- + uint8_t run[16];
- +} x264_run_level_t;
- +
- +extern const vlc_t x264_coeff0_token[5];
- +extern const vlc_t x264_coeff_token[5][16][4];
- +extern const vlc_t x264_total_zeros[15][16];
- +extern const vlc_t x264_total_zeros_dc[3][4];
- +extern const vlc_t x264_run_before[7][16];
- +
- +typedef struct
- +{
- + uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
- +} x264_bitstream_function_t;
- +
- +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
- +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
- +
- +/* A larger level table size theoretically could help a bit at extremely
- + * high bitrates, but the cost in cache is usually too high for it to be
- + * useful.
- + * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
- + * FIXME: Do further testing? */
- +#define LEVEL_TABLE_SIZE 128
- +extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
- +
- +static inline void bs_init( bs_t *s, void *p_data, int i_data )
- +{
- + int offset = ((intptr_t)p_data & 3);
- + s->p = s->p_start = (uint8_t*)p_data - offset;
- + s->p_end = (uint8_t*)p_data + i_data;
- + s->i_left = (WORD_SIZE - offset)*8;
- + s->cur_bits = endian_fix32( M32(s->p) );
- + s->cur_bits >>= (4-offset)*8;
- +}
- +static inline int bs_pos( bs_t *s )
- +{
- + return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
- +}
- +
- +/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
- +static inline void bs_flush( bs_t *s )
- +{
- + M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
- + s->p += WORD_SIZE - s->i_left / 8;
- + s->i_left = WORD_SIZE*8;
- +}
- +/* The inverse of bs_flush: prepare the bitstream to be written to again. */
- +static inline void bs_realign( bs_t *s )
- +{
- + int offset = ((intptr_t)s->p & 3);
- + if( offset )
- + {
- + s->p = (uint8_t*)s->p - offset;
- + s->i_left = (WORD_SIZE - offset)*8;
- + s->cur_bits = endian_fix32( M32(s->p) );
- + s->cur_bits >>= (4-offset)*8;
- + }
- +}
- +
- +static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
- +{
- + if( WORD_SIZE == 8 )
- + {
- + s->cur_bits = (s->cur_bits << i_count) | i_bits;
- + s->i_left -= i_count;
- + if( s->i_left <= 32 )
- + {
- +#ifdef WORDS_BIGENDIAN
- + M32( s->p ) = s->cur_bits >> (32 - s->i_left);
- +#else
- + M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
- +#endif
- + s->i_left += 32;
- + s->p += 4;
- + }
- + }
- + else
- + {
- + if( i_count < s->i_left )
- + {
- + s->cur_bits = (s->cur_bits << i_count) | i_bits;
- + s->i_left -= i_count;
- + }
- + else
- + {
- + i_count -= s->i_left;
- + s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
- + M32( s->p ) = endian_fix( s->cur_bits );
- + s->p += 4;
- + s->cur_bits = i_bits;
- + s->i_left = 32 - i_count;
- + }
- + }
- +}
- +
- +/* Special case to eliminate branch in normal bs_write. */
- +/* Golomb never writes an even-size code, so this is only used in slice headers. */
- +static inline void bs_write32( bs_t *s, uint32_t i_bits )
- +{
- + bs_write( s, 16, i_bits >> 16 );
- + bs_write( s, 16, i_bits );
- +}
- +
- +static inline void bs_write1( bs_t *s, uint32_t i_bit )
- +{
- + s->cur_bits <<= 1;
- + s->cur_bits |= i_bit;
- + s->i_left--;
- + if( s->i_left == WORD_SIZE*8-32 )
- + {
- + M32( s->p ) = endian_fix32( s->cur_bits );
- + s->p += 4;
- + s->i_left = WORD_SIZE*8;
- + }
- +}
- +
- +static inline void bs_align_0( bs_t *s )
- +{
- + bs_write( s, s->i_left&7, 0 );
- + bs_flush( s );
- +}
- +static inline void bs_align_1( bs_t *s )
- +{
- + bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
- + bs_flush( s );
- +}
- +static inline void bs_align_10( bs_t *s )
- +{
- + if( s->i_left&7 )
- + bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
- +}
- +
- +/* golomb functions */
- +
- +static const uint8_t x264_ue_size_tab[256] =
- +{
- + 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
- + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- + 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
- + 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
- + 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- + 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- + 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- + 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- +};
- +
- +static inline void bs_write_ue_big( bs_t *s, unsigned int val )
- +{
- + int size = 0;
- + int tmp = ++val;
- + if( tmp >= 0x10000 )
- + {
- + size = 32;
- + tmp >>= 16;
- + }
- + if( tmp >= 0x100 )
- + {
- + size += 16;
- + tmp >>= 8;
- + }
- + size += x264_ue_size_tab[tmp];
- + bs_write( s, size>>1, 0 );
- + bs_write( s, (size>>1)+1, val );
- +}
- +
- +/* Only works on values under 255. */
- +static inline void bs_write_ue( bs_t *s, int val )
- +{
- + bs_write( s, x264_ue_size_tab[val+1], val+1 );
- +}
- +
- +static inline void bs_write_se( bs_t *s, int val )
- +{
- + int size = 0;
- + /* Faster than (val <= 0 ? -val*2+1 : val*2) */
- + /* 4 instructions on x86, 3 on ARM */
- + int tmp = 1 - val*2;
- + if( tmp < 0 ) tmp = val*2;
- + val = tmp;
- +
- + if( tmp >= 0x100 )
- + {
- + size = 16;
- + tmp >>= 8;
- + }
- + size += x264_ue_size_tab[tmp];
- + bs_write( s, size, val );
- +}
- +
- +static inline void bs_write_te( bs_t *s, int x, int val )
- +{
- + if( x == 1 )
- + bs_write1( s, 1^val );
- + else //if( x > 1 )
- + bs_write_ue( s, val );
- +}
- +
- +static inline void bs_rbsp_trailing( bs_t *s )
- +{
- + bs_write1( s, 1 );
- + bs_write( s, s->i_left&7, 0 );
- +}
- +
- +static ALWAYS_INLINE int bs_size_ue( unsigned int val )
- +{
- + return x264_ue_size_tab[val+1];
- +}
- +
- +static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
- +{
- + if( val < 255 )
- + return x264_ue_size_tab[val+1];
- + else
- + return x264_ue_size_tab[(val+1)>>8] + 16;
- +}
- +
- +static ALWAYS_INLINE int bs_size_se( int val )
- +{
- + int tmp = 1 - val*2;
- + if( tmp < 0 ) tmp = val*2;
- + if( tmp < 256 )
- + return x264_ue_size_tab[tmp];
- + else
- + return x264_ue_size_tab[tmp>>8]+16;
- +}
- +
- +static ALWAYS_INLINE int bs_size_te( int x, int val )
- +{
- + if( x == 1 )
- + return 1;
- + else //if( x > 1 )
- + return x264_ue_size_tab[val+1];
- +}
- +
- +#endif
- diff --git a/common/bs.h b/common/bs.h
- deleted file mode 100644
- index 343a3c9..0000000
- --- a/common/bs.h
- +++ /dev/null
- @@ -1,291 +0,0 @@
- -/*****************************************************************************
- - * bs.h :
- - *****************************************************************************
- - * Copyright (C) 2003-2008 x264 project
- - *
- - * Authors: Loren Merritt <lorenm@u.washington.edu>
- - * Jason Garrett-Glaser <darkshikari@gmail.com>
- - * Laurent Aimar <fenrir@via.ecp.fr>
- - *
- - * This program is free software; you can redistribute it and/or modify
- - * it under the terms of the GNU General Public License as published by
- - * the Free Software Foundation; either version 2 of the License, or
- - * (at your option) any later version.
- - *
- - * This program is distributed in the hope that it will be useful,
- - * but WITHOUT ANY WARRANTY; without even the implied warranty of
- - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- - * GNU General Public License for more details.
- - *
- - * You should have received a copy of the GNU General Public License
- - * along with this program; if not, write to the Free Software
- - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- - *****************************************************************************/
- -
- -#ifndef X264_BS_H
- -#define X264_BS_H
- -
- -typedef struct
- -{
- - uint8_t i_bits;
- - uint8_t i_size;
- -} vlc_t;
- -
- -typedef struct
- -{
- - uint16_t i_bits;
- - uint8_t i_size;
- - /* Next level table to use */
- - uint8_t i_next;
- -} vlc_large_t;
- -
- -typedef struct bs_s
- -{
- - uint8_t *p_start;
- - uint8_t *p;
- - uint8_t *p_end;
- -
- - intptr_t cur_bits;
- - int i_left; /* i_count number of available bits */
- - int i_bits_encoded; /* RD only */
- -} bs_t;
- -
- -typedef struct
- -{
- - int last;
- - int16_t level[16];
- - uint8_t run[16];
- -} x264_run_level_t;
- -
- -extern const vlc_t x264_coeff0_token[5];
- -extern const vlc_t x264_coeff_token[5][16][4];
- -extern const vlc_t x264_total_zeros[15][16];
- -extern const vlc_t x264_total_zeros_dc[3][4];
- -extern const vlc_t x264_run_before[7][16];
- -
- -/* A larger level table size theoretically could help a bit at extremely
- - * high bitrates, but the cost in cache is usually too high for it to be
- - * useful.
- - * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
- - * FIXME: Do further testing? */
- -#define LEVEL_TABLE_SIZE 128
- -extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
- -
- -static inline void bs_init( bs_t *s, void *p_data, int i_data )
- -{
- - int offset = ((intptr_t)p_data & 3);
- - s->p = s->p_start = (uint8_t*)p_data - offset;
- - s->p_end = (uint8_t*)p_data + i_data;
- - s->i_left = (WORD_SIZE - offset)*8;
- - s->cur_bits = endian_fix32( M32(s->p) );
- - s->cur_bits >>= (4-offset)*8;
- -}
- -static inline int bs_pos( bs_t *s )
- -{
- - return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
- -}
- -
- -/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
- -static inline void bs_flush( bs_t *s )
- -{
- - M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
- - s->p += WORD_SIZE - s->i_left / 8;
- - s->i_left = WORD_SIZE*8;
- -}
- -/* The inverse of bs_flush: prepare the bitstream to be written to again. */
- -static inline void bs_realign( bs_t *s )
- -{
- - int offset = ((intptr_t)s->p & 3);
- - if( offset )
- - {
- - s->p = (uint8_t*)s->p - offset;
- - s->i_left = (WORD_SIZE - offset)*8;
- - s->cur_bits = endian_fix32( M32(s->p) );
- - s->cur_bits >>= (4-offset)*8;
- - }
- -}
- -
- -static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
- -{
- - if( WORD_SIZE == 8 )
- - {
- - s->cur_bits = (s->cur_bits << i_count) | i_bits;
- - s->i_left -= i_count;
- - if( s->i_left <= 32 )
- - {
- -#ifdef WORDS_BIGENDIAN
- - M32( s->p ) = s->cur_bits >> (32 - s->i_left);
- -#else
- - M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
- -#endif
- - s->i_left += 32;
- - s->p += 4;
- - }
- - }
- - else
- - {
- - if( i_count < s->i_left )
- - {
- - s->cur_bits = (s->cur_bits << i_count) | i_bits;
- - s->i_left -= i_count;
- - }
- - else
- - {
- - i_count -= s->i_left;
- - s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
- - M32( s->p ) = endian_fix( s->cur_bits );
- - s->p += 4;
- - s->cur_bits = i_bits;
- - s->i_left = 32 - i_count;
- - }
- - }
- -}
- -
- -/* Special case to eliminate branch in normal bs_write. */
- -/* Golomb never writes an even-size code, so this is only used in slice headers. */
- -static inline void bs_write32( bs_t *s, uint32_t i_bits )
- -{
- - bs_write( s, 16, i_bits >> 16 );
- - bs_write( s, 16, i_bits );
- -}
- -
- -static inline void bs_write1( bs_t *s, uint32_t i_bit )
- -{
- - s->cur_bits <<= 1;
- - s->cur_bits |= i_bit;
- - s->i_left--;
- - if( s->i_left == WORD_SIZE*8-32 )
- - {
- - M32( s->p ) = endian_fix32( s->cur_bits );
- - s->p += 4;
- - s->i_left = WORD_SIZE*8;
- - }
- -}
- -
- -static inline void bs_align_0( bs_t *s )
- -{
- - bs_write( s, s->i_left&7, 0 );
- - bs_flush( s );
- -}
- -static inline void bs_align_1( bs_t *s )
- -{
- - bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
- - bs_flush( s );
- -}
- -static inline void bs_align_10( bs_t *s )
- -{
- - if( s->i_left&7 )
- - bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
- -}
- -
- -/* golomb functions */
- -
- -static const uint8_t x264_ue_size_tab[256] =
- -{
- - 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
- - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- - 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
- - 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
- - 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- - 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- - 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- - 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
- -};
- -
- -static inline void bs_write_ue_big( bs_t *s, unsigned int val )
- -{
- - int size = 0;
- - int tmp = ++val;
- - if( tmp >= 0x10000 )
- - {
- - size = 32;
- - tmp >>= 16;
- - }
- - if( tmp >= 0x100 )
- - {
- - size += 16;
- - tmp >>= 8;
- - }
- - size += x264_ue_size_tab[tmp];
- - bs_write( s, size>>1, 0 );
- - bs_write( s, (size>>1)+1, val );
- -}
- -
- -/* Only works on values under 255. */
- -static inline void bs_write_ue( bs_t *s, int val )
- -{
- - bs_write( s, x264_ue_size_tab[val+1], val+1 );
- -}
- -
- -static inline void bs_write_se( bs_t *s, int val )
- -{
- - int size = 0;
- - /* Faster than (val <= 0 ? -val*2+1 : val*2) */
- - /* 4 instructions on x86, 3 on ARM */
- - int tmp = 1 - val*2;
- - if( tmp < 0 ) tmp = val*2;
- - val = tmp;
- -
- - if( tmp >= 0x100 )
- - {
- - size = 16;
- - tmp >>= 8;
- - }
- - size += x264_ue_size_tab[tmp];
- - bs_write( s, size, val );
- -}
- -
- -static inline void bs_write_te( bs_t *s, int x, int val )
- -{
- - if( x == 1 )
- - bs_write1( s, 1^val );
- - else //if( x > 1 )
- - bs_write_ue( s, val );
- -}
- -
- -static inline void bs_rbsp_trailing( bs_t *s )
- -{
- - bs_write1( s, 1 );
- - bs_write( s, s->i_left&7, 0 );
- -}
- -
- -static ALWAYS_INLINE int bs_size_ue( unsigned int val )
- -{
- - return x264_ue_size_tab[val+1];
- -}
- -
- -static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
- -{
- - if( val < 255 )
- - return x264_ue_size_tab[val+1];
- - else
- - return x264_ue_size_tab[(val+1)>>8] + 16;
- -}
- -
- -static ALWAYS_INLINE int bs_size_se( int val )
- -{
- - int tmp = 1 - val*2;
- - if( tmp < 0 ) tmp = val*2;
- - if( tmp < 256 )
- - return x264_ue_size_tab[tmp];
- - else
- - return x264_ue_size_tab[tmp>>8]+16;
- -}
- -
- -static ALWAYS_INLINE int bs_size_te( int x, int val )
- -{
- - if( x == 1 )
- - return 1;
- - else //if( x > 1 )
- - return x264_ue_size_tab[val+1];
- -}
- -
- -#endif
- diff --git a/common/common.c b/common/common.c
- index fccf2b0..2458f65 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -1027,60 +1027,6 @@ void x264_picture_clean( x264_picture_t *pic )
- }
- /****************************************************************************
- - * x264_nal_encode:
- - ****************************************************************************/
- -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
- -{
- - uint8_t *src = nal->p_payload;
- - uint8_t *end = nal->p_payload + nal->i_payload;
- - uint8_t *orig_dst = dst;
- - int i_count = 0, size;
- -
- - if( b_annexb )
- - {
- - if( b_long_startcode )
- - *dst++ = 0x00;
- - *dst++ = 0x00;
- - *dst++ = 0x00;
- - *dst++ = 0x01;
- - }
- - else /* save room for size later */
- - dst += 4;
- -
- - /* nal header */
- - *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
- -
- - while( src < end )
- - {
- - if( i_count == 2 && *src <= 0x03 )
- - {
- - *dst++ = 0x03;
- - i_count = 0;
- - }
- - if( *src == 0 )
- - i_count++;
- - else
- - i_count = 0;
- - *dst++ = *src++;
- - }
- - size = (dst - orig_dst) - 4;
- -
- - /* Write the size header for mp4/etc */
- - if( !b_annexb )
- - {
- - /* Size doesn't include the size of the header we're writing now. */
- - orig_dst[0] = size>>24;
- - orig_dst[1] = size>>16;
- - orig_dst[2] = size>> 8;
- - orig_dst[3] = size>> 0;
- - }
- -
- - return size+4;
- -}
- -
- -
- -
- -/****************************************************************************
- * x264_malloc:
- ****************************************************************************/
- void *x264_malloc( int i_size )
- diff --git a/common/common.h b/common/common.h
- index 539ea65..93712fe 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
- */
- #include "x264.h"
- -#include "bs.h"
- +#include "bitstream.h"
- #include "set.h"
- #include "predict.h"
- #include "pixel.h"
- @@ -166,8 +166,6 @@ int64_t x264_mdate( void );
- * the encoding options */
- char *x264_param2string( x264_param_t *p, int b_res );
- -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
- -
- /* log */
- void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
- @@ -796,6 +794,7 @@ struct x264_t
- x264_zigzag_function_t zigzagf;
- x264_quant_function_t quantf;
- x264_deblock_function_t loopf;
- + x264_bitstream_function_t bsf;
- #ifdef HAVE_VISUALIZE
- struct visualize_t *visualize;
- diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
- new file mode 100644
- index 0000000..1fb4cea
- --- /dev/null
- +++ b/common/x86/bitstream-a.asm
- @@ -0,0 +1,112 @@
- +;*****************************************************************************
- +;* bitstream-a.asm: h264 encoder library
- +;*****************************************************************************
- +;* Copyright (C) 2010 x264 project
- +;*
- +;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
- +;*
- +;* This program is free software; you can redistribute it and/or modify
- +;* it under the terms of the GNU General Public License as published by
- +;* the Free Software Foundation; either version 2 of the License, or
- +;* (at your option) any later version.
- +;*
- +;* This program is distributed in the hope that it will be useful,
- +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- +;* GNU General Public License for more details.
- +;*
- +;* You should have received a copy of the GNU General Public License
- +;* along with this program; if not, write to the Free Software
- +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- +;*****************************************************************************
- +
- +%include "x86inc.asm"
- +%include "x86util.asm"
- +
- +SECTION .text
- +
- +;-----------------------------------------------------------------------------
- +; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
- +;-----------------------------------------------------------------------------
- +
- +%macro NAL_LOOP 2
- +ALIGN 16
- +%1:
- + mova m0, [r1+r2]
- + mova m1, m0
- +%if mmsize == 8
- + psrlq m0, 8
- +%else
- + psrldq m0, 1
- +%endif
- + %2 [r0+r1], m1
- + por m1, m0
- + pcmpeqb m1, m2
- + pmovmskb r3d, m1
- + test r3d, r3d
- + jnz .escape
- + add r1, mmsize
- + jl %1
- +%endmacro
- +
- +%macro NAL_ESCAPE 1
- +
- +cglobal nal_escape_%1, 3,5
- + pxor m2, m2
- + sub r1, r2 ; r1 = offset of current src pointer from end of src
- + sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
- +
- + mov r3w, [r1+r2]
- + mov [r0+r1], r3w
- + add r1, 2
- + jge .ret
- +
- + ; Start off by jumping into the escape loop in
- + ; case there's an escape at the start.
- + ; And do a few more in scalar until src is aligned again.
- + lea r4d, [r1+r2]
- + or r4d, -mmsize
- + neg r4d
- + jmp .escapeloop
- +
- + NAL_LOOP .loop_aligned, mova
- +%if mmsize==16
- + NAL_LOOP .loop_unaligned, movu
- +%endif
- +
- +.ret:
- + movifnidn rax, r0
- + RET
- +ALIGN 16
- +.escape:
- + mov r4d, mmsize
- +.escapeloop:
- + mov r3b, [r1+r2]
- + cmp r3b, 3
- + jna .escape_check
- +.copy:
- + mov [r0+r1], r3b
- + inc r1
- + jge .ret
- + dec r4d
- + jg .escapeloop
- + cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
- + jz .escape
- +%if mmsize==16
- + lea r4d, [r0+r1]
- + test r4d, mmsize-1
- + jnz .loop_unaligned
- +%endif
- + jmp .loop_aligned
- +.escape_check:
- + cmp word [r0+r1-2], 0
- + jnz .copy
- + mov byte [r0+r1], 3
- + inc r0
- + jmp .copy
- +%endmacro
- +
- +INIT_MMX
- +NAL_ESCAPE mmxext
- +INIT_XMM
- +NAL_ESCAPE sse2
- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
- index aedd688..3a31e26 100644
- --- a/common/x86/deblock-a.asm
- +++ b/common/x86/deblock-a.asm
- @@ -4,6 +4,7 @@
- ;* Copyright (C) 2005-2008 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- +;* Jason Garrett-Glaser <darkshikari@gmail.com>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 6e0dc54..32db82a 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -986,6 +986,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
- x264_mc_init( h->param.cpu, &h->mc );
- x264_quant_init( h, h->param.cpu, &h->quantf );
- x264_deblock_init( h->param.cpu, &h->loopf );
- + x264_bitstream_init( h->param.cpu, &h->bsf );
- x264_dct_init_weights();
- mbcmp_init( h );
- @@ -1272,7 +1273,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
- for( int i = start; i < h->out.i_nal; i++ )
- {
- int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
- - int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
- + int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
- h->out.nal[i].i_payload = size;
- h->out.nal[i].p_payload = nal_buffer;
- nal_buffer += size;
- diff --git a/tools/checkasm.c b/tools/checkasm.c
- index a0a9d54..ea6f209 100644
- --- a/tools/checkasm.c
- +++ b/tools/checkasm.c
- @@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
- return ret;
- }
- +static int check_bitstream( int cpu_ref, int cpu_new )
- +{
- + x264_bitstream_function_t bs_c;
- + x264_bitstream_function_t bs_ref;
- + x264_bitstream_function_t bs_a;
- +
- + int ret = 0, ok = 1, used_asm = 0;
- +
- + x264_bitstream_init( 0, &bs_c );
- + x264_bitstream_init( cpu_ref, &bs_ref );
- + x264_bitstream_init( cpu_new, &bs_a );
- + if( bs_a.nal_escape != bs_ref.nal_escape )
- + {
- + int size = 0x4000;
- + uint8_t *input = malloc(size+100);
- + uint8_t *output1 = malloc(size*2);
- + uint8_t *output2 = malloc(size*2);
- + used_asm = 1;
- + set_func_name( "nal_escape" );
- + for( int i = 0; i < 100; i++ )
- + {
- + /* Test corner-case sizes */
- + int test_size = i < 10 ? i+1 : rand() & 0x3fff;
- + for( int j = 0; j < test_size; j++ )
- + input[j] = (rand()&1) * rand();
- + uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
- + uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
- + int size_c = end_c-output1;
- + int size_a = end_a-output2;
- + if( size_c != size_a || memcmp( output1, output2, size_c ) )
- + {
- + fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a );
- + ok = 0;
- + break;
- + }
- + }
- + for( int j = 0; j < size; j++ )
- + input[j] = rand();
- + call_c2( bs_c.nal_escape, output1, input, input+size );
- + call_a2( bs_a.nal_escape, output2, input, input+size );
- + free(input);
- + free(output1);
- + free(output2);
- + }
- + report( "nal escape:" );
- +
- + return ret;
- +}
- +
- static int check_all_funcs( int cpu_ref, int cpu_new )
- {
- return check_pixel( cpu_ref, cpu_new )
- @@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
- + check_intra( cpu_ref, cpu_new )
- + check_deblock( cpu_ref, cpu_new )
- + check_quant( cpu_ref, cpu_new )
- - + check_cabac( cpu_ref, cpu_new );
- + + check_cabac( cpu_ref, cpu_new )
- + + check_bitstream( cpu_ref, cpu_new );
- }
- static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
- --
- 1.7.0.4
- From 790c0bcb4d96894969ab3dab6df670eafcbbcd85 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 28 May 2010 14:30:07 -0700
- Subject: [PATCH 07/10] Re-enable i8x8 merged SATD
- Accidentally got disabled when intra_sad_x3 was added.
- ---
- encoder/encoder.c | 1 +
- 1 files changed, 1 insertions(+), 0 deletions(-)
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 32db82a..2f9e7f6 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
- memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
- h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
- h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
- + h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
- h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
- satd &= h->param.analyse.i_me_method == X264_ME_TESA;
- memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
- --
- 1.7.0.4
- From 6e549ed124a0a84d77c51baa39984fb36ab49123 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 28 May 2010 14:27:22 -0700
- Subject: [PATCH 08/10] Add API tool to apply arbitrary quantizer offsets
- The calling application can now pass a "map" of quantizer offsets to apply to each frame.
- An optional callback to free the map can also be included.
- This allows all kinds of flexible region-of-interest coding and similar.
- ---
- common/common.c | 2 +-
- encoder/encoder.c | 7 +++++--
- encoder/ratecontrol.c | 36 +++++++++++++++++++++++++-----------
- encoder/ratecontrol.h | 4 ++--
- x264.h | 20 +++++++++++++++++++-
- 5 files changed, 52 insertions(+), 17 deletions(-)
- diff --git a/common/common.c b/common/common.c
- index 2458f65..48e1bbc 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -998,6 +998,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
- ****************************************************************************/
- int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
- {
- + memset( pic, 0, sizeof( x264_picture_t ) );
- pic->i_type = X264_TYPE_AUTO;
- pic->i_qpplus1 = 0;
- pic->img.i_csp = i_csp;
- @@ -1010,7 +1011,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
- pic->img.i_stride[0] = i_width;
- pic->img.i_stride[1] = i_width / 2;
- pic->img.i_stride[2] = i_width / 2;
- - pic->param = NULL;
- pic->i_pic_struct = PIC_STRUCT_AUTO;
- return 0;
- }
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 2f9e7f6..89107a3 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -2250,11 +2250,14 @@ int x264_encoder_encode( x264_t *h,
- if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
- {
- - if( x264_macroblock_tree_read( h, fenc ) )
- + if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
- return -1;
- }
- else
- - x264_adaptive_quant_frame( h, fenc );
- + x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
- +
- + if( pic_in->prop.quant_offsets_free )
- + pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
- if( h->frames.b_have_lowres )
- x264_frame_init_lowres( h, fenc );
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index bf0a400..d09de98 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
- return var;
- }
- -void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
- +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
- {
- /* constants chosen to result in approximately the same overall bitrate as without AQ.
- * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
- @@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
- /* Need to init it anyways for MB tree */
- if( h->param.rc.f_aq_strength == 0 )
- {
- - memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
- - memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
- - if( h->frames.b_have_lowres )
- + if( quant_offsets )
- + {
- for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
- - frame->i_inv_qscale_factor[mb_xy] = 256;
- + frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
- + if( h->frames.b_have_lowres )
- + for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
- + frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
- + }
- + else
- + {
- + memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
- + memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
- + if( h->frames.b_have_lowres )
- + for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
- + frame->i_inv_qscale_factor[mb_xy] = 256;
- + }
- }
- /* Need variance data for weighted prediction */
- if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- @@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
- for( int mb_x = 0; mb_x < width; mb_x++ )
- {
- float qp_adj;
- + int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
- if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
- {
- - qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
- + qp_adj = frame->f_qp_offset[mb_xy];
- qp_adj = strength * (qp_adj - avg_adj);
- }
- else
- @@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
- uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
- qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
- }
- - frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
- - frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
- + if( quant_offsets )
- + qp_adj += quant_offsets[mb_xy];
- + frame->f_qp_offset[mb_xy] =
- + frame->f_qp_offset_aq[mb_xy] = qp_adj;
- if( h->frames.b_have_lowres )
- - frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
- + frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
- }
- }
- @@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
- }
- }
- -int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
- +int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
- {
- x264_ratecontrol_t *rc = h->rc;
- uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
- @@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
- rc->qpbuf_pos--;
- }
- else
- - x264_adaptive_quant_frame( h, frame );
- + x264_adaptive_quant_frame( h, frame, quant_offsets );
- return 0;
- fail:
- x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
- diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
- index e052b2a..dd139eb 100644
- --- a/encoder/ratecontrol.h
- +++ b/encoder/ratecontrol.h
- @@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
- void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
- -void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
- -int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
- +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
- +int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
- int x264_reference_build_list_optimal( x264_t *h );
- void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
- void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
- diff --git a/x264.h b/x264.h
- index 95efd88..a4b3400 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -35,7 +35,7 @@
- #include <stdarg.h>
- -#define X264_BUILD 96
- +#define X264_BUILD 97
- /* x264_t:
- * opaque handler for encoder */
- @@ -508,6 +508,22 @@ typedef struct
- typedef struct
- {
- + /* In: an array of quantizer offsets to be applied to this image during encoding.
- + * These are added on top of the decisions made by x264.
- + * Offsets can be fractional; they are added before QPs are rounded to integer.
- + * Adaptive quantization must be enabled to use this feature. Behavior if quant
- + * offsets differ between encoding passes is undefined.
- + *
- + * Array contains one offset per macroblock, in raster scan order. In interlaced
- + * mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
- + float *quant_offsets;
- + /* In: optional callback to free quant_offsets when used.
- + * Useful if one wants to use a different quant_offset array for each frame. */
- + void (*quant_offsets_free)( void* );
- +} x264_image_properties_t;
- +
- +typedef struct
- +{
- /* In: force picture type (if not auto)
- * If x264 encoding parameters are violated in the forcing of picture types,
- * x264 will correct the input picture type and log a warning.
- @@ -537,6 +553,8 @@ typedef struct
- x264_param_t *param;
- /* In: raw data */
- x264_image_t img;
- + /* In: optional information to modify encoder decisions for this frame */
- + x264_image_properties_t prop;
- /* Out: HRD timing information. Output only when i_nal_hrd is set. */
- x264_hrd_t hrd_timing;
- /* private user data. libx264 doesn't touch this,
- --
- 1.7.0.4
- From ef05902684b7f2fdfcb07b900740b61248a097e1 Mon Sep 17 00:00:00 2001
- From: Henrik Gramner <hengar-6@student.ltu.se>
- Date: Thu, 27 May 2010 22:18:38 +0200
- Subject: [PATCH 09/10] Optimize out some x264_scan8 reads
- ---
- encoder/analyse.c | 15 ++++-----
- encoder/macroblock.c | 82 ++++++++++++++++++++++++++++++--------------------
- encoder/me.c | 25 ++++++++-------
- 3 files changed, 70 insertions(+), 52 deletions(-)
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index a128a70..9e85e89 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
- static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
- {
- uint8_t *p_dst = h->mb.pic.p_fdec[0];
- -
- - int x, y;
- uint64_t i_satd, i_best;
- h->mb.i_skip_intra = 0;
- @@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
- int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
- i_best = COST_MAX64;
- - x = idx&1;
- - y = idx>>1;
- + int x = idx&1;
- + int y = idx>>1;
- + int s8 = X264_SCAN8_0 + 2*x + 16*y;
- p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
- predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
- @@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
- if( !(idx&1) )
- for( int j = 0; j < 7; j++ )
- pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
- - i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
- - i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
- + i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
- + i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
- }
- }
- a->i_cbp_i8x8_luma = cbp_luma_new;
- @@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
- if( !(idx&1) )
- for( int j = 0; j < 7; j++ )
- p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
- - M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
- - M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
- + M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
- + M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
- x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
- }
- diff --git a/encoder/macroblock.c b/encoder/macroblock.c
- index 984f8a8..cdc4563 100644
- --- a/encoder/macroblock.c
- +++ b/encoder/macroblock.c
- @@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
- }
- }
- -#define STORE_8x8_NNZ(idx,nz)\
- +#define STORE_8x8_NNZ( s8, nz )\
- +do\
- {\
- - M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
- - M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
- -}
- + M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
- + M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
- +} while(0)
- #define CLEAR_16x16_NNZ \
- {\
- @@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
- void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
- {
- - int x = 8 * (idx&1);
- - int y = 8 * (idx>>1);
- + int x = idx&1;
- + int y = idx>>1;
- + int s8 = X264_SCAN8_0 + 2*x + 16*y;
- int nz;
- - uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
- - uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
- + uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
- + uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
- ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
- if( h->mb.b_lossless )
- {
- nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
- - STORE_8x8_NNZ(idx,nz);
- + STORE_8x8_NNZ( s8, nz );
- h->mb.i_cbp_luma |= nz<<idx;
- return;
- }
- @@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
- h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
- h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
- h->dctf.add8x8_idct8( p_dst, dct8x8 );
- - STORE_8x8_NNZ(idx,1);
- + STORE_8x8_NNZ( s8, 1 );
- }
- else
- - STORE_8x8_NNZ(idx,0);
- + STORE_8x8_NNZ( s8, 0 );
- }
- static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
- @@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
- if( h->mb.b_transform_8x8 )
- for( int i8x8 = 0; i8x8 < 4; i8x8++ )
- {
- - int x = 8*(i8x8&1);
- - int y = 8*(i8x8>>1);
- - nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
- - h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
- - h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
- - STORE_8x8_NNZ(i8x8,nz);
- + int x = i8x8&1;
- + int y = i8x8>>1;
- + int s8 = X264_SCAN8_0 + 2*x + 16*y;
- +
- + nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
- + h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
- + STORE_8x8_NNZ( s8, nz );
- h->mb.i_cbp_luma |= nz << i8x8;
- }
- else
- @@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
- {
- for( int idx = 0; idx < 4; idx++ )
- {
- + int x = idx&1;
- + int y = idx>>1;
- + int s8 = X264_SCAN8_0 + 2*x + 16*y;
- +
- if( h->mb.i_cbp_luma&(1<<idx) )
- {
- h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
- - h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
- - STORE_8x8_NNZ(idx,1);
- + h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
- + STORE_8x8_NNZ( s8, 1 );
- }
- else
- - STORE_8x8_NNZ(idx,0);
- + STORE_8x8_NNZ( s8, 0 );
- }
- }
- }
- @@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
- }
- }
- + int x = i8x8&1;
- + int y = i8x8>>1;
- +
- /* decimate this 8x8 block */
- i_decimate_mb += i_decimate_8x8;
- if( b_decimate )
- {
- if( i_decimate_8x8 < 4 )
- - STORE_8x8_NNZ(i8x8,0)
- + {
- + int s8 = X264_SCAN8_0 + 2*x + 16*y;
- + STORE_8x8_NNZ( s8, 0 );
- + }
- else
- h->mb.i_cbp_luma |= 1<<i8x8;
- }
- else if( cbp )
- {
- - h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
- + h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
- h->mb.i_cbp_luma |= 1<<i8x8;
- }
- }
- @@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
- void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
- {
- int i_qp = h->mb.i_qp;
- - uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
- - uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
- + int x = i8&1;
- + int y = i8>>1;
- + int s8 = X264_SCAN8_0 + 2*x + 16*y;
- + uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
- + uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
- int b_decimate = h->mb.b_dct_decimate;
- int nnz8x8 = 0;
- int nz;
- @@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
- if( h->mb.b_transform_8x8 )
- {
- nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
- - STORE_8x8_NNZ(i8,nnz8x8);
- + STORE_8x8_NNZ( s8, nnz8x8 );
- }
- else
- {
- @@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
- for( int ch = 0; ch < 2; ch++ )
- {
- int16_t dc;
- - p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
- - p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
- + p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
- + p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
- nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
- h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
- }
- @@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
- {
- h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
- h->dctf.add8x8_idct8( p_fdec, dct8x8 );
- - STORE_8x8_NNZ(i8,1);
- + STORE_8x8_NNZ( s8, 1 );
- }
- else
- - STORE_8x8_NNZ(i8,0);
- + STORE_8x8_NNZ( s8, 0 );
- }
- else
- - STORE_8x8_NNZ(i8,0);
- + STORE_8x8_NNZ( s8, 0 );
- }
- else
- {
- @@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
- if( nnz8x8 )
- h->dctf.add8x8_idct( p_fdec, dct4x4 );
- else
- - STORE_8x8_NNZ(i8,0);
- + STORE_8x8_NNZ( s8, 0 );
- }
- i_qp = h->mb.i_chroma_qp;
- @@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
- for( int ch = 0; ch < 2; ch++ )
- {
- ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
- - p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
- - p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
- + p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
- + p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
- h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
- dct4x4[0] = 0;
- diff --git a/encoder/me.c b/encoder/me.c
- index 77073cc..40d0650 100644
- --- a/encoder/me.c
- +++ b/encoder/me.c
- @@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
- static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
- {
- - int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
- - int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
- + int x = i8&1;
- + int y = i8>>1;
- + int s8 = X264_SCAN8_0 + 2*x + 16*y;
- + int16_t *cache0_mv = h->mb.cache.mv[0][s8];
- + int16_t *cache1_mv = h->mb.cache.mv[1][s8];
- const int i_pixel = m0->i_pixel;
- const int bw = x264_pixel_size[i_pixel].w;
- const int bh = x264_pixel_size[i_pixel].h;
- @@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
- ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
- ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
- uint8_t *src[2][9];
- - uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
- - uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- - uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- - const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
- - const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
- + uint8_t *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
- + uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
- + uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
- + int ref0 = h->mb.cache.ref[0][s8];
- + int ref1 = h->mb.cache.ref[1][s8];
- const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- int stride[2][9];
- @@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
- if( rd )
- {
- - x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
- + x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
- amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
- - x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
- + x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
- - x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
- + x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
- amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
- - x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
- + x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
- }
- m0->mv[0] = bm0x;
- --
- 1.7.0.4
- From c949405e834a2cbe35f3fb460eae061447dc386b Mon Sep 17 00:00:00 2001
- From: Henrik Gramner <hengar-6@student.ltu.se>
- Date: Sun, 30 May 2010 22:45:14 +0200
- Subject: [PATCH 10/10] Some deblocking-related optimizations
- ---
- common/deblock.c | 8 ++++----
- common/macroblock.c | 43 +++++++++++++++++++++++--------------------
- 2 files changed, 27 insertions(+), 24 deletions(-)
- diff --git a/common/deblock.c b/common/deblock.c
- index 27c73ae..3296dbf 100644
- --- a/common/deblock.c
- +++ b/common/deblock.c
- @@ -299,7 +299,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
- void x264_frame_deblock_row( x264_t *h, int mb_y )
- {
- int b_interlaced = h->sh.b_mbaff;
- - int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
- + int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset );
- int stridey = h->fdec->i_stride[0];
- int stride2y = stridey << b_interlaced;
- int strideuv = h->fdec->i_stride[1];
- @@ -318,7 +318,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
- uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
- uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
- uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
- - if( b_interlaced && (mb_y&1) )
- + if( mb_y & b_interlaced )
- {
- pixy -= 15*stridey;
- pixu -= 7*strideuv;
- @@ -366,12 +366,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
- int qp_top = (qp + qpt + 1) >> 1;
- int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
- int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
- - if( !b_interlaced && (intra_cur || intra_top) )
- + if( ~b_interlaced & (intra_cur | intra_top) )
- FILTER( _intra, 1, 0, qp_top, qpc_top );
- else
- {
- if( intra_top )
- - memset( bs[1][0], 3, sizeof(bs[1][0]) );
- + M32( bs[1][0] ) = 0x03030303;
- FILTER( , 1, 0, qp_top, qpc_top );
- }
- }
- diff --git a/common/macroblock.c b/common/macroblock.c
- index 1b2d37b..7180e8f 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -400,9 +400,27 @@ void x264_macroblock_slice_init( x264_t *h )
- }
- }
- }
- - if( h->sh.i_type == SLICE_TYPE_P )
- + else if( h->sh.i_type == SLICE_TYPE_P )
- + {
- memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
- + if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
- + {
- + deblock_ref_table(-2) = -2;
- + deblock_ref_table(-1) = -1;
- + for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
- + {
- + /* Mask off high bits to avoid frame num collisions with -1/-2.
- + * In current x264 frame num values don't cover a range of more
- + * than 32, so 6 bits is enough for uniqueness. */
- + if( !h->mb.b_interlaced )
- + deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
- + else
- + deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
- + }
- + }
- + }
- +
- /* init with not available (for top right idx=7,15) */
- memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
- @@ -418,19 +436,6 @@ void x264_macroblock_slice_init( x264_t *h )
- h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
- }
- - deblock_ref_table(-2) = -2;
- - deblock_ref_table(-1) = -1;
- - for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
- - {
- - /* Mask off high bits to avoid frame num collisions with -1/-2.
- - * In current x264 frame num values don't cover a range of more
- - * than 32, so 6 bits is enough for uniqueness. */
- - if( !h->mb.b_interlaced )
- - deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
- - else
- - deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
- - }
- -
- h->mb.i_neighbour4[6] =
- h->mb.i_neighbour4[9] =
- h->mb.i_neighbour4[12] =
- @@ -894,7 +899,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
- {
- int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
- - int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
- h->mb.i_neighbour = 0;
- h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
- @@ -906,9 +910,9 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_
- h->mb.i_neighbour |= MB_LEFT;
- }
- - if( top >= 0 )
- + if( mb_y > h->mb.b_interlaced )
- {
- - h->mb.i_mb_top_xy = top;
- + h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
- if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
- h->mb.i_neighbour |= MB_TOP;
- }
- @@ -930,8 +934,6 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
- h->mb.i_neighbour &= ~old_neighbour;
- if( h->mb.i_neighbour )
- {
- - int left = h->mb.i_mb_left_xy;
- - int top = h->mb.i_mb_top_xy;
- int top_y = mb_y - (1 << h->mb.b_interlaced);
- int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
- int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
- @@ -941,10 +943,11 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
- uint8_t (*nnz)[24] = h->mb.non_zero_count;
- if( h->mb.i_neighbour & MB_TOP )
- - CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
- + CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
- if( h->mb.i_neighbour & MB_LEFT )
- {
- + int left = h->mb.i_mb_left_xy;
- h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
- h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
- h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
- --
- 1.7.0.4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement