Advertisement
Guest User

Untitled

a guest
May 28th, 2017
599
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 86.55 KB | None | 0 0
  1. From 2bcbac357b714f468e0138f022e584ffdb42f6d2 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Mon, 31 May 2010 11:14:22 -0700
  4. Subject: [PATCH 01/11] Fix cavlc+deblock+8x8dct (regression in r1612)
  5.  Add cavlc+8x8dct munging to new deblock system.
  6.  May have caused minor visual artifacts.
  7.  
  8. ---
  9. common/deblock.c    |   47 -----------------------------------------------
  10.  common/macroblock.c |   46 ++++++++++++++++++++++++++++++++++++++++++++--
  11.  2 files changed, 44 insertions(+), 49 deletions(-)
  12.  
  13. diff --git a/common/deblock.c b/common/deblock.c
  14. index fc039c5..27c73ae 100644
  15. --- a/common/deblock.c
  16. +++ b/common/deblock.c
  17. @@ -24,46 +24,6 @@
  18.  
  19.  #include "common.h"
  20.  
  21. -/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
  22. - * entropy coding, but per 64 coeffs for the purpose of deblocking */
  23. -static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
  24. -{
  25. -    uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
  26. -    int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
  27. -    for( int x = 0; x<h->sps->i_mb_width; x++ )
  28. -    {
  29. -        memcpy( buf+x, src+x, 16 );
  30. -        if( transform[x] )
  31. -        {
  32. -            int nnz = src[x][0] | src[x][1];
  33. -            src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
  34. -            nnz = src[x][2] | src[x][3];
  35. -            src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
  36. -        }
  37. -    }
  38. -}
  39. -
  40. -static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
  41. -{
  42. -    uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
  43. -    for( int x = 0; x < h->sps->i_mb_width; x++ )
  44. -        memcpy( dst+x, buf+x, 16 );
  45. -}
  46. -
  47. -static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
  48. -{
  49. -    func( h, mb_y, buf );
  50. -    if( mb_y > 0 )
  51. -        func( h, mb_y-1, buf + h->sps->i_mb_width );
  52. -    if( h->sh.b_mbaff )
  53. -    {
  54. -        func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
  55. -        if( mb_y > 0 )
  56. -            func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
  57. -    }
  58. -}
  59. -
  60. -
  61.  /* Deblocking filter */
  62.  static const uint8_t i_alpha_table[52+12*2] =
  63.  {
  64. @@ -344,10 +304,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  65.      int stride2y  = stridey << b_interlaced;
  66.      int strideuv  = h->fdec->i_stride[1];
  67.      int stride2uv = strideuv << b_interlaced;
  68. -    uint8_t (*nnz_backup)[16] = h->scratch_buffer;
  69. -
  70. -    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
  71. -        munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
  72.  
  73.      for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
  74.      {
  75. @@ -427,9 +383,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  76.              if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
  77.          }
  78.      }
  79. -
  80. -    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
  81. -        munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
  82.  }
  83.  
  84.  #ifdef HAVE_MMX
  85. diff --git a/common/macroblock.c b/common/macroblock.c
  86. index ce510e9..01c90d2 100644
  87. --- a/common/macroblock.c
  88. +++ b/common/macroblock.c
  89. @@ -344,8 +344,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
  90.          int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
  91.          int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
  92.              ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
  93. -        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
  94. -        scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
  95. +        scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
  96.      }
  97.      int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
  98.      scratch_size = X264_MAX( scratch_size, buf_mbtree );
  99. @@ -1013,6 +1012,49 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
  100.          M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
  101.          M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
  102.      }
  103. +
  104. +    /* Munge NNZ for cavlc + 8x8dct */
  105. +    if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
  106. +    {
  107. +        uint8_t (*nnz)[24] = h->mb.non_zero_count;
  108. +        int top = h->mb.i_mb_top_xy;
  109. +        int left = h->mb.i_mb_left_xy;
  110. +
  111. +        if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
  112. +        {
  113. +            int i8 = x264_scan8[0] - 8;
  114. +            int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
  115. +            int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
  116. +            M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
  117. +            M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
  118. +        }
  119. +
  120. +        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left] )
  121. +        {
  122. +            int i8 = x264_scan8[0] - 1;
  123. +            int nnz_left0 = M16( &nnz[left][2] ) | M16( &nnz[left][6] );
  124. +            int nnz_left1 = M16( &nnz[left][10] ) | M16( &nnz[left][14] );
  125. +            h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
  126. +            h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
  127. +            h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
  128. +            h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
  129. +        }
  130. +
  131. +        if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
  132. +        {
  133. +            int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
  134. +            int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
  135. +            int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
  136. +            int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
  137. +            uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
  138. +            uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
  139. +
  140. +            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
  141. +            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
  142. +            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
  143. +            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
  144. +        }
  145. +    }
  146.  }
  147.  
  148.  static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
  149. --
  150. 1.7.0.4
  151.  
  152.  
  153. From d51fde592507649e22757a23f0ea0252ec35b5b6 Mon Sep 17 00:00:00 2001
  154. From: Anton Mitrofanov <BugMaster@narod.ru>
  155. Date: Mon, 31 May 2010 22:36:50 +0400
  156. Subject: [PATCH 02/11] Fix crash with MP4-muxing if zero frames were encoded
  157.  
  158. ---
  159. output/mp4.c |    3 ++-
  160.  1 files changed, 2 insertions(+), 1 deletions(-)
  161.  
  162. diff --git a/output/mp4.c b/output/mp4.c
  163. index f76541e..0aa5070 100644
  164. --- a/output/mp4.c
  165. +++ b/output/mp4.c
  166. @@ -112,6 +112,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  167.          if( p_mp4->p_sample->data )
  168.              free( p_mp4->p_sample->data );
  169.  
  170. +        p_mp4->p_sample->dataLength = 0;
  171.          gf_isom_sample_del( &p_mp4->p_sample );
  172.      }
  173.  
  174. @@ -135,7 +136,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
  175.           * The reason is that an Edit Box maps the presentation time-line to the media time-line.
  176.           * Any demuxers should follow the Edit Box if it exists. */
  177.          GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL );
  178. -        if( sample->CTS_Offset > 0 )
  179. +        if( sample && sample->CTS_Offset > 0 )
  180.          {
  181.              uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file );
  182.              uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) );
  183. --
  184. 1.7.0.4
  185.  
  186.  
  187. From 8098997dcba2602b22b43fdf26621d08d3f81333 Mon Sep 17 00:00:00 2001
  188. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  189. Date: Sun, 30 May 2010 09:42:53 -0700
  190. Subject: [PATCH 03/11] Fix ultrafast to actually turn off weightb
  191.  
  192. ---
  193. common/common.c |    1 +
  194.  1 files changed, 1 insertions(+), 0 deletions(-)
  195.  
  196. diff --git a/common/common.c b/common/common.c
  197. index 62bef99..fccf2b0 100644
  198. --- a/common/common.c
  199. +++ b/common/common.c
  200. @@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
  201.          param->i_bframe_adaptive = X264_B_ADAPT_NONE;
  202.          param->rc.b_mb_tree = 0;
  203.          param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  204. +        param->analyse.b_weighted_bipred = 0;
  205.      }
  206.      else if( !strcasecmp( preset, "superfast" ) )
  207.      {
  208. --
  209. 1.7.0.4
  210.  
  211.  
  212. From a7f870990af39a11f3bb883b9335baad91909ccb Mon Sep 17 00:00:00 2001
  213. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  214. Date: Thu, 27 May 2010 12:31:41 -0700
  215. Subject: [PATCH 04/11] Fix omission in libx264 tuning documentation
  216.  
  217. ---
  218. x264.h |    2 +-
  219.  1 files changed, 1 insertions(+), 1 deletions(-)
  220.  
  221. diff --git a/x264.h b/x264.h
  222. index 6d7b703..95efd88 100644
  223. --- a/x264.h
  224. +++ b/x264.h
  225. @@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
  226.  
  227.  /*      Multiple tunings can be used if separated by a delimiter in ",./-+",
  228.   *      however multiple psy tunings cannot be used.
  229. - *      film, animation, grain, psnr, and ssim are psy tunings.
  230. + *      film, animation, grain, stillimage, psnr, and ssim are psy tunings.
  231.   *
  232.   *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
  233.  int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
  234. --
  235. 1.7.0.4
  236.  
  237.  
  238. From 5832bdfaed3bcce1b2823b6594386e0357d8ff31 Mon Sep 17 00:00:00 2001
  239. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  240. Date: Wed, 26 May 2010 12:55:35 -0700
  241. Subject: [PATCH 05/11] Merge some of adaptive quant and weightp
  242.  Eliminate redundant work; both of them were calculating variance of the frame.
  243.  
  244. ---
  245. common/frame.h        |    4 +-
  246.  encoder/analyse.h     |    1 -
  247.  encoder/encoder.c     |   12 ++---
  248.  encoder/ratecontrol.c |  124 +++++++++++++++++++++++++++++++-----------------
  249.  encoder/slicetype.c   |   31 ++----------
  250.  5 files changed, 92 insertions(+), 80 deletions(-)
  251.  
  252. diff --git a/common/frame.h b/common/frame.h
  253. index 91d27b5..ca5cb7a 100644
  254. --- a/common/frame.h
  255. +++ b/common/frame.h
  256. @@ -118,8 +118,8 @@ typedef struct x264_frame
  257.      uint16_t *i_inv_qscale_factor;
  258.      int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
  259.      float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
  260. -    uint32_t i_pixel_sum;
  261. -    uint64_t i_pixel_ssd;
  262. +    uint32_t i_pixel_sum[3];
  263. +    uint64_t i_pixel_ssd[3];
  264.  
  265.      /* hrd */
  266.      x264_hrd_t hrd_timing;
  267. diff --git a/encoder/analyse.h b/encoder/analyse.h
  268. index 7c2c22c..53e4c2e 100644
  269. --- a/encoder/analyse.h
  270. +++ b/encoder/analyse.h
  271. @@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
  272.  void x264_slicetype_analyse( x264_t *h, int keyframe );
  273.  
  274.  int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
  275. -void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
  276.  
  277.  int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
  278.  int  x264_lookahead_is_empty( x264_t *h );
  279. diff --git a/encoder/encoder.c b/encoder/encoder.c
  280. index 52017ff..6e0dc54 100644
  281. --- a/encoder/encoder.c
  282. +++ b/encoder/encoder.c
  283. @@ -2246,21 +2246,17 @@ int     x264_encoder_encode( x264_t *h,
  284.                  fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
  285.          }
  286.  
  287. -        if( h->frames.b_have_lowres )
  288. -        {
  289. -            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
  290. -                x264_weight_plane_analyse( h, fenc );
  291. -            x264_frame_init_lowres( h, fenc );
  292. -        }
  293. -
  294.          if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
  295.          {
  296.              if( x264_macroblock_tree_read( h, fenc ) )
  297.                  return -1;
  298.          }
  299. -        else if( h->param.rc.i_aq_mode )
  300. +        else
  301.              x264_adaptive_quant_frame( h, fenc );
  302.  
  303. +        if( h->frames.b_have_lowres )
  304. +            x264_frame_init_lowres( h, fenc );
  305. +
  306.          /* 2: Place the frame into the queue for its slice type decision */
  307.          x264_lookahead_put_frame( h, fenc );
  308.  
  309. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  310. index a725a24..bf0a400 100644
  311. --- a/encoder/ratecontrol.c
  312. +++ b/encoder/ratecontrol.c
  313. @@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
  314.      stride <<= h->mb.b_interlaced;
  315.      uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
  316.      uint32_t sum = (uint32_t)res;
  317. -    uint32_t sqr = res >> 32;
  318. -    return sqr - (sum * sum >> shift);
  319. +    uint32_t ssd = res >> 32;
  320. +    frame->i_pixel_sum[i] += sum;
  321. +    frame->i_pixel_ssd[i] += ssd;
  322. +    return ssd - (sum * sum >> shift);
  323.  }
  324.  
  325.  // Find the total AC energy of the block in all planes.
  326. -static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
  327. +static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
  328.  {
  329.      /* This function contains annoying hacks because GCC has a habit of reordering emms
  330.       * and putting it after floating point ops.  As a result, we put the emms at the end of the
  331. @@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  332.       * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
  333.      float strength;
  334.      float avg_adj = 0.f;
  335. -    /* Need to init it anyways for MB tree. */
  336. -    if( h->param.rc.f_aq_strength == 0 )
  337. -    {
  338. -        memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  339. -        memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  340. -        if( h->frames.b_have_lowres )
  341. -            for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  342. -                frame->i_inv_qscale_factor[mb_xy] = 256;
  343. -        return;
  344. +    int width = h->sps->i_mb_width;
  345. +    int height = h->sps->i_mb_height;
  346. +    /* Initialize frame stats */
  347. +    for( int i = 0; i < 3; i++ )
  348. +    {
  349. +        frame->i_pixel_sum[i] = 0;
  350. +        frame->i_pixel_ssd[i] = 0;
  351.      }
  352.  
  353. -    if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  354. +    /* Degenerate cases */
  355. +    if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
  356.      {
  357. -        float avg_adj_pow2 = 0.f;
  358. -        for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
  359. -            for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
  360. -            {
  361. -                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
  362. -                float qp_adj = powf( energy + 1, 0.125f );
  363. -                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  364. -                avg_adj += qp_adj;
  365. -                avg_adj_pow2 += qp_adj * qp_adj;
  366. -            }
  367. -        avg_adj /= h->mb.i_mb_count;
  368. -        avg_adj_pow2 /= h->mb.i_mb_count;
  369. -        strength = h->param.rc.f_aq_strength * avg_adj;
  370. -        avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
  371. +        /* Need to init it anyways for MB tree */
  372. +        if( h->param.rc.f_aq_strength == 0 )
  373. +        {
  374. +            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  375. +            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  376. +            if( h->frames.b_have_lowres )
  377. +                for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  378. +                    frame->i_inv_qscale_factor[mb_xy] = 256;
  379. +        }
  380. +        /* Need variance data for weighted prediction */
  381. +        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
  382. +        {
  383. +            for( int mb_y = 0; mb_y < height; mb_y++ )
  384. +                for( int mb_x = 0; mb_x < width; mb_x++ )
  385. +                    x264_ac_energy_mb( h, mb_x, mb_y, frame );
  386. +        }
  387. +        else
  388. +            return;
  389.      }
  390. +    /* Actual adaptive quantization */
  391.      else
  392. -        strength = h->param.rc.f_aq_strength * 1.0397f;
  393. -
  394. -    for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
  395. -        for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
  396. +    {
  397. +        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  398.          {
  399. -            float qp_adj;
  400. -            if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  401. -            {
  402. -                qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
  403. -                qp_adj = strength * (qp_adj - avg_adj);
  404. -            }
  405. -            else
  406. +            float avg_adj_pow2 = 0.f;
  407. +            for( int mb_y = 0; mb_y < height; mb_y++ )
  408. +                for( int mb_x = 0; mb_x < width; mb_x++ )
  409. +                {
  410. +                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
  411. +                    float qp_adj = powf( energy + 1, 0.125f );
  412. +                    frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  413. +                    avg_adj += qp_adj;
  414. +                    avg_adj_pow2 += qp_adj * qp_adj;
  415. +                }
  416. +            avg_adj /= h->mb.i_mb_count;
  417. +            avg_adj_pow2 /= h->mb.i_mb_count;
  418. +            strength = h->param.rc.f_aq_strength * avg_adj;
  419. +            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
  420. +        }
  421. +        else
  422. +            strength = h->param.rc.f_aq_strength * 1.0397f;
  423. +
  424. +        for( int mb_y = 0; mb_y < height; mb_y++ )
  425. +            for( int mb_x = 0; mb_x < width; mb_x++ )
  426.              {
  427. -                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
  428. -                qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
  429. +                float qp_adj;
  430. +                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  431. +                {
  432. +                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
  433. +                    qp_adj = strength * (qp_adj - avg_adj);
  434. +                }
  435. +                else
  436. +                {
  437. +                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
  438. +                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
  439. +                }
  440. +                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
  441. +                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  442. +                if( h->frames.b_have_lowres )
  443. +                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
  444.              }
  445. -            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
  446. -            frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  447. -            if( h->frames.b_have_lowres )
  448. -                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
  449. -        }
  450. +    }
  451. +
  452. +    /* Remove mean from SSD calculation */
  453. +    for( int i = 0; i < 3; i++ )
  454. +    {
  455. +        uint64_t ssd = frame->i_pixel_ssd[i];
  456. +        uint64_t sum = frame->i_pixel_sum[i];
  457. +        int w = width*16>>!!i;
  458. +        int h = height*16>>!!i;
  459. +        frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
  460. +    }
  461.  }
  462.  
  463.  int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
  464. diff --git a/encoder/slicetype.c b/encoder/slicetype.c
  465. index 9352367..e454e12 100644
  466. --- a/encoder/slicetype.c
  467. +++ b/encoder/slicetype.c
  468. @@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
  469.      w->i_scale = X264_MIN( w->i_scale, 127 );
  470.  }
  471.  
  472. -void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
  473. -{
  474. -    uint32_t sad = 0;
  475. -    uint64_t ssd = 0;
  476. -    uint8_t *p = frame->plane[0];
  477. -    int stride = frame->i_stride[0];
  478. -    int width = frame->i_width[0];
  479. -    int height = frame->i_lines[0];
  480. -    for( int y = 0; y < height>>4; y++, p += stride*16 )
  481. -        for( int x = 0; x < width; x += 16 )
  482. -        {
  483. -            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
  484. -            sad += (uint32_t)res;
  485. -            ssd += res >> 32;
  486. -        }
  487. -    frame->i_pixel_sum = sad;
  488. -    frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
  489. -}
  490. -
  491.  static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
  492.  {
  493.      int ref0_distance = fenc->i_frame - ref->i_frame - 1;
  494. @@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
  495.      int found;
  496.      x264_weight_t *weights = fenc->weight[0];
  497.  
  498. -    fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
  499. -    ref_var  = round( sqrt(  ref->i_pixel_ssd ) );
  500. -    fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
  501. -    ref_mean  = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
  502. +    fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
  503. +    ref_var  = round( sqrt(  ref->i_pixel_ssd[0] ) );
  504. +    fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
  505. +    ref_mean  = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
  506.  
  507.      //early termination
  508.      if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
  509. @@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
  510.          do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
  511.          if( do_search[0] )
  512.          {
  513. -            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
  514. -                  || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
  515. +            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
  516. +                  h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
  517.              {
  518.                  x264_emms();
  519.                  x264_weights_analyse( h, frames[b], frames[p0], 1 );
  520. --
  521. 1.7.0.4
  522.  
  523.  
  524. From 794713a35eadcd999d5aab4a50274ca43f29be93 Mon Sep 17 00:00:00 2001
  525. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  526. Date: Thu, 27 May 2010 10:42:15 -0700
  527. Subject: [PATCH 06/11] Add fast skip in lookahead motion search
  528.  Helps speed very significantly on motionless blocks.
  529.  
  530. ---
  531. encoder/slicetype.c |   16 +++++++++++++++-
  532.  1 files changed, 15 insertions(+), 1 deletions(-)
  533.  
  534. diff --git a/encoder/slicetype.c b/encoder/slicetype.c
  535. index e454e12..d7cfe5c 100644
  536. --- a/encoder/slicetype.c
  537. +++ b/encoder/slicetype.c
  538. @@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
  539.                  CP32( m[l].mvp, mvc[0] );
  540.              else
  541.                  x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
  542. -            x264_me_search( h, &m[l], mvc, i_mvc );
  543.  
  544. +            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
  545. +             * since anything else is likely to have enough residual to not trigger the skip. */
  546. +            if( !M32( m[l].mvp ) )
  547. +            {
  548. +                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
  549. +                if( m[l].cost < 64 )
  550. +                {
  551. +                    M32( m[l].mv ) = 0;
  552. +                    goto skip_motionest;
  553. +                }
  554. +            }
  555. +
  556. +            x264_me_search( h, &m[l], mvc, i_mvc );
  557.              m[l].cost -= 2; // remove mvcost from skip mbs
  558.              if( M32( m[l].mv ) )
  559.                  m[l].cost += 5;
  560. +
  561. +skip_motionest:
  562.              CP32( fenc_mvs[l], m[l].mv );
  563.              *fenc_costs[l] = m[l].cost;
  564.          }
  565. --
  566. 1.7.0.4
  567.  
  568.  
  569. From 77b568b22d42baa344dad050aef420de3b22e126 Mon Sep 17 00:00:00 2001
  570. From: Henrik Gramner <hengar-6@student.ltu.se>
  571. Date: Thu, 27 May 2010 22:18:38 +0200
  572. Subject: [PATCH 07/11] Optimize out some x264_scan8 reads
  573.  
  574. ---
  575. encoder/analyse.c    |   15 ++++-----
  576.  encoder/macroblock.c |   82 ++++++++++++++++++++++++++++++--------------------
  577.  encoder/me.c         |   25 ++++++++-------
  578.  3 files changed, 70 insertions(+), 52 deletions(-)
  579.  
  580. diff --git a/encoder/analyse.c b/encoder/analyse.c
  581. index a128a70..9e85e89 100644
  582. --- a/encoder/analyse.c
  583. +++ b/encoder/analyse.c
  584. @@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
  585.  static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  586.  {
  587.      uint8_t  *p_dst = h->mb.pic.p_fdec[0];
  588. -
  589. -    int x, y;
  590.      uint64_t i_satd, i_best;
  591.      h->mb.i_skip_intra = 0;
  592.  
  593. @@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  594.              int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
  595.  
  596.              i_best = COST_MAX64;
  597. -            x = idx&1;
  598. -            y = idx>>1;
  599. +            int x = idx&1;
  600. +            int y = idx>>1;
  601. +            int s8 = X264_SCAN8_0 + 2*x + 16*y;
  602.  
  603.              p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
  604.              predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
  605. @@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  606.                      if( !(idx&1) )
  607.                          for( int j = 0; j < 7; j++ )
  608.                              pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
  609. -                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
  610. -                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
  611. +                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
  612. +                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
  613.                  }
  614.              }
  615.              a->i_cbp_i8x8_luma = cbp_luma_new;
  616. @@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  617.              if( !(idx&1) )
  618.                  for( int j = 0; j < 7; j++ )
  619.                      p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
  620. -            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
  621. -            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
  622. +            M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
  623. +            M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
  624.  
  625.              x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
  626.          }
  627. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  628. index 984f8a8..cdc4563 100644
  629. --- a/encoder/macroblock.c
  630. +++ b/encoder/macroblock.c
  631. @@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
  632.      }
  633.  }
  634.  
  635. -#define STORE_8x8_NNZ(idx,nz)\
  636. +#define STORE_8x8_NNZ( s8, nz )\
  637. +do\
  638.  {\
  639. -    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
  640. -    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
  641. -}
  642. +    M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
  643. +    M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
  644. +} while(0)
  645.  
  646.  #define CLEAR_16x16_NNZ \
  647.  {\
  648. @@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
  649.  
  650.  void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
  651.  {
  652. -    int x = 8 * (idx&1);
  653. -    int y = 8 * (idx>>1);
  654. +    int x = idx&1;
  655. +    int y = idx>>1;
  656. +    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  657.      int nz;
  658. -    uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
  659. -    uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
  660. +    uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
  661. +    uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
  662.      ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
  663.  
  664.      if( h->mb.b_lossless )
  665.      {
  666.          nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
  667. -        STORE_8x8_NNZ(idx,nz);
  668. +        STORE_8x8_NNZ( s8, nz );
  669.          h->mb.i_cbp_luma |= nz<<idx;
  670.          return;
  671.      }
  672. @@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
  673.          h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
  674.          h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
  675.          h->dctf.add8x8_idct8( p_dst, dct8x8 );
  676. -        STORE_8x8_NNZ(idx,1);
  677. +        STORE_8x8_NNZ( s8, 1 );
  678.      }
  679.      else
  680. -        STORE_8x8_NNZ(idx,0);
  681. +        STORE_8x8_NNZ( s8, 0 );
  682.  }
  683.  
  684.  static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
  685. @@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
  686.              if( h->mb.b_transform_8x8 )
  687.                  for( int i8x8 = 0; i8x8 < 4; i8x8++ )
  688.                  {
  689. -                    int x = 8*(i8x8&1);
  690. -                    int y = 8*(i8x8>>1);
  691. -                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
  692. -                                        h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
  693. -                                        h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
  694. -                    STORE_8x8_NNZ(i8x8,nz);
  695. +                    int x = i8x8&1;
  696. +                    int y = i8x8>>1;
  697. +                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  698. +
  699. +                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
  700. +                                                                   h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
  701. +                    STORE_8x8_NNZ( s8, nz );
  702.                      h->mb.i_cbp_luma |= nz << i8x8;
  703.                  }
  704.              else
  705. @@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
  706.              {
  707.                  for( int idx = 0; idx < 4; idx++ )
  708.                  {
  709. +                    int x = idx&1;
  710. +                    int y = idx>>1;
  711. +                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  712. +
  713.                      if( h->mb.i_cbp_luma&(1<<idx) )
  714.                      {
  715.                          h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
  716. -                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
  717. -                        STORE_8x8_NNZ(idx,1);
  718. +                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
  719. +                        STORE_8x8_NNZ( s8, 1 );
  720.                      }
  721.                      else
  722. -                        STORE_8x8_NNZ(idx,0);
  723. +                        STORE_8x8_NNZ( s8, 0 );
  724.                  }
  725.              }
  726.          }
  727. @@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
  728.                      }
  729.                  }
  730.  
  731. +                int x = i8x8&1;
  732. +                int y = i8x8>>1;
  733. +
  734.                  /* decimate this 8x8 block */
  735.                  i_decimate_mb += i_decimate_8x8;
  736.                  if( b_decimate )
  737.                  {
  738.                      if( i_decimate_8x8 < 4 )
  739. -                        STORE_8x8_NNZ(i8x8,0)
  740. +                    {
  741. +                        int s8 = X264_SCAN8_0 + 2*x + 16*y;
  742. +                        STORE_8x8_NNZ( s8, 0 );
  743. +                    }
  744.                      else
  745.                          h->mb.i_cbp_luma |= 1<<i8x8;
  746.                  }
  747.                  else if( cbp )
  748.                  {
  749. -                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
  750. +                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
  751.                      h->mb.i_cbp_luma |= 1<<i8x8;
  752.                  }
  753.              }
  754. @@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
  755.  void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  756.  {
  757.      int i_qp = h->mb.i_qp;
  758. -    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
  759. -    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
  760. +    int x = i8&1;
  761. +    int y = i8>>1;
  762. +    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  763. +    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
  764. +    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
  765.      int b_decimate = h->mb.b_dct_decimate;
  766.      int nnz8x8 = 0;
  767.      int nz;
  768. @@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  769.          if( h->mb.b_transform_8x8 )
  770.          {
  771.              nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
  772. -            STORE_8x8_NNZ(i8,nnz8x8);
  773. +            STORE_8x8_NNZ( s8, nnz8x8 );
  774.          }
  775.          else
  776.          {
  777. @@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  778.          for( int ch = 0; ch < 2; ch++ )
  779.          {
  780.              int16_t dc;
  781. -            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
  782. -            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
  783. +            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
  784. +            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
  785.              nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
  786.              h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
  787.          }
  788. @@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  789.                  {
  790.                      h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
  791.                      h->dctf.add8x8_idct8( p_fdec, dct8x8 );
  792. -                    STORE_8x8_NNZ(i8,1);
  793. +                    STORE_8x8_NNZ( s8, 1 );
  794.                  }
  795.                  else
  796. -                    STORE_8x8_NNZ(i8,0);
  797. +                    STORE_8x8_NNZ( s8, 0 );
  798.              }
  799.              else
  800. -                STORE_8x8_NNZ(i8,0);
  801. +                STORE_8x8_NNZ( s8, 0 );
  802.          }
  803.          else
  804.          {
  805. @@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  806.              if( nnz8x8 )
  807.                  h->dctf.add8x8_idct( p_fdec, dct4x4 );
  808.              else
  809. -                STORE_8x8_NNZ(i8,0);
  810. +                STORE_8x8_NNZ( s8, 0 );
  811.          }
  812.  
  813.          i_qp = h->mb.i_chroma_qp;
  814. @@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  815.          for( int ch = 0; ch < 2; ch++ )
  816.          {
  817.              ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
  818. -            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
  819. -            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
  820. +            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
  821. +            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
  822.  
  823.              h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
  824.              dct4x4[0] = 0;
  825. diff --git a/encoder/me.c b/encoder/me.c
  826. index 77073cc..40d0650 100644
  827. --- a/encoder/me.c
  828. +++ b/encoder/me.c
  829. @@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
  830.  
  831.  static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
  832.  {
  833. -    int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
  834. -    int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
  835. +    int x = i8&1;
  836. +    int y = i8>>1;
  837. +    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  838. +    int16_t *cache0_mv = h->mb.cache.mv[0][s8];
  839. +    int16_t *cache1_mv = h->mb.cache.mv[1][s8];
  840.      const int i_pixel = m0->i_pixel;
  841.      const int bw = x264_pixel_size[i_pixel].w;
  842.      const int bh = x264_pixel_size[i_pixel].h;
  843. @@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
  844.      ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
  845.      ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
  846.      uint8_t *src[2][9];
  847. -    uint8_t *pix  = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
  848. -    uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
  849. -    uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
  850. -    const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
  851. -    const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
  852. +    uint8_t *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
  853. +    uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
  854. +    uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
  855. +    int ref0 = h->mb.cache.ref[0][s8];
  856. +    int ref1 = h->mb.cache.ref[1][s8];
  857.      const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  858.      const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  859.      int stride[2][9];
  860. @@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
  861.  
  862.      if( rd )
  863.      {
  864. -        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
  865. +        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
  866.          amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
  867. -        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
  868. +        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
  869.  
  870. -        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
  871. +        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
  872.          amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
  873. -        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
  874. +        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
  875.      }
  876.  
  877.      m0->mv[0] = bm0x;
  878. --
  879. 1.7.0.4
  880.  
  881.  
  882. From 0c7cf0bfb1d30ee8e7f1b355fef5aa9e2db929d2 Mon Sep 17 00:00:00 2001
  883. From: Henrik Gramner <hengar-6@student.ltu.se>
  884. Date: Sun, 30 May 2010 22:45:14 +0200
  885. Subject: [PATCH 08/11] Some deblocking-related optimizations
  886.  
  887. ---
  888. common/deblock.c    |    8 ++++----
  889.  common/macroblock.c |   43 +++++++++++++++++++++++--------------------
  890.  2 files changed, 27 insertions(+), 24 deletions(-)
  891.  
  892. diff --git a/common/deblock.c b/common/deblock.c
  893. index 27c73ae..3296dbf 100644
  894. --- a/common/deblock.c
  895. +++ b/common/deblock.c
  896. @@ -299,7 +299,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
  897.  void x264_frame_deblock_row( x264_t *h, int mb_y )
  898.  {
  899.      int b_interlaced = h->sh.b_mbaff;
  900. -    int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
  901. +    int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset );
  902.      int stridey   = h->fdec->i_stride[0];
  903.      int stride2y  = stridey << b_interlaced;
  904.      int strideuv  = h->fdec->i_stride[1];
  905. @@ -318,7 +318,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  906.          uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
  907.          uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
  908.          uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
  909. -        if( b_interlaced && (mb_y&1) )
  910. +        if( mb_y & b_interlaced )
  911.          {
  912.              pixy -= 15*stridey;
  913.              pixu -=  7*strideuv;
  914. @@ -366,12 +366,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  915.              int qp_top = (qp + qpt + 1) >> 1;
  916.              int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1;
  917.              int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
  918. -            if( !b_interlaced && (intra_cur || intra_top) )
  919. +            if( ~b_interlaced & (intra_cur | intra_top) )
  920.                  FILTER( _intra, 1, 0, qp_top, qpc_top );
  921.              else
  922.              {
  923.                  if( intra_top )
  924. -                    memset( bs[1][0], 3, sizeof(bs[1][0]) );
  925. +                    M32( bs[1][0] ) = 0x03030303;
  926.                  FILTER(       , 1, 0, qp_top, qpc_top );
  927.              }
  928.          }
  929. diff --git a/common/macroblock.c b/common/macroblock.c
  930. index 01c90d2..26f63f5 100644
  931. --- a/common/macroblock.c
  932. +++ b/common/macroblock.c
  933. @@ -400,9 +400,27 @@ void x264_macroblock_slice_init( x264_t *h )
  934.                  }
  935.          }
  936.      }
  937. -    if( h->sh.i_type == SLICE_TYPE_P )
  938. +    else if( h->sh.i_type == SLICE_TYPE_P )
  939. +    {
  940.          memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
  941.  
  942. +        if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
  943. +        {
  944. +            deblock_ref_table(-2) = -2;
  945. +            deblock_ref_table(-1) = -1;
  946. +            for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
  947. +            {
  948. +                /* Mask off high bits to avoid frame num collisions with -1/-2.
  949. +                 * In current x264 frame num values don't cover a range of more
  950. +                 * than 32, so 6 bits is enough for uniqueness. */
  951. +                if( !h->mb.b_interlaced )
  952. +                    deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
  953. +                else
  954. +                    deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
  955. +            }
  956. +        }
  957. +    }
  958. +
  959.      /* init with not available (for top right idx=7,15) */
  960.      memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
  961.  
  962. @@ -418,19 +436,6 @@ void x264_macroblock_slice_init( x264_t *h )
  963.              h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
  964.          }
  965.  
  966. -    deblock_ref_table(-2) = -2;
  967. -    deblock_ref_table(-1) = -1;
  968. -    for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
  969. -    {
  970. -        /* Mask off high bits to avoid frame num collisions with -1/-2.
  971. -         * In current x264 frame num values don't cover a range of more
  972. -         * than 32, so 6 bits is enough for uniqueness. */
  973. -        if( !h->mb.b_interlaced )
  974. -            deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
  975. -        else
  976. -            deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
  977. -    }
  978. -
  979.      h->mb.i_neighbour4[6] =
  980.      h->mb.i_neighbour4[9] =
  981.      h->mb.i_neighbour4[12] =
  982. @@ -894,7 +899,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  983.  void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
  984.  {
  985.      int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
  986. -    int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
  987.  
  988.      h->mb.i_neighbour = 0;
  989.      h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
  990. @@ -906,9 +910,9 @@ void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_
  991.              h->mb.i_neighbour |= MB_LEFT;
  992.      }
  993.  
  994. -    if( top >= 0 )
  995. +    if( mb_y > h->mb.b_interlaced )
  996.      {
  997. -        h->mb.i_mb_top_xy = top;
  998. +        h->mb.i_mb_top_xy = h->mb.i_mb_xy - (h->mb.i_mb_stride << h->mb.b_interlaced);
  999.          if( deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy] )
  1000.              h->mb.i_neighbour |= MB_TOP;
  1001.      }
  1002. @@ -930,8 +934,6 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
  1003.          h->mb.i_neighbour &= ~old_neighbour;
  1004.          if( h->mb.i_neighbour )
  1005.          {
  1006. -            int left = h->mb.i_mb_left_xy;
  1007. -            int top  = h->mb.i_mb_top_xy;
  1008.              int top_y = mb_y - (1 << h->mb.b_interlaced);
  1009.              int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
  1010.              int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
  1011. @@ -941,10 +943,11 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
  1012.              uint8_t (*nnz)[24] = h->mb.non_zero_count;
  1013.  
  1014.              if( h->mb.i_neighbour & MB_TOP )
  1015. -                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
  1016. +                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
  1017.  
  1018.              if( h->mb.i_neighbour & MB_LEFT )
  1019.              {
  1020. +                int left = h->mb.i_mb_left_xy;
  1021.                  h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
  1022.                  h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
  1023.                  h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
  1024. --
  1025. 1.7.0.4
  1026.  
  1027.  
  1028. From bdc68d651db64045aecb28f27e0e05e027ab48eb Mon Sep 17 00:00:00 2001
  1029. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1030. Date: Fri, 28 May 2010 14:30:07 -0700
  1031. Subject: [PATCH 09/11] Re-enable i8x8 merged SATD
  1032.  Accidentally got disabled when intra_sad_x3 was added.
  1033.  
  1034. ---
  1035. encoder/encoder.c |    1 +
  1036.  1 files changed, 1 insertions(+), 0 deletions(-)
  1037.  
  1038. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1039. index 6e0dc54..7717ea8 100644
  1040. --- a/encoder/encoder.c
  1041. +++ b/encoder/encoder.c
  1042. @@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
  1043.      memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
  1044.      h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
  1045.      h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
  1046. +    h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
  1047.      h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
  1048.      satd &= h->param.analyse.i_me_method == X264_ME_TESA;
  1049.      memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
  1050. --
  1051. 1.7.0.4
  1052.  
  1053.  
  1054. From c211bfffa59599e6a90df2e0fd00f4ae9e01ada0 Mon Sep 17 00:00:00 2001
  1055. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1056. Date: Thu, 27 May 2010 14:27:32 -0700
  1057. Subject: [PATCH 10/11] x86 assembly code for NAL escaping
  1058.  Up to ~10x faster than C depending on CPU.
  1059.  Helps the most at very high bitrates (e.g. lossless).
  1060.  Also make the C code faster and simpler.
  1061.  
  1062. ---
  1063. Makefile                   |    4 +-
  1064.  common/bitstream.c         |   92 ++++++++++++++
  1065.  common/bitstream.h         |  299 ++++++++++++++++++++++++++++++++++++++++++++
  1066.  common/bs.h                |  291 ------------------------------------------
  1067.  common/common.c            |   54 --------
  1068.  common/common.h            |    5 +-
  1069.  common/x86/bitstream-a.asm |  112 +++++++++++++++++
  1070.  common/x86/deblock-a.asm   |    1 +
  1071.  encoder/encoder.c          |    3 +-
  1072.  tools/checkasm.c           |   52 ++++++++-
  1073.  10 files changed, 561 insertions(+), 352 deletions(-)
  1074.  create mode 100644 common/bitstream.c
  1075.  create mode 100644 common/bitstream.h
  1076.  delete mode 100644 common/bs.h
  1077.  create mode 100644 common/x86/bitstream-a.asm
  1078.  
  1079. diff --git a/Makefile b/Makefile
  1080. index 0b43a3e..519e181 100644
  1081. --- a/Makefile
  1082. +++ b/Makefile
  1083. @@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
  1084.         common/frame.c common/dct.c common/cpu.c common/cabac.c \
  1085.         common/common.c common/mdate.c common/rectangle.c \
  1086.         common/set.c common/quant.c common/deblock.c common/vlc.c \
  1087. -       common/mvpred.c \
  1088. +       common/mvpred.c common/bitstream.c \
  1089.         encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
  1090.         encoder/set.c encoder/macroblock.c encoder/cabac.c \
  1091.         encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
  1092. @@ -52,7 +52,7 @@ endif
  1093.  ifneq ($(AS),)
  1094.  X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
  1095.            mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
  1096. -          cpu-a.asm dct-32.asm
  1097. +          cpu-a.asm dct-32.asm bitstream-a.asm
  1098.  X86SRC = $(X86SRC0:%=common/x86/%)
  1099.  
  1100.  ifeq ($(ARCH),X86)
  1101. diff --git a/common/bitstream.c b/common/bitstream.c
  1102. new file mode 100644
  1103. index 0000000..0aaac21
  1104. --- /dev/null
  1105. +++ b/common/bitstream.c
  1106. @@ -0,0 +1,92 @@
  1107. +/*****************************************************************************
  1108. + * bitstream.c: h264 encoder library
  1109. + *****************************************************************************
  1110. + * Copyright (C) 2010 x264 project
  1111. + *
  1112. + * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  1113. + *          Jason Garrett-Glaser <darkshikari@gmail.com>
  1114. + *
  1115. + * This program is free software; you can redistribute it and/or modify
  1116. + * it under the terms of the GNU General Public License as published by
  1117. + * the Free Software Foundation; either version 2 of the License, or
  1118. + * (at your option) any later version.
  1119. + *
  1120. + * This program is distributed in the hope that it will be useful,
  1121. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1122. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  1123. + * GNU General Public License for more details.
  1124. + *
  1125. + * You should have received a copy of the GNU General Public License
  1126. + * along with this program; if not, write to the Free Software
  1127. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  1128. + *****************************************************************************/
  1129. +
  1130. +#include "common.h"
  1131. +
  1132. +static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
  1133. +{
  1134. +    if( src < end ) *dst++ = *src++;
  1135. +    if( src < end ) *dst++ = *src++;
  1136. +    while( src < end )
  1137. +    {
  1138. +        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
  1139. +            *dst++ = 0x03;
  1140. +        *dst++ = *src++;
  1141. +    }
  1142. +    return dst;
  1143. +}
  1144. +
  1145. +#ifdef HAVE_MMX
  1146. +uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
  1147. +uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
  1148. +#endif
  1149. +
  1150. +/****************************************************************************
  1151. + * x264_nal_encode:
  1152. + ****************************************************************************/
  1153. +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
  1154. +{
  1155. +    uint8_t *src = nal->p_payload;
  1156. +    uint8_t *end = nal->p_payload + nal->i_payload;
  1157. +    uint8_t *orig_dst = dst;
  1158. +
  1159. +    if( h->param.b_annexb )
  1160. +    {
  1161. +        if( b_long_startcode )
  1162. +            *dst++ = 0x00;
  1163. +        *dst++ = 0x00;
  1164. +        *dst++ = 0x00;
  1165. +        *dst++ = 0x01;
  1166. +    }
  1167. +    else /* save room for size later */
  1168. +        dst += 4;
  1169. +
  1170. +    /* nal header */
  1171. +    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
  1172. +
  1173. +    dst = h->bsf.nal_escape( dst, src, end );
  1174. +    int size = (dst - orig_dst) - 4;
  1175. +
  1176. +    /* Write the size header for mp4/etc */
  1177. +    if( !h->param.b_annexb )
  1178. +    {
  1179. +        /* Size doesn't include the size of the header we're writing now. */
  1180. +        orig_dst[0] = size>>24;
  1181. +        orig_dst[1] = size>>16;
  1182. +        orig_dst[2] = size>> 8;
  1183. +        orig_dst[3] = size>> 0;
  1184. +    }
  1185. +
  1186. +    return size+4;
  1187. +}
  1188. +
  1189. +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
  1190. +{
  1191. +    pf->nal_escape = x264_nal_escape_c;
  1192. +#ifdef HAVE_MMX
  1193. +    if( cpu&X264_CPU_MMXEXT )
  1194. +        pf->nal_escape = x264_nal_escape_mmxext;
  1195. +    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
  1196. +        pf->nal_escape = x264_nal_escape_sse2;
  1197. +#endif
  1198. +}
  1199. diff --git a/common/bitstream.h b/common/bitstream.h
  1200. new file mode 100644
  1201. index 0000000..d018c7d
  1202. --- /dev/null
  1203. +++ b/common/bitstream.h
  1204. @@ -0,0 +1,299 @@
  1205. +/*****************************************************************************
  1206. + * bitstream.h: h264 encoder library
  1207. + *****************************************************************************
  1208. + * Copyright (C) 2003-2008 x264 project
  1209. + *
  1210. + * Authors: Loren Merritt <lorenm@u.washington.edu>
  1211. + *          Jason Garrett-Glaser <darkshikari@gmail.com>
  1212. + *          Laurent Aimar <fenrir@via.ecp.fr>
  1213. + *
  1214. + * This program is free software; you can redistribute it and/or modify
  1215. + * it under the terms of the GNU General Public License as published by
  1216. + * the Free Software Foundation; either version 2 of the License, or
  1217. + * (at your option) any later version.
  1218. + *
  1219. + * This program is distributed in the hope that it will be useful,
  1220. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1221. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  1222. + * GNU General Public License for more details.
  1223. + *
  1224. + * You should have received a copy of the GNU General Public License
  1225. + * along with this program; if not, write to the Free Software
  1226. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  1227. + *****************************************************************************/
  1228. +
  1229. +#ifndef X264_BS_H
  1230. +#define X264_BS_H
  1231. +
  1232. +typedef struct
  1233. +{
  1234. +    uint8_t i_bits;
  1235. +    uint8_t i_size;
  1236. +} vlc_t;
  1237. +
  1238. +typedef struct
  1239. +{
  1240. +    uint16_t i_bits;
  1241. +    uint8_t  i_size;
  1242. +    /* Next level table to use */
  1243. +    uint8_t  i_next;
  1244. +} vlc_large_t;
  1245. +
  1246. +typedef struct bs_s
  1247. +{
  1248. +    uint8_t *p_start;
  1249. +    uint8_t *p;
  1250. +    uint8_t *p_end;
  1251. +
  1252. +    intptr_t cur_bits;
  1253. +    int     i_left;    /* i_count number of available bits */
  1254. +    int     i_bits_encoded; /* RD only */
  1255. +} bs_t;
  1256. +
  1257. +typedef struct
  1258. +{
  1259. +    int     last;
  1260. +    int16_t level[16];
  1261. +    uint8_t run[16];
  1262. +} x264_run_level_t;
  1263. +
  1264. +extern const vlc_t x264_coeff0_token[5];
  1265. +extern const vlc_t x264_coeff_token[5][16][4];
  1266. +extern const vlc_t x264_total_zeros[15][16];
  1267. +extern const vlc_t x264_total_zeros_dc[3][4];
  1268. +extern const vlc_t x264_run_before[7][16];
  1269. +
  1270. +typedef struct
  1271. +{
  1272. +    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
  1273. +} x264_bitstream_function_t;
  1274. +
  1275. +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
  1276. +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
  1277. +
  1278. +/* A larger level table size theoretically could help a bit at extremely
  1279. + * high bitrates, but the cost in cache is usually too high for it to be
  1280. + * useful.
  1281. + * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
  1282. + * FIXME: Do further testing? */
  1283. +#define LEVEL_TABLE_SIZE 128
  1284. +extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
  1285. +
  1286. +static inline void bs_init( bs_t *s, void *p_data, int i_data )
  1287. +{
  1288. +    int offset = ((intptr_t)p_data & 3);
  1289. +    s->p       = s->p_start = (uint8_t*)p_data - offset;
  1290. +    s->p_end   = (uint8_t*)p_data + i_data;
  1291. +    s->i_left  = (WORD_SIZE - offset)*8;
  1292. +    s->cur_bits = endian_fix32( M32(s->p) );
  1293. +    s->cur_bits >>= (4-offset)*8;
  1294. +}
  1295. +static inline int bs_pos( bs_t *s )
  1296. +{
  1297. +    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
  1298. +}
  1299. +
  1300. +/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
  1301. +static inline void bs_flush( bs_t *s )
  1302. +{
  1303. +    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
  1304. +    s->p += WORD_SIZE - s->i_left / 8;
  1305. +    s->i_left = WORD_SIZE*8;
  1306. +}
  1307. +/* The inverse of bs_flush: prepare the bitstream to be written to again. */
  1308. +static inline void bs_realign( bs_t *s )
  1309. +{
  1310. +    int offset = ((intptr_t)s->p & 3);
  1311. +    if( offset )
  1312. +    {
  1313. +        s->p       = (uint8_t*)s->p - offset;
  1314. +        s->i_left  = (WORD_SIZE - offset)*8;
  1315. +        s->cur_bits = endian_fix32( M32(s->p) );
  1316. +        s->cur_bits >>= (4-offset)*8;
  1317. +    }
  1318. +}
  1319. +
  1320. +static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
  1321. +{
  1322. +    if( WORD_SIZE == 8 )
  1323. +    {
  1324. +        s->cur_bits = (s->cur_bits << i_count) | i_bits;
  1325. +        s->i_left -= i_count;
  1326. +        if( s->i_left <= 32 )
  1327. +        {
  1328. +#ifdef WORDS_BIGENDIAN
  1329. +            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
  1330. +#else
  1331. +            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
  1332. +#endif
  1333. +            s->i_left += 32;
  1334. +            s->p += 4;
  1335. +        }
  1336. +    }
  1337. +    else
  1338. +    {
  1339. +        if( i_count < s->i_left )
  1340. +        {
  1341. +            s->cur_bits = (s->cur_bits << i_count) | i_bits;
  1342. +            s->i_left -= i_count;
  1343. +        }
  1344. +        else
  1345. +        {
  1346. +            i_count -= s->i_left;
  1347. +            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
  1348. +            M32( s->p ) = endian_fix( s->cur_bits );
  1349. +            s->p += 4;
  1350. +            s->cur_bits = i_bits;
  1351. +            s->i_left = 32 - i_count;
  1352. +        }
  1353. +    }
  1354. +}
  1355. +
  1356. +/* Special case to eliminate branch in normal bs_write. */
  1357. +/* Golomb never writes an even-size code, so this is only used in slice headers. */
  1358. +static inline void bs_write32( bs_t *s, uint32_t i_bits )
  1359. +{
  1360. +    bs_write( s, 16, i_bits >> 16 );
  1361. +    bs_write( s, 16, i_bits );
  1362. +}
  1363. +
  1364. +static inline void bs_write1( bs_t *s, uint32_t i_bit )
  1365. +{
  1366. +    s->cur_bits <<= 1;
  1367. +    s->cur_bits |= i_bit;
  1368. +    s->i_left--;
  1369. +    if( s->i_left == WORD_SIZE*8-32 )
  1370. +    {
  1371. +        M32( s->p ) = endian_fix32( s->cur_bits );
  1372. +        s->p += 4;
  1373. +        s->i_left = WORD_SIZE*8;
  1374. +    }
  1375. +}
  1376. +
  1377. +static inline void bs_align_0( bs_t *s )
  1378. +{
  1379. +    bs_write( s, s->i_left&7, 0 );
  1380. +    bs_flush( s );
  1381. +}
  1382. +static inline void bs_align_1( bs_t *s )
  1383. +{
  1384. +    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
  1385. +    bs_flush( s );
  1386. +}
  1387. +static inline void bs_align_10( bs_t *s )
  1388. +{
  1389. +    if( s->i_left&7 )
  1390. +        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
  1391. +}
  1392. +
  1393. +/* golomb functions */
  1394. +
  1395. +static const uint8_t x264_ue_size_tab[256] =
  1396. +{
  1397. +     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
  1398. +     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
  1399. +    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  1400. +    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  1401. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1402. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1403. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1404. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1405. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1406. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1407. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1408. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1409. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1410. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1411. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1412. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1413. +};
  1414. +
  1415. +static inline void bs_write_ue_big( bs_t *s, unsigned int val )
  1416. +{
  1417. +    int size = 0;
  1418. +    int tmp = ++val;
  1419. +    if( tmp >= 0x10000 )
  1420. +    {
  1421. +        size = 32;
  1422. +        tmp >>= 16;
  1423. +    }
  1424. +    if( tmp >= 0x100 )
  1425. +    {
  1426. +        size += 16;
  1427. +        tmp >>= 8;
  1428. +    }
  1429. +    size += x264_ue_size_tab[tmp];
  1430. +    bs_write( s, size>>1, 0 );
  1431. +    bs_write( s, (size>>1)+1, val );
  1432. +}
  1433. +
  1434. +/* Only works on values under 255. */
  1435. +static inline void bs_write_ue( bs_t *s, int val )
  1436. +{
  1437. +    bs_write( s, x264_ue_size_tab[val+1], val+1 );
  1438. +}
  1439. +
  1440. +static inline void bs_write_se( bs_t *s, int val )
  1441. +{
  1442. +    int size = 0;
  1443. +    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
  1444. +    /* 4 instructions on x86, 3 on ARM */
  1445. +    int tmp = 1 - val*2;
  1446. +    if( tmp < 0 ) tmp = val*2;
  1447. +    val = tmp;
  1448. +
  1449. +    if( tmp >= 0x100 )
  1450. +    {
  1451. +        size = 16;
  1452. +        tmp >>= 8;
  1453. +    }
  1454. +    size += x264_ue_size_tab[tmp];
  1455. +    bs_write( s, size, val );
  1456. +}
  1457. +
  1458. +static inline void bs_write_te( bs_t *s, int x, int val )
  1459. +{
  1460. +    if( x == 1 )
  1461. +        bs_write1( s, 1^val );
  1462. +    else //if( x > 1 )
  1463. +        bs_write_ue( s, val );
  1464. +}
  1465. +
  1466. +static inline void bs_rbsp_trailing( bs_t *s )
  1467. +{
  1468. +    bs_write1( s, 1 );
  1469. +    bs_write( s, s->i_left&7, 0  );
  1470. +}
  1471. +
  1472. +static ALWAYS_INLINE int bs_size_ue( unsigned int val )
  1473. +{
  1474. +    return x264_ue_size_tab[val+1];
  1475. +}
  1476. +
  1477. +static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
  1478. +{
  1479. +    if( val < 255 )
  1480. +        return x264_ue_size_tab[val+1];
  1481. +    else
  1482. +        return x264_ue_size_tab[(val+1)>>8] + 16;
  1483. +}
  1484. +
  1485. +static ALWAYS_INLINE int bs_size_se( int val )
  1486. +{
  1487. +    int tmp = 1 - val*2;
  1488. +    if( tmp < 0 ) tmp = val*2;
  1489. +    if( tmp < 256 )
  1490. +        return x264_ue_size_tab[tmp];
  1491. +    else
  1492. +        return x264_ue_size_tab[tmp>>8]+16;
  1493. +}
  1494. +
  1495. +static ALWAYS_INLINE int bs_size_te( int x, int val )
  1496. +{
  1497. +    if( x == 1 )
  1498. +        return 1;
  1499. +    else //if( x > 1 )
  1500. +        return x264_ue_size_tab[val+1];
  1501. +}
  1502. +
  1503. +#endif
  1504. diff --git a/common/bs.h b/common/bs.h
  1505. deleted file mode 100644
  1506. index 343a3c9..0000000
  1507. --- a/common/bs.h
  1508. +++ /dev/null
  1509. @@ -1,291 +0,0 @@
  1510. -/*****************************************************************************
  1511. - * bs.h :
  1512. - *****************************************************************************
  1513. - * Copyright (C) 2003-2008 x264 project
  1514. - *
  1515. - * Authors: Loren Merritt <lorenm@u.washington.edu>
  1516. - *          Jason Garrett-Glaser <darkshikari@gmail.com>
  1517. - *          Laurent Aimar <fenrir@via.ecp.fr>
  1518. - *
  1519. - * This program is free software; you can redistribute it and/or modify
  1520. - * it under the terms of the GNU General Public License as published by
  1521. - * the Free Software Foundation; either version 2 of the License, or
  1522. - * (at your option) any later version.
  1523. - *
  1524. - * This program is distributed in the hope that it will be useful,
  1525. - * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1526. - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  1527. - * GNU General Public License for more details.
  1528. - *
  1529. - * You should have received a copy of the GNU General Public License
  1530. - * along with this program; if not, write to the Free Software
  1531. - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  1532. - *****************************************************************************/
  1533. -
  1534. -#ifndef X264_BS_H
  1535. -#define X264_BS_H
  1536. -
  1537. -typedef struct
  1538. -{
  1539. -    uint8_t i_bits;
  1540. -    uint8_t i_size;
  1541. -} vlc_t;
  1542. -
  1543. -typedef struct
  1544. -{
  1545. -    uint16_t i_bits;
  1546. -    uint8_t  i_size;
  1547. -    /* Next level table to use */
  1548. -    uint8_t  i_next;
  1549. -} vlc_large_t;
  1550. -
  1551. -typedef struct bs_s
  1552. -{
  1553. -    uint8_t *p_start;
  1554. -    uint8_t *p;
  1555. -    uint8_t *p_end;
  1556. -
  1557. -    intptr_t cur_bits;
  1558. -    int     i_left;    /* i_count number of available bits */
  1559. -    int     i_bits_encoded; /* RD only */
  1560. -} bs_t;
  1561. -
  1562. -typedef struct
  1563. -{
  1564. -    int     last;
  1565. -    int16_t level[16];
  1566. -    uint8_t run[16];
  1567. -} x264_run_level_t;
  1568. -
  1569. -extern const vlc_t x264_coeff0_token[5];
  1570. -extern const vlc_t x264_coeff_token[5][16][4];
  1571. -extern const vlc_t x264_total_zeros[15][16];
  1572. -extern const vlc_t x264_total_zeros_dc[3][4];
  1573. -extern const vlc_t x264_run_before[7][16];
  1574. -
  1575. -/* A larger level table size theoretically could help a bit at extremely
  1576. - * high bitrates, but the cost in cache is usually too high for it to be
  1577. - * useful.
  1578. - * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
  1579. - * FIXME: Do further testing? */
  1580. -#define LEVEL_TABLE_SIZE 128
  1581. -extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
  1582. -
  1583. -static inline void bs_init( bs_t *s, void *p_data, int i_data )
  1584. -{
  1585. -    int offset = ((intptr_t)p_data & 3);
  1586. -    s->p       = s->p_start = (uint8_t*)p_data - offset;
  1587. -    s->p_end   = (uint8_t*)p_data + i_data;
  1588. -    s->i_left  = (WORD_SIZE - offset)*8;
  1589. -    s->cur_bits = endian_fix32( M32(s->p) );
  1590. -    s->cur_bits >>= (4-offset)*8;
  1591. -}
  1592. -static inline int bs_pos( bs_t *s )
  1593. -{
  1594. -    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
  1595. -}
  1596. -
  1597. -/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
  1598. -static inline void bs_flush( bs_t *s )
  1599. -{
  1600. -    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
  1601. -    s->p += WORD_SIZE - s->i_left / 8;
  1602. -    s->i_left = WORD_SIZE*8;
  1603. -}
  1604. -/* The inverse of bs_flush: prepare the bitstream to be written to again. */
  1605. -static inline void bs_realign( bs_t *s )
  1606. -{
  1607. -    int offset = ((intptr_t)s->p & 3);
  1608. -    if( offset )
  1609. -    {
  1610. -        s->p       = (uint8_t*)s->p - offset;
  1611. -        s->i_left  = (WORD_SIZE - offset)*8;
  1612. -        s->cur_bits = endian_fix32( M32(s->p) );
  1613. -        s->cur_bits >>= (4-offset)*8;
  1614. -    }
  1615. -}
  1616. -
  1617. -static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
  1618. -{
  1619. -    if( WORD_SIZE == 8 )
  1620. -    {
  1621. -        s->cur_bits = (s->cur_bits << i_count) | i_bits;
  1622. -        s->i_left -= i_count;
  1623. -        if( s->i_left <= 32 )
  1624. -        {
  1625. -#ifdef WORDS_BIGENDIAN
  1626. -            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
  1627. -#else
  1628. -            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
  1629. -#endif
  1630. -            s->i_left += 32;
  1631. -            s->p += 4;
  1632. -        }
  1633. -    }
  1634. -    else
  1635. -    {
  1636. -        if( i_count < s->i_left )
  1637. -        {
  1638. -            s->cur_bits = (s->cur_bits << i_count) | i_bits;
  1639. -            s->i_left -= i_count;
  1640. -        }
  1641. -        else
  1642. -        {
  1643. -            i_count -= s->i_left;
  1644. -            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
  1645. -            M32( s->p ) = endian_fix( s->cur_bits );
  1646. -            s->p += 4;
  1647. -            s->cur_bits = i_bits;
  1648. -            s->i_left = 32 - i_count;
  1649. -        }
  1650. -    }
  1651. -}
  1652. -
  1653. -/* Special case to eliminate branch in normal bs_write. */
  1654. -/* Golomb never writes an even-size code, so this is only used in slice headers. */
  1655. -static inline void bs_write32( bs_t *s, uint32_t i_bits )
  1656. -{
  1657. -    bs_write( s, 16, i_bits >> 16 );
  1658. -    bs_write( s, 16, i_bits );
  1659. -}
  1660. -
  1661. -static inline void bs_write1( bs_t *s, uint32_t i_bit )
  1662. -{
  1663. -    s->cur_bits <<= 1;
  1664. -    s->cur_bits |= i_bit;
  1665. -    s->i_left--;
  1666. -    if( s->i_left == WORD_SIZE*8-32 )
  1667. -    {
  1668. -        M32( s->p ) = endian_fix32( s->cur_bits );
  1669. -        s->p += 4;
  1670. -        s->i_left = WORD_SIZE*8;
  1671. -    }
  1672. -}
  1673. -
  1674. -static inline void bs_align_0( bs_t *s )
  1675. -{
  1676. -    bs_write( s, s->i_left&7, 0 );
  1677. -    bs_flush( s );
  1678. -}
  1679. -static inline void bs_align_1( bs_t *s )
  1680. -{
  1681. -    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
  1682. -    bs_flush( s );
  1683. -}
  1684. -static inline void bs_align_10( bs_t *s )
  1685. -{
  1686. -    if( s->i_left&7 )
  1687. -        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
  1688. -}
  1689. -
  1690. -/* golomb functions */
  1691. -
  1692. -static const uint8_t x264_ue_size_tab[256] =
  1693. -{
  1694. -     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
  1695. -     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
  1696. -    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  1697. -    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  1698. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1699. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1700. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1701. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1702. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1703. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1704. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1705. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1706. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1707. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1708. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1709. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1710. -};
  1711. -
  1712. -static inline void bs_write_ue_big( bs_t *s, unsigned int val )
  1713. -{
  1714. -    int size = 0;
  1715. -    int tmp = ++val;
  1716. -    if( tmp >= 0x10000 )
  1717. -    {
  1718. -        size = 32;
  1719. -        tmp >>= 16;
  1720. -    }
  1721. -    if( tmp >= 0x100 )
  1722. -    {
  1723. -        size += 16;
  1724. -        tmp >>= 8;
  1725. -    }
  1726. -    size += x264_ue_size_tab[tmp];
  1727. -    bs_write( s, size>>1, 0 );
  1728. -    bs_write( s, (size>>1)+1, val );
  1729. -}
  1730. -
  1731. -/* Only works on values under 255. */
  1732. -static inline void bs_write_ue( bs_t *s, int val )
  1733. -{
  1734. -    bs_write( s, x264_ue_size_tab[val+1], val+1 );
  1735. -}
  1736. -
  1737. -static inline void bs_write_se( bs_t *s, int val )
  1738. -{
  1739. -    int size = 0;
  1740. -    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
  1741. -    /* 4 instructions on x86, 3 on ARM */
  1742. -    int tmp = 1 - val*2;
  1743. -    if( tmp < 0 ) tmp = val*2;
  1744. -    val = tmp;
  1745. -
  1746. -    if( tmp >= 0x100 )
  1747. -    {
  1748. -        size = 16;
  1749. -        tmp >>= 8;
  1750. -    }
  1751. -    size += x264_ue_size_tab[tmp];
  1752. -    bs_write( s, size, val );
  1753. -}
  1754. -
  1755. -static inline void bs_write_te( bs_t *s, int x, int val )
  1756. -{
  1757. -    if( x == 1 )
  1758. -        bs_write1( s, 1^val );
  1759. -    else //if( x > 1 )
  1760. -        bs_write_ue( s, val );
  1761. -}
  1762. -
  1763. -static inline void bs_rbsp_trailing( bs_t *s )
  1764. -{
  1765. -    bs_write1( s, 1 );
  1766. -    bs_write( s, s->i_left&7, 0  );
  1767. -}
  1768. -
  1769. -static ALWAYS_INLINE int bs_size_ue( unsigned int val )
  1770. -{
  1771. -    return x264_ue_size_tab[val+1];
  1772. -}
  1773. -
  1774. -static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
  1775. -{
  1776. -    if( val < 255 )
  1777. -        return x264_ue_size_tab[val+1];
  1778. -    else
  1779. -        return x264_ue_size_tab[(val+1)>>8] + 16;
  1780. -}
  1781. -
  1782. -static ALWAYS_INLINE int bs_size_se( int val )
  1783. -{
  1784. -    int tmp = 1 - val*2;
  1785. -    if( tmp < 0 ) tmp = val*2;
  1786. -    if( tmp < 256 )
  1787. -        return x264_ue_size_tab[tmp];
  1788. -    else
  1789. -        return x264_ue_size_tab[tmp>>8]+16;
  1790. -}
  1791. -
  1792. -static ALWAYS_INLINE int bs_size_te( int x, int val )
  1793. -{
  1794. -    if( x == 1 )
  1795. -        return 1;
  1796. -    else //if( x > 1 )
  1797. -        return x264_ue_size_tab[val+1];
  1798. -}
  1799. -
  1800. -#endif
  1801. diff --git a/common/common.c b/common/common.c
  1802. index fccf2b0..2458f65 100644
  1803. --- a/common/common.c
  1804. +++ b/common/common.c
  1805. @@ -1027,60 +1027,6 @@ void x264_picture_clean( x264_picture_t *pic )
  1806.  }
  1807.  
  1808.  /****************************************************************************
  1809. - * x264_nal_encode:
  1810. - ****************************************************************************/
  1811. -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
  1812. -{
  1813. -    uint8_t *src = nal->p_payload;
  1814. -    uint8_t *end = nal->p_payload + nal->i_payload;
  1815. -    uint8_t *orig_dst = dst;
  1816. -    int i_count = 0, size;
  1817. -
  1818. -    if( b_annexb )
  1819. -    {
  1820. -        if( b_long_startcode )
  1821. -            *dst++ = 0x00;
  1822. -        *dst++ = 0x00;
  1823. -        *dst++ = 0x00;
  1824. -        *dst++ = 0x01;
  1825. -    }
  1826. -    else /* save room for size later */
  1827. -        dst += 4;
  1828. -
  1829. -    /* nal header */
  1830. -    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
  1831. -
  1832. -    while( src < end )
  1833. -    {
  1834. -        if( i_count == 2 && *src <= 0x03 )
  1835. -        {
  1836. -            *dst++ = 0x03;
  1837. -            i_count = 0;
  1838. -        }
  1839. -        if( *src == 0 )
  1840. -            i_count++;
  1841. -        else
  1842. -            i_count = 0;
  1843. -        *dst++ = *src++;
  1844. -    }
  1845. -    size = (dst - orig_dst) - 4;
  1846. -
  1847. -    /* Write the size header for mp4/etc */
  1848. -    if( !b_annexb )
  1849. -    {
  1850. -        /* Size doesn't include the size of the header we're writing now. */
  1851. -        orig_dst[0] = size>>24;
  1852. -        orig_dst[1] = size>>16;
  1853. -        orig_dst[2] = size>> 8;
  1854. -        orig_dst[3] = size>> 0;
  1855. -    }
  1856. -
  1857. -    return size+4;
  1858. -}
  1859. -
  1860. -
  1861. -
  1862. -/****************************************************************************
  1863.   * x264_malloc:
  1864.   ****************************************************************************/
  1865.  void *x264_malloc( int i_size )
  1866. diff --git a/common/common.h b/common/common.h
  1867. index 539ea65..93712fe 100644
  1868. --- a/common/common.h
  1869. +++ b/common/common.h
  1870. @@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
  1871.  */
  1872.  
  1873.  #include "x264.h"
  1874. -#include "bs.h"
  1875. +#include "bitstream.h"
  1876.  #include "set.h"
  1877.  #include "predict.h"
  1878.  #include "pixel.h"
  1879. @@ -166,8 +166,6 @@ int64_t x264_mdate( void );
  1880.   * the encoding options */
  1881.  char *x264_param2string( x264_param_t *p, int b_res );
  1882.  
  1883. -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
  1884. -
  1885.  /* log */
  1886.  void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  1887.  
  1888. @@ -796,6 +794,7 @@ struct x264_t
  1889.      x264_zigzag_function_t zigzagf;
  1890.      x264_quant_function_t quantf;
  1891.      x264_deblock_function_t loopf;
  1892. +    x264_bitstream_function_t bsf;
  1893.  
  1894.  #ifdef HAVE_VISUALIZE
  1895.      struct visualize_t *visualize;
  1896. diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
  1897. new file mode 100644
  1898. index 0000000..1fb4cea
  1899. --- /dev/null
  1900. +++ b/common/x86/bitstream-a.asm
  1901. @@ -0,0 +1,112 @@
  1902. +;*****************************************************************************
  1903. +;* bitstream-a.asm: h264 encoder library
  1904. +;*****************************************************************************
  1905. +;* Copyright (C) 2010 x264 project
  1906. +;*
  1907. +;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
  1908. +;*
  1909. +;* This program is free software; you can redistribute it and/or modify
  1910. +;* it under the terms of the GNU General Public License as published by
  1911. +;* the Free Software Foundation; either version 2 of the License, or
  1912. +;* (at your option) any later version.
  1913. +;*
  1914. +;* This program is distributed in the hope that it will be useful,
  1915. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  1916. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  1917. +;* GNU General Public License for more details.
  1918. +;*
  1919. +;* You should have received a copy of the GNU General Public License
  1920. +;* along with this program; if not, write to the Free Software
  1921. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  1922. +;*****************************************************************************
  1923. +
  1924. +%include "x86inc.asm"
  1925. +%include "x86util.asm"
  1926. +
  1927. +SECTION .text
  1928. +
  1929. +;-----------------------------------------------------------------------------
  1930. +; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
  1931. +;-----------------------------------------------------------------------------
  1932. +
  1933. +%macro NAL_LOOP 2
  1934. +ALIGN 16
  1935. +%1:
  1936. +    mova      m0, [r1+r2]
  1937. +    mova      m1, m0
  1938. +%if mmsize == 8
  1939. +    psrlq     m0, 8
  1940. +%else
  1941. +    psrldq    m0, 1
  1942. +%endif
  1943. +    %2   [r0+r1], m1
  1944. +    por       m1, m0
  1945. +    pcmpeqb   m1, m2
  1946. +    pmovmskb r3d, m1
  1947. +    test     r3d, r3d
  1948. +    jnz .escape
  1949. +    add       r1, mmsize
  1950. +    jl %1
  1951. +%endmacro
  1952. +
  1953. +%macro NAL_ESCAPE 1
  1954. +
  1955. +cglobal nal_escape_%1, 3,5
  1956. +    pxor      m2, m2
  1957. +    sub       r1, r2 ; r1 = offset of current src pointer from end of src
  1958. +    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
  1959. +
  1960. +    mov      r3w, [r1+r2]
  1961. +    mov  [r0+r1], r3w
  1962. +    add       r1, 2
  1963. +    jge .ret
  1964. +
  1965. +    ; Start off by jumping into the escape loop in
  1966. +    ; case there's an escape at the start.
  1967. +    ; And do a few more in scalar until src is aligned again.
  1968. +    lea      r4d, [r1+r2]
  1969. +    or       r4d, -mmsize
  1970. +    neg      r4d
  1971. +    jmp .escapeloop
  1972. +
  1973. +    NAL_LOOP .loop_aligned, mova
  1974. +%if mmsize==16
  1975. +    NAL_LOOP .loop_unaligned, movu
  1976. +%endif
  1977. +
  1978. +.ret:
  1979. +    movifnidn rax, r0
  1980. +    RET
  1981. +ALIGN 16
  1982. +.escape:
  1983. +    mov      r4d, mmsize
  1984. +.escapeloop:
  1985. +    mov      r3b, [r1+r2]
  1986. +    cmp      r3b, 3
  1987. +    jna .escape_check
  1988. +.copy:
  1989. +    mov  [r0+r1], r3b
  1990. +    inc      r1
  1991. +    jge .ret
  1992. +    dec      r4d
  1993. +    jg .escapeloop
  1994. +    cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
  1995. +    jz .escape
  1996. +%if mmsize==16
  1997. +    lea      r4d, [r0+r1]
  1998. +    test     r4d, mmsize-1
  1999. +    jnz .loop_unaligned
  2000. +%endif
  2001. +    jmp .loop_aligned
  2002. +.escape_check:
  2003. +    cmp word [r0+r1-2], 0
  2004. +    jnz .copy
  2005. +    mov byte [r0+r1], 3
  2006. +    inc      r0
  2007. +    jmp .copy
  2008. +%endmacro
  2009. +
  2010. +INIT_MMX
  2011. +NAL_ESCAPE mmxext
  2012. +INIT_XMM
  2013. +NAL_ESCAPE sse2
  2014. diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
  2015. index aedd688..3a31e26 100644
  2016. --- a/common/x86/deblock-a.asm
  2017. +++ b/common/x86/deblock-a.asm
  2018. @@ -4,6 +4,7 @@
  2019.  ;* Copyright (C) 2005-2008 x264 project
  2020.  ;*
  2021.  ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  2022. +;*          Jason Garrett-Glaser <darkshikari@gmail.com>
  2023.  ;*
  2024.  ;* This program is free software; you can redistribute it and/or modify
  2025.  ;* it under the terms of the GNU General Public License as published by
  2026. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2027. index 7717ea8..2f9e7f6 100644
  2028. --- a/encoder/encoder.c
  2029. +++ b/encoder/encoder.c
  2030. @@ -987,6 +987,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
  2031.      x264_mc_init( h->param.cpu, &h->mc );
  2032.      x264_quant_init( h, h->param.cpu, &h->quantf );
  2033.      x264_deblock_init( h->param.cpu, &h->loopf );
  2034. +    x264_bitstream_init( h->param.cpu, &h->bsf );
  2035.      x264_dct_init_weights();
  2036.  
  2037.      mbcmp_init( h );
  2038. @@ -1273,7 +1274,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
  2039.      for( int i = start; i < h->out.i_nal; i++ )
  2040.      {
  2041.          int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
  2042. -        int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
  2043. +        int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
  2044.          h->out.nal[i].i_payload = size;
  2045.          h->out.nal[i].p_payload = nal_buffer;
  2046.          nal_buffer += size;
  2047. diff --git a/tools/checkasm.c b/tools/checkasm.c
  2048. index a0a9d54..ea6f209 100644
  2049. --- a/tools/checkasm.c
  2050. +++ b/tools/checkasm.c
  2051. @@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
  2052.      return ret;
  2053.  }
  2054.  
  2055. +static int check_bitstream( int cpu_ref, int cpu_new )
  2056. +{
  2057. +    x264_bitstream_function_t bs_c;
  2058. +    x264_bitstream_function_t bs_ref;
  2059. +    x264_bitstream_function_t bs_a;
  2060. +
  2061. +    int ret = 0, ok = 1, used_asm = 0;
  2062. +
  2063. +    x264_bitstream_init( 0, &bs_c );
  2064. +    x264_bitstream_init( cpu_ref, &bs_ref );
  2065. +    x264_bitstream_init( cpu_new, &bs_a );
  2066. +    if( bs_a.nal_escape != bs_ref.nal_escape )
  2067. +    {
  2068. +        int size = 0x4000;
  2069. +        uint8_t *input = malloc(size+100);
  2070. +        uint8_t *output1 = malloc(size*2);
  2071. +        uint8_t *output2 = malloc(size*2);
  2072. +        used_asm = 1;
  2073. +        set_func_name( "nal_escape" );
  2074. +        for( int i = 0; i < 100; i++ )
  2075. +        {
  2076. +            /* Test corner-case sizes */
  2077. +            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
  2078. +            for( int j = 0; j < test_size; j++ )
  2079. +                input[j] = (rand()&1) * rand();
  2080. +            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
  2081. +            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
  2082. +            int size_c = end_c-output1;
  2083. +            int size_a = end_a-output2;
  2084. +            if( size_c != size_a || memcmp( output1, output2, size_c ) )
  2085. +            {
  2086. +                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
  2087. +                ok = 0;
  2088. +                break;
  2089. +            }
  2090. +        }
  2091. +        for( int j = 0; j < size; j++ )
  2092. +            input[j] = rand();
  2093. +        call_c2( bs_c.nal_escape, output1, input, input+size );
  2094. +        call_a2( bs_a.nal_escape, output2, input, input+size );
  2095. +        free(input);
  2096. +        free(output1);
  2097. +        free(output2);
  2098. +    }
  2099. +    report( "nal escape:" );
  2100. +
  2101. +    return ret;
  2102. +}
  2103. +
  2104.  static int check_all_funcs( int cpu_ref, int cpu_new )
  2105.  {
  2106.      return check_pixel( cpu_ref, cpu_new )
  2107. @@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
  2108.           + check_intra( cpu_ref, cpu_new )
  2109.           + check_deblock( cpu_ref, cpu_new )
  2110.           + check_quant( cpu_ref, cpu_new )
  2111. -         + check_cabac( cpu_ref, cpu_new );
  2112. +         + check_cabac( cpu_ref, cpu_new )
  2113. +         + check_bitstream( cpu_ref, cpu_new );
  2114.  }
  2115.  
  2116.  static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
  2117. --
  2118. 1.7.0.4
  2119.  
  2120.  
  2121. From 9efc381b344f784285e10cf6a836f9efdf1035b8 Mon Sep 17 00:00:00 2001
  2122. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  2123. Date: Fri, 28 May 2010 14:27:22 -0700
  2124. Subject: [PATCH 11/11] Add API tool to apply arbitrary quantizer offsets
  2125.  The calling application can now pass a "map" of quantizer offsets to apply to each frame.
  2126.  An optional callback to free the map can also be included.
  2127.  This allows all kinds of flexible region-of-interest coding and similar.
  2128.  
  2129. ---
  2130. common/common.c       |    2 +-
  2131.  encoder/encoder.c     |    7 +++++--
  2132.  encoder/ratecontrol.c |   36 +++++++++++++++++++++++++-----------
  2133.  encoder/ratecontrol.h |    4 ++--
  2134.  x264.h                |   20 +++++++++++++++++++-
  2135.  5 files changed, 52 insertions(+), 17 deletions(-)
  2136.  
  2137. diff --git a/common/common.c b/common/common.c
  2138. index 2458f65..48e1bbc 100644
  2139. --- a/common/common.c
  2140. +++ b/common/common.c
  2141. @@ -998,6 +998,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
  2142.   ****************************************************************************/
  2143.  int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
  2144.  {
  2145. +    memset( pic, 0, sizeof( x264_picture_t ) );
  2146.      pic->i_type = X264_TYPE_AUTO;
  2147.      pic->i_qpplus1 = 0;
  2148.      pic->img.i_csp = i_csp;
  2149. @@ -1010,7 +1011,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
  2150.      pic->img.i_stride[0] = i_width;
  2151.      pic->img.i_stride[1] = i_width / 2;
  2152.      pic->img.i_stride[2] = i_width / 2;
  2153. -    pic->param = NULL;
  2154.      pic->i_pic_struct = PIC_STRUCT_AUTO;
  2155.      return 0;
  2156.  }
  2157. diff --git a/encoder/encoder.c b/encoder/encoder.c
  2158. index 2f9e7f6..89107a3 100644
  2159. --- a/encoder/encoder.c
  2160. +++ b/encoder/encoder.c
  2161. @@ -2250,11 +2250,14 @@ int     x264_encoder_encode( x264_t *h,
  2162.  
  2163.          if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
  2164.          {
  2165. -            if( x264_macroblock_tree_read( h, fenc ) )
  2166. +            if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
  2167.                  return -1;
  2168.          }
  2169.          else
  2170. -            x264_adaptive_quant_frame( h, fenc );
  2171. +            x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
  2172. +
  2173. +        if( pic_in->prop.quant_offsets_free )
  2174. +            pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
  2175.  
  2176.          if( h->frames.b_have_lowres )
  2177.              x264_frame_init_lowres( h, fenc );
  2178. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  2179. index bf0a400..d09de98 100644
  2180. --- a/encoder/ratecontrol.c
  2181. +++ b/encoder/ratecontrol.c
  2182. @@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
  2183.      return var;
  2184.  }
  2185.  
  2186. -void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  2187. +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
  2188.  {
  2189.      /* constants chosen to result in approximately the same overall bitrate as without AQ.
  2190.       * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
  2191. @@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  2192.          /* Need to init it anyways for MB tree */
  2193.          if( h->param.rc.f_aq_strength == 0 )
  2194.          {
  2195. -            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  2196. -            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  2197. -            if( h->frames.b_have_lowres )
  2198. +            if( quant_offsets )
  2199. +            {
  2200.                  for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  2201. -                    frame->i_inv_qscale_factor[mb_xy] = 256;
  2202. +                    frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
  2203. +                if( h->frames.b_have_lowres )
  2204. +                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  2205. +                        frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
  2206. +            }
  2207. +            else
  2208. +            {
  2209. +                memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  2210. +                memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  2211. +                if( h->frames.b_have_lowres )
  2212. +                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  2213. +                        frame->i_inv_qscale_factor[mb_xy] = 256;
  2214. +            }
  2215.          }
  2216.          /* Need variance data for weighted prediction */
  2217.          if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
  2218. @@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  2219.              for( int mb_x = 0; mb_x < width; mb_x++ )
  2220.              {
  2221.                  float qp_adj;
  2222. +                int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
  2223.                  if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  2224.                  {
  2225. -                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
  2226. +                    qp_adj = frame->f_qp_offset[mb_xy];
  2227.                      qp_adj = strength * (qp_adj - avg_adj);
  2228.                  }
  2229.                  else
  2230. @@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  2231.                      uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
  2232.                      qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
  2233.                  }
  2234. -                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
  2235. -                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  2236. +                if( quant_offsets )
  2237. +                    qp_adj += quant_offsets[mb_xy];
  2238. +                frame->f_qp_offset[mb_xy] =
  2239. +                frame->f_qp_offset_aq[mb_xy] = qp_adj;
  2240.                  if( h->frames.b_have_lowres )
  2241. -                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
  2242. +                    frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
  2243.              }
  2244.      }
  2245.  
  2246. @@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  2247.      }
  2248.  }
  2249.  
  2250. -int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
  2251. +int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
  2252.  {
  2253.      x264_ratecontrol_t *rc = h->rc;
  2254.      uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
  2255. @@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
  2256.          rc->qpbuf_pos--;
  2257.      }
  2258.      else
  2259. -        x264_adaptive_quant_frame( h, frame );
  2260. +        x264_adaptive_quant_frame( h, frame, quant_offsets );
  2261.      return 0;
  2262.  fail:
  2263.      x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
  2264. diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
  2265. index e052b2a..dd139eb 100644
  2266. --- a/encoder/ratecontrol.h
  2267. +++ b/encoder/ratecontrol.h
  2268. @@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
  2269.  
  2270.  void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
  2271.  
  2272. -void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
  2273. -int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
  2274. +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
  2275. +int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
  2276.  int  x264_reference_build_list_optimal( x264_t *h );
  2277.  void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
  2278.  void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
  2279. diff --git a/x264.h b/x264.h
  2280. index 95efd88..a4b3400 100644
  2281. --- a/x264.h
  2282. +++ b/x264.h
  2283. @@ -35,7 +35,7 @@
  2284.  
  2285.  #include <stdarg.h>
  2286.  
  2287. -#define X264_BUILD 96
  2288. +#define X264_BUILD 97
  2289.  
  2290.  /* x264_t:
  2291.   *      opaque handler for encoder */
  2292. @@ -508,6 +508,22 @@ typedef struct
  2293.  
  2294.  typedef struct
  2295.  {
  2296. +    /* In: an array of quantizer offsets to be applied to this image during encoding.
  2297. +     *     These are added on top of the decisions made by x264.
  2298. +     *     Offsets can be fractional; they are added before QPs are rounded to integer.
  2299. +     *     Adaptive quantization must be enabled to use this feature.  Behavior if quant
  2300. +     *     offsets differ between encoding passes is undefined.
  2301. +     *
  2302. +     *     Array contains one offset per macroblock, in raster scan order.  In interlaced
  2303. +     *     mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
  2304. +    float *quant_offsets;
  2305. +    /* In: optional callback to free quant_offsets when used.
  2306. +     *     Useful if one wants to use a different quant_offset array for each frame. */
  2307. +    void (*quant_offsets_free)( void* );
  2308. +} x264_image_properties_t;
  2309. +
  2310. +typedef struct
  2311. +{
  2312.      /* In: force picture type (if not auto)
  2313.       *     If x264 encoding parameters are violated in the forcing of picture types,
  2314.       *     x264 will correct the input picture type and log a warning.
  2315. @@ -537,6 +553,8 @@ typedef struct
  2316.      x264_param_t *param;
  2317.      /* In: raw data */
  2318.      x264_image_t img;
  2319. +    /* In: optional information to modify encoder decisions for this frame */
  2320. +    x264_image_properties_t prop;
  2321.      /* Out: HRD timing information. Output only when i_nal_hrd is set. */
  2322.      x264_hrd_t hrd_timing;
  2323.      /* private user data. libx264 doesn't touch this,
  2324. --
  2325. 1.7.0.4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement