Advertisement
Guest User

Untitled

a guest
May 28th, 2017
607
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 72.16 KB | None | 0 0
  1. From 6ec28a907abee4ebb86c68e404cfe20483e1a128 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Wed, 26 May 2010 12:55:35 -0700
  4. Subject: [PATCH 1/8] Merge some of adaptive quant and weightp
  5.  Eliminate redundant work; both of them were calculating variance of the frame.
  6.  
  7. ---
  8. common/frame.h        |    4 +-
  9.  encoder/analyse.h     |    1 -
  10.  encoder/encoder.c     |   12 ++---
  11.  encoder/ratecontrol.c |  124 +++++++++++++++++++++++++++++++-----------------
  12.  encoder/slicetype.c   |   31 ++----------
  13.  5 files changed, 92 insertions(+), 80 deletions(-)
  14.  
  15. diff --git a/common/frame.h b/common/frame.h
  16. index 91d27b5..ca5cb7a 100644
  17. --- a/common/frame.h
  18. +++ b/common/frame.h
  19. @@ -118,8 +118,8 @@ typedef struct x264_frame
  20.      uint16_t *i_inv_qscale_factor;
  21.      int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
  22.      float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
  23. -    uint32_t i_pixel_sum;
  24. -    uint64_t i_pixel_ssd;
  25. +    uint32_t i_pixel_sum[3];
  26. +    uint64_t i_pixel_ssd[3];
  27.  
  28.      /* hrd */
  29.      x264_hrd_t hrd_timing;
  30. diff --git a/encoder/analyse.h b/encoder/analyse.h
  31. index 7c2c22c..53e4c2e 100644
  32. --- a/encoder/analyse.h
  33. +++ b/encoder/analyse.h
  34. @@ -33,7 +33,6 @@ void x264_slicetype_decide( x264_t *h );
  35.  void x264_slicetype_analyse( x264_t *h, int keyframe );
  36.  
  37.  int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
  38. -void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
  39.  
  40.  int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
  41.  int  x264_lookahead_is_empty( x264_t *h );
  42. diff --git a/encoder/encoder.c b/encoder/encoder.c
  43. index 52017ff..6e0dc54 100644
  44. --- a/encoder/encoder.c
  45. +++ b/encoder/encoder.c
  46. @@ -2246,21 +2246,17 @@ int     x264_encoder_encode( x264_t *h,
  47.                  fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
  48.          }
  49.  
  50. -        if( h->frames.b_have_lowres )
  51. -        {
  52. -            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
  53. -                x264_weight_plane_analyse( h, fenc );
  54. -            x264_frame_init_lowres( h, fenc );
  55. -        }
  56. -
  57.          if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
  58.          {
  59.              if( x264_macroblock_tree_read( h, fenc ) )
  60.                  return -1;
  61.          }
  62. -        else if( h->param.rc.i_aq_mode )
  63. +        else
  64.              x264_adaptive_quant_frame( h, fenc );
  65.  
  66. +        if( h->frames.b_have_lowres )
  67. +            x264_frame_init_lowres( h, fenc );
  68. +
  69.          /* 2: Place the frame into the queue for its slice type decision */
  70.          x264_lookahead_put_frame( h, fenc );
  71.  
  72. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  73. index a725a24..bf0a400 100644
  74. --- a/encoder/ratecontrol.c
  75. +++ b/encoder/ratecontrol.c
  76. @@ -215,12 +215,14 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
  77.      stride <<= h->mb.b_interlaced;
  78.      uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
  79.      uint32_t sum = (uint32_t)res;
  80. -    uint32_t sqr = res >> 32;
  81. -    return sqr - (sum * sum >> shift);
  82. +    uint32_t ssd = res >> 32;
  83. +    frame->i_pixel_sum[i] += sum;
  84. +    frame->i_pixel_ssd[i] += ssd;
  85. +    return ssd - (sum * sum >> shift);
  86.  }
  87.  
  88.  // Find the total AC energy of the block in all planes.
  89. -static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
  90. +static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
  91.  {
  92.      /* This function contains annoying hacks because GCC has a habit of reordering emms
  93.       * and putting it after floating point ops.  As a result, we put the emms at the end of the
  94. @@ -239,56 +241,90 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  95.       * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
  96.      float strength;
  97.      float avg_adj = 0.f;
  98. -    /* Need to init it anyways for MB tree. */
  99. -    if( h->param.rc.f_aq_strength == 0 )
  100. -    {
  101. -        memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  102. -        memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  103. -        if( h->frames.b_have_lowres )
  104. -            for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  105. -                frame->i_inv_qscale_factor[mb_xy] = 256;
  106. -        return;
  107. +    int width = h->sps->i_mb_width;
  108. +    int height = h->sps->i_mb_height;
  109. +    /* Initialize frame stats */
  110. +    for( int i = 0; i < 3; i++ )
  111. +    {
  112. +        frame->i_pixel_sum[i] = 0;
  113. +        frame->i_pixel_ssd[i] = 0;
  114.      }
  115.  
  116. -    if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  117. +    /* Degenerate cases */
  118. +    if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
  119.      {
  120. -        float avg_adj_pow2 = 0.f;
  121. -        for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
  122. -            for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
  123. -            {
  124. -                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
  125. -                float qp_adj = powf( energy + 1, 0.125f );
  126. -                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  127. -                avg_adj += qp_adj;
  128. -                avg_adj_pow2 += qp_adj * qp_adj;
  129. -            }
  130. -        avg_adj /= h->mb.i_mb_count;
  131. -        avg_adj_pow2 /= h->mb.i_mb_count;
  132. -        strength = h->param.rc.f_aq_strength * avg_adj;
  133. -        avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
  134. +        /* Need to init it anyways for MB tree */
  135. +        if( h->param.rc.f_aq_strength == 0 )
  136. +        {
  137. +            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  138. +            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  139. +            if( h->frames.b_have_lowres )
  140. +                for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  141. +                    frame->i_inv_qscale_factor[mb_xy] = 256;
  142. +        }
  143. +        /* Need variance data for weighted prediction */
  144. +        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
  145. +        {
  146. +            for( int mb_y = 0; mb_y < height; mb_y++ )
  147. +                for( int mb_x = 0; mb_x < width; mb_x++ )
  148. +                    x264_ac_energy_mb( h, mb_x, mb_y, frame );
  149. +        }
  150. +        else
  151. +            return;
  152.      }
  153. +    /* Actual adaptive quantization */
  154.      else
  155. -        strength = h->param.rc.f_aq_strength * 1.0397f;
  156. -
  157. -    for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
  158. -        for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
  159. +    {
  160. +        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  161.          {
  162. -            float qp_adj;
  163. -            if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  164. -            {
  165. -                qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
  166. -                qp_adj = strength * (qp_adj - avg_adj);
  167. -            }
  168. -            else
  169. +            float avg_adj_pow2 = 0.f;
  170. +            for( int mb_y = 0; mb_y < height; mb_y++ )
  171. +                for( int mb_x = 0; mb_x < width; mb_x++ )
  172. +                {
  173. +                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
  174. +                    float qp_adj = powf( energy + 1, 0.125f );
  175. +                    frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  176. +                    avg_adj += qp_adj;
  177. +                    avg_adj_pow2 += qp_adj * qp_adj;
  178. +                }
  179. +            avg_adj /= h->mb.i_mb_count;
  180. +            avg_adj_pow2 /= h->mb.i_mb_count;
  181. +            strength = h->param.rc.f_aq_strength * avg_adj;
  182. +            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
  183. +        }
  184. +        else
  185. +            strength = h->param.rc.f_aq_strength * 1.0397f;
  186. +
  187. +        for( int mb_y = 0; mb_y < height; mb_y++ )
  188. +            for( int mb_x = 0; mb_x < width; mb_x++ )
  189.              {
  190. -                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
  191. -                qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
  192. +                float qp_adj;
  193. +                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  194. +                {
  195. +                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
  196. +                    qp_adj = strength * (qp_adj - avg_adj);
  197. +                }
  198. +                else
  199. +                {
  200. +                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
  201. +                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
  202. +                }
  203. +                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
  204. +                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  205. +                if( h->frames.b_have_lowres )
  206. +                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
  207.              }
  208. -            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
  209. -            frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  210. -            if( h->frames.b_have_lowres )
  211. -                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
  212. -        }
  213. +    }
  214. +
  215. +    /* Remove mean from SSD calculation */
  216. +    for( int i = 0; i < 3; i++ )
  217. +    {
  218. +        uint64_t ssd = frame->i_pixel_ssd[i];
  219. +        uint64_t sum = frame->i_pixel_sum[i];
  220. +        int w = width*16>>!!i;
  221. +        int h = height*16>>!!i;
  222. +        frame->i_pixel_ssd[i] = ssd - (sum * sum + w * h / 2) / (w * h);
  223. +    }
  224.  }
  225.  
  226.  int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
  227. diff --git a/encoder/slicetype.c b/encoder/slicetype.c
  228. index 9352367..e454e12 100644
  229. --- a/encoder/slicetype.c
  230. +++ b/encoder/slicetype.c
  231. @@ -67,25 +67,6 @@ static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_
  232.      w->i_scale = X264_MIN( w->i_scale, 127 );
  233.  }
  234.  
  235. -void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
  236. -{
  237. -    uint32_t sad = 0;
  238. -    uint64_t ssd = 0;
  239. -    uint8_t *p = frame->plane[0];
  240. -    int stride = frame->i_stride[0];
  241. -    int width = frame->i_width[0];
  242. -    int height = frame->i_lines[0];
  243. -    for( int y = 0; y < height>>4; y++, p += stride*16 )
  244. -        for( int x = 0; x < width; x += 16 )
  245. -        {
  246. -            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
  247. -            sad += (uint32_t)res;
  248. -            ssd += res >> 32;
  249. -        }
  250. -    frame->i_pixel_sum = sad;
  251. -    frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
  252. -}
  253. -
  254.  static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
  255.  {
  256.      int ref0_distance = fenc->i_frame - ref->i_frame - 1;
  257. @@ -167,10 +148,10 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
  258.      int found;
  259.      x264_weight_t *weights = fenc->weight[0];
  260.  
  261. -    fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
  262. -    ref_var  = round( sqrt(  ref->i_pixel_ssd ) );
  263. -    fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
  264. -    ref_mean  = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
  265. +    fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
  266. +    ref_var  = round( sqrt(  ref->i_pixel_ssd[0] ) );
  267. +    fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
  268. +    ref_mean  = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
  269.  
  270.      //early termination
  271.      if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
  272. @@ -534,8 +515,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
  273.          do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
  274.          if( do_search[0] )
  275.          {
  276. -            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
  277. -                  || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
  278. +            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
  279. +                  h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
  280.              {
  281.                  x264_emms();
  282.                  x264_weights_analyse( h, frames[b], frames[p0], 1 );
  283. --
  284. 1.7.0.4
  285.  
  286.  
  287. From 3b40a04ffdd7b6d5a69b3c5dc29f1e727f314496 Mon Sep 17 00:00:00 2001
  288. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  289. Date: Thu, 27 May 2010 10:42:15 -0700
  290. Subject: [PATCH 2/8] Add fast skip in lookahead motion search
  291.  Helps speed very significantly on motionless blocks.
  292.  
  293. ---
  294. encoder/slicetype.c |   16 +++++++++++++++-
  295.  1 files changed, 15 insertions(+), 1 deletions(-)
  296.  
  297. diff --git a/encoder/slicetype.c b/encoder/slicetype.c
  298. index e454e12..d7cfe5c 100644
  299. --- a/encoder/slicetype.c
  300. +++ b/encoder/slicetype.c
  301. @@ -379,11 +379,25 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
  302.                  CP32( m[l].mvp, mvc[0] );
  303.              else
  304.                  x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
  305. -            x264_me_search( h, &m[l], mvc, i_mvc );
  306.  
  307. +            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
  308. +             * since anything else is likely to have enough residual to not trigger the skip. */
  309. +            if( !M32( m[l].mvp ) )
  310. +            {
  311. +                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
  312. +                if( m[l].cost < 64 )
  313. +                {
  314. +                    M32( m[l].mv ) = 0;
  315. +                    goto skip_motionest;
  316. +                }
  317. +            }
  318. +
  319. +            x264_me_search( h, &m[l], mvc, i_mvc );
  320.              m[l].cost -= 2; // remove mvcost from skip mbs
  321.              if( M32( m[l].mv ) )
  322.                  m[l].cost += 5;
  323. +
  324. +skip_motionest:
  325.              CP32( fenc_mvs[l], m[l].mv );
  326.              *fenc_costs[l] = m[l].cost;
  327.          }
  328. --
  329. 1.7.0.4
  330.  
  331.  
  332. From 77ec5d11f0b22035f836f8451d568ecb3e1236e6 Mon Sep 17 00:00:00 2001
  333. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  334. Date: Thu, 27 May 2010 12:31:41 -0700
  335. Subject: [PATCH 3/8] Fix omission in libx264 tuning documentation
  336.  
  337. ---
  338. x264.h |    2 +-
  339.  1 files changed, 1 insertions(+), 1 deletions(-)
  340.  
  341. diff --git a/x264.h b/x264.h
  342. index 6d7b703..95efd88 100644
  343. --- a/x264.h
  344. +++ b/x264.h
  345. @@ -446,7 +446,7 @@ static const char * const x264_tune_names[] = { "film", "animation", "grain", "s
  346.  
  347.  /*      Multiple tunings can be used if separated by a delimiter in ",./-+",
  348.   *      however multiple psy tunings cannot be used.
  349. - *      film, animation, grain, psnr, and ssim are psy tunings.
  350. + *      film, animation, grain, stillimage, psnr, and ssim are psy tunings.
  351.   *
  352.   *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
  353.  int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
  354. --
  355. 1.7.0.4
  356.  
  357.  
  358. From bec048110f55c197aeaa6aa506952ef071a2558d Mon Sep 17 00:00:00 2001
  359. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  360. Date: Thu, 27 May 2010 14:27:32 -0700
  361. Subject: [PATCH 4/8] x86 assembly code for NAL escaping
  362.  Up to ~10x faster than C depending on CPU.
  363.  Helps the most at very high bitrates (e.g. lossless).
  364.  Also make the C code faster and simpler.
  365.  
  366. ---
  367. Makefile                   |    4 +-
  368.  common/bitstream.c         |   92 ++++++++++++++
  369.  common/bitstream.h         |  299 ++++++++++++++++++++++++++++++++++++++++++++
  370.  common/bs.h                |  291 ------------------------------------------
  371.  common/common.c            |   54 --------
  372.  common/common.h            |    5 +-
  373.  common/x86/bitstream-a.asm |  112 +++++++++++++++++
  374.  common/x86/deblock-a.asm   |    1 +
  375.  encoder/encoder.c          |    3 +-
  376.  tools/checkasm.c           |   52 ++++++++-
  377.  10 files changed, 561 insertions(+), 352 deletions(-)
  378.  create mode 100644 common/bitstream.c
  379.  create mode 100644 common/bitstream.h
  380.  delete mode 100644 common/bs.h
  381.  create mode 100644 common/x86/bitstream-a.asm
  382.  
  383. diff --git a/Makefile b/Makefile
  384. index 0b43a3e..519e181 100644
  385. --- a/Makefile
  386. +++ b/Makefile
  387. @@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
  388.         common/frame.c common/dct.c common/cpu.c common/cabac.c \
  389.         common/common.c common/mdate.c common/rectangle.c \
  390.         common/set.c common/quant.c common/deblock.c common/vlc.c \
  391. -       common/mvpred.c \
  392. +       common/mvpred.c common/bitstream.c \
  393.         encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
  394.         encoder/set.c encoder/macroblock.c encoder/cabac.c \
  395.         encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
  396. @@ -52,7 +52,7 @@ endif
  397.  ifneq ($(AS),)
  398.  X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
  399.            mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
  400. -          cpu-a.asm dct-32.asm
  401. +          cpu-a.asm dct-32.asm bitstream-a.asm
  402.  X86SRC = $(X86SRC0:%=common/x86/%)
  403.  
  404.  ifeq ($(ARCH),X86)
  405. diff --git a/common/bitstream.c b/common/bitstream.c
  406. new file mode 100644
  407. index 0000000..0aaac21
  408. --- /dev/null
  409. +++ b/common/bitstream.c
  410. @@ -0,0 +1,92 @@
  411. +/*****************************************************************************
  412. + * bitstream.c: h264 encoder library
  413. + *****************************************************************************
  414. + * Copyright (C) 2010 x264 project
  415. + *
  416. + * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  417. + *          Jason Garrett-Glaser <darkshikari@gmail.com>
  418. + *
  419. + * This program is free software; you can redistribute it and/or modify
  420. + * it under the terms of the GNU General Public License as published by
  421. + * the Free Software Foundation; either version 2 of the License, or
  422. + * (at your option) any later version.
  423. + *
  424. + * This program is distributed in the hope that it will be useful,
  425. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  426. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  427. + * GNU General Public License for more details.
  428. + *
  429. + * You should have received a copy of the GNU General Public License
  430. + * along with this program; if not, write to the Free Software
  431. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  432. + *****************************************************************************/
  433. +
  434. +#include "common.h"
  435. +
  436. +static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
  437. +{
  438. +    if( src < end ) *dst++ = *src++;
  439. +    if( src < end ) *dst++ = *src++;
  440. +    while( src < end )
  441. +    {
  442. +        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
  443. +            *dst++ = 0x03;
  444. +        *dst++ = *src++;
  445. +    }
  446. +    return dst;
  447. +}
  448. +
  449. +#ifdef HAVE_MMX
  450. +uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
  451. +uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
  452. +#endif
  453. +
  454. +/****************************************************************************
  455. + * x264_nal_encode:
  456. + ****************************************************************************/
  457. +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
  458. +{
  459. +    uint8_t *src = nal->p_payload;
  460. +    uint8_t *end = nal->p_payload + nal->i_payload;
  461. +    uint8_t *orig_dst = dst;
  462. +
  463. +    if( h->param.b_annexb )
  464. +    {
  465. +        if( b_long_startcode )
  466. +            *dst++ = 0x00;
  467. +        *dst++ = 0x00;
  468. +        *dst++ = 0x00;
  469. +        *dst++ = 0x01;
  470. +    }
  471. +    else /* save room for size later */
  472. +        dst += 4;
  473. +
  474. +    /* nal header */
  475. +    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
  476. +
  477. +    dst = h->bsf.nal_escape( dst, src, end );
  478. +    int size = (dst - orig_dst) - 4;
  479. +
  480. +    /* Write the size header for mp4/etc */
  481. +    if( !h->param.b_annexb )
  482. +    {
  483. +        /* Size doesn't include the size of the header we're writing now. */
  484. +        orig_dst[0] = size>>24;
  485. +        orig_dst[1] = size>>16;
  486. +        orig_dst[2] = size>> 8;
  487. +        orig_dst[3] = size>> 0;
  488. +    }
  489. +
  490. +    return size+4;
  491. +}
  492. +
  493. +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
  494. +{
  495. +    pf->nal_escape = x264_nal_escape_c;
  496. +#ifdef HAVE_MMX
  497. +    if( cpu&X264_CPU_MMXEXT )
  498. +        pf->nal_escape = x264_nal_escape_mmxext;
  499. +    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
  500. +        pf->nal_escape = x264_nal_escape_sse2;
  501. +#endif
  502. +}
  503. diff --git a/common/bitstream.h b/common/bitstream.h
  504. new file mode 100644
  505. index 0000000..d018c7d
  506. --- /dev/null
  507. +++ b/common/bitstream.h
  508. @@ -0,0 +1,299 @@
  509. +/*****************************************************************************
  510. + * bitstream.h: h264 encoder library
  511. + *****************************************************************************
  512. + * Copyright (C) 2003-2008 x264 project
  513. + *
  514. + * Authors: Loren Merritt <lorenm@u.washington.edu>
  515. + *          Jason Garrett-Glaser <darkshikari@gmail.com>
  516. + *          Laurent Aimar <fenrir@via.ecp.fr>
  517. + *
  518. + * This program is free software; you can redistribute it and/or modify
  519. + * it under the terms of the GNU General Public License as published by
  520. + * the Free Software Foundation; either version 2 of the License, or
  521. + * (at your option) any later version.
  522. + *
  523. + * This program is distributed in the hope that it will be useful,
  524. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  525. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  526. + * GNU General Public License for more details.
  527. + *
  528. + * You should have received a copy of the GNU General Public License
  529. + * along with this program; if not, write to the Free Software
  530. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  531. + *****************************************************************************/
  532. +
  533. +#ifndef X264_BS_H
  534. +#define X264_BS_H
  535. +
  536. +typedef struct
  537. +{
  538. +    uint8_t i_bits;
  539. +    uint8_t i_size;
  540. +} vlc_t;
  541. +
  542. +typedef struct
  543. +{
  544. +    uint16_t i_bits;
  545. +    uint8_t  i_size;
  546. +    /* Next level table to use */
  547. +    uint8_t  i_next;
  548. +} vlc_large_t;
  549. +
  550. +typedef struct bs_s
  551. +{
  552. +    uint8_t *p_start;
  553. +    uint8_t *p;
  554. +    uint8_t *p_end;
  555. +
  556. +    intptr_t cur_bits;
  557. +    int     i_left;    /* i_count number of available bits */
  558. +    int     i_bits_encoded; /* RD only */
  559. +} bs_t;
  560. +
  561. +typedef struct
  562. +{
  563. +    int     last;
  564. +    int16_t level[16];
  565. +    uint8_t run[16];
  566. +} x264_run_level_t;
  567. +
  568. +extern const vlc_t x264_coeff0_token[5];
  569. +extern const vlc_t x264_coeff_token[5][16][4];
  570. +extern const vlc_t x264_total_zeros[15][16];
  571. +extern const vlc_t x264_total_zeros_dc[3][4];
  572. +extern const vlc_t x264_run_before[7][16];
  573. +
  574. +typedef struct
  575. +{
  576. +    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
  577. +} x264_bitstream_function_t;
  578. +
  579. +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
  580. +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
  581. +
  582. +/* A larger level table size theoretically could help a bit at extremely
  583. + * high bitrates, but the cost in cache is usually too high for it to be
  584. + * useful.
  585. + * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
  586. + * FIXME: Do further testing? */
  587. +#define LEVEL_TABLE_SIZE 128
  588. +extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
  589. +
  590. +static inline void bs_init( bs_t *s, void *p_data, int i_data )
  591. +{
  592. +    int offset = ((intptr_t)p_data & 3);
  593. +    s->p       = s->p_start = (uint8_t*)p_data - offset;
  594. +    s->p_end   = (uint8_t*)p_data + i_data;
  595. +    s->i_left  = (WORD_SIZE - offset)*8;
  596. +    s->cur_bits = endian_fix32( M32(s->p) );
  597. +    s->cur_bits >>= (4-offset)*8;
  598. +}
  599. +static inline int bs_pos( bs_t *s )
  600. +{
  601. +    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
  602. +}
  603. +
  604. +/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
  605. +static inline void bs_flush( bs_t *s )
  606. +{
  607. +    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
  608. +    s->p += WORD_SIZE - s->i_left / 8;
  609. +    s->i_left = WORD_SIZE*8;
  610. +}
  611. +/* The inverse of bs_flush: prepare the bitstream to be written to again. */
  612. +static inline void bs_realign( bs_t *s )
  613. +{
  614. +    int offset = ((intptr_t)s->p & 3);
  615. +    if( offset )
  616. +    {
  617. +        s->p       = (uint8_t*)s->p - offset;
  618. +        s->i_left  = (WORD_SIZE - offset)*8;
  619. +        s->cur_bits = endian_fix32( M32(s->p) );
  620. +        s->cur_bits >>= (4-offset)*8;
  621. +    }
  622. +}
  623. +
  624. +static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
  625. +{
  626. +    if( WORD_SIZE == 8 )
  627. +    {
  628. +        s->cur_bits = (s->cur_bits << i_count) | i_bits;
  629. +        s->i_left -= i_count;
  630. +        if( s->i_left <= 32 )
  631. +        {
  632. +#ifdef WORDS_BIGENDIAN
  633. +            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
  634. +#else
  635. +            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
  636. +#endif
  637. +            s->i_left += 32;
  638. +            s->p += 4;
  639. +        }
  640. +    }
  641. +    else
  642. +    {
  643. +        if( i_count < s->i_left )
  644. +        {
  645. +            s->cur_bits = (s->cur_bits << i_count) | i_bits;
  646. +            s->i_left -= i_count;
  647. +        }
  648. +        else
  649. +        {
  650. +            i_count -= s->i_left;
  651. +            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
  652. +            M32( s->p ) = endian_fix( s->cur_bits );
  653. +            s->p += 4;
  654. +            s->cur_bits = i_bits;
  655. +            s->i_left = 32 - i_count;
  656. +        }
  657. +    }
  658. +}
  659. +
  660. +/* Special case to eliminate branch in normal bs_write. */
  661. +/* Golomb never writes an even-size code, so this is only used in slice headers. */
  662. +static inline void bs_write32( bs_t *s, uint32_t i_bits )
  663. +{
  664. +    bs_write( s, 16, i_bits >> 16 );
  665. +    bs_write( s, 16, i_bits );
  666. +}
  667. +
  668. +static inline void bs_write1( bs_t *s, uint32_t i_bit )
  669. +{
  670. +    s->cur_bits <<= 1;
  671. +    s->cur_bits |= i_bit;
  672. +    s->i_left--;
  673. +    if( s->i_left == WORD_SIZE*8-32 )
  674. +    {
  675. +        M32( s->p ) = endian_fix32( s->cur_bits );
  676. +        s->p += 4;
  677. +        s->i_left = WORD_SIZE*8;
  678. +    }
  679. +}
  680. +
  681. +static inline void bs_align_0( bs_t *s )
  682. +{
  683. +    bs_write( s, s->i_left&7, 0 );
  684. +    bs_flush( s );
  685. +}
  686. +static inline void bs_align_1( bs_t *s )
  687. +{
  688. +    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
  689. +    bs_flush( s );
  690. +}
  691. +static inline void bs_align_10( bs_t *s )
  692. +{
  693. +    if( s->i_left&7 )
  694. +        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
  695. +}
  696. +
  697. +/* golomb functions */
  698. +
  699. +static const uint8_t x264_ue_size_tab[256] =
  700. +{
  701. +     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
  702. +     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
  703. +    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  704. +    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  705. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  706. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  707. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  708. +    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  709. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  710. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  711. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  712. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  713. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  714. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  715. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  716. +    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  717. +};
  718. +
  719. +static inline void bs_write_ue_big( bs_t *s, unsigned int val )
  720. +{
  721. +    int size = 0;
  722. +    int tmp = ++val;
  723. +    if( tmp >= 0x10000 )
  724. +    {
  725. +        size = 32;
  726. +        tmp >>= 16;
  727. +    }
  728. +    if( tmp >= 0x100 )
  729. +    {
  730. +        size += 16;
  731. +        tmp >>= 8;
  732. +    }
  733. +    size += x264_ue_size_tab[tmp];
  734. +    bs_write( s, size>>1, 0 );
  735. +    bs_write( s, (size>>1)+1, val );
  736. +}
  737. +
  738. +/* Only works on values under 255. */
  739. +static inline void bs_write_ue( bs_t *s, int val )
  740. +{
  741. +    bs_write( s, x264_ue_size_tab[val+1], val+1 );
  742. +}
  743. +
  744. +static inline void bs_write_se( bs_t *s, int val )
  745. +{
  746. +    int size = 0;
  747. +    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
  748. +    /* 4 instructions on x86, 3 on ARM */
  749. +    int tmp = 1 - val*2;
  750. +    if( tmp < 0 ) tmp = val*2;
  751. +    val = tmp;
  752. +
  753. +    if( tmp >= 0x100 )
  754. +    {
  755. +        size = 16;
  756. +        tmp >>= 8;
  757. +    }
  758. +    size += x264_ue_size_tab[tmp];
  759. +    bs_write( s, size, val );
  760. +}
  761. +
  762. +static inline void bs_write_te( bs_t *s, int x, int val )
  763. +{
  764. +    if( x == 1 )
  765. +        bs_write1( s, 1^val );
  766. +    else //if( x > 1 )
  767. +        bs_write_ue( s, val );
  768. +}
  769. +
  770. +static inline void bs_rbsp_trailing( bs_t *s )
  771. +{
  772. +    bs_write1( s, 1 );
  773. +    bs_write( s, s->i_left&7, 0  );
  774. +}
  775. +
  776. +static ALWAYS_INLINE int bs_size_ue( unsigned int val )
  777. +{
  778. +    return x264_ue_size_tab[val+1];
  779. +}
  780. +
  781. +static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
  782. +{
  783. +    if( val < 255 )
  784. +        return x264_ue_size_tab[val+1];
  785. +    else
  786. +        return x264_ue_size_tab[(val+1)>>8] + 16;
  787. +}
  788. +
  789. +static ALWAYS_INLINE int bs_size_se( int val )
  790. +{
  791. +    int tmp = 1 - val*2;
  792. +    if( tmp < 0 ) tmp = val*2;
  793. +    if( tmp < 256 )
  794. +        return x264_ue_size_tab[tmp];
  795. +    else
  796. +        return x264_ue_size_tab[tmp>>8]+16;
  797. +}
  798. +
  799. +static ALWAYS_INLINE int bs_size_te( int x, int val )
  800. +{
  801. +    if( x == 1 )
  802. +        return 1;
  803. +    else //if( x > 1 )
  804. +        return x264_ue_size_tab[val+1];
  805. +}
  806. +
  807. +#endif
  808. diff --git a/common/bs.h b/common/bs.h
  809. deleted file mode 100644
  810. index 343a3c9..0000000
  811. --- a/common/bs.h
  812. +++ /dev/null
  813. @@ -1,291 +0,0 @@
  814. -/*****************************************************************************
  815. - * bs.h :
  816. - *****************************************************************************
  817. - * Copyright (C) 2003-2008 x264 project
  818. - *
  819. - * Authors: Loren Merritt <lorenm@u.washington.edu>
  820. - *          Jason Garrett-Glaser <darkshikari@gmail.com>
  821. - *          Laurent Aimar <fenrir@via.ecp.fr>
  822. - *
  823. - * This program is free software; you can redistribute it and/or modify
  824. - * it under the terms of the GNU General Public License as published by
  825. - * the Free Software Foundation; either version 2 of the License, or
  826. - * (at your option) any later version.
  827. - *
  828. - * This program is distributed in the hope that it will be useful,
  829. - * but WITHOUT ANY WARRANTY; without even the implied warranty of
  830. - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  831. - * GNU General Public License for more details.
  832. - *
  833. - * You should have received a copy of the GNU General Public License
  834. - * along with this program; if not, write to the Free Software
  835. - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  836. - *****************************************************************************/
  837. -
  838. -#ifndef X264_BS_H
  839. -#define X264_BS_H
  840. -
  841. -typedef struct
  842. -{
  843. -    uint8_t i_bits;
  844. -    uint8_t i_size;
  845. -} vlc_t;
  846. -
  847. -typedef struct
  848. -{
  849. -    uint16_t i_bits;
  850. -    uint8_t  i_size;
  851. -    /* Next level table to use */
  852. -    uint8_t  i_next;
  853. -} vlc_large_t;
  854. -
  855. -typedef struct bs_s
  856. -{
  857. -    uint8_t *p_start;
  858. -    uint8_t *p;
  859. -    uint8_t *p_end;
  860. -
  861. -    intptr_t cur_bits;
  862. -    int     i_left;    /* i_count number of available bits */
  863. -    int     i_bits_encoded; /* RD only */
  864. -} bs_t;
  865. -
  866. -typedef struct
  867. -{
  868. -    int     last;
  869. -    int16_t level[16];
  870. -    uint8_t run[16];
  871. -} x264_run_level_t;
  872. -
  873. -extern const vlc_t x264_coeff0_token[5];
  874. -extern const vlc_t x264_coeff_token[5][16][4];
  875. -extern const vlc_t x264_total_zeros[15][16];
  876. -extern const vlc_t x264_total_zeros_dc[3][4];
  877. -extern const vlc_t x264_run_before[7][16];
  878. -
  879. -/* A larger level table size theoretically could help a bit at extremely
  880. - * high bitrates, but the cost in cache is usually too high for it to be
  881. - * useful.
  882. - * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
  883. - * FIXME: Do further testing? */
  884. -#define LEVEL_TABLE_SIZE 128
  885. -extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
  886. -
  887. -static inline void bs_init( bs_t *s, void *p_data, int i_data )
  888. -{
  889. -    int offset = ((intptr_t)p_data & 3);
  890. -    s->p       = s->p_start = (uint8_t*)p_data - offset;
  891. -    s->p_end   = (uint8_t*)p_data + i_data;
  892. -    s->i_left  = (WORD_SIZE - offset)*8;
  893. -    s->cur_bits = endian_fix32( M32(s->p) );
  894. -    s->cur_bits >>= (4-offset)*8;
  895. -}
  896. -static inline int bs_pos( bs_t *s )
  897. -{
  898. -    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
  899. -}
  900. -
  901. -/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
  902. -static inline void bs_flush( bs_t *s )
  903. -{
  904. -    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
  905. -    s->p += WORD_SIZE - s->i_left / 8;
  906. -    s->i_left = WORD_SIZE*8;
  907. -}
  908. -/* The inverse of bs_flush: prepare the bitstream to be written to again. */
  909. -static inline void bs_realign( bs_t *s )
  910. -{
  911. -    int offset = ((intptr_t)s->p & 3);
  912. -    if( offset )
  913. -    {
  914. -        s->p       = (uint8_t*)s->p - offset;
  915. -        s->i_left  = (WORD_SIZE - offset)*8;
  916. -        s->cur_bits = endian_fix32( M32(s->p) );
  917. -        s->cur_bits >>= (4-offset)*8;
  918. -    }
  919. -}
  920. -
  921. -static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
  922. -{
  923. -    if( WORD_SIZE == 8 )
  924. -    {
  925. -        s->cur_bits = (s->cur_bits << i_count) | i_bits;
  926. -        s->i_left -= i_count;
  927. -        if( s->i_left <= 32 )
  928. -        {
  929. -#ifdef WORDS_BIGENDIAN
  930. -            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
  931. -#else
  932. -            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
  933. -#endif
  934. -            s->i_left += 32;
  935. -            s->p += 4;
  936. -        }
  937. -    }
  938. -    else
  939. -    {
  940. -        if( i_count < s->i_left )
  941. -        {
  942. -            s->cur_bits = (s->cur_bits << i_count) | i_bits;
  943. -            s->i_left -= i_count;
  944. -        }
  945. -        else
  946. -        {
  947. -            i_count -= s->i_left;
  948. -            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
  949. -            M32( s->p ) = endian_fix( s->cur_bits );
  950. -            s->p += 4;
  951. -            s->cur_bits = i_bits;
  952. -            s->i_left = 32 - i_count;
  953. -        }
  954. -    }
  955. -}
  956. -
  957. -/* Special case to eliminate branch in normal bs_write. */
  958. -/* Golomb never writes an even-size code, so this is only used in slice headers. */
  959. -static inline void bs_write32( bs_t *s, uint32_t i_bits )
  960. -{
  961. -    bs_write( s, 16, i_bits >> 16 );
  962. -    bs_write( s, 16, i_bits );
  963. -}
  964. -
  965. -static inline void bs_write1( bs_t *s, uint32_t i_bit )
  966. -{
  967. -    s->cur_bits <<= 1;
  968. -    s->cur_bits |= i_bit;
  969. -    s->i_left--;
  970. -    if( s->i_left == WORD_SIZE*8-32 )
  971. -    {
  972. -        M32( s->p ) = endian_fix32( s->cur_bits );
  973. -        s->p += 4;
  974. -        s->i_left = WORD_SIZE*8;
  975. -    }
  976. -}
  977. -
  978. -static inline void bs_align_0( bs_t *s )
  979. -{
  980. -    bs_write( s, s->i_left&7, 0 );
  981. -    bs_flush( s );
  982. -}
  983. -static inline void bs_align_1( bs_t *s )
  984. -{
  985. -    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
  986. -    bs_flush( s );
  987. -}
  988. -static inline void bs_align_10( bs_t *s )
  989. -{
  990. -    if( s->i_left&7 )
  991. -        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
  992. -}
  993. -
  994. -/* golomb functions */
  995. -
  996. -static const uint8_t x264_ue_size_tab[256] =
  997. -{
  998. -     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
  999. -     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
  1000. -    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  1001. -    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  1002. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1003. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1004. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1005. -    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  1006. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1007. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1008. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1009. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1010. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1011. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1012. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1013. -    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  1014. -};
  1015. -
  1016. -static inline void bs_write_ue_big( bs_t *s, unsigned int val )
  1017. -{
  1018. -    int size = 0;
  1019. -    int tmp = ++val;
  1020. -    if( tmp >= 0x10000 )
  1021. -    {
  1022. -        size = 32;
  1023. -        tmp >>= 16;
  1024. -    }
  1025. -    if( tmp >= 0x100 )
  1026. -    {
  1027. -        size += 16;
  1028. -        tmp >>= 8;
  1029. -    }
  1030. -    size += x264_ue_size_tab[tmp];
  1031. -    bs_write( s, size>>1, 0 );
  1032. -    bs_write( s, (size>>1)+1, val );
  1033. -}
  1034. -
  1035. -/* Only works on values under 255. */
  1036. -static inline void bs_write_ue( bs_t *s, int val )
  1037. -{
  1038. -    bs_write( s, x264_ue_size_tab[val+1], val+1 );
  1039. -}
  1040. -
  1041. -static inline void bs_write_se( bs_t *s, int val )
  1042. -{
  1043. -    int size = 0;
  1044. -    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
  1045. -    /* 4 instructions on x86, 3 on ARM */
  1046. -    int tmp = 1 - val*2;
  1047. -    if( tmp < 0 ) tmp = val*2;
  1048. -    val = tmp;
  1049. -
  1050. -    if( tmp >= 0x100 )
  1051. -    {
  1052. -        size = 16;
  1053. -        tmp >>= 8;
  1054. -    }
  1055. -    size += x264_ue_size_tab[tmp];
  1056. -    bs_write( s, size, val );
  1057. -}
  1058. -
  1059. -static inline void bs_write_te( bs_t *s, int x, int val )
  1060. -{
  1061. -    if( x == 1 )
  1062. -        bs_write1( s, 1^val );
  1063. -    else //if( x > 1 )
  1064. -        bs_write_ue( s, val );
  1065. -}
  1066. -
  1067. -static inline void bs_rbsp_trailing( bs_t *s )
  1068. -{
  1069. -    bs_write1( s, 1 );
  1070. -    bs_write( s, s->i_left&7, 0  );
  1071. -}
  1072. -
  1073. -static ALWAYS_INLINE int bs_size_ue( unsigned int val )
  1074. -{
  1075. -    return x264_ue_size_tab[val+1];
  1076. -}
  1077. -
  1078. -static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
  1079. -{
  1080. -    if( val < 255 )
  1081. -        return x264_ue_size_tab[val+1];
  1082. -    else
  1083. -        return x264_ue_size_tab[(val+1)>>8] + 16;
  1084. -}
  1085. -
  1086. -static ALWAYS_INLINE int bs_size_se( int val )
  1087. -{
  1088. -    int tmp = 1 - val*2;
  1089. -    if( tmp < 0 ) tmp = val*2;
  1090. -    if( tmp < 256 )
  1091. -        return x264_ue_size_tab[tmp];
  1092. -    else
  1093. -        return x264_ue_size_tab[tmp>>8]+16;
  1094. -}
  1095. -
  1096. -static ALWAYS_INLINE int bs_size_te( int x, int val )
  1097. -{
  1098. -    if( x == 1 )
  1099. -        return 1;
  1100. -    else //if( x > 1 )
  1101. -        return x264_ue_size_tab[val+1];
  1102. -}
  1103. -
  1104. -#endif
  1105. diff --git a/common/common.c b/common/common.c
  1106. index 62bef99..f1e8758 100644
  1107. --- a/common/common.c
  1108. +++ b/common/common.c
  1109. @@ -1026,60 +1026,6 @@ void x264_picture_clean( x264_picture_t *pic )
  1110.  }
  1111.  
  1112.  /****************************************************************************
  1113. - * x264_nal_encode:
  1114. - ****************************************************************************/
  1115. -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
  1116. -{
  1117. -    uint8_t *src = nal->p_payload;
  1118. -    uint8_t *end = nal->p_payload + nal->i_payload;
  1119. -    uint8_t *orig_dst = dst;
  1120. -    int i_count = 0, size;
  1121. -
  1122. -    if( b_annexb )
  1123. -    {
  1124. -        if( b_long_startcode )
  1125. -            *dst++ = 0x00;
  1126. -        *dst++ = 0x00;
  1127. -        *dst++ = 0x00;
  1128. -        *dst++ = 0x01;
  1129. -    }
  1130. -    else /* save room for size later */
  1131. -        dst += 4;
  1132. -
  1133. -    /* nal header */
  1134. -    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
  1135. -
  1136. -    while( src < end )
  1137. -    {
  1138. -        if( i_count == 2 && *src <= 0x03 )
  1139. -        {
  1140. -            *dst++ = 0x03;
  1141. -            i_count = 0;
  1142. -        }
  1143. -        if( *src == 0 )
  1144. -            i_count++;
  1145. -        else
  1146. -            i_count = 0;
  1147. -        *dst++ = *src++;
  1148. -    }
  1149. -    size = (dst - orig_dst) - 4;
  1150. -
  1151. -    /* Write the size header for mp4/etc */
  1152. -    if( !b_annexb )
  1153. -    {
  1154. -        /* Size doesn't include the size of the header we're writing now. */
  1155. -        orig_dst[0] = size>>24;
  1156. -        orig_dst[1] = size>>16;
  1157. -        orig_dst[2] = size>> 8;
  1158. -        orig_dst[3] = size>> 0;
  1159. -    }
  1160. -
  1161. -    return size+4;
  1162. -}
  1163. -
  1164. -
  1165. -
  1166. -/****************************************************************************
  1167.   * x264_malloc:
  1168.   ****************************************************************************/
  1169.  void *x264_malloc( int i_size )
  1170. diff --git a/common/common.h b/common/common.h
  1171. index 539ea65..93712fe 100644
  1172. --- a/common/common.h
  1173. +++ b/common/common.h
  1174. @@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
  1175.  */
  1176.  
  1177.  #include "x264.h"
  1178. -#include "bs.h"
  1179. +#include "bitstream.h"
  1180.  #include "set.h"
  1181.  #include "predict.h"
  1182.  #include "pixel.h"
  1183. @@ -166,8 +166,6 @@ int64_t x264_mdate( void );
  1184.   * the encoding options */
  1185.  char *x264_param2string( x264_param_t *p, int b_res );
  1186.  
  1187. -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
  1188. -
  1189.  /* log */
  1190.  void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  1191.  
  1192. @@ -796,6 +794,7 @@ struct x264_t
  1193.      x264_zigzag_function_t zigzagf;
  1194.      x264_quant_function_t quantf;
  1195.      x264_deblock_function_t loopf;
  1196. +    x264_bitstream_function_t bsf;
  1197.  
  1198.  #ifdef HAVE_VISUALIZE
  1199.      struct visualize_t *visualize;
  1200. diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
  1201. new file mode 100644
  1202. index 0000000..1fb4cea
  1203. --- /dev/null
  1204. +++ b/common/x86/bitstream-a.asm
  1205. @@ -0,0 +1,112 @@
  1206. +;*****************************************************************************
  1207. +;* bitstream-a.asm: h264 encoder library
  1208. +;*****************************************************************************
  1209. +;* Copyright (C) 2010 x264 project
  1210. +;*
  1211. +;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
  1212. +;*
  1213. +;* This program is free software; you can redistribute it and/or modify
  1214. +;* it under the terms of the GNU General Public License as published by
  1215. +;* the Free Software Foundation; either version 2 of the License, or
  1216. +;* (at your option) any later version.
  1217. +;*
  1218. +;* This program is distributed in the hope that it will be useful,
  1219. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  1220. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  1221. +;* GNU General Public License for more details.
  1222. +;*
  1223. +;* You should have received a copy of the GNU General Public License
  1224. +;* along with this program; if not, write to the Free Software
  1225. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  1226. +;*****************************************************************************
  1227. +
  1228. +%include "x86inc.asm"
  1229. +%include "x86util.asm"
  1230. +
  1231. +SECTION .text
  1232. +
  1233. +;-----------------------------------------------------------------------------
  1234. +; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
  1235. +;-----------------------------------------------------------------------------
  1236. +
  1237. +%macro NAL_LOOP 2
  1238. +ALIGN 16
  1239. +%1:
  1240. +    mova      m0, [r1+r2]
  1241. +    mova      m1, m0
  1242. +%if mmsize == 8
  1243. +    psrlq     m0, 8
  1244. +%else
  1245. +    psrldq    m0, 1
  1246. +%endif
  1247. +    %2   [r0+r1], m1
  1248. +    por       m1, m0
  1249. +    pcmpeqb   m1, m2
  1250. +    pmovmskb r3d, m1
  1251. +    test     r3d, r3d
  1252. +    jnz .escape
  1253. +    add       r1, mmsize
  1254. +    jl %1
  1255. +%endmacro
  1256. +
  1257. +%macro NAL_ESCAPE 1
  1258. +
  1259. +cglobal nal_escape_%1, 3,5
  1260. +    pxor      m2, m2
  1261. +    sub       r1, r2 ; r1 = offset of current src pointer from end of src
  1262. +    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
  1263. +
  1264. +    mov      r3w, [r1+r2]
  1265. +    mov  [r0+r1], r3w
  1266. +    add       r1, 2
  1267. +    jge .ret
  1268. +
  1269. +    ; Start off by jumping into the escape loop in
  1270. +    ; case there's an escape at the start.
  1271. +    ; And do a few more in scalar until src is aligned again.
  1272. +    lea      r4d, [r1+r2]
  1273. +    or       r4d, -mmsize
  1274. +    neg      r4d
  1275. +    jmp .escapeloop
  1276. +
  1277. +    NAL_LOOP .loop_aligned, mova
  1278. +%if mmsize==16
  1279. +    NAL_LOOP .loop_unaligned, movu
  1280. +%endif
  1281. +
  1282. +.ret:
  1283. +    movifnidn rax, r0
  1284. +    RET
  1285. +ALIGN 16
  1286. +.escape:
  1287. +    mov      r4d, mmsize
  1288. +.escapeloop:
  1289. +    mov      r3b, [r1+r2]
  1290. +    cmp      r3b, 3
  1291. +    jna .escape_check
  1292. +.copy:
  1293. +    mov  [r0+r1], r3b
  1294. +    inc      r1
  1295. +    jge .ret
  1296. +    dec      r4d
  1297. +    jg .escapeloop
  1298. +    cmp byte [r1+r2-1], 0 ; Don't go back to the main loop until we're out of a zero-run.
  1299. +    jz .escape
  1300. +%if mmsize==16
  1301. +    lea      r4d, [r0+r1]
  1302. +    test     r4d, mmsize-1
  1303. +    jnz .loop_unaligned
  1304. +%endif
  1305. +    jmp .loop_aligned
  1306. +.escape_check:
  1307. +    cmp word [r0+r1-2], 0
  1308. +    jnz .copy
  1309. +    mov byte [r0+r1], 3
  1310. +    inc      r0
  1311. +    jmp .copy
  1312. +%endmacro
  1313. +
  1314. +INIT_MMX
  1315. +NAL_ESCAPE mmxext
  1316. +INIT_XMM
  1317. +NAL_ESCAPE sse2
  1318. diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
  1319. index aedd688..3a31e26 100644
  1320. --- a/common/x86/deblock-a.asm
  1321. +++ b/common/x86/deblock-a.asm
  1322. @@ -4,6 +4,7 @@
  1323.  ;* Copyright (C) 2005-2008 x264 project
  1324.  ;*
  1325.  ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  1326. +;*          Jason Garrett-Glaser <darkshikari@gmail.com>
  1327.  ;*
  1328.  ;* This program is free software; you can redistribute it and/or modify
  1329.  ;* it under the terms of the GNU General Public License as published by
  1330. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1331. index 6e0dc54..32db82a 100644
  1332. --- a/encoder/encoder.c
  1333. +++ b/encoder/encoder.c
  1334. @@ -986,6 +986,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
  1335.      x264_mc_init( h->param.cpu, &h->mc );
  1336.      x264_quant_init( h, h->param.cpu, &h->quantf );
  1337.      x264_deblock_init( h->param.cpu, &h->loopf );
  1338. +    x264_bitstream_init( h->param.cpu, &h->bsf );
  1339.      x264_dct_init_weights();
  1340.  
  1341.      mbcmp_init( h );
  1342. @@ -1272,7 +1273,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
  1343.      for( int i = start; i < h->out.i_nal; i++ )
  1344.      {
  1345.          int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
  1346. -        int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
  1347. +        int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
  1348.          h->out.nal[i].i_payload = size;
  1349.          h->out.nal[i].p_payload = nal_buffer;
  1350.          nal_buffer += size;
  1351. diff --git a/tools/checkasm.c b/tools/checkasm.c
  1352. index a0a9d54..ea6f209 100644
  1353. --- a/tools/checkasm.c
  1354. +++ b/tools/checkasm.c
  1355. @@ -1661,6 +1661,55 @@ static int check_cabac( int cpu_ref, int cpu_new )
  1356.      return ret;
  1357.  }
  1358.  
  1359. +static int check_bitstream( int cpu_ref, int cpu_new )
  1360. +{
  1361. +    x264_bitstream_function_t bs_c;
  1362. +    x264_bitstream_function_t bs_ref;
  1363. +    x264_bitstream_function_t bs_a;
  1364. +
  1365. +    int ret = 0, ok = 1, used_asm = 0;
  1366. +
  1367. +    x264_bitstream_init( 0, &bs_c );
  1368. +    x264_bitstream_init( cpu_ref, &bs_ref );
  1369. +    x264_bitstream_init( cpu_new, &bs_a );
  1370. +    if( bs_a.nal_escape != bs_ref.nal_escape )
  1371. +    {
  1372. +        int size = 0x4000;
  1373. +        uint8_t *input = malloc(size+100);
  1374. +        uint8_t *output1 = malloc(size*2);
  1375. +        uint8_t *output2 = malloc(size*2);
  1376. +        used_asm = 1;
  1377. +        set_func_name( "nal_escape" );
  1378. +        for( int i = 0; i < 100; i++ )
  1379. +        {
  1380. +            /* Test corner-case sizes */
  1381. +            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
  1382. +            for( int j = 0; j < test_size; j++ )
  1383. +                input[j] = (rand()&1) * rand();
  1384. +            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
  1385. +            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
  1386. +            int size_c = end_c-output1;
  1387. +            int size_a = end_a-output2;
  1388. +            if( size_c != size_a || memcmp( output1, output2, size_c ) )
  1389. +            {
  1390. +                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
  1391. +                ok = 0;
  1392. +                break;
  1393. +            }
  1394. +        }
  1395. +        for( int j = 0; j < size; j++ )
  1396. +            input[j] = rand();
  1397. +        call_c2( bs_c.nal_escape, output1, input, input+size );
  1398. +        call_a2( bs_a.nal_escape, output2, input, input+size );
  1399. +        free(input);
  1400. +        free(output1);
  1401. +        free(output2);
  1402. +    }
  1403. +    report( "nal escape:" );
  1404. +
  1405. +    return ret;
  1406. +}
  1407. +
  1408.  static int check_all_funcs( int cpu_ref, int cpu_new )
  1409.  {
  1410.      return check_pixel( cpu_ref, cpu_new )
  1411. @@ -1669,7 +1718,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
  1412.           + check_intra( cpu_ref, cpu_new )
  1413.           + check_deblock( cpu_ref, cpu_new )
  1414.           + check_quant( cpu_ref, cpu_new )
  1415. -         + check_cabac( cpu_ref, cpu_new );
  1416. +         + check_cabac( cpu_ref, cpu_new )
  1417. +         + check_bitstream( cpu_ref, cpu_new );
  1418.  }
  1419.  
  1420.  static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
  1421. --
  1422. 1.7.0.4
  1423.  
  1424.  
  1425. From 92e968cda1b4306ae0d99024114adcd17c617637 Mon Sep 17 00:00:00 2001
  1426. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1427. Date: Fri, 28 May 2010 14:30:07 -0700
  1428. Subject: [PATCH 5/8] Re-enable i8x8 merged SATD
  1429.  Accidentally got disabled when intra_sad_x3 was added.
  1430.  
  1431. ---
  1432. encoder/encoder.c |    1 +
  1433.  1 files changed, 1 insertions(+), 0 deletions(-)
  1434.  
  1435. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1436. index 32db82a..2f9e7f6 100644
  1437. --- a/encoder/encoder.c
  1438. +++ b/encoder/encoder.c
  1439. @@ -810,6 +810,7 @@ static void mbcmp_init( x264_t *h )
  1440.      memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
  1441.      h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
  1442.      h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
  1443. +    h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
  1444.      h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
  1445.      satd &= h->param.analyse.i_me_method == X264_ME_TESA;
  1446.      memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
  1447. --
  1448. 1.7.0.4
  1449.  
  1450.  
  1451. From 02fa45a7a2e26a885bcf6e996bec2a7ee6c242bf Mon Sep 17 00:00:00 2001
  1452. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1453. Date: Fri, 28 May 2010 14:27:22 -0700
  1454. Subject: [PATCH 6/8] Add API tool to apply arbitrary quantizer offsets
  1455.  The calling application can now pass a "map" of quantizer offsets to apply to each frame.
  1456.  An optional callback to free the map can also be included.
  1457.  This allows all kinds of flexible region-of-interest coding and similar.
  1458.  
  1459. ---
  1460. common/common.c       |    2 +-
  1461.  encoder/encoder.c     |    7 +++++--
  1462.  encoder/ratecontrol.c |   36 +++++++++++++++++++++++++-----------
  1463.  encoder/ratecontrol.h |    4 ++--
  1464.  x264.h                |   20 +++++++++++++++++++-
  1465.  5 files changed, 52 insertions(+), 17 deletions(-)
  1466.  
  1467. diff --git a/common/common.c b/common/common.c
  1468. index f1e8758..c092c01 100644
  1469. --- a/common/common.c
  1470. +++ b/common/common.c
  1471. @@ -997,6 +997,7 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
  1472.   ****************************************************************************/
  1473.  int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
  1474.  {
  1475. +    memset( pic, 0, sizeof( x264_picture_t ) );
  1476.      pic->i_type = X264_TYPE_AUTO;
  1477.      pic->i_qpplus1 = 0;
  1478.      pic->img.i_csp = i_csp;
  1479. @@ -1009,7 +1010,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
  1480.      pic->img.i_stride[0] = i_width;
  1481.      pic->img.i_stride[1] = i_width / 2;
  1482.      pic->img.i_stride[2] = i_width / 2;
  1483. -    pic->param = NULL;
  1484.      pic->i_pic_struct = PIC_STRUCT_AUTO;
  1485.      return 0;
  1486.  }
  1487. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1488. index 2f9e7f6..89107a3 100644
  1489. --- a/encoder/encoder.c
  1490. +++ b/encoder/encoder.c
  1491. @@ -2250,11 +2250,14 @@ int     x264_encoder_encode( x264_t *h,
  1492.  
  1493.          if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
  1494.          {
  1495. -            if( x264_macroblock_tree_read( h, fenc ) )
  1496. +            if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
  1497.                  return -1;
  1498.          }
  1499.          else
  1500. -            x264_adaptive_quant_frame( h, fenc );
  1501. +            x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets );
  1502. +
  1503. +        if( pic_in->prop.quant_offsets_free )
  1504. +            pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
  1505.  
  1506.          if( h->frames.b_have_lowres )
  1507.              x264_frame_init_lowres( h, fenc );
  1508. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  1509. index bf0a400..d09de98 100644
  1510. --- a/encoder/ratecontrol.c
  1511. +++ b/encoder/ratecontrol.c
  1512. @@ -235,7 +235,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
  1513.      return var;
  1514.  }
  1515.  
  1516. -void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  1517. +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
  1518.  {
  1519.      /* constants chosen to result in approximately the same overall bitrate as without AQ.
  1520.       * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
  1521. @@ -256,11 +256,22 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  1522.          /* Need to init it anyways for MB tree */
  1523.          if( h->param.rc.f_aq_strength == 0 )
  1524.          {
  1525. -            memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  1526. -            memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  1527. -            if( h->frames.b_have_lowres )
  1528. +            if( quant_offsets )
  1529. +            {
  1530.                  for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  1531. -                    frame->i_inv_qscale_factor[mb_xy] = 256;
  1532. +                    frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
  1533. +                if( h->frames.b_have_lowres )
  1534. +                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  1535. +                        frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
  1536. +            }
  1537. +            else
  1538. +            {
  1539. +                memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
  1540. +                memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
  1541. +                if( h->frames.b_have_lowres )
  1542. +                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
  1543. +                        frame->i_inv_qscale_factor[mb_xy] = 256;
  1544. +            }
  1545.          }
  1546.          /* Need variance data for weighted prediction */
  1547.          if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
  1548. @@ -299,9 +310,10 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  1549.              for( int mb_x = 0; mb_x < width; mb_x++ )
  1550.              {
  1551.                  float qp_adj;
  1552. +                int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
  1553.                  if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
  1554.                  {
  1555. -                    qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
  1556. +                    qp_adj = frame->f_qp_offset[mb_xy];
  1557.                      qp_adj = strength * (qp_adj - avg_adj);
  1558.                  }
  1559.                  else
  1560. @@ -309,10 +321,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  1561.                      uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
  1562.                      qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
  1563.                  }
  1564. -                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
  1565. -                frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
  1566. +                if( quant_offsets )
  1567. +                    qp_adj += quant_offsets[mb_xy];
  1568. +                frame->f_qp_offset[mb_xy] =
  1569. +                frame->f_qp_offset_aq[mb_xy] = qp_adj;
  1570.                  if( h->frames.b_have_lowres )
  1571. -                    frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
  1572. +                    frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
  1573.              }
  1574.      }
  1575.  
  1576. @@ -327,7 +341,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
  1577.      }
  1578.  }
  1579.  
  1580. -int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
  1581. +int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
  1582.  {
  1583.      x264_ratecontrol_t *rc = h->rc;
  1584.      uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
  1585. @@ -363,7 +377,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
  1586.          rc->qpbuf_pos--;
  1587.      }
  1588.      else
  1589. -        x264_adaptive_quant_frame( h, frame );
  1590. +        x264_adaptive_quant_frame( h, frame, quant_offsets );
  1591.      return 0;
  1592.  fail:
  1593.      x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
  1594. diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
  1595. index e052b2a..dd139eb 100644
  1596. --- a/encoder/ratecontrol.h
  1597. +++ b/encoder/ratecontrol.h
  1598. @@ -29,8 +29,8 @@ void x264_ratecontrol_delete( x264_t * );
  1599.  
  1600.  void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
  1601.  
  1602. -void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
  1603. -int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
  1604. +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
  1605. +int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
  1606.  int  x264_reference_build_list_optimal( x264_t *h );
  1607.  void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
  1608.  void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
  1609. diff --git a/x264.h b/x264.h
  1610. index 95efd88..a4b3400 100644
  1611. --- a/x264.h
  1612. +++ b/x264.h
  1613. @@ -35,7 +35,7 @@
  1614.  
  1615.  #include <stdarg.h>
  1616.  
  1617. -#define X264_BUILD 96
  1618. +#define X264_BUILD 97
  1619.  
  1620.  /* x264_t:
  1621.   *      opaque handler for encoder */
  1622. @@ -508,6 +508,22 @@ typedef struct
  1623.  
  1624.  typedef struct
  1625.  {
  1626. +    /* In: an array of quantizer offsets to be applied to this image during encoding.
  1627. +     *     These are added on top of the decisions made by x264.
  1628. +     *     Offsets can be fractional; they are added before QPs are rounded to integer.
  1629. +     *     Adaptive quantization must be enabled to use this feature.  Behavior if quant
  1630. +     *     offsets differ between encoding passes is undefined.
  1631. +     *
  1632. +     *     Array contains one offset per macroblock, in raster scan order.  In interlaced
  1633. +     *     mode, top-field MBs and bottom-field MBs are interleaved at the row level. */
  1634. +    float *quant_offsets;
  1635. +    /* In: optional callback to free quant_offsets when used.
  1636. +     *     Useful if one wants to use a different quant_offset array for each frame. */
  1637. +    void (*quant_offsets_free)( void* );
  1638. +} x264_image_properties_t;
  1639. +
  1640. +typedef struct
  1641. +{
  1642.      /* In: force picture type (if not auto)
  1643.       *     If x264 encoding parameters are violated in the forcing of picture types,
  1644.       *     x264 will correct the input picture type and log a warning.
  1645. @@ -537,6 +553,8 @@ typedef struct
  1646.      x264_param_t *param;
  1647.      /* In: raw data */
  1648.      x264_image_t img;
  1649. +    /* In: optional information to modify encoder decisions for this frame */
  1650. +    x264_image_properties_t prop;
  1651.      /* Out: HRD timing information. Output only when i_nal_hrd is set. */
  1652.      x264_hrd_t hrd_timing;
  1653.      /* private user data. libx264 doesn't touch this,
  1654. --
  1655. 1.7.0.4
  1656.  
  1657.  
  1658. From 1edf08c06c9f07fc8bb56879033dbc59e86ef7ac Mon Sep 17 00:00:00 2001
  1659. From: Henrik Gramner <hengar-6@student.ltu.se>
  1660. Date: Thu, 27 May 2010 22:18:38 +0200
  1661. Subject: [PATCH 7/8] Optimize out some x264_scan8 reads
  1662.  
  1663. ---
  1664. encoder/analyse.c    |   15 ++++-----
  1665.  encoder/macroblock.c |   82 ++++++++++++++++++++++++++++++--------------------
  1666.  encoder/me.c         |   25 ++++++++-------
  1667.  3 files changed, 70 insertions(+), 52 deletions(-)
  1668.  
  1669. diff --git a/encoder/analyse.c b/encoder/analyse.c
  1670. index a128a70..9e85e89 100644
  1671. --- a/encoder/analyse.c
  1672. +++ b/encoder/analyse.c
  1673. @@ -907,8 +907,6 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
  1674.  static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  1675.  {
  1676.      uint8_t  *p_dst = h->mb.pic.p_fdec[0];
  1677. -
  1678. -    int x, y;
  1679.      uint64_t i_satd, i_best;
  1680.      h->mb.i_skip_intra = 0;
  1681.  
  1682. @@ -1031,8 +1029,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  1683.              int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
  1684.  
  1685.              i_best = COST_MAX64;
  1686. -            x = idx&1;
  1687. -            y = idx>>1;
  1688. +            int x = idx&1;
  1689. +            int y = idx>>1;
  1690. +            int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1691.  
  1692.              p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
  1693.              predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
  1694. @@ -1061,8 +1060,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  1695.                      if( !(idx&1) )
  1696.                          for( int j = 0; j < 7; j++ )
  1697.                              pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
  1698. -                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
  1699. -                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
  1700. +                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
  1701. +                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
  1702.                  }
  1703.              }
  1704.              a->i_cbp_i8x8_luma = cbp_luma_new;
  1705. @@ -1070,8 +1069,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  1706.              if( !(idx&1) )
  1707.                  for( int j = 0; j < 7; j++ )
  1708.                      p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
  1709. -            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
  1710. -            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
  1711. +            M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
  1712. +            M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
  1713.  
  1714.              x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
  1715.          }
  1716. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  1717. index 984f8a8..cdc4563 100644
  1718. --- a/encoder/macroblock.c
  1719. +++ b/encoder/macroblock.c
  1720. @@ -135,11 +135,12 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
  1721.      }
  1722.  }
  1723.  
  1724. -#define STORE_8x8_NNZ(idx,nz)\
  1725. +#define STORE_8x8_NNZ( s8, nz )\
  1726. +do\
  1727.  {\
  1728. -    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
  1729. -    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
  1730. -}
  1731. +    M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
  1732. +    M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
  1733. +} while(0)
  1734.  
  1735.  #define CLEAR_16x16_NNZ \
  1736.  {\
  1737. @@ -151,17 +152,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
  1738.  
  1739.  void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
  1740.  {
  1741. -    int x = 8 * (idx&1);
  1742. -    int y = 8 * (idx>>1);
  1743. +    int x = idx&1;
  1744. +    int y = idx>>1;
  1745. +    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1746.      int nz;
  1747. -    uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
  1748. -    uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
  1749. +    uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
  1750. +    uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
  1751.      ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
  1752.  
  1753.      if( h->mb.b_lossless )
  1754.      {
  1755.          nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
  1756. -        STORE_8x8_NNZ(idx,nz);
  1757. +        STORE_8x8_NNZ( s8, nz );
  1758.          h->mb.i_cbp_luma |= nz<<idx;
  1759.          return;
  1760.      }
  1761. @@ -175,10 +177,10 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
  1762.          h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
  1763.          h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
  1764.          h->dctf.add8x8_idct8( p_dst, dct8x8 );
  1765. -        STORE_8x8_NNZ(idx,1);
  1766. +        STORE_8x8_NNZ( s8, 1 );
  1767.      }
  1768.      else
  1769. -        STORE_8x8_NNZ(idx,0);
  1770. +        STORE_8x8_NNZ( s8, 0 );
  1771.  }
  1772.  
  1773.  static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
  1774. @@ -728,12 +730,13 @@ void x264_macroblock_encode( x264_t *h )
  1775.              if( h->mb.b_transform_8x8 )
  1776.                  for( int i8x8 = 0; i8x8 < 4; i8x8++ )
  1777.                  {
  1778. -                    int x = 8*(i8x8&1);
  1779. -                    int y = 8*(i8x8>>1);
  1780. -                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
  1781. -                                        h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
  1782. -                                        h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
  1783. -                    STORE_8x8_NNZ(i8x8,nz);
  1784. +                    int x = i8x8&1;
  1785. +                    int y = i8x8>>1;
  1786. +                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1787. +
  1788. +                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
  1789. +                                                                   h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
  1790. +                    STORE_8x8_NNZ( s8, nz );
  1791.                      h->mb.i_cbp_luma |= nz << i8x8;
  1792.                  }
  1793.              else
  1794. @@ -783,14 +786,18 @@ void x264_macroblock_encode( x264_t *h )
  1795.              {
  1796.                  for( int idx = 0; idx < 4; idx++ )
  1797.                  {
  1798. +                    int x = idx&1;
  1799. +                    int y = idx>>1;
  1800. +                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1801. +
  1802.                      if( h->mb.i_cbp_luma&(1<<idx) )
  1803.                      {
  1804.                          h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
  1805. -                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
  1806. -                        STORE_8x8_NNZ(idx,1);
  1807. +                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
  1808. +                        STORE_8x8_NNZ( s8, 1 );
  1809.                      }
  1810.                      else
  1811. -                        STORE_8x8_NNZ(idx,0);
  1812. +                        STORE_8x8_NNZ( s8, 0 );
  1813.                  }
  1814.              }
  1815.          }
  1816. @@ -825,18 +832,24 @@ void x264_macroblock_encode( x264_t *h )
  1817.                      }
  1818.                  }
  1819.  
  1820. +                int x = i8x8&1;
  1821. +                int y = i8x8>>1;
  1822. +
  1823.                  /* decimate this 8x8 block */
  1824.                  i_decimate_mb += i_decimate_8x8;
  1825.                  if( b_decimate )
  1826.                  {
  1827.                      if( i_decimate_8x8 < 4 )
  1828. -                        STORE_8x8_NNZ(i8x8,0)
  1829. +                    {
  1830. +                        int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1831. +                        STORE_8x8_NNZ( s8, 0 );
  1832. +                    }
  1833.                      else
  1834.                          h->mb.i_cbp_luma |= 1<<i8x8;
  1835.                  }
  1836.                  else if( cbp )
  1837.                  {
  1838. -                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
  1839. +                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
  1840.                      h->mb.i_cbp_luma |= 1<<i8x8;
  1841.                  }
  1842.              }
  1843. @@ -1045,8 +1058,11 @@ void x264_noise_reduction_update( x264_t *h )
  1844.  void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1845.  {
  1846.      int i_qp = h->mb.i_qp;
  1847. -    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
  1848. -    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
  1849. +    int x = i8&1;
  1850. +    int y = i8>>1;
  1851. +    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1852. +    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
  1853. +    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
  1854.      int b_decimate = h->mb.b_dct_decimate;
  1855.      int nnz8x8 = 0;
  1856.      int nz;
  1857. @@ -1059,7 +1075,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1858.          if( h->mb.b_transform_8x8 )
  1859.          {
  1860.              nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
  1861. -            STORE_8x8_NNZ(i8,nnz8x8);
  1862. +            STORE_8x8_NNZ( s8, nnz8x8 );
  1863.          }
  1864.          else
  1865.          {
  1866. @@ -1075,8 +1091,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1867.          for( int ch = 0; ch < 2; ch++ )
  1868.          {
  1869.              int16_t dc;
  1870. -            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
  1871. -            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
  1872. +            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
  1873. +            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
  1874.              nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
  1875.              h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
  1876.          }
  1877. @@ -1099,13 +1115,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1878.                  {
  1879.                      h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
  1880.                      h->dctf.add8x8_idct8( p_fdec, dct8x8 );
  1881. -                    STORE_8x8_NNZ(i8,1);
  1882. +                    STORE_8x8_NNZ( s8, 1 );
  1883.                  }
  1884.                  else
  1885. -                    STORE_8x8_NNZ(i8,0);
  1886. +                    STORE_8x8_NNZ( s8, 0 );
  1887.              }
  1888.              else
  1889. -                STORE_8x8_NNZ(i8,0);
  1890. +                STORE_8x8_NNZ( s8, 0 );
  1891.          }
  1892.          else
  1893.          {
  1894. @@ -1132,7 +1148,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1895.              if( nnz8x8 )
  1896.                  h->dctf.add8x8_idct( p_fdec, dct4x4 );
  1897.              else
  1898. -                STORE_8x8_NNZ(i8,0);
  1899. +                STORE_8x8_NNZ( s8, 0 );
  1900.          }
  1901.  
  1902.          i_qp = h->mb.i_chroma_qp;
  1903. @@ -1140,8 +1156,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1904.          for( int ch = 0; ch < 2; ch++ )
  1905.          {
  1906.              ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
  1907. -            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
  1908. -            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
  1909. +            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
  1910. +            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
  1911.  
  1912.              h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
  1913.              dct4x4[0] = 0;
  1914. diff --git a/encoder/me.c b/encoder/me.c
  1915. index 77073cc..40d0650 100644
  1916. --- a/encoder/me.c
  1917. +++ b/encoder/me.c
  1918. @@ -937,8 +937,11 @@ int x264_iter_kludge = 0;
  1919.  
  1920.  static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
  1921.  {
  1922. -    int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
  1923. -    int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
  1924. +    int x = i8&1;
  1925. +    int y = i8>>1;
  1926. +    int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1927. +    int16_t *cache0_mv = h->mb.cache.mv[0][s8];
  1928. +    int16_t *cache1_mv = h->mb.cache.mv[1][s8];
  1929.      const int i_pixel = m0->i_pixel;
  1930.      const int bw = x264_pixel_size[i_pixel].w;
  1931.      const int bh = x264_pixel_size[i_pixel].h;
  1932. @@ -946,11 +949,11 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
  1933.      ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
  1934.      ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
  1935.      uint8_t *src[2][9];
  1936. -    uint8_t *pix  = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
  1937. -    uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
  1938. -    uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
  1939. -    const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
  1940. -    const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
  1941. +    uint8_t *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
  1942. +    uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
  1943. +    uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
  1944. +    int ref0 = h->mb.cache.ref[0][s8];
  1945. +    int ref1 = h->mb.cache.ref[1][s8];
  1946.      const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  1947.      const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  1948.      int stride[2][9];
  1949. @@ -1058,13 +1061,13 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
  1950.  
  1951.      if( rd )
  1952.      {
  1953. -        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
  1954. +        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
  1955.          amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
  1956. -        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
  1957. +        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
  1958.  
  1959. -        x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
  1960. +        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
  1961.          amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
  1962. -        x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
  1963. +        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
  1964.      }
  1965.  
  1966.      m0->mv[0] = bm0x;
  1967. --
  1968. 1.7.0.4
  1969.  
  1970.  
  1971. From cb8b597efd407a6deecee00b81483d82c77abadc Mon Sep 17 00:00:00 2001
  1972. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1973. Date: Sun, 30 May 2010 09:42:53 -0700
  1974. Subject: [PATCH 8/8] Fix ultrafast to actually turn off weightb
  1975.  
  1976. ---
  1977. common/common.c |    1 +
  1978.  1 files changed, 1 insertions(+), 0 deletions(-)
  1979.  
  1980. diff --git a/common/common.c b/common/common.c
  1981. index c092c01..48e1bbc 100644
  1982. --- a/common/common.c
  1983. +++ b/common/common.c
  1984. @@ -183,6 +183,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
  1985.          param->i_bframe_adaptive = X264_B_ADAPT_NONE;
  1986.          param->rc.b_mb_tree = 0;
  1987.          param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
  1988. +        param->analyse.b_weighted_bipred = 0;
  1989.      }
  1990.      else if( !strcasecmp( preset, "superfast" ) )
  1991.      {
  1992. --
  1993. 1.7.0.4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement