Advertisement
Guest User

Untitled

a guest
Jun 27th, 2017
500
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 27.55 KB | None | 0 0
  1. From c40e95310f84738f7bdee83a23d66518d6dd6a64 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Sun, 14 Nov 2010 03:34:26 -0800
  4. Subject: [PATCH 3/3] Chroma weighted prediction
  5.  Like luma weighted prediction, dramatically improves compression in fades.
  6.  Up to 4-8db chroma PSNR gain in extreme cases (short, perfect fade-outs).
  7.  On actual videos, helps up to ~1% overall.
  8.  One example video with a decent number of fades (ef OP): 0.8% bitrate reduction overall, 7% bitrate reduction just counting chroma.
  9.  Fixes a lot of artifacts in fades at lower bitrates.
  10.  
  11. Original patch by Dylan Yudaken <dyudaken@gmail.com>.
  12. ---
  13. common/common.h       |    2 +-
  14.  encoder/encoder.c     |   94 ++++++++++--------
  15.  encoder/me.c          |    8 ++
  16.  encoder/ratecontrol.c |   47 +++++++--
  17.  encoder/slicetype.c   |  262 +++++++++++++++++++++++++++++++++++++------------
  18.  5 files changed, 299 insertions(+), 114 deletions(-)
  19.  
  20. diff --git a/common/common.h b/common/common.h
  21. index 7d57119..1434e13 100644
  22. --- a/common/common.h
  23. +++ b/common/common.h
  24. @@ -805,7 +805,7 @@ struct x264_t
  25.          int     i_direct_score[2];
  26.          int     i_direct_frames[2];
  27.          /* num p-frames weighted */
  28. -        int     i_wpred[3];
  29. +        int     i_wpred[2];
  30.  
  31.      } stat;
  32.  
  33. diff --git a/encoder/encoder.c b/encoder/encoder.c
  34. index ede1c28..58331b9 100644
  35. --- a/encoder/encoder.c
  36. +++ b/encoder/encoder.c
  37. @@ -1468,49 +1468,67 @@ static void x264_weighted_pred_init( x264_t *h )
  38.  
  39.      int i_padv = PADV << h->param.b_interlaced;
  40.      int denom = -1;
  41. -    int weightluma = 0;
  42. +    int weightplane[2] = { 0, 0 };
  43.      int buffer_next = 0;
  44. -    //FIXME: when chroma support is added, move this into loop
  45. -    h->sh.weight[0][1].weightfn = h->sh.weight[0][2].weightfn = NULL;
  46. -    h->sh.weight[0][1].i_denom = h->sh.weight[0][2].i_denom = 0;
  47. -    for( int j = 0; j < h->i_ref0; j++ )
  48. +    for( int i = 0; i < 3; i++ )
  49.      {
  50. -        if( h->fenc->weight[j][0].weightfn )
  51. +        for( int j = 0; j < h->i_ref0; j++ )
  52.          {
  53. -            h->sh.weight[j][0] = h->fenc->weight[j][0];
  54. -            // if weight is useless, don't write it to stream
  55. -            if( h->sh.weight[j][0].i_scale == 1<<h->sh.weight[j][0].i_denom && h->sh.weight[j][0].i_offset == 0 )
  56. -                h->sh.weight[j][0].weightfn = NULL;
  57. -            else
  58. +            if( h->fenc->weight[j][i].weightfn )
  59.              {
  60. -                if( !weightluma )
  61. +                h->sh.weight[j][i] = h->fenc->weight[j][i];
  62. +                // if weight is useless, don't write it to stream
  63. +                if( h->sh.weight[j][i].i_scale == 1<<h->sh.weight[j][i].i_denom && h->sh.weight[j][i].i_offset == 0 )
  64. +                    h->sh.weight[j][i].weightfn = NULL;
  65. +                else
  66.                  {
  67. -                    weightluma = 1;
  68. -                    h->sh.weight[0][0].i_denom = denom = h->sh.weight[j][0].i_denom;
  69. -                    assert( x264_clip3( denom, 0, 7 ) == denom );
  70. +                    if( !weightplane[!!i] )
  71. +                    {
  72. +                        weightplane[!!i] = 1;
  73. +                        h->sh.weight[0][!!i].i_denom = denom = h->sh.weight[j][i].i_denom;
  74. +                        assert( x264_clip3( denom, 0, 7 ) == denom );
  75. +                    }
  76. +
  77. +                    assert( h->sh.weight[j][i].i_denom == denom );
  78. +                    if( !i )
  79. +                    {
  80. +                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH;
  81. +                        //scale full resolution frame
  82. +                        if( h->param.i_threads == 1 )
  83. +                        {
  84. +                            pixel *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
  85. +                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
  86. +                            int stride = h->fenc->i_stride[0];
  87. +                            int width = h->fenc->i_width[0] + PADH*2;
  88. +                            int height = h->fenc->i_lines[0] + i_padv*2;
  89. +                            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
  90. +                            h->fenc->i_lines_weighted = height;
  91. +                        }
  92. +                    }
  93.                  }
  94. -                assert( h->sh.weight[j][0].i_denom == denom );
  95. -                assert( x264_clip3( h->sh.weight[j][0].i_scale, 0, 127 ) == h->sh.weight[j][0].i_scale );
  96. -                assert( x264_clip3( h->sh.weight[j][0].i_offset, -128, 127 ) == h->sh.weight[j][0].i_offset );
  97. -                h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] +
  98. -                    h->fenc->i_stride[0] * i_padv + PADH;
  99.              }
  100.          }
  101. +    }
  102.  
  103. -        //scale full resolution frame
  104. -        if( h->sh.weight[j][0].weightfn && h->param.i_threads == 1 )
  105. +    if( weightplane[1] )
  106. +        for( int i = 0; i < h->i_ref0; i++ )
  107.          {
  108. -            pixel *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
  109. -            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
  110. -            int stride = h->fenc->i_stride[0];
  111. -            int width = h->fenc->i_width[0] + PADH*2;
  112. -            int height = h->fenc->i_lines[0] + i_padv*2;
  113. -            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
  114. -            h->fenc->i_lines_weighted = height;
  115. +            if( h->sh.weight[i][1].weightfn && !h->sh.weight[i][2].weightfn )
  116. +            {
  117. +                h->sh.weight[i][2].i_scale = 1 << h->sh.weight[0][1].i_denom;
  118. +                h->sh.weight[i][2].i_offset = 0;
  119. +            }
  120. +            else if( h->sh.weight[i][2].weightfn && !h->sh.weight[i][1].weightfn )
  121. +            {
  122. +                h->sh.weight[i][1].i_scale = 1 << h->sh.weight[0][1].i_denom;
  123. +                h->sh.weight[i][1].i_offset = 0;
  124. +            }
  125.          }
  126. -    }
  127. -    if( !weightluma )
  128. +
  129. +    if( !weightplane[0] )
  130.          h->sh.weight[0][0].i_denom = 0;
  131. +    if( !weightplane[1] )
  132. +        h->sh.weight[0][1].i_denom = h->sh.weight[0][2].i_denom = 0;
  133.  }
  134.  
  135.  static inline void x264_reference_build_list( x264_t *h, int i_poc )
  136. @@ -2849,13 +2867,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
  137.      {
  138.          h->stat.i_consecutive_bframes[h->fdec->i_frame - h->fref0[0]->i_frame - 1]++;
  139.          if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
  140. -            for( int i = 0; i < 3; i++ )
  141. -                for( int j = 0; j < h->i_ref0; j++ )
  142. -                    if( h->sh.weight[0][i].i_denom != 0 )
  143. -                    {
  144. -                        h->stat.i_wpred[i]++;
  145. -                        break;
  146. -                    }
  147. +            for( int i = 0; i < 2; i++ )
  148. +                h->stat.i_wpred[i] += !!h->sh.weight[0][i].i_denom;
  149.      }
  150.      if( h->sh.i_type == SLICE_TYPE_B )
  151.      {
  152. @@ -3201,8 +3214,9 @@ void    x264_encoder_close  ( x264_t *h )
  153.                        fixed_pred_modes[3][3] * 100.0 / sum_pred_modes[3] );
  154.  
  155.          if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
  156. -            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%%\n",
  157. -                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
  158. +            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%% UV:%.1f%%\n",
  159. +                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P],
  160. +                      h->stat.i_wpred[1] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
  161.  
  162.          for( int i_list = 0; i_list < 2; i_list++ )
  163.              for( int i_slice = 0; i_slice < 2; i_slice++ )
  164. diff --git a/encoder/me.c b/encoder/me.c
  165. index 3f8d8e5..90f7dfd 100644
  166. --- a/encoder/me.c
  167. +++ b/encoder/me.c
  168. @@ -1110,7 +1110,15 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
  169.          uint64_t cost; \
  170.          M32( cache_mv ) = pack16to32_mask(mx,my); \
  171.          if( m->i_pixel <= PIXEL_8x8 ) \
  172. +        { \
  173.              h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
  174. +            if( m->weight[1].weightfn ) \
  175. +                m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, \
  176. +                                                                      &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
  177. +            if( m->weight[2].weightfn ) \
  178. +                m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, \
  179. +                                                                      &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
  180. +        } \
  181.          cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
  182.          COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
  183.      } \
  184. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  185. index 34879b7..212b474 100644
  186. --- a/encoder/ratecontrol.c
  187. +++ b/encoder/ratecontrol.c
  188. @@ -53,8 +53,8 @@ typedef struct
  189.      int s_count;
  190.      float blurred_complexity;
  191.      char direct_mode;
  192. -    int16_t weight[2];
  193. -    int16_t i_weight_denom;
  194. +    int16_t weight[3][2];
  195. +    int16_t i_weight_denom[2];
  196.      int refcount[16];
  197.      int refs;
  198.      int i_duration;
  199. @@ -227,8 +227,8 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
  200.      {
  201.          ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
  202.          h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride );
  203. -        return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, i )
  204. -             + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, i );
  205. +        return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1 )
  206. +             + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2 );
  207.      }
  208.      else
  209.          return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, i );
  210. @@ -854,11 +854,19 @@ int x264_ratecontrol_new( x264_t *h )
  211.              rce->refs = ref;
  212.  
  213.              /* find weights */
  214. -            rce->i_weight_denom = -1;
  215. +            rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
  216.              char *w = strchr( p, 'w' );
  217.              if( w )
  218. -                if( sscanf( w, "w:%hd,%hd,%hd", &rce->i_weight_denom, &rce->weight[0], &rce->weight[1] ) != 3 )
  219. -                    rce->i_weight_denom = -1;
  220. +            {
  221. +                int count = sscanf( w, "w:%hd,%hd,%hd,%hd,%hd,%hd,%hd,%hd",
  222. +                                    &rce->i_weight_denom[0], &rce->weight[0][0], &rce->weight[0][1],
  223. +                                    &rce->i_weight_denom[1], &rce->weight[1][0], &rce->weight[1][1],
  224. +                                    &rce->weight[2][0], &rce->weight[2][1] );
  225. +                if( count == 3 )
  226. +                    rce->i_weight_denom[1] = -1;
  227. +                else if ( count != 8 )
  228. +                    rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
  229. +            }
  230.  
  231.              if( pict_type != 'b' )
  232.                  rce->kept_as_ref = 1;
  233. @@ -1485,8 +1493,15 @@ void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm )
  234.      ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame];
  235.      if( h->param.analyse.i_weighted_pred <= 0 )
  236.          return;
  237. -    if( rce->i_weight_denom >= 0 )
  238. -        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0], rce->i_weight_denom, rce->weight[1] );
  239. +
  240. +    if( rce->i_weight_denom[0] >= 0 )
  241. +        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0][0], rce->i_weight_denom[0], rce->weight[0][1] );
  242. +
  243. +    if( rce->i_weight_denom[1] >= 0 )
  244. +    {
  245. +        SET_WEIGHT( frm->weight[0][1], 1, rce->weight[1][0], rce->i_weight_denom[1], rce->weight[1][1] );
  246. +        SET_WEIGHT( frm->weight[0][2], 1, rce->weight[2][0], rce->i_weight_denom[1], rce->weight[2][1] );
  247. +    }
  248.  }
  249.  
  250.  /* After encoding one frame, save stats and update ratecontrol state */
  251. @@ -1543,9 +1558,19 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
  252.                  goto fail;
  253.          }
  254.  
  255. -        if( h->sh.weight[0][0].weightfn )
  256. +        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.weight[0][0].weightfn )
  257.          {
  258. -            if( fprintf( rc->p_stat_file_out, "w:%"PRId32",%"PRId32",%"PRId32, h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
  259. +            if( fprintf( rc->p_stat_file_out, "w:%"PRId32",%"PRId32",%"PRId32,
  260. +                         h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
  261. +                goto fail;
  262. +            if( h->sh.weight[0][1].weightfn || h->sh.weight[0][2].weightfn )
  263. +            {
  264. +                if( fprintf( rc->p_stat_file_out, ",%"PRId32",%"PRId32",%"PRId32",%"PRId32",%"PRId32"\n",
  265. +                             h->sh.weight[0][1].i_denom, h->sh.weight[0][1].i_scale, h->sh.weight[0][1].i_offset,
  266. +                             h->sh.weight[0][2].i_scale, h->sh.weight[0][2].i_offset ) < 0 )
  267. +                    goto fail;
  268. +            }
  269. +            else if( fprintf( rc->p_stat_file_out, "\n" ) < 0 )
  270.                  goto fail;
  271.          }
  272.  
  273. diff --git a/encoder/slicetype.c b/encoder/slicetype.c
  274. index dc02fbd..5921541 100644
  275. --- a/encoder/slicetype.c
  276. +++ b/encoder/slicetype.c
  277. @@ -98,7 +98,73 @@ static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc
  278.      return ref->lowres[0];
  279.  }
  280.  
  281. -static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
  282. +/* How data is organized for chroma weightp:
  283. +   [U: ref] [U: fenc]
  284. +   [V: ref] [V: fenc]
  285. +   fenc = ref + offset
  286. +   v = u + stride * chroma height
  287. + * We'll need more room if we do 4:2:2 or 4:4:4. */
  288. +
  289. +static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
  290. +{
  291. +    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
  292. +    int i_stride = fenc->i_stride[1];
  293. +    int i_offset = i_stride / 2;
  294. +    int i_lines = fenc->i_lines[1];
  295. +    int i_width = fenc->i_width[1];
  296. +    int i_mb_xy = 0;
  297. +
  298. +    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
  299. +    {
  300. +        for( int y = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride )
  301. +            for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, i_mb_xy++, pel_offset_x += 8 )
  302. +            {
  303. +                /* XXX: The stride for our dst is twice what it needs to be, but we have plenty of
  304. +                 * memory (the same data is used for luma as well), so it's not a problem, at least
  305. +                 * with 4:2:0. */
  306. +                pixel *pixu = dstu + pel_offset_y + pel_offset_x;
  307. +                pixel *pixv = dstv + pel_offset_y + pel_offset_x;
  308. +                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12 */
  309. +                pixel *src2 = fenc->plane[1] + pel_offset_y + pel_offset_x*2;
  310. +                int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
  311. +                int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
  312. +                h->mc.mc_chroma( pixu         , pixv         , i_stride, src1, i_stride, mvx, mvy, 8, 8 );
  313. +                h->mc.mc_chroma( pixu+i_offset, pixv+i_offset, i_stride, src2, i_stride, 0, 0, 8, 8 );
  314. +            }
  315. +    }
  316. +    else
  317. +    {
  318. +        for( int y = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride )
  319. +            for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, i_mb_xy++, pel_offset_x += 8 )
  320. +            {
  321. +                pixel *pixu = dstu + pel_offset_y + pel_offset_x;
  322. +                pixel *pixv = dstv + pel_offset_y + pel_offset_x;
  323. +                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2;
  324. +                pixel *src2 = fenc->plane[1] + pel_offset_y + pel_offset_x*2;
  325. +                h->mc.mc_chroma( pixu         , pixv         , i_stride, src1, i_stride, 0, 0, 8, 8 );
  326. +                h->mc.mc_chroma( pixu+i_offset, pixv+i_offset, i_stride, src2, i_stride, 0, 0, 8, 8 );
  327. +            }
  328. +    }
  329. +    x264_emms();
  330. +}
  331. +
  332. +static int x264_weight_slice_header_cost( x264_t *h, x264_weight_t *w )
  333. +{
  334. +    /* Add cost of weights in the slice header. */
  335. +    int numslices;
  336. +    if( h->param.i_slice_count )
  337. +        numslices = h->param.i_slice_count;
  338. +    else if( h->param.i_slice_max_mbs )
  339. +        numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
  340. +    else
  341. +        numslices = 1;
  342. +    /* FIXME: find a way to account for --slice-max-size?
  343. +     * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
  344. +     * Since using lowres frames, assume lambda = 1. */
  345. +    return numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
  346. +}
  347. +
  348. +static NOINLINE unsigned int x264_weight_cost_luma( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
  349.  {
  350.      unsigned int cost = 0;
  351.      int i_stride = fenc->i_stride_lowres;
  352. @@ -117,18 +183,7 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, pi
  353.                  w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
  354.                  cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
  355.              }
  356. -        /* Add cost of weights in the slice header. */
  357. -        int numslices;
  358. -        if( h->param.i_slice_count )
  359. -            numslices = h->param.i_slice_count;
  360. -        else if( h->param.i_slice_max_mbs )
  361. -            numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
  362. -        else
  363. -            numslices = 1;
  364. -        /* FIXME: find a way to account for --slice-max-size?
  365. -         * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
  366. -         * Since using lowres frames, assume lambda = 1. */
  367. -        cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
  368. +        cost += x264_weight_slice_header_cost( h, w );
  369.      }
  370.      else
  371.          for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
  372. @@ -138,6 +193,46 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, pi
  373.      return cost;
  374.  }
  375.  
  376. +static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w )
  377. +{
  378. +    int x, y;
  379. +    unsigned int cost = 0;
  380. +    int i_stride = fenc->i_stride[1];
  381. +    int i_offset = i_stride / 2;
  382. +    int i_lines = fenc->i_lines[1];
  383. +    int i_width = fenc->i_width[1];
  384. +    pixel *src = ref + i_offset;
  385. +    ALIGNED_ARRAY_8( pixel, buf, [8*8] );
  386. +    int pixoff = 0;
  387. +    int i_mb = 0;
  388. +    ALIGNED_8( pixel flat[8] ) = {0};
  389. +    if( w )
  390. +    {
  391. +        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
  392. +            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
  393. +            {
  394. +                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, 8 );
  395. +                /* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
  396. +                 * But testing shows that for chroma the DC coefficient is by far the most
  397. +                 * important part of the coding cost.  Thus a more useful chroma weight is
  398. +                 * obtained by comparing each block's DC coefficient instead of the actual
  399. +                 * pixels.
  400. +                 *
  401. +                 * FIXME: add a (faster) asm sum function to replace sad. */
  402. +                cost += abs( h->pixf.sad_aligned[PIXEL_8x8](          buf,        8, flat, 0 ) -
  403. +                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
  404. +            }
  405. +        cost += x264_weight_slice_header_cost( h, w );
  406. +    }
  407. +    else
  408. +        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
  409. +            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
  410. +                cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( &ref[pixoff], i_stride, flat, 0 ) -
  411. +                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
  412. +    x264_emms();
  413. +    return cost;
  414. +}
  415. +
  416.  void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
  417.  {
  418.      float fenc_mean, ref_mean, fenc_var, ref_var;
  419. @@ -150,66 +245,109 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
  420.      float guess_scale;
  421.      int found;
  422.      x264_weight_t *weights = fenc->weight[0];
  423. +    SET_WEIGHT(weights[1], 0, 1, 0, 0 );
  424. +    SET_WEIGHT(weights[2], 0, 1, 0, 0 );
  425. +    /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
  426. +    for( int plane = 0; plane <= 2  && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
  427. +    {
  428. +        fenc_var = round( sqrt( fenc->i_pixel_ssd[plane] ) );
  429. +        ref_var  = round( sqrt(  ref->i_pixel_ssd[plane] ) );
  430. +        fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
  431. +        ref_mean  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
  432. +
  433. +        //early termination
  434. +        if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
  435. +        {
  436. +            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
  437. +            continue;
  438. +        }
  439.  
  440. -    fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
  441. -    ref_var  = round( sqrt(  ref->i_pixel_ssd[0] ) );
  442. -    fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
  443. -    ref_mean  = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
  444. +        guess_scale = ref_var ? (float)fenc_var/ref_var : 0;
  445.  
  446. -    //early termination
  447. -    if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
  448. -    {
  449. -        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
  450. -        return;
  451. -    }
  452. +        if( plane )
  453. +        {
  454. +            weights[plane].i_denom = 6;
  455. +            weights[plane].i_scale = x264_clip3( round(guess_scale * 64.0), 0, 255 );
  456. +            if( weights[plane].i_scale > 127 )
  457. +            {
  458. +                weights[1].weightfn = weights[2].weightfn = 0;
  459. +                break;
  460. +            }
  461. +        }
  462. +        else
  463. +            x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] );
  464.  
  465. -    guess_scale = ref_var ? fenc_var/ref_var : 0;
  466. -    x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[0] );
  467. +        found = 0;
  468. +        mindenom = weights[plane].i_denom;
  469. +        minscale = weights[plane].i_scale;
  470. +        minoff = 0;
  471.  
  472. -    found = 0;
  473. -    mindenom = weights[0].i_denom;
  474. -    minscale = weights[0].i_scale;
  475. -    minoff = 0;
  476. -    offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
  477. +        if( !plane && !fenc->b_intra_calculated )
  478. +        {
  479. +            x264_mb_analysis_t a;
  480. +            x264_lowres_context_init( h, &a );
  481. +            x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
  482. +        }
  483.  
  484. -    if( !fenc->b_intra_calculated )
  485. -    {
  486. -        x264_mb_analysis_t a;
  487. -        x264_lowres_context_init( h, &a );
  488. -        x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
  489. -    }
  490. -    pixel *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
  491. -    origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0 );
  492. +        pixel *mcbuf;
  493. +        if( !plane )
  494. +        {
  495. +            mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
  496. +            origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, 0 );
  497. +        }
  498. +        else if( plane )
  499. +        {
  500. +            pixel *dstu = h->mb.p_weight_buf[0];
  501. +            pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
  502. +            x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
  503. +            mcbuf = plane == 1 ? dstu : dstv;
  504. +            origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, 0 );
  505. +        }
  506.  
  507. -    if( !minscore )
  508. -    {
  509. -        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
  510. -        return;
  511. -    }
  512. +        if( !minscore )
  513. +            continue;
  514.  
  515. -    // This gives a slight improvement due to rounding errors but only tests
  516. -    // one offset on lookahead.
  517. -    // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
  518. -    for( int i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
  519. -    {
  520. -        SET_WEIGHT( weights[0], 1, minscale, mindenom, i_off );
  521. -        unsigned int s = x264_weight_cost( h, fenc, mcbuf, &weights[0] );
  522. -        COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
  523. +        // This gives a slight improvement due to rounding errors but only tests
  524. +        // one offset on lookahead.
  525. +        // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
  526. +        offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
  527. +        for( int i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
  528. +        {
  529. +            SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
  530. +            unsigned int s;
  531. +            if( plane )
  532. +                s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
  533. +            else
  534. +                s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
  535. +            COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
  536. +        }
  537. +        x264_emms();
  538. +
  539. +        /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
  540. +        /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
  541. +        if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
  542. +        {
  543. +            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
  544. +            continue;
  545. +        }
  546. +        else
  547. +            SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );
  548. +
  549. +        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane )
  550. +            fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
  551.      }
  552. -    x264_emms();
  553.  
  554. -    /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
  555. -    /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
  556. -    if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
  557. +    //FIXME, what is the correct way to deal with this?
  558. +    if( weights[1].weightfn && weights[2].weightfn && weights[1].i_denom != weights[2].i_denom )
  559.      {
  560. -        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
  561. -        return;
  562. +        int denom = X264_MIN( weights[1].i_denom, weights[2].i_denom );
  563. +        int i;
  564. +        for( i = 1; i <= 2; i++ )
  565. +        {
  566. +            weights[i].i_scale = x264_clip3( weights[i].i_scale >> ( weights[i].i_denom - denom ), 0, 255 );
  567. +            weights[i].i_denom = denom;
  568. +        }
  569.      }
  570. -    else
  571. -        SET_WEIGHT( weights[0], 1, minscale, mindenom, minoff );
  572. -
  573. -    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn )
  574. -        fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
  575.  
  576.      if( weights[0].weightfn && b_lookahead )
  577.      {
  578. --
  579. 1.7.3.2.146.gca209
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement