Untitled

From c40e95310f84738f7bdee83a23d66518d6dd6a64 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sun, 14 Nov 2010 03:34:26 -0800
Subject: [PATCH 3/3] Chroma weighted prediction
 Like luma weighted prediction, dramatically improves compression in fades.
 Up to 4-8db chroma PSNR gain in extreme cases (short, perfect fade-outs).
 On actual videos, helps up to ~1% overall.
 One example video with a decent number of fades (ef OP): 0.8% bitrate reduction overall, 7% bitrate reduction just counting chroma.
 Fixes a lot of artifacts in fades at lower bitrates.

Original patch by Dylan Yudaken <dyudaken@gmail.com>.
---
 common/common.h       |    2 +-
 encoder/encoder.c     |   94 ++++++++++--------
 encoder/me.c          |    8 ++
 encoder/ratecontrol.c |   47 +++++++--
 encoder/slicetype.c   |  262 +++++++++++++++++++++++++++++++++++++------------
 5 files changed, 299 insertions(+), 114 deletions(-)

diff --git a/common/common.h b/common/common.h
index 7d57119..1434e13 100644
--- a/common/common.h
+++ b/common/common.h
@@ -805,7 +805,7 @@ struct x264_t
         int     i_direct_score[2];
         int     i_direct_frames[2];
         /* num p-frames weighted */
-        int     i_wpred[3];
+        int     i_wpred[2];

     } stat;

diff --git a/encoder/encoder.c b/encoder/encoder.c
index ede1c28..58331b9 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1468,49 +1468,67 @@ static void x264_weighted_pred_init( x264_t *h )

     int i_padv = PADV << h->param.b_interlaced;
     int denom = -1;
-    int weightluma = 0;
+    int weightplane[2] = { 0, 0 };
     int buffer_next = 0;
-    //FIXME: when chroma support is added, move this into loop
-    h->sh.weight[0][1].weightfn = h->sh.weight[0][2].weightfn = NULL;
-    h->sh.weight[0][1].i_denom = h->sh.weight[0][2].i_denom = 0;
-    for( int j = 0; j < h->i_ref0; j++ )
+    for( int i = 0; i < 3; i++ )
     {
-        if( h->fenc->weight[j][0].weightfn )
+        for( int j = 0; j < h->i_ref0; j++ )
         {
-            h->sh.weight[j][0] = h->fenc->weight[j][0];
-            // if weight is useless, don't write it to stream
-            if( h->sh.weight[j][0].i_scale == 1<<h->sh.weight[j][0].i_denom && h->sh.weight[j][0].i_offset == 0 )
-                h->sh.weight[j][0].weightfn = NULL;
-            else
+            if( h->fenc->weight[j][i].weightfn )
             {
-                if( !weightluma )
+                h->sh.weight[j][i] = h->fenc->weight[j][i];
+                // if weight is useless, don't write it to stream
+                if( h->sh.weight[j][i].i_scale == 1<<h->sh.weight[j][i].i_denom && h->sh.weight[j][i].i_offset == 0 )
+                    h->sh.weight[j][i].weightfn = NULL;
+                else
                 {
-                    weightluma = 1;
-                    h->sh.weight[0][0].i_denom = denom = h->sh.weight[j][0].i_denom;
-                    assert( x264_clip3( denom, 0, 7 ) == denom );
+                    if( !weightplane[!!i] )
+                    {
+                        weightplane[!!i] = 1;
+                        h->sh.weight[0][!!i].i_denom = denom = h->sh.weight[j][i].i_denom;
+                        assert( x264_clip3( denom, 0, 7 ) == denom );
+                    }
+
+                    assert( h->sh.weight[j][i].i_denom == denom );
+                    if( !i )
+                    {
+                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH;
+                        //scale full resolution frame
+                        if( h->param.i_threads == 1 )
+                        {
+                            pixel *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
+                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
+                            int stride = h->fenc->i_stride[0];
+                            int width = h->fenc->i_width[0] + PADH*2;
+                            int height = h->fenc->i_lines[0] + i_padv*2;
+                            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
+                            h->fenc->i_lines_weighted = height;
+                        }
+                    }
                 }
-                assert( h->sh.weight[j][0].i_denom == denom );
-                assert( x264_clip3( h->sh.weight[j][0].i_scale, 0, 127 ) == h->sh.weight[j][0].i_scale );
-                assert( x264_clip3( h->sh.weight[j][0].i_offset, -128, 127 ) == h->sh.weight[j][0].i_offset );
-                h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] +
-                    h->fenc->i_stride[0] * i_padv + PADH;
             }
         }
+    }

-        //scale full resolution frame
-        if( h->sh.weight[j][0].weightfn && h->param.i_threads == 1 )
+    if( weightplane[1] )
+        for( int i = 0; i < h->i_ref0; i++ )
         {
-            pixel *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
-            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
-            int stride = h->fenc->i_stride[0];
-            int width = h->fenc->i_width[0] + PADH*2;
-            int height = h->fenc->i_lines[0] + i_padv*2;
-            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
-            h->fenc->i_lines_weighted = height;
+            if( h->sh.weight[i][1].weightfn && !h->sh.weight[i][2].weightfn )
+            {
+                h->sh.weight[i][2].i_scale = 1 << h->sh.weight[0][1].i_denom;
+                h->sh.weight[i][2].i_offset = 0;
+            }
+            else if( h->sh.weight[i][2].weightfn && !h->sh.weight[i][1].weightfn )
+            {
+                h->sh.weight[i][1].i_scale = 1 << h->sh.weight[0][1].i_denom;
+                h->sh.weight[i][1].i_offset = 0;
+            }
         }
-    }
-    if( !weightluma )
+
+    if( !weightplane[0] )
         h->sh.weight[0][0].i_denom = 0;
+    if( !weightplane[1] )
+        h->sh.weight[0][1].i_denom = h->sh.weight[0][2].i_denom = 0;
 }

 static inline void x264_reference_build_list( x264_t *h, int i_poc )
@@ -2849,13 +2867,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     {
         h->stat.i_consecutive_bframes[h->fdec->i_frame - h->fref0[0]->i_frame - 1]++;
         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
-            for( int i = 0; i < 3; i++ )
-                for( int j = 0; j < h->i_ref0; j++ )
-                    if( h->sh.weight[0][i].i_denom != 0 )
-                    {
-                        h->stat.i_wpred[i]++;
-                        break;
-                    }
+            for( int i = 0; i < 2; i++ )
+                h->stat.i_wpred[i] += !!h->sh.weight[0][i].i_denom;
     }
     if( h->sh.i_type == SLICE_TYPE_B )
     {
@@ -3201,8 +3214,9 @@ void    x264_encoder_close  ( x264_t *h )
                       fixed_pred_modes[3][3] * 100.0 / sum_pred_modes[3] );

         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
-            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%%\n",
-                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
+            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%% UV:%.1f%%\n",
+                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P],
+                      h->stat.i_wpred[1] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );

         for( int i_list = 0; i_list < 2; i_list++ )
             for( int i_slice = 0; i_slice < 2; i_slice++ )
diff --git a/encoder/me.c b/encoder/me.c
index 3f8d8e5..90f7dfd 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -1110,7 +1110,15 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
         uint64_t cost; \
         M32( cache_mv ) = pack16to32_mask(mx,my); \
         if( m->i_pixel <= PIXEL_8x8 ) \
+        { \
             h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+            if( m->weight[1].weightfn ) \
+                m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, \
+                                                                      &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
+            if( m->weight[2].weightfn ) \
+                m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, \
+                                                                      &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
+        } \
         cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
         COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
     } \
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 34879b7..212b474 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -53,8 +53,8 @@ typedef struct
     int s_count;
     float blurred_complexity;
     char direct_mode;
-    int16_t weight[2];
-    int16_t i_weight_denom;
+    int16_t weight[3][2];
+    int16_t i_weight_denom[2];
     int refcount[16];
     int refs;
     int i_duration;
@@ -227,8 +227,8 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
     {
         ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
         h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride );
-        return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, i )
-             + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, i );
+        return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1 )
+             + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2 );
     }
     else
         return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, i );
@@ -854,11 +854,19 @@ int x264_ratecontrol_new( x264_t *h )
             rce->refs = ref;

             /* find weights */
-            rce->i_weight_denom = -1;
+            rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
             char *w = strchr( p, 'w' );
             if( w )
-                if( sscanf( w, "w:%hd,%hd,%hd", &rce->i_weight_denom, &rce->weight[0], &rce->weight[1] ) != 3 )
-                    rce->i_weight_denom = -1;
+            {
+                int count = sscanf( w, "w:%hd,%hd,%hd,%hd,%hd,%hd,%hd,%hd",
+                                    &rce->i_weight_denom[0], &rce->weight[0][0], &rce->weight[0][1],
+                                    &rce->i_weight_denom[1], &rce->weight[1][0], &rce->weight[1][1],
+                                    &rce->weight[2][0], &rce->weight[2][1] );
+                if( count == 3 )
+                    rce->i_weight_denom[1] = -1;
+                else if ( count != 8 )
+                    rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
+            }

             if( pict_type != 'b' )
                 rce->kept_as_ref = 1;
@@ -1485,8 +1493,15 @@ void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm )
     ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame];
     if( h->param.analyse.i_weighted_pred <= 0 )
         return;
-    if( rce->i_weight_denom >= 0 )
-        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0], rce->i_weight_denom, rce->weight[1] );
+
+    if( rce->i_weight_denom[0] >= 0 )
+        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0][0], rce->i_weight_denom[0], rce->weight[0][1] );
+
+    if( rce->i_weight_denom[1] >= 0 )
+    {
+        SET_WEIGHT( frm->weight[0][1], 1, rce->weight[1][0], rce->i_weight_denom[1], rce->weight[1][1] );
+        SET_WEIGHT( frm->weight[0][2], 1, rce->weight[2][0], rce->i_weight_denom[1], rce->weight[2][1] );
+    }
 }

 /* After encoding one frame, save stats and update ratecontrol state */
@@ -1543,9 +1558,19 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
                 goto fail;
         }

-        if( h->sh.weight[0][0].weightfn )
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.weight[0][0].weightfn )
         {
-            if( fprintf( rc->p_stat_file_out, "w:%"PRId32",%"PRId32",%"PRId32, h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
+            if( fprintf( rc->p_stat_file_out, "w:%"PRId32",%"PRId32",%"PRId32,
+                         h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
+                goto fail;
+            if( h->sh.weight[0][1].weightfn || h->sh.weight[0][2].weightfn )
+            {
+                if( fprintf( rc->p_stat_file_out, ",%"PRId32",%"PRId32",%"PRId32",%"PRId32",%"PRId32"\n",
+                             h->sh.weight[0][1].i_denom, h->sh.weight[0][1].i_scale, h->sh.weight[0][1].i_offset,
+                             h->sh.weight[0][2].i_scale, h->sh.weight[0][2].i_offset ) < 0 )
+                    goto fail;
+            }
+            else if( fprintf( rc->p_stat_file_out, "\n" ) < 0 )
                 goto fail;
         }

diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index dc02fbd..5921541 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -98,7 +98,73 @@ static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc
     return ref->lowres[0];
 }

-static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
+/* How data is organized for chroma weightp:
+   [U: ref] [U: fenc]
+   [V: ref] [V: fenc]
+   fenc = ref + offset
+   v = u + stride * chroma height
+ * We'll need more room if we do 4:2:2 or 4:4:4. */
+
+static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
+{
+    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
+    int i_stride = fenc->i_stride[1];
+    int i_offset = i_stride / 2;
+    int i_lines = fenc->i_lines[1];
+    int i_width = fenc->i_width[1];
+    int i_mb_xy = 0;
+
+    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
+    {
+        for( int y = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride )
+            for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, i_mb_xy++, pel_offset_x += 8 )
+            {
+                /* XXX: The stride for our dst is twice what it needs to be, but we have plenty of
+                 * memory (the same data is used for luma as well), so it's not a problem, at least
+                 * with 4:2:0. */
+                pixel *pixu = dstu + pel_offset_y + pel_offset_x;
+                pixel *pixv = dstv + pel_offset_y + pel_offset_x;
+                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12 */
+                pixel *src2 = fenc->plane[1] + pel_offset_y + pel_offset_x*2;
+                int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
+                int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
+                h->mc.mc_chroma( pixu         , pixv         , i_stride, src1, i_stride, mvx, mvy, 8, 8 );
+                h->mc.mc_chroma( pixu+i_offset, pixv+i_offset, i_stride, src2, i_stride, 0, 0, 8, 8 );
+            }
+    }
+    else
+    {
+        for( int y = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride )
+            for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, i_mb_xy++, pel_offset_x += 8 )
+            {
+                pixel *pixu = dstu + pel_offset_y + pel_offset_x;
+                pixel *pixv = dstv + pel_offset_y + pel_offset_x;
+                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2;
+                pixel *src2 = fenc->plane[1] + pel_offset_y + pel_offset_x*2;
+                h->mc.mc_chroma( pixu         , pixv         , i_stride, src1, i_stride, 0, 0, 8, 8 );
+                h->mc.mc_chroma( pixu+i_offset, pixv+i_offset, i_stride, src2, i_stride, 0, 0, 8, 8 );
+            }
+    }
+    x264_emms();
+}
+
+static int x264_weight_slice_header_cost( x264_t *h, x264_weight_t *w )
+{
+    /* Add cost of weights in the slice header. */
+    int numslices;
+    if( h->param.i_slice_count )
+        numslices = h->param.i_slice_count;
+    else if( h->param.i_slice_max_mbs )
+        numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
+    else
+        numslices = 1;
+    /* FIXME: find a way to account for --slice-max-size?
+     * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
+     * Since using lowres frames, assume lambda = 1. */
+    return numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
+}
+
+static NOINLINE unsigned int x264_weight_cost_luma( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
 {
     unsigned int cost = 0;
     int i_stride = fenc->i_stride_lowres;
@@ -117,18 +183,7 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, pi
                 w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
                 cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
             }
-        /* Add cost of weights in the slice header. */
-        int numslices;
-        if( h->param.i_slice_count )
-            numslices = h->param.i_slice_count;
-        else if( h->param.i_slice_max_mbs )
-            numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
-        else
-            numslices = 1;
-        /* FIXME: find a way to account for --slice-max-size?
-         * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
-         * Since using lowres frames, assume lambda = 1. */
-        cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
+        cost += x264_weight_slice_header_cost( h, w );
     }
     else
         for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
@@ -138,6 +193,46 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, pi
     return cost;
 }

+static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w )
+{
+    int x, y;
+    unsigned int cost = 0;
+    int i_stride = fenc->i_stride[1];
+    int i_offset = i_stride / 2;
+    int i_lines = fenc->i_lines[1];
+    int i_width = fenc->i_width[1];
+    pixel *src = ref + i_offset;
+    ALIGNED_ARRAY_8( pixel, buf, [8*8] );
+    int pixoff = 0;
+    int i_mb = 0;
+    ALIGNED_8( pixel flat[8] ) = {0};
+    if( w )
+    {
+        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+            {
+                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, 8 );
+                /* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
+                 * But testing shows that for chroma the DC coefficient is by far the most
+                 * important part of the coding cost.  Thus a more useful chroma weight is
+                 * obtained by comparing each block's DC coefficient instead of the actual
+                 * pixels.
+                 *
+                 * FIXME: add a (faster) asm sum function to replace sad. */
+                cost += abs( h->pixf.sad_aligned[PIXEL_8x8](          buf,        8, flat, 0 ) -
+                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+            }
+        cost += x264_weight_slice_header_cost( h, w );
+    }
+    else
+        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+                cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( &ref[pixoff], i_stride, flat, 0 ) -
+                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+    x264_emms();
+    return cost;
+}
+
 void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
 {
     float fenc_mean, ref_mean, fenc_var, ref_var;
@@ -150,66 +245,109 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
     float guess_scale;
     int found;
     x264_weight_t *weights = fenc->weight[0];
+    SET_WEIGHT(weights[1], 0, 1, 0, 0 );
+    SET_WEIGHT(weights[2], 0, 1, 0, 0 );
+    /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
+    for( int plane = 0; plane <= 2  && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
+    {
+        fenc_var = round( sqrt( fenc->i_pixel_ssd[plane] ) );
+        ref_var  = round( sqrt(  ref->i_pixel_ssd[plane] ) );
+        fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
+        ref_mean  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
+
+        //early termination
+        if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
+        {
+            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
+            continue;
+        }

-    fenc_var = round( sqrt( fenc->i_pixel_ssd[0] ) );
-    ref_var  = round( sqrt(  ref->i_pixel_ssd[0] ) );
-    fenc_mean = (float)fenc->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
-    ref_mean  = (float) ref->i_pixel_sum[0] / (fenc->i_lines[0] * fenc->i_width[0]);
+        guess_scale = ref_var ? (float)fenc_var/ref_var : 0;

-    //early termination
-    if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
-    {
-        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
-        return;
-    }
+        if( plane )
+        {
+            weights[plane].i_denom = 6;
+            weights[plane].i_scale = x264_clip3( round(guess_scale * 64.0), 0, 255 );
+            if( weights[plane].i_scale > 127 )
+            {
+                weights[1].weightfn = weights[2].weightfn = 0;
+                break;
+            }
+        }
+        else
+            x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] );

-    guess_scale = ref_var ? fenc_var/ref_var : 0;
-    x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[0] );
+        found = 0;
+        mindenom = weights[plane].i_denom;
+        minscale = weights[plane].i_scale;
+        minoff = 0;

-    found = 0;
-    mindenom = weights[0].i_denom;
-    minscale = weights[0].i_scale;
-    minoff = 0;
-    offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
+        if( !plane && !fenc->b_intra_calculated )
+        {
+            x264_mb_analysis_t a;
+            x264_lowres_context_init( h, &a );
+            x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
+        }

-    if( !fenc->b_intra_calculated )
-    {
-        x264_mb_analysis_t a;
-        x264_lowres_context_init( h, &a );
-        x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
-    }
-    pixel *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
-    origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0 );
+        pixel *mcbuf;
+        if( !plane )
+        {
+            mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
+            origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, 0 );
+        }
+        else if( plane )
+        {
+            pixel *dstu = h->mb.p_weight_buf[0];
+            pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
+            x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
+            mcbuf = plane == 1 ? dstu : dstv;
+            origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, 0 );
+        }

-    if( !minscore )
-    {
-        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
-        return;
-    }
+        if( !minscore )
+            continue;

-    // This gives a slight improvement due to rounding errors but only tests
-    // one offset on lookahead.
-    // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
-    for( int i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
-    {
-        SET_WEIGHT( weights[0], 1, minscale, mindenom, i_off );
-        unsigned int s = x264_weight_cost( h, fenc, mcbuf, &weights[0] );
-        COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
+        // This gives a slight improvement due to rounding errors but only tests
+        // one offset on lookahead.
+        // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
+        offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
+        for( int i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
+        {
+            SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
+            unsigned int s;
+            if( plane )
+                s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+            else
+                s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
+            COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
+        }
+        x264_emms();
+
+        /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
+        /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+        if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
+        {
+            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
+            continue;
+        }
+        else
+            SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );
+
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane )
+            fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
     }
-    x264_emms();

-    /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
-    /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
-    if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
+    //FIXME, what is the correct way to deal with this?
+    if( weights[1].weightfn && weights[2].weightfn && weights[1].i_denom != weights[2].i_denom )
     {
-        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
-        return;
+        int denom = X264_MIN( weights[1].i_denom, weights[2].i_denom );
+        int i;
+        for( i = 1; i <= 2; i++ )
+        {
+            weights[i].i_scale = x264_clip3( weights[i].i_scale >> ( weights[i].i_denom - denom ), 0, 255 );
+            weights[i].i_denom = denom;
+        }
     }
-    else
-        SET_WEIGHT( weights[0], 1, minscale, mindenom, minoff );
-
-    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn )
-        fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;

     if( weights[0].weightfn && b_lookahead )
     {
--
1.7.3.2.146.gca209