techouse

diff --git a/common/common.c b/common/common.c
index ff8ce77..8a3a593 100644
--- a/common/common.c
+++ b/common/common.c
@@ -95,6 +95,7 @@ void    x264_param_default( x264_param_t *param )
     param->rc.f_pb_factor = 1.3;
     param->rc.i_aq_mode = X264_AQ_GLOBAL;
     param->rc.f_aq_strength = 1.0;
+    param->analyse.i_fgo = 0;

     param->rc.b_stat_write = 0;
     param->rc.psz_stat_out = "x264_2pass.log";
@@ -519,6 +520,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->rc.i_aq_mode = atoi(value);
     OPT("aq-strength")
         p->rc.f_aq_strength = atof(value);
+    OPT("fgo")
+        p->analyse.i_fgo = atoi(value);
     OPT("pass")
     {
         int i = x264_clip3( atoi(value), 0, 3 );
@@ -867,6 +870,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
     s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
     s += sprintf( s, " mbaff=%d", p->b_interlaced );
+    s += sprintf( s, " fgo=%d", p->analyse.i_fgo );

     s += sprintf( s, " bframes=%d", p->i_bframe );
     if( p->i_bframe )
diff --git a/common/pixel.c b/common/pixel.c
index 1d5567b..71fc811 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -70,7 +70,7 @@ PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
  ****************************************************************************/
 #define PIXEL_SSD_C( name, lx, ly ) \
 static int name( uint8_t *pix1, int i_stride_pix1,  \
-                 uint8_t *pix2, int i_stride_pix2 ) \
+                 uint8_t *pix2, int i_stride_pix2, int weight ) \
 {                                                   \
     int i_sum = 0;                                  \
     int x, y;                                       \
@@ -95,6 +95,61 @@ PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
 PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
 PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )

+#define PIXEL_NOISE_C( lx, ly) \
+static int x264_pixel_noise_##lx##x##ly( uint8_t *pix, int i_stride ) \
+{\
+    int score = 0;\
+    int x,y;\
+    for(y=0; y<ly; y++){\
+        if(y+1<ly){\
+            for(x=0; x<lx-1; x++)\
+                score += abs(  pix[x]   - pix[x+i_stride]\
+                             - pix[x+1] + pix[x+1+i_stride]);\
+        }\
+        pix += i_stride;\
+    }\
+    return score;\
+}
+
+PIXEL_NOISE_C( 16, 16 )
+PIXEL_NOISE_C( 16,  8 )
+PIXEL_NOISE_C(  8, 16 )
+PIXEL_NOISE_C(  8,  8 )
+PIXEL_NOISE_C(  8,  4 )
+PIXEL_NOISE_C(  4,  8 )
+PIXEL_NOISE_C(  4,  4 )
+
+#define PIXEL_NSSD( nssdname, lx, ly, ssdname, noisename)\
+static int x264_pixel_nssd_##lx##x##ly##nssdname( uint8_t *pix1,\
+int i_stride_pix1, uint8_t *pix2, int i_stride_pix2, int weight ) \
+{\
+    int ssd = x264_pixel_ssd_##lx##x##ly##ssdname( pix1, i_stride_pix1, \
+                 pix2, i_stride_pix2, weight );\
+    int noise1 = x264_pixel_noise_##lx##x##ly##noisename( pix1, i_stride_pix1 );\
+    int noise2 = x264_pixel_noise_##lx##x##ly##noisename( pix2, i_stride_pix2 );\
+    return ssd + abs(noise1 - noise2) * weight;\
+}
+
+PIXEL_NSSD(     , 16, 16,      , )
+PIXEL_NSSD(     , 16,  8,      , )
+PIXEL_NSSD(     ,  8, 16,      , )
+PIXEL_NSSD(     ,  8,  8,      , )
+PIXEL_NSSD(     ,  8,  4,      , )
+PIXEL_NSSD(     ,  4,  8,      , )
+PIXEL_NSSD(     ,  4,  4,      , )
+PIXEL_NSSD( _mmxext, 16, 16,  _mmx, _mmxext)
+PIXEL_NSSD( _mmxext, 16,  8,  _mmx, _mmxext)
+PIXEL_NSSD( _mmxext,  8, 16,  _mmx, _mmxext)
+PIXEL_NSSD( _mmxext,  8,  8,  _mmx, _mmxext)
+PIXEL_NSSD( _mmxext,  8,  4,  _mmx, _mmxext)
+PIXEL_NSSD( _mmxext,  4,  8,  _mmx, )
+PIXEL_NSSD( _mmxext,  4,  4,  _mmx, )
+PIXEL_NSSD(_sse2, 16, 16, _sse2, _mmxext)
+PIXEL_NSSD(_sse2, 16,  8, _sse2, _mmxext)
+PIXEL_NSSD(_sse2, 8, 16, _sse2, _mmxext)
+PIXEL_NSSD(_sse2, 8,  8, _sse2, _mmxext)
+PIXEL_NSSD(_sse2, 8,  4, _sse2, _mmxext)
+
 int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
 {
     int64_t i_ssd = 0;
@@ -102,7 +157,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
     int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);

 #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
-                                          pix2 + y*i_pix2 + x, i_pix2 );
+                                          pix2 + y*i_pix2 + x, i_pix2, 0 );
     for( y = 0; y < i_height-15; y += 16 )
     {
         x = 0;
@@ -527,6 +582,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     INIT7( sad_x3, );
     INIT7( sad_x4, );
     INIT7( ssd, );
+    INIT7( nssd, );
     INIT7( satd, );
     INIT7( satd_x3, );
     INIT7( satd_x4, );
@@ -550,6 +606,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT7( satd, _mmxext );
         INIT7( satd_x3, _mmxext );
         INIT7( satd_x4, _mmxext );
+        INIT7( nssd, _mmxext );
         INIT_ADS( _mmxext );

 #ifdef ARCH_X86
@@ -605,6 +662,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     if( cpu&X264_CPU_SSE2 )
     {
         INIT5( ssd, _sse2 );
+        INIT5( nssd, _sse2);
         INIT5( satd, _sse2 );
         INIT5( satd_x3, _sse2 );
         INIT5( satd_x4, _sse2 );
diff --git a/common/pixel.h b/common/pixel.h
index c95a304..c22de07 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -27,6 +27,7 @@
 // SSD assumes all args aligned
 // other cmp functions assume first arg aligned
 typedef int  (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
+typedef int  (*x264_pixel_cmp_weight_t) ( uint8_t *, int, uint8_t *, int, int );
 typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
 typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );

@@ -66,10 +67,12 @@ static const uint8_t x264_size2pixel[5][5] = {
 typedef struct
 {
     x264_pixel_cmp_t  sad[7];
-    x264_pixel_cmp_t  ssd[7];
+    x264_pixel_cmp_weight_t ssd[7];
+    x264_pixel_cmp_weight_t nssd[7];
     x264_pixel_cmp_t satd[7];
     x264_pixel_cmp_t ssim[7];
     x264_pixel_cmp_t sa8d[4];
+    x264_pixel_cmp_weight_t rdcmp[7]; /* either ssd or nsse for mode decision */
     x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
     x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index b4d0656..4efcd04 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1285,6 +1285,92 @@ SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.


 ;=============================================================================
+; NSSD
+;=============================================================================
+
+%macro NOISE_CORE_LOAD_FIRST 3
+    mova      %1, [r0+%3]
+    movu      %2, [r0+%3+1]
+%endmacro
+
+%macro NOISE_CORE_LOAD_LAST 3
+    mova      %1, [r0+%3]
+    mova      %2, %1
+    psllq     %1, 8
+    psrlq     %2, 8
+    psrlq     %1, 8
+%endmacro
+
+%macro NOISE_CORE_START 5
+    NOISE_CORE_LOAD %1, %2, %5
+    mova      %3, %1
+    mova      %4, %2
+    punpcklbw %1, m7
+    punpcklbw %2, m7
+    punpckhbw %3, m7
+    punpckhbw %4, m7
+    psubw     %1, %2
+    psubw     %3, %4
+%endmacro
+
+%macro NOISE_CORE 7
+    NOISE_CORE_START %1, %2, %3, %4, %7
+    psubw     %5, %1
+    psubw     %6, %3
+    ABS2      %5, %6, %4, %2
+    paddw     %6, %5
+    paddw     m6, %6
+%endmacro
+
+;arguments: src, stride
+;macro arguments: width, height, name
+%macro NOISE 3
+%if %1 == 16
+cglobal x264_pixel_noise_%1x%2_%3, 2,3
+    mov       r2, r0
+%else
+cglobal x264_pixel_noise_%1x%2_%3, 2,2
+x264_pixel_noise_%1x%2_%3 %+ .skip_prologue
+%endif
+    pxor      m7, m7
+    pxor      m6, m6
+    NOISE_CORE_START m0, m1, m2, m3, 0
+    NOISE_CORE m4, m1, m5, m3, m0, m2, r1
+    lea r0, [r0+r1*2]
+%rep (%2 - 2) / 2
+    NOISE_CORE m0, m1, m2, m3, m4, m5, 0
+    NOISE_CORE m4, m1, m5, m3, m0, m2, r1
+    lea r0, [r0+r1*2]
+%endrep
+    mova      m0, m6
+    punpcklwd m0, m7
+    punpckhwd m6, m7
+    paddd     m6, m0
+    mova      m0, m6
+    psrlq     m6, 32
+    paddd     m0, m6
+%if %1 == 16
+    lea       r0, [r2+8]
+    movd      r2d, m0
+    call      x264_pixel_noise_8x%2_%3 %+ .skip_prologue
+    add       eax, r2d
+%else
+    movd      eax, m0
+%endif
+    RET
+%endmacro
+
+INIT_MMX
+%define ABS2 ABS2_MMX
+%define NOISE_CORE_LOAD NOISE_CORE_LOAD_LAST
+NOISE  8, 16, mmxext
+NOISE  8,  8, mmxext
+NOISE  8,  4, mmxext
+%define NOISE_CORE_LOAD NOISE_CORE_LOAD_FIRST
+NOISE  16, 16, mmxext
+NOISE  16, 8, mmxext
+
+;=============================================================================
 ; SSIM
 ;=============================================================================

diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index fcacaf2..2f80e82 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -46,8 +46,9 @@ DECL_X1( sad, sse3 )
 DECL_X4( sad, mmxext )
 DECL_X4( sad, sse2 )
 DECL_X4( sad, sse3 )
-DECL_X1( ssd, mmx )
-DECL_X1( ssd, sse2 )
+DECL_PIXELS( int, ssd, mmx, ( uint8_t *, int, uint8_t *, int, int ) )
+DECL_PIXELS( int, ssd, sse2, ( uint8_t *, int, uint8_t *, int, int ) )
+DECL_PIXELS( int, noise, mmxext, ( uint8_t *, int ) )
 DECL_X1( satd, mmxext )
 DECL_X1( satd, sse2 )
 DECL_X1( satd, ssse3 )
diff --git a/encoder/analyse.c b/encoder/analyse.c
index de3cf57..fd7e478 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -765,7 +765,8 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
     else
         a->i_satd_i4x4 = COST_MAX;

-    if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
+    if( (a->i_satd_i8x8 <= i_satd_thresh || h->param.analyse.i_fgo)
+        && a->i_satd_i8x8 < COST_MAX )
     {
         h->mb.i_type = I_8x8;
         x264_analyse_update_cache( h, a );
@@ -1928,7 +1929,7 @@ static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_i
     //FIXME not all the update_cache calls are needed
     h->mb.i_partition = D_16x16;
     /* L0 */
-    if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
+    if( (a->l0.me16x16.cost <= thresh || h->param.analyse.i_fgo) && a->l0.i_rd16x16 == COST_MAX )
     {
         h->mb.i_type = B_L0_L0;
         x264_analyse_update_cache( h, a );
@@ -1936,7 +1937,7 @@ static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_i
     }

     /* L1 */
-    if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
+    if( (a->l1.me16x16.cost <= thresh || h->param.analyse.i_fgo) && a->l1.i_rd16x16 == COST_MAX )
     {
         h->mb.i_type = B_L1_L1;
         x264_analyse_update_cache( h, a );
@@ -2277,7 +2278,8 @@ void x264_macroblock_analyse( x264_t *h )

             if( analysis.b_mbrd )
             {
-                x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
+                x264_mb_analyse_p_rd( h, &analysis, h->param.analyse.i_fgo ?
+                i_satd_inter : X264_MIN(i_satd_inter, i_satd_intra) );
                 i_type = P_L0;
                 i_partition = D_16x16;
                 i_cost = analysis.l0.me16x16.cost;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 2b81e64..83f0141 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -402,6 +402,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_noise_reduction = 0;
         h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
         h->param.rc.i_aq_mode = 0;
+        h->param.analyse.i_fgo = 0;
     }
     if( h->param.rc.i_rc_method == X264_RC_CQP )
     {
@@ -500,6 +501,29 @@ static int x264_validate_parameters( x264_t *h )
         if( h->param.analyse.i_direct_8x8_inference < 0 )
             h->param.analyse.i_direct_8x8_inference = l->direct8x8;
     }
+
+    if( h->param.analyse.i_fgo )
+    {
+        if(h->param.analyse.i_subpel_refine < 6 ||
+        (!h->param.analyse.b_bframe_rdo && h->param.i_bframe) )
+        {
+            if(h->param.i_bframe)
+                x264_log( h, X264_LOG_WARNING, "fgo requires b-rdo and subme >= 6\n" );
+            else
+                x264_log( h, X264_LOG_WARNING, "fgo requires subme >= 6\n" );
+            h->param.analyse.i_fgo = 0;
+        }
+        else
+        {
+            /* Arbitrary clipping. */
+            h->param.analyse.i_fgo = x264_clip3(h->param.analyse.i_fgo, 0, 50);
+            /* P-skip's threshold isn't necessarily accurate when using NSSD/FGO */
+            h->param.analyse.b_fast_pskip = 0;
+            /* B-frame QPs need to be lower to retain grain */
+            /* Arbitrary formula to scale pbratio based on fgo strength. */
+            h->param.rc.f_pb_factor = 1 + (h->param.rc.f_pb_factor - 1) / pow(h->param.analyse.i_fgo,0.3);
+        }
+    }

     if( h->param.i_threads > 1 )
     {
@@ -562,6 +586,7 @@ static void mbcmp_init( x264_t *h )
     memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
     memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
     memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
+    memcpy( h->pixf.rdcmp, h->param.analyse.i_fgo ? h->pixf.nssd : h->pixf.ssd, sizeof(h->pixf.rdcmp) );
 }

 /****************************************************************************
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 11790ea..52b436f 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -194,7 +194,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
             int pix = i ? PIXEL_8x8 : PIXEL_16x16;
             stride <<= h->mb.b_interlaced;
             sad = h->pixf.sad[pix]( flat, 0, h->fenc->plane[i]+offset, stride );
-            ssd = h->pixf.ssd[pix]( flat, 0, h->fenc->plane[i]+offset, stride );
+            ssd = h->pixf.ssd[pix]( flat, 0, h->fenc->plane[i]+offset, stride, 0 );
             var += ssd - (sad * sad >> (i?6:8));
             // SATD to represent the block's overall complexity (bit cost) for intra encoding.
             // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
@@ -269,7 +269,7 @@ int x264_ratecontrol_new( x264_t *h )

     rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
     rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read;
-
+
     /* FIXME: use integers */
     if(h->param.i_fps_num > 0 && h->param.i_fps_den > 0)
         rc->fps = (float) h->param.i_fps_num / h->param.i_fps_den;
@@ -679,7 +679,7 @@ void x264_ratecontrol_summary( x264_t *h )
     if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 )
     {
         double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
-        x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
+        x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
                   qscale2qp( pow( base_cplx, 1 - h->param.rc.f_qcompress )
                              * rc->cplxr_sum / rc->wanted_bits_window ) );
     }
@@ -838,7 +838,7 @@ double predict_row_size( x264_t *h, int y, int qp )
     x264_ratecontrol_t *rc = h->rc;
     double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] );
     double pred_t = 0;
-    if( h->sh.i_type != SLICE_TYPE_I
+    if( h->sh.i_type != SLICE_TYPE_I
         && h->fref0[0]->i_type == h->fdec->i_type
         && h->fref0[0]->i_row_satd[y] > 0 )
     {
@@ -1007,7 +1007,7 @@ void x264_ratecontrol_end( x264_t *h, int bits )
         int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0];
         int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0];
         char c_direct = h->mb.b_direct_auto_write ?
-                        ( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
+                        ( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
                           dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' )
                         : '-';
         fprintf( rc->p_stat_file_out,
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 8607e07..ec956fe 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -55,18 +55,22 @@ static uint16_t cabac_prefix_size[15][128];

 static int ssd_mb( x264_t *h )
 {
-    return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
-                                     h->mb.pic.p_fdec[0], FDEC_STRIDE )
-         + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[1], FENC_STRIDE,
-                                     h->mb.pic.p_fdec[1], FDEC_STRIDE )
-         + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[2], FENC_STRIDE,
-                                     h->mb.pic.p_fdec[2], FDEC_STRIDE );
+    return h->pixf.rdcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
+                                       h->mb.pic.p_fdec[0], FDEC_STRIDE,
+                                       h->param.analyse.i_fgo )
+         + h->pixf.rdcmp[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE,
+                                       h->mb.pic.p_fdec[1], FDEC_STRIDE,
+                                       h->param.analyse.i_fgo )
+         + h->pixf.rdcmp[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE,
+                                       h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                       h->param.analyse.i_fgo );
 }

 static int ssd_plane( x264_t *h, int size, int p, int x, int y )
 {
-    return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
-                              h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
+    return h->pixf.rdcmp[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
+                               h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE,
+                               h->param.analyse.i_fgo );
 }

 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 73faf12..ecfb540 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -69,9 +69,32 @@ static int check_pixel( int cpu_ref, int cpu_new )
         } \
     } \
     report( "pixel " #name " :" );
+
+#define TEST_PIXEL_WEIGHT( name, align ) \
+    for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
+    { \
+        int res_c, res_asm; \
+        if( pixel_asm.name[i] != pixel_ref.name[i] ) \
+        { \
+            for( j=0; j<64; j++ ) \
+            { \
+                used_asm = 1; \
+                res_c   = call_c( pixel_c.name[i], buf1, 32, buf2+j*!align, 16, j ); \
+                res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j*!align, 16, j ); \
+                if( res_c != res_asm ) \
+                { \
+                    ok = 0; \
+                    fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+                    break; \
+                } \
+            } \
+        } \
+    } \
+    report( "pixel " #name " :" );

     TEST_PIXEL( sad, 0 );
-    TEST_PIXEL( ssd, 1 );
+    TEST_PIXEL_WEIGHT( ssd, 1 );
+    TEST_PIXEL_WEIGHT( nssd, 1 );
     TEST_PIXEL( satd, 0 );
     TEST_PIXEL( sa8d, 0 );

diff --git a/x264.c b/x264.c
index 70adb71..88c8c78 100644
--- a/x264.c
+++ b/x264.c
@@ -196,6 +196,9 @@ static void Help( x264_param_t *defaults, int b_longhelp )
         "                              textured areas. [%.1f]\n"
         "                                  - 0.5: weak AQ\n"
         "                                  - 1.5: strong AQ\n", defaults->rc.f_aq_strength );
+    H0( "      --fgo <int>             Activates Film Grain Optimization.[%d]\n"
+        "                                  - 5: weak FGO\n"
+        "                                  - 15: strong FGO\n", defaults->analyse.i_fgo);
     H0( "\n" );
     H0( "  -p, --pass <1|2|3>          Enable multipass ratecontrol\n"
         "                                  - 1: First pass, creates stats file\n"
@@ -420,6 +423,7 @@ static int  Parse( int argc, char **argv,
             { "no-dct-decimate", no_argument, NULL, 0 },
             { "aq-strength", required_argument, NULL, 0 },
             { "aq-mode", required_argument, NULL, 0 },
+            { "fgo", required_argument, NULL, 0 },
             { "deadzone-inter", required_argument, NULL, '0' },
             { "deadzone-intra", required_argument, NULL, '0' },
             { "level",   required_argument, NULL, 0 },
diff --git a/x264.h b/x264.h
index d2c6510..c0156ea 100644
--- a/x264.h
+++ b/x264.h
@@ -236,6 +236,7 @@ typedef struct x264_param_t
         int          b_fast_pskip; /* early SKIP detection on P-frames */
         int          b_dct_decimate; /* transform coefficient thresholding on P-frames */
         int          i_noise_reduction; /* adaptive pseudo-deadzone */
+        int          i_fgo; /* psy film grain optimization */

         /* the deadzone size that will be used in luma quantization */
         int          i_luma_deadzone[2]; /* {inter, intra} */