Untitled

From d7ea15bf5b3dd2ada6449facd52cf2e35db0fbe9 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 21 May 2010 13:07:12 -0700
Subject: [PATCH 01/10] Avoid a redundant qpel check in lookahead with subme <= 1.

---
 encoder/me.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/encoder/me.c b/encoder/me.c
index a35da53..77073cc 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -852,7 +852,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
             break;
     }

-    if( !b_refine_qpel )
+    if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) )
     {
         bcost = COST_MAX;
         COST_MV_SATD( bmx, bmy, -1 );
--
1.7.0.4


From 7fc5984e9ad11bafe20d4585848066554fb4a171 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 21 May 2010 14:32:13 -0700
Subject: [PATCH 02/10] Avoid an extra var2 in chroma encoding if possible
 Also remove a redundant if.

---
 encoder/analyse.c    |    5 ++---
 encoder/macroblock.c |    3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 8868012..a128a70 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -2637,9 +2637,8 @@ intra_analysis:
             h->mb.i_partition = D_16x16;
             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
             /* Set up MVs for future predictors */
-            if( b_skip )
-                for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
-                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
+            for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
+                M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
         }
         else
         {
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index a961baf..199bb68 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -331,7 +331,8 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
     {
         int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
         int ssd[2];
-        int score  = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+        int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+        if( score < thresh*4 )
             score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
         if( score < thresh*4 )
         {
--
1.7.0.4


From 038481fb5dd4144946824c7ecd94646d13db1710 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 21 May 2010 15:39:38 -0700
Subject: [PATCH 03/10] Faster deblock strength asm on conroe/penryn

---
 common/x86/deblock-a.asm |   24 +++++++++++++++++++++++-
 1 files changed, 23 insertions(+), 1 deletions(-)

diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 628ee5d..f2f3e58 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -1022,7 +1022,7 @@ cglobal deblock_strength_mmxext, 6,6
     RET

 %macro DEBLOCK_STRENGTH_XMM 1
-cglobal deblock_strength_%1, 6,6,7
+cglobal deblock_strength_%1, 6,6,8
     ; Prepare mv comparison register
     shl      r4d, 8
     add      r4d, 3 - (1<<8)
@@ -1040,6 +1040,27 @@ cglobal deblock_strength_%1, 6,6,7
     por       m5, m1

     ; Check mvs
+%ifidn %1, ssse3
+    mova      m3, [mv+4*8*0]
+    mova      m2, [mv+4*8*1]
+    mova      m0, m3
+    mova      m1, m2
+    palignr   m3, [mv+4*8*0-16], 12
+    palignr   m2, [mv+4*8*1-16], 12
+    psubw     m0, m3
+    psubw     m1, m2
+    packsswb  m0, m1
+
+    mova      m3, [mv+4*8*2]
+    mova      m7, [mv+4*8*3]
+    mova      m2, m3
+    mova      m1, m7
+    palignr   m3, [mv+4*8*2-16], 12
+    palignr   m7, [mv+4*8*3-16], 12
+    psubw     m2, m3
+    psubw     m1, m7
+    packsswb  m2, m1
+%else
     movu      m0, [mv-4+4*8*0]
     movu      m1, [mv-4+4*8*1]
     movu      m2, [mv-4+4*8*2]
@@ -1050,6 +1071,7 @@ cglobal deblock_strength_%1, 6,6,7
     psubw     m3, [mv+4*8*3]
     packsswb  m0, m1
     packsswb  m2, m3
+%endif
     ABSB2     m0, m2, m1, m3
     psubusb   m0, m6
     psubusb   m2, m6
--
1.7.0.4


From 50fd9b03194695828b822020133e28430bce3d45 Mon Sep 17 00:00:00 2001
From: Kieran Kunhya <kieran@kunhya.com>
Date: Sat, 22 May 2010 14:32:53 +0100
Subject: [PATCH 04/10] Fix typo in fake-interlaced documentation

---
 x264.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/x264.h b/x264.h
index b11acf8..f714b72 100644
--- a/x264.h
+++ b/x264.h
@@ -351,7 +351,7 @@ typedef struct x264_param_t

     /* Fake Interlaced.
      *
-     * Used only when b_interlaced=0. Setting this flag to zero makes it possible to flag the stream as PAFF interlaced yet
+     * Used only when b_interlaced=0. Setting this flag makes it possible to flag the stream as PAFF interlaced yet
      * encode all frames progessively. It is useful for encoding 25p and 30p Blu-Ray streams.
      */

--
1.7.0.4


From 23f7cfda89cd7e8c1632f86a9af887017a05594a Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Mon, 24 May 2010 11:13:22 -0700
Subject: [PATCH 05/10] Slightly faster mbtree asm

---
 common/x86/mc-a2.asm |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 8deb9e0..aee3f0a 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1111,7 +1111,7 @@ FRAME_INIT_LOWRES ssse3, 12
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
 ;                             uint16_t *inter_costs, uint16_t *inv_qscales, int len )
 ;-----------------------------------------------------------------------------
-cglobal mbtree_propagate_cost_sse2, 6,6
+cglobal mbtree_propagate_cost_sse2, 6,6,7
     shl r5d, 1
     lea r0, [r0+r5*2]
     add r1, r5
@@ -1121,6 +1121,7 @@ cglobal mbtree_propagate_cost_sse2, 6,6
     neg r5
     pxor      xmm5, xmm5
     movdqa    xmm4, [pd_128]
+    movdqa    xmm6, [pw_3fff]
 .loop:
     movq      xmm2, [r2+r5] ; intra
     movq      xmm0, [r4+r5] ; invq
@@ -1131,7 +1132,7 @@ cglobal mbtree_propagate_cost_sse2, 6,6
     psrld     xmm0, 8       ; intra*invq>>8
     movq      xmm3, [r3+r5] ; inter
     movq      xmm1, [r1+r5] ; prop
-    pand      xmm3, [pw_3fff]
+    pand      xmm3, xmm6
     punpcklwd xmm1, xmm5
     punpcklwd xmm3, xmm5
     paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
--
1.7.0.4


From 1d3e99cfff9c31a626a6720bc83f1fd25793d24f Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Tue, 25 May 2010 18:45:16 +0400
Subject: [PATCH 06/10] Fix calculation of total bitrate printed after stop by CTRL+C

---
 x264.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/x264.c b/x264.c
index c4a7400..3a01854 100644
--- a/x264.c
+++ b/x264.c
@@ -1560,6 +1560,8 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
     /* duration algorithm fails when only 1 frame is output */
     if( i_frame_output == 1 )
         duration = (double)param->i_fps_den / param->i_fps_num;
+    else if( b_ctrl_c )
+        duration = (double)(2 * last_dts - prev_dts - first_dts) * param->i_timebase_num / param->i_timebase_den;
     else
         duration = (double)(2 * largest_pts - second_largest_pts) * param->i_timebase_num / param->i_timebase_den;
     if( !(opt->i_pulldown && !param->b_vfr_input) )
--
1.7.0.4


From 9ac371a36e15b18991727d625887fad88154afd8 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Tue, 25 May 2010 19:11:42 +0400
Subject: [PATCH 07/10] Fix ABR rate control calculations (incorrect use of h->fenc->i_frame instead of h->i_frame)

---
 common/common.h       |    2 +-
 encoder/encoder.c     |    4 ++--
 encoder/ratecontrol.c |   25 ++++++++++++-------------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/common/common.h b/common/common.h
index c564768..e1f4d0c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -491,7 +491,7 @@ struct x264_t
     /* hrd */
     int initial_cpb_removal_delay;
     int initial_cpb_removal_delay_offset;
-    int64_t first_pts;
+    int64_t i_reordered_pts_delay;

     /* Current MB DCT coeffs */
     struct
diff --git a/encoder/encoder.c b/encoder/encoder.c
index de06251..a7ccd3f 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2275,8 +2275,8 @@ int     x264_encoder_encode( x264_t *h,
     /* ------------------- Get frame to be encoded ------------------------- */
     /* 4: get picture to encode */
     h->fenc = x264_frame_shift( h->frames.current );
-    if( h->i_frame == 0 )
-        h->first_pts = h->fenc->i_reordered_pts;
+    if( h->i_frame == h->i_thread_frames - 1 )
+        h->i_reordered_pts_delay = h->fenc->i_reordered_pts;
     if( h->fenc->param )
     {
         x264_encoder_reconfig( h, h->fenc->param );
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index efa872c..a725a24 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -1966,8 +1966,8 @@ static float rate_estimate_qscale( x264_t *h )
             int64_t diff;
             int64_t predicted_bits = total_bits;
             /* Adjust ABR buffer based on distance to the end of the video. */
-            if( rcc->num_entries > h->fenc->i_frame )
-                abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->fenc->i_frame );
+            if( rcc->num_entries > h->i_frame )
+                abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->i_frame );

             if( rcc->b_vbv )
             {
@@ -1987,8 +1987,8 @@ static float rate_estimate_qscale( x264_t *h )
             }
             else
             {
-                if( h->fenc->i_frame < h->i_thread_frames )
-                    predicted_bits += (int64_t)h->fenc->i_frame * rcc->bitrate / rcc->fps;
+                if( h->i_frame < h->i_thread_frames )
+                    predicted_bits += (int64_t)h->i_frame * rcc->bitrate / rcc->fps;
                 else
                     predicted_bits += (int64_t)(h->i_thread_frames - 1) * rcc->bitrate / rcc->fps;
             }
@@ -1996,12 +1996,12 @@ static float rate_estimate_qscale( x264_t *h )
             diff = predicted_bits - (int64_t)rce.expected_bits;
             q = rce.new_qscale;
             q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2);
-            if( ((h->fenc->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
+            if( ((h->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
                 (rcc->expected_bits_sum > 0))
             {
                 /* Adjust quant based on the difference between
                  * achieved and expected bitrate so far */
-                double cur_time = (double)h->fenc->i_frame / rcc->num_entries;
+                double cur_time = (double)h->i_frame / rcc->num_entries;
                 double w = x264_clip3f( cur_time*100, 0.0, 1.0 );
                 q *= pow( (double)total_bits / rcc->expected_bits_sum, w );
             }
@@ -2063,11 +2063,6 @@ static float rate_estimate_qscale( x264_t *h )
             }
             else
             {
-                int i_frame_done = h->fenc->i_frame + 1 - h->i_thread_frames;
-                double i_time_done = i_frame_done / rcc->fps;
-                if( h->param.b_vfr_input )
-                    i_time_done = ((double)(h->fenc->i_reordered_pts - h->first_pts)) * h->param.i_timebase_num / h->param.i_timebase_den;
-
                 q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );

                 /* ABR code can potentially be counterproductive in CBR, so just don't bother.
@@ -2075,10 +2070,14 @@ static float rate_estimate_qscale( x264_t *h )
                 if( !rcc->b_vbv_min_rate && rcc->last_satd )
                 {
                     // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end?
-                    wanted_bits = i_time_done * rcc->bitrate;
+                    int i_frame_done = h->i_frame + 1 - h->i_thread_frames;
+                    double time_done = i_frame_done / rcc->fps;
+                    if( h->param.b_vfr_input && i_frame_done > 0 )
+                        time_done = ((double)(h->fenc->i_reordered_pts - h->i_reordered_pts_delay)) * h->param.i_timebase_num / h->param.i_timebase_den;
+                    wanted_bits = time_done * rcc->bitrate;
                     if( wanted_bits > 0 )
                     {
-                        abr_buffer *= X264_MAX( 1, sqrt(i_time_done) );
+                        abr_buffer *= X264_MAX( 1, sqrt( time_done ) );
                         overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
                         q *= overflow;
                     }
--
1.7.0.4


From e45175154b20332a29137c093bdc9866015e10c8 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Tue, 25 May 2010 13:35:45 -0700
Subject: [PATCH 08/10] Use a thread pool instead of constantly spawning threads
 Small performance increase; may be as high as 1-2% in some cases.
 Probably helps more on OSs where thread-spawning is expensive, like OS X.
 Also gets rid of "thread created" spam when debugging in gdb.

---
 common/common.h     |   20 ++++--
 encoder/encoder.c   |  198 +++++++++++++++++++++++++++++++++++++++------------
 encoder/lookahead.c |    4 +-
 3 files changed, 168 insertions(+), 54 deletions(-)

diff --git a/common/common.h b/common/common.h
index e1f4d0c..98fcab5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -365,12 +365,20 @@ struct x264_t
     /* encoder parameters */
     x264_param_t    param;

-    x264_t          *thread[X264_THREAD_MAX+1];
-    x264_pthread_t  thread_handle;
-    int             b_thread_active;
-    int             i_thread_phase; /* which thread to use for the next frame */
-    int             i_threadslice_start; /* first row in this thread slice */
-    int             i_threadslice_end; /* row after the end of this thread slice */
+    x264_t               *thread[X264_THREAD_MAX+1]; /* contexts for each frame in progress + lookahead */
+    x264_pthread_t       *thread_handle;
+    x264_pthread_cond_t  thread_queue_cv;
+    x264_pthread_mutex_t thread_queue_mutex;
+    x264_t               **thread_queue; /* frames that have been prepared but not yet claimed by a worker thread */
+    x264_pthread_cond_t  thread_active_cv;
+    x264_pthread_mutex_t thread_active_mutex;
+    int                  thread_active;
+    int                  b_thread_active;
+    int                  i_thread_phase; /* which thread to use for the next frame */
+    int                  thread_exit;
+    int                  thread_error;
+    int                  i_threadslice_start; /* first row in this thread slice */
+    int                  i_threadslice_end; /* row after the end of this thread slice */

     /* bitstream output */
     struct
diff --git a/encoder/encoder.c b/encoder/encoder.c
index a7ccd3f..e839370 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -44,6 +44,53 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
                                    x264_nal_t **pp_nal, int *pi_nal,
                                    x264_picture_t *pic_out );

+/* threading */
+
+static void *x264_slices_write_thread( x264_t *h );
+
+#ifdef HAVE_PTHREAD
+static void x264_int_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
+{
+    x264_pthread_mutex_lock( mutex );
+    *var = val;
+    x264_pthread_cond_broadcast( cv );
+    x264_pthread_mutex_unlock( mutex );
+}
+
+static void x264_int_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
+{
+    x264_pthread_mutex_lock( mutex );
+    while( *var != val )
+        x264_pthread_cond_wait( cv, mutex );
+    x264_pthread_mutex_unlock( mutex );
+}
+
+#else
+static void x264_int_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
+{}
+static void x264_int_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
+{}
+#endif
+
+static void x264_thread_pool_push( x264_t *h )
+{
+    assert( h->thread_active == 0 );
+    h->thread_active = 1;
+    assert( h->b_thread_active == 0 );
+    h->b_thread_active = 1;
+    x264_pthread_mutex_lock( &h->thread[0]->thread_queue_mutex );
+    x264_frame_push( (void*)h->thread_queue, (void*)h );
+    x264_pthread_cond_broadcast( &h->thread[0]->thread_queue_cv );
+    x264_pthread_mutex_unlock( &h->thread[0]->thread_queue_mutex );
+}
+
+static int x264_thread_pool_wait( x264_t *h )
+{
+    x264_int_cond_wait( &h->thread_active_cv, &h->thread_active_mutex, &h->thread_active, 0 );
+    h->b_thread_active = 0;
+    return h->thread_error;
+}
+
 /****************************************************************************
  *
  ******************************* x264 libs **********************************
@@ -1047,6 +1094,16 @@ x264_t *x264_encoder_open( x264_param_t *param )
     for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
         CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );

+    if( h->param.i_threads > 1 )
+    {
+        CHECKED_MALLOCZERO( h->thread_handle, (h->param.i_threads + 1) * sizeof(x264_pthread_t) );
+        CHECKED_MALLOCZERO( h->thread_queue, (h->param.i_threads + 1) * sizeof(x264_t*) );
+        if( x264_pthread_cond_init( &h->thread_queue_cv, NULL ) )
+            goto fail;
+        if( x264_pthread_mutex_init( &h->thread_queue_mutex, NULL ) )
+            goto fail;
+    }
+
     if( x264_lookahead_init( h, i_slicetype_length ) )
         goto fail;

@@ -1071,6 +1128,20 @@ x264_t *x264_encoder_open( x264_param_t *param )
         CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
         h->thread[i]->out.i_nals_allocated = init_nal_count;

+        if( h->param.i_threads > 1 )
+        {
+            if( x264_pthread_cond_init( &h->thread[i]->thread_active_cv, NULL ) )
+                goto fail;
+            if( x264_pthread_mutex_init( &h->thread[i]->thread_active_mutex, NULL ) )
+                goto fail;
+        }
+
+#ifdef HAVE_VISUALIZE
+        if( h->param.b_visualize )
+            if( x264_visualize_init( h->thread[i] ) )
+                goto fail;
+#endif
+
         if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
             goto fail;
     }
@@ -1111,6 +1182,11 @@ x264_t *x264_encoder_open( x264_param_t *param )
         h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
         "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );

+    if( h->param.i_threads > 1 )
+        for( int i = 0; i < h->param.i_threads; i++ )
+            if( x264_pthread_create( &h->thread_handle[i], NULL, (void*)x264_slices_write_thread, h ) )
+                return NULL;
+
     return h;
 fail:
     x264_free( h );
@@ -2013,24 +2089,10 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
     memcpy( &dst->stat.i_frame_count, &src->stat.i_frame_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
 }

-static void *x264_slices_write( x264_t *h )
+static int x264_slices_write_internal( x264_t *h )
 {
     int i_slice_num = 0;
     int last_thread_mb = h->sh.i_last_mb;
-    if( h->param.i_sync_lookahead )
-        x264_lower_thread_priority( 10 );
-
-#ifdef HAVE_MMX
-    /* Misalign mask has to be set separately for each thread. */
-    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
-        x264_cpu_mask_misalign_sse();
-#endif
-
-#ifdef HAVE_VISUALIZE
-    if( h->param.b_visualize )
-        if( x264_visualize_init( h ) )
-            return (void *)-1;
-#endif

     /* init stats */
     memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
@@ -2049,24 +2111,69 @@ static void *x264_slices_write( x264_t *h )
         }
         h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
         if( x264_stack_align( x264_slice_write, h ) )
-            return (void *)-1;
+            return -1;
         h->sh.i_first_mb = h->sh.i_last_mb + 1;
     }

 #ifdef HAVE_VISUALIZE
     if( h->param.b_visualize )
-    {
         x264_visualize_show( h );
-        x264_visualize_close( h );
-    }
 #endif

+    return 0;
+}
+
+static int x264_slices_write( x264_t *h )
+{
+#ifdef HAVE_MMX
+    /* Misalign mask has to be set separately for each thread. */
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+
+    if( x264_slices_write_internal( h ) )
+        return -1;
+
+    return 0;
+}
+
+static void *x264_slices_write_thread( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+        x264_lower_thread_priority( 10 );
+
+#ifdef HAVE_MMX
+    /* Misalign mask has to be set separately for each thread. */
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+
+    for(;;)
+    {
+        int b_exit;
+        x264_t *t = NULL;
+
+        // get one frame from the queue
+        x264_pthread_mutex_lock( &h->thread_queue_mutex );
+        while( !h->thread_queue[0] && !h->thread_exit )
+            x264_pthread_cond_wait( &h->thread_queue_cv, &h->thread_queue_mutex );
+        b_exit = h->thread_exit;
+        if( !b_exit )
+            t = (void*)x264_frame_shift( (void*)h->thread_queue );
+        x264_pthread_mutex_unlock( &h->thread_queue_mutex );
+        if( b_exit )
+            break;
+
+        t->thread_error = x264_slices_write_internal( t );
+
+        x264_int_cond_broadcast( &t->thread_active_cv, &t->thread_active_mutex, &t->thread_active, 0 );
+    }
+
     return (void *)0;
 }

 static int x264_threaded_slices_write( x264_t *h )
 {
-    void *ret = NULL;
 #ifdef HAVE_MMX
     if( h->param.cpu&X264_CPU_SSE_MISALIGN )
         x264_cpu_mask_misalign_sse();
@@ -2093,18 +2200,10 @@ static int x264_threaded_slices_write( x264_t *h )

     /* dispatch */
     for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
-            return -1;
-        h->thread[i]->b_thread_active = 1;
-    }
+        x264_thread_pool_push( h->thread[i] );
     for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        x264_pthread_join( h->thread[i]->thread_handle, &ret );
-        h->thread[i]->b_thread_active = 0;
-        if( (intptr_t)ret )
-            return (intptr_t)ret;
-    }
+        if( x264_thread_pool_wait( h->thread[i] ) )
+            return -1;

     /* Go back and fix up the hpel on the borders between slices. */
     for( int i = 1; i < h->param.i_threads; i++ )
@@ -2502,18 +2601,14 @@ int     x264_encoder_encode( x264_t *h,
     h->i_threadslice_start = 0;
     h->i_threadslice_end = h->sps->i_mb_height;
     if( h->i_thread_frames > 1 )
-    {
-        if( x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ) )
-            return -1;
-        h->b_thread_active = 1;
-    }
+        x264_thread_pool_push( h );
     else if( h->param.b_sliced_threads )
     {
         if( x264_threaded_slices_write( h ) )
             return -1;
     }
     else
-        if( (intptr_t)x264_slices_write( h ) )
+        if( x264_slices_write( h ) )
             return -1;

     return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
@@ -2526,13 +2621,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     char psz_message[80];

     if( h->b_thread_active )
-    {
-        void *ret = NULL;
-        x264_pthread_join( h->thread_handle, &ret );
-        h->b_thread_active = 0;
-        if( (intptr_t)ret )
-            return (intptr_t)ret;
-    }
+        if( x264_thread_pool_wait( h ) )
+            return -1;
     if( !h->out.i_nal )
     {
         pic_out->i_type = X264_TYPE_AUTO;
@@ -2798,9 +2888,21 @@ void    x264_encoder_close  ( x264_t *h )
     if( h->param.i_threads > 1 )
     {
         // don't strictly have to wait for the other threads, but it's simpler than canceling them
+        x264_pthread_mutex_lock( &h->thread_queue_mutex );
+        h->thread_exit = 1;
+        x264_pthread_cond_broadcast( &h->thread_queue_cv );
+        x264_pthread_mutex_unlock( &h->thread_queue_mutex );
         for( int i = 0; i < h->param.i_threads; i++ )
-            if( h->thread[i]->b_thread_active )
-                x264_pthread_join( h->thread[i]->thread_handle, NULL );
+            x264_pthread_join( h->thread_handle[i], NULL );
+        for( int i = 0; i < h->param.i_threads; i++ )
+        {
+            x264_pthread_cond_destroy( &h->thread[i]->thread_active_cv );
+            x264_pthread_mutex_destroy( &h->thread[i]->thread_active_mutex );
+        }
+        x264_pthread_cond_destroy( &h->thread_queue_cv );
+        x264_pthread_mutex_destroy( &h->thread_queue_mutex );
+        x264_free( h->thread_handle );
+        x264_free( h->thread_queue );
         if( h->i_thread_frames > 1 )
         {
             for( int i = 0; i < h->i_thread_frames; i++ )
@@ -3114,6 +3216,10 @@ void    x264_encoder_close  ( x264_t *h )
             x264_macroblock_cache_free( h->thread[i] );
         }
         x264_macroblock_thread_free( h->thread[i], 0 );
+#ifdef HAVE_VISUALIZE
+        if( h->param.b_visualize )
+            x264_visualize_close( h->thread[i] );
+#endif
         x264_free( h->thread[i]->out.p_bitstream );
         x264_free( h->thread[i]->out.nal);
         x264_free( h->thread[i] );
diff --git a/encoder/lookahead.c b/encoder/lookahead.c
index 942e952..1b56c16 100644
--- a/encoder/lookahead.c
+++ b/encoder/lookahead.c
@@ -153,7 +153,7 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )
     if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
         goto fail;

-    if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
+    if( x264_pthread_create( &h->thread_handle[h->param.i_threads], NULL, (void *)x264_lookahead_thread, look_h ) )
         goto fail;
     look->b_thread_active = 1;

@@ -171,7 +171,7 @@ void x264_lookahead_delete( x264_t *h )
         h->lookahead->b_exit_thread = 1;
         x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
         x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
-        x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
+        x264_pthread_join( h->thread_handle[h->param.i_threads], NULL );
         x264_macroblock_cache_free( h->thread[h->param.i_threads] );
         x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
         x264_free( h->thread[h->param.i_threads] );
--
1.7.0.4


From 9572e5b2f839316f69c295d86dd4891f64308d4d Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Tue, 25 May 2010 12:42:44 -0700
Subject: [PATCH 09/10] Overhaul deblocking again
 Move deblock strength calculation to immediately after encoding to take advantage of the data that's already in cache.
 Keep the deblocking itself as per-row.

---
 common/common.h          |    3 +
 common/deblock.c         |   44 +++---------
 common/frame.h           |    2 +-
 common/macroblock.c      |  172 ++++++++++++++++++++++++++++------------------
 common/macroblock.h      |    4 +-
 common/x86/deblock-a.asm |    3 +-
 encoder/encoder.c        |   17 +++++
 encoder/macroblock.c     |    8 ++-
 tools/checkasm.c         |    4 +-
 9 files changed, 147 insertions(+), 110 deletions(-)

diff --git a/common/common.h b/common/common.h
index 98fcab5..d88d695 100644
--- a/common/common.h
+++ b/common/common.h
@@ -588,6 +588,8 @@ struct x264_t
         int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
         uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
                                              * NOTE: this will fail on resolutions above 2^16 MBs... */
+        int8_t deblock_ref_table[32+2];
+        #define deblock_ref_table(x) h->mb.deblock_ref_table[x+2]

          /* buffer for weighted versions of the reference frames */
         uint8_t *p_weight_buf[16];
@@ -787,6 +789,7 @@ struct x264_t
     /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
     uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+    uint8_t (*deblock_strength[2])[2][4][4];

     /* CPU functions dependents */
     x264_predict_t      predict_16x16[4+3];
diff --git a/common/deblock.c b/common/deblock.c
index 9450a8b..af59b18 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -274,13 +274,15 @@ static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int b
     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 }

-static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, int bframe, int step, int first_edge_only )
+static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit,
+                                int bframe )
 {
     for( int dir = 0; dir < 2; dir++ )
     {
         int s1 = dir ? 1 : 8;
         int s2 = dir ? 8 : 1;
-        for( int edge = 0; edge < (first_edge_only ? 1 : 4); edge += step )
+        for( int edge = 0; edge < 4; edge++ )
             for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
             {
                 int locn = loc - s2;
@@ -337,46 +339,25 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
 void x264_frame_deblock_row( x264_t *h, int mb_y )
 {
     int b_interlaced = h->sh.b_mbaff;
-    int mvy_limit = 4 >> b_interlaced;
     int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
     int stridey   = h->fdec->i_stride[0];
     int stride2y  = stridey << b_interlaced;
     int strideuv  = h->fdec->i_stride[1];
     int stride2uv = strideuv << b_interlaced;
-    int deblock_ref_table[2][32+2];
     uint8_t (*nnz_backup)[16] = h->scratch_buffer;

-    for( int l = 0; l < 2; l++ )
-    {
-        int refs = (l ? h->i_ref1 : h->i_ref0) << h->sh.b_mbaff;
-        x264_frame_t **fref = l ? h->fref1 : h->fref0;
-        deblock_ref_table(l,-2) = -2;
-        deblock_ref_table(l,-1) = -1;
-        for( int i = 0; i < refs; i++ )
-        {
-            /* Mask off high bits to avoid frame num collisions with -1/-2.
-             * frame num values don't actually have to be correct, just unique.
-             * frame num values can't cover a range of more than 32. */
-            if( !h->mb.b_interlaced )
-                deblock_ref_table(l,i) = fref[i]->i_frame_num&63;
-            else
-                deblock_ref_table(l,i) = ((fref[i>>1]->i_frame_num&63)<<1) + (i&1);
-        }
-    }
-
     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
         munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );

     for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
     {
-        ALIGNED_ARRAY_16( uint8_t, bs, [2][4][4] );
-
         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
-        x264_macroblock_cache_load_deblock( h, mb_x, mb_y, deblock_ref_table );
+        x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );

         int mb_xy = h->mb.i_mb_xy;
-        int transform_8x8 = h->mb.mb_transform_size[mb_xy];
+        int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy];
         int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
+        uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&1&b_interlaced][mb_x];

         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
@@ -404,11 +385,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                                      h->loopf.deblock_chroma##intra[dir] );\
         } while(0)

-        if( intra_cur )
-            memset( bs, 3, sizeof(bs) );
-        else
-            h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B, transform_8x8 + 1, first_edge_only );
-
         if( h->mb.i_neighbour & MB_LEFT )
         {
             int qpl = h->mb.qp[h->mb.i_mb_left_xy];
@@ -468,13 +444,13 @@ void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int be
 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                    int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
-                                   int mvy_limit, int bframe, int step, int first_edge_only );
+                                   int mvy_limit, int bframe );
 void x264_deblock_strength_sse2  ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                    int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
-                                   int mvy_limit, int bframe, int step, int first_edge_only );
+                                   int mvy_limit, int bframe );
 void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                    int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
-                                   int mvy_limit, int bframe, int step, int first_edge_only );
+                                   int mvy_limit, int bframe );
 #ifdef ARCH_X86
 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
diff --git a/common/frame.h b/common/frame.h
index adc707c..91d27b5 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -166,7 +166,7 @@ typedef struct
     x264_deblock_intra_t deblock_chroma_intra[2];
     void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit,
-                               int bframe, int step, int first_edge_only );
+                               int bframe );
 } x264_deblock_function_t;

 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
diff --git a/common/macroblock.c b/common/macroblock.c
index 1c0ff9b..fbd0307 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -325,12 +325,15 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
 {
     if( !b_lookahead )
         for( int i = 0; i <= h->param.b_interlaced; i++ )
+        {
             for( int j = 0; j < 3; j++ )
             {
                 /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
                 CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
                 h->intra_border_backup[i][j] += 8;
             }
+            CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->sps->i_mb_width );
+        }

     /* Allocate scratch buffer */
     int scratch_size = 0;
@@ -357,8 +360,11 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
 {
     if( !b_lookahead )
         for( int i = 0; i <= h->param.b_interlaced; i++ )
+        {
+            x264_free( h->deblock_strength[i] );
             for( int j = 0; j < 3; j++ )
                 x264_free( h->intra_border_backup[i][j] - 8 );
+        }
     x264_free( h->scratch_buffer );
 }

@@ -413,6 +419,19 @@ void x264_macroblock_slice_init( x264_t *h )
             h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
         }

+    deblock_ref_table(-2) = -2;
+    deblock_ref_table(-1) = -1;
+    for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
+    {
+        /* Mask off high bits to avoid frame num collisions with -1/-2.
+         * frame num values don't actually have to be correct, just unique.
+         * frame num values can't cover a range of more than 32. */
+        if( !h->mb.b_interlaced )
+            deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
+        else
+            deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
+    }
+
     h->mb.i_neighbour4[6] =
     h->mb.i_neighbour4[9] =
     h->mb.i_neighbour4[12] =
@@ -873,15 +892,13 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
                             | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
 }

-static void inline x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
+void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
 {
-    int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
     int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
+    int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;

     h->mb.i_neighbour = 0;
     h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
-    h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x);
-    h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x);

     if( mb_x > 0 )
     {
@@ -898,86 +915,105 @@ static void inline x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int
     }
 }

-void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] )
+void x264_macroblock_cache_load_deblock( x264_t *h )
 {
-    x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
+    int mb_x = h->mb.i_mb_x;
+    int mb_y = h->mb.i_mb_y;
+    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;

     if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
         return;

-    int cur  = h->mb.i_mb_xy;
-    int left = h->mb.i_mb_left_xy;
-    int top  = h->mb.i_mb_top_xy;
-    int top_y = mb_y - (1 << h->mb.b_interlaced);
-    int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
-    int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
-    int s8x8 = h->mb.i_b8_stride;
-    int s4x4 = h->mb.i_b4_stride;
+    /* If we have multiple slices and we're deblocking on slice edges, we
+     * have to reload neighbour data. */
+    if( h->sh.i_first_mb && deblock_on_slice_edges )
+    {
+        int old_neighbour = h->mb.i_neighbour;
+        x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
+        h->mb.i_neighbour &= ~old_neighbour;
+        if( h->mb.i_neighbour )
+        {
+            int left = h->mb.i_mb_left_xy;
+            int top  = h->mb.i_mb_top_xy;
+            int top_y = mb_y - (1 << h->mb.b_interlaced);
+            int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
+            int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
+            int s8x8 = h->mb.i_b8_stride;
+            int s4x4 = h->mb.i_b4_stride;

-    uint8_t (*nnz)[24] = h->mb.non_zero_count;
+            uint8_t (*nnz)[24] = h->mb.non_zero_count;

-    if( h->mb.i_neighbour & MB_TOP )
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
+            if( h->mb.i_neighbour & MB_TOP )
+                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );

-    if( h->mb.i_neighbour & MB_LEFT )
-    {
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
-        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
-        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
-    }
+            if( h->mb.i_neighbour & MB_LEFT )
+            {
+                h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
+                h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
+                h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
+                h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
+            }

-    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8], &nnz[cur][0*4] );
-    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8], &nnz[cur][1*4] );
-    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8], &nnz[cur][2*4] );
-    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8], &nnz[cur][3*4] );
+            for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
+            {
+                int16_t (*mv)[2] = h->mb.mv[l];
+                int8_t *ref = h->mb.ref[l];

-    for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
-    {
-        int16_t (*mv)[2] = h->mb.mv[l];
-        int8_t *ref = h->mb.ref[l];
+                int i8 = x264_scan8[0] - 8;
+                if( h->mb.i_neighbour & MB_TOP )
+                {
+                    h->mb.cache.ref[l][i8+0] =
+                    h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
+                    h->mb.cache.ref[l][i8+2] =
+                    h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
+                    CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
+                }

-        int i8 = x264_scan8[0] - 8;
-        if( h->mb.i_neighbour & MB_TOP )
-        {
-            h->mb.cache.ref[l][i8+0] =
-            h->mb.cache.ref[l][i8+1] = deblock_ref_table(l,ref[top_8x8 + 0]);
-            h->mb.cache.ref[l][i8+2] =
-            h->mb.cache.ref[l][i8+3] = deblock_ref_table(l,ref[top_8x8 + 1]);
-            CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
+                i8 = x264_scan8[0] - 1;
+                if( h->mb.i_neighbour & MB_LEFT )
+                {
+                    int ir = h->mb.i_b8_xy - 1;
+                    int iv = h->mb.i_b4_xy - 1;
+                    h->mb.cache.ref[l][i8+0*8] =
+                    h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
+                    h->mb.cache.ref[l][i8+2*8] =
+                    h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
+
+                    CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
+                    CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
+                    CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
+                    CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
+                }
+            }
         }
+    }

-        i8 = x264_scan8[0] - 1;
-        if( h->mb.i_neighbour & MB_LEFT )
-        {
-            int ir = h->mb.i_b8_xy - 1;
-            int iv = h->mb.i_b4_xy - 1;
-            h->mb.cache.ref[l][i8+0*8] =
-            h->mb.cache.ref[l][i8+1*8] = deblock_ref_table(l,ref[ir + 0*s8x8]);
-            h->mb.cache.ref[l][i8+2*8] =
-            h->mb.cache.ref[l][i8+3*8] = deblock_ref_table(l,ref[ir + 1*s8x8]);
-
-            CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
-            CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
-            CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
-            CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
-        }
+    if( h->param.analyse.i_weighted_pred && h->sh.i_type == SLICE_TYPE_P )
+    {
+        /* Handle reference frame duplicates */
+        int i8 = x264_scan8[0] - 8;
+        h->mb.cache.ref[0][i8+0] =
+        h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]);
+        h->mb.cache.ref[0][i8+2] =
+        h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]);

-        int ref0 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+0*s8x8]);
-        int ref1 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+0*s8x8]);
-        int ref2 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+1*s8x8]);
-        int ref3 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+1*s8x8]);
+        i8 = x264_scan8[0] - 1;
+        h->mb.cache.ref[0][i8+0*8] =
+        h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]);
+        h->mb.cache.ref[0][i8+2*8] =
+        h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]);
+
+        int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]);
+        int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]);
+        int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]);
+        int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]);
         uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101;
         uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101;

-        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*0] ) = reftop;
-        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*1] ) = reftop;
-        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*2] ) = refbot;
-        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*3] ) = refbot;
-        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*0], mv[h->mb.i_b4_xy+0*s4x4] );
-        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*1], mv[h->mb.i_b4_xy+1*s4x4] );
-        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*2], mv[h->mb.i_b4_xy+2*s4x4] );
-        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*3], mv[h->mb.i_b4_xy+3*s4x4] );
+        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop;
+        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop;
+        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
+        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
     }
 }

@@ -1041,6 +1077,8 @@ void x264_macroblock_cache_save( x264_t *h )
         h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
         h->mb.b_transform_8x8 = 0;
         memset( nnz, 16, sizeof( *h->mb.non_zero_count ) );
+        for( int i = 0; i < 24; i++ )
+            h->mb.cache.non_zero_count[x264_scan8[i]] = 16;
     }
     else
     {
diff --git a/common/macroblock.h b/common/macroblock.h
index 5fbbd16..8dc65b8 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -271,8 +271,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
 void x264_macroblock_slice_init( x264_t *h );
 void x264_macroblock_thread_init( x264_t *h );
 void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
-void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] );
-#define deblock_ref_table(l,x) deblock_ref_table[l][x+2]
+void x264_macroblock_cache_load_deblock( x264_t *h );
+void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y );
 void x264_macroblock_cache_save( x264_t *h );

 void x264_macroblock_bipred_init( x264_t *h );
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index f2f3e58..aedd688 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -889,8 +889,7 @@ chroma_intra_body_mmxext:

 ;-----------------------------------------------------------------------------
 ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
-;                               uint8_t bs[2][4][4], int mvy_limit, int bframe, int step,
-;                               int first_edge_only )
+;                               uint8_t bs[2][4][4], int mvy_limit, int bframe )
 ;-----------------------------------------------------------------------------

 %define scan8start (4+1*8)
diff --git a/encoder/encoder.c b/encoder/encoder.c
index e839370..7872013 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1828,6 +1828,9 @@ static int x264_slice_write( x264_t *h )
     int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 3;
     int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : INT_MAX;
     int starting_bits = bs_pos(&h->out.bs);
+    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
+    int b_hpel = h->fdec->b_kept_as_ref;
+    b_deblock &= b_hpel || h->param.psz_dump_yuv;
     bs_realign( &h->out.bs );

     /* Slice */
@@ -1966,6 +1969,20 @@ static int x264_slice_write( x264_t *h )
         /* save cache */
         x264_macroblock_cache_save( h );

+        /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */
+        if( b_deblock )
+        {
+            int mvy_limit = 4 >> h->sh.b_mbaff;
+            int type = h->mb.type[h->mb.i_mb_xy];
+            uint8_t (*bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1&h->sh.b_mbaff][h->mb.i_mb_x];
+            x264_macroblock_cache_load_deblock( h );
+            if( IS_INTRA( type ) )
+                memset( bs, 3, sizeof(uint8_t)*2*4*4 );
+            else
+                h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
+                                           bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B );
+        }
+
         /* accumulate mb stats */
         h->stat.frame.i_mb_count[h->mb.i_type]++;

diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 199bb68..b7e5f34 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -459,8 +459,12 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )

 static void x264_macroblock_encode_skip( x264_t *h )
 {
-    for( int i = 0; i < sizeof( h->mb.cache.non_zero_count ); i += 16 )
-        M128( &h->mb.cache.non_zero_count[i] ) = M128_ZERO;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ) = 0;
+    for( int i = 16; i < 24; i++ )
+        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
     h->mb.i_cbp_luma = 0;
     h->mb.i_cbp_chroma = 0;
     h->mb.cbp[h->mb.i_mb_xy] = 0;
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 5dd360a..6469017 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1164,8 +1164,8 @@ static int check_deblock( int cpu_ref, int cpu_new )
                         mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
                 }
             set_func_name( "deblock_strength" );
-            call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1), 1, 0 );
-            call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1), 1, 0 );
+            call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
+            call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
             if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
             {
                 ok = 0;
--
1.7.0.4


From bd8642cef14e8dc13a0c87526b8f43e4436ab3a1 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Tue, 25 May 2010 16:13:59 -0700
Subject: [PATCH 10/10] Detect Atom CPU, enable appropriate asm functions
 I'm not going to actually optimize for this pile of garbage unless someone pays me.
 But it can't hurt to at least enable the correct functions based on benchmarks.

Also save some cache on Intel CPUs that don't need the decimate LUT due to having fast bsr/bsf.
---
 common/cpu.c           |   16 ++++++++++++----
 common/dct.c           |    2 +-
 common/pixel.c         |   17 ++++++++++-------
 common/quant.c         |   15 +++++++++++++++
 common/x86/mc-c.c      |    9 ++++++---
 common/x86/quant-a.asm |   32 ++++++++++++++++++++++++--------
 common/x86/quant.h     |    6 ++++++
 encoder/macroblock.c   |    5 +----
 tools/checkasm.c       |   14 +++++++++++++-
 x264.h                 |    2 ++
 10 files changed, 90 insertions(+), 28 deletions(-)

diff --git a/common/cpu.c b/common/cpu.c
index 933a754..10ac303 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -64,6 +64,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"ARMv6", X264_CPU_ARMV6},
     {"NEON",  X264_CPU_NEON},
     {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
+    {"SlowCTZ", X264_CPU_SLOW_CTZ},
+    {"SlowAtom", X264_CPU_SLOW_ATOM},
     {"", 0},
 };

@@ -135,6 +137,7 @@ uint32_t x264_cpu_detect( void )

     if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
     {
+        cpu |= X264_CPU_SLOW_CTZ;
         x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
         if( edx&0x00400000 )
             cpu |= X264_CPU_MMXEXT;
@@ -145,6 +148,7 @@ uint32_t x264_cpu_detect( void )
                 cpu |= X264_CPU_SSE2_IS_FAST;
                 cpu |= X264_CPU_LZCNT;
                 cpu |= X264_CPU_SHUFFLE_IS_FAST;
+                cpu &= ~X264_CPU_SLOW_CTZ;
             }
             else
                 cpu |= X264_CPU_SSE2_IS_SLOW;
@@ -159,11 +163,9 @@ uint32_t x264_cpu_detect( void )

     if( !strcmp((char*)vendor, "GenuineIntel") )
     {
-        int family, model, stepping;
         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-        family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
-        model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
-        stepping = eax&0xf;
+        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
         /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
          * theoretically support sse2, but it's significantly slower than mmx for
          * almost all of x264's functions, so let's just pretend they don't. */
@@ -172,6 +174,12 @@ uint32_t x264_cpu_detect( void )
             cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
             assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
         }
+        /* Detect Atom CPU */
+        if( family == 6 && model == 28 )
+        {
+            cpu |= X264_CPU_SLOW_ATOM;
+            cpu |= X264_CPU_SLOW_CTZ;
+        }
     }

     if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
diff --git a/common/dct.c b/common/dct.c
index 3917510..10fe2f7 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -457,7 +457,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
     }

-    if( cpu&X264_CPU_SSSE3 )
+    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
     {
         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
diff --git a/common/pixel.c b/common/pixel.c
index 20c5170..5759abf 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -768,17 +768,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )

     if( cpu&X264_CPU_SSSE3 )
     {
-        INIT7( ssd, _ssse3 );
-        INIT7( satd, _ssse3 );
-        INIT7( satd_x3, _ssse3 );
-        INIT7( satd_x4, _ssse3 );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _ssse3 );
         }
         INIT_ADS( _ssse3 );
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        {
+            INIT7( ssd, _ssse3 );
+            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
+            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+            INIT7( satd, _ssse3 );
+            INIT7( satd_x3, _ssse3 );
+            INIT7( satd_x4, _ssse3 );
+        }
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
         pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
@@ -794,7 +797,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT2( sad_x3, _cache64_ssse3 );
             INIT2( sad_x4, _cache64_ssse3 );
         }
-        if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
+        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
         {
             INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
         }
diff --git a/common/quant.c b/common/quant.c
index ce074e2..e62fa0f 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -312,6 +312,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
         pf->decimate_score15 = x264_decimate_score15_mmxext;
         pf->decimate_score16 = x264_decimate_score16_mmxext;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_mmxext_slowbsr;
+            pf->decimate_score16 = x264_decimate_score16_mmxext_slowbsr;
+        }
         pf->decimate_score64 = x264_decimate_score64_mmxext;
         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmxext;
         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
@@ -345,6 +350,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
         pf->decimate_score64 = x264_decimate_score64_sse2;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_sse2_slowbsr;
+            pf->decimate_score16 = x264_decimate_score16_sse2_slowbsr;
+        }
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
@@ -369,6 +379,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->denoise_dct = x264_denoise_dct_ssse3;
         pf->decimate_score15 = x264_decimate_score15_ssse3;
         pf->decimate_score16 = x264_decimate_score16_ssse3;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_ssse3_slowbsr;
+            pf->decimate_score16 = x264_decimate_score16_ssse3_slowbsr;
+        }
         pf->decimate_score64 = x264_decimate_score64_ssse3;
     }

diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index f641cff..2171f89 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -427,8 +427,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         return;

     pf->weight = x264_mc_weight_wtab_sse2;
-    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
-    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+    if( !(cpu&X264_CPU_SLOW_ATOM) )
+    {
+        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+    }

     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
@@ -481,7 +484,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->weight = x264_mc_weight_wtab_ssse3;
     }

-    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
         pf->integral_init4v = x264_integral_init4v_ssse3;

     if( !(cpu&X264_CPU_SSE4) )
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 3e520fa..b770adf 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -583,7 +583,7 @@ DENOISE_DCT ssse3, 7
 cextern decimate_table4
 cextern decimate_table8

-%macro DECIMATE4x4 2
+%macro DECIMATE4x4 3

 ;A LUT is faster than bsf on AMD processors, and no slower on Intel
 ;This is not true for score64.
@@ -605,6 +605,7 @@ cglobal decimate_score%1_%2, 1,3
 %if %1==15
     shr   edx, 1
 %endif
+%if %3==1
     movzx ecx, dl
     movzx eax, byte [mask_table + rcx]
     cmp   edx, ecx
@@ -617,8 +618,17 @@ cglobal decimate_score%1_%2, 1,3
     shr   edx, cl
     add    al, byte [table + rcx]
     add    al, byte [mask_table + rdx]
+%else
+.loop:
+    bsf   ecx, edx
+    shr   edx, cl
+    movzx ecx, byte [table + rcx]
+    add   eax, ecx
+    shr   edx, 1
+    jne  .loop
+%endif
 .ret:
-    REP_RET
+    RET
 .ret9:
     mov   eax, 9
     RET
@@ -627,14 +637,20 @@ cglobal decimate_score%1_%2, 1,3

 %ifndef ARCH_X86_64
 %define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmxext
-DECIMATE4x4 16, mmxext
+DECIMATE4x4 15, mmxext, 0
+DECIMATE4x4 16, mmxext, 0
+DECIMATE4x4 15, mmxext_slowbsr, 1
+DECIMATE4x4 16, mmxext_slowbsr, 1
 %endif
 %define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2
-DECIMATE4x4 15, ssse3
-DECIMATE4x4 16, sse2
-DECIMATE4x4 16, ssse3
+DECIMATE4x4 15, sse2, 0
+DECIMATE4x4 16, sse2, 0
+DECIMATE4x4 15, sse2_slowbsr, 1
+DECIMATE4x4 16, sse2_slowbsr, 1
+DECIMATE4x4 15, ssse3, 0
+DECIMATE4x4 16, ssse3, 0
+DECIMATE4x4 15, ssse3_slowbsr, 1
+DECIMATE4x4 16, ssse3_slowbsr, 1

 %macro DECIMATE8x8 1

diff --git a/common/x86/quant.h b/common/x86/quant.h
index 4e42b81..4ffd684 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -57,6 +57,12 @@ int x264_decimate_score15_ssse3 ( int16_t *dct );
 int x264_decimate_score16_mmxext( int16_t *dct );
 int x264_decimate_score16_sse2  ( int16_t *dct );
 int x264_decimate_score16_ssse3 ( int16_t *dct );
+int x264_decimate_score15_mmxext_slowbsr( int16_t *dct );
+int x264_decimate_score15_sse2_slowbsr  ( int16_t *dct );
+int x264_decimate_score15_ssse3_slowbsr ( int16_t *dct );
+int x264_decimate_score16_mmxext_slowbsr( int16_t *dct );
+int x264_decimate_score16_sse2_slowbsr  ( int16_t *dct );
+int x264_decimate_score16_ssse3_slowbsr ( int16_t *dct );
 int x264_decimate_score64_mmxext( int16_t *dct );
 int x264_decimate_score64_sse2  ( int16_t *dct );
 int x264_decimate_score64_ssse3 ( int16_t *dct );
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index b7e5f34..984f8a8 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -997,10 +997,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
         /* calculate dct coeffs */
         for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
         {
-            /* We don't need to zero the DC coefficient before quantization because we already
-             * checked that all the DCs were zero above at twice the precision that quant4x4
-             * uses.  This applies even though the DC here is being quantized before the 2x2
-             * transform. */
+            dct4x4[i4x4][0] = 0;
             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
                 continue;
             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 6469017..a0a9d54 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -173,7 +173,9 @@ static void print_bench(void)
                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                     b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                     b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
-                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+                    b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
+                    b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
                     ((int64_t)10*b->cycles/b->den - nop_time)/4 );
         }
 }
@@ -1700,6 +1702,8 @@ static int check_all_flags( void )
             ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
     }
     if( x264_cpu_detect() & X264_CPU_SSE2 )
     {
@@ -1708,6 +1712,10 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
         cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
+        cpu1 &= ~X264_CPU_SLOW_ATOM;
     }
     if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
     {
@@ -1730,6 +1738,10 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
         cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
+        cpu1 &= ~X264_CPU_SLOW_ATOM;
     }
     if( x264_cpu_detect() & X264_CPU_SSE4 )
     {
diff --git a/x264.h b/x264.h
index f714b72..6d7b703 100644
--- a/x264.h
+++ b/x264.h
@@ -66,6 +66,8 @@ typedef struct x264_t x264_t;
 #define X264_CPU_ARMV6          0x020000
 #define X264_CPU_NEON           0x040000  /* ARM NEON */
 #define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X264_CPU_SLOW_CTZ       0x100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM      0x200000  /* The Atom just sucks */

 /* Analyse flags
  */
--
1.7.0.4