Advertisement
Guest User

Untitled

a guest
May 27th, 2017
558
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 65.24 KB | None | 0 0
  1. From d7ea15bf5b3dd2ada6449facd52cf2e35db0fbe9 Mon Sep 17 00:00:00 2001
  2. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  3. Date: Fri, 21 May 2010 13:07:12 -0700
  4. Subject: [PATCH 01/10] Avoid a redundant qpel check in lookahead with subme <= 1.
  5.  
  6. ---
  7. encoder/me.c |    2 +-
  8.  1 files changed, 1 insertions(+), 1 deletions(-)
  9.  
  10. diff --git a/encoder/me.c b/encoder/me.c
  11. index a35da53..77073cc 100644
  12. --- a/encoder/me.c
  13. +++ b/encoder/me.c
  14. @@ -852,7 +852,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  15.              break;
  16.      }
  17.  
  18. -    if( !b_refine_qpel )
  19. +    if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) )
  20.      {
  21.          bcost = COST_MAX;
  22.          COST_MV_SATD( bmx, bmy, -1 );
  23. --
  24. 1.7.0.4
  25.  
  26.  
  27. From 7fc5984e9ad11bafe20d4585848066554fb4a171 Mon Sep 17 00:00:00 2001
  28. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  29. Date: Fri, 21 May 2010 14:32:13 -0700
  30. Subject: [PATCH 02/10] Avoid an extra var2 in chroma encoding if possible
  31.  Also remove a redundant if.
  32.  
  33. ---
  34. encoder/analyse.c    |    5 ++---
  35.  encoder/macroblock.c |    3 ++-
  36.  2 files changed, 4 insertions(+), 4 deletions(-)
  37.  
  38. diff --git a/encoder/analyse.c b/encoder/analyse.c
  39. index 8868012..a128a70 100644
  40. --- a/encoder/analyse.c
  41. +++ b/encoder/analyse.c
  42. @@ -2637,9 +2637,8 @@ intra_analysis:
  43.              h->mb.i_partition = D_16x16;
  44.              assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
  45.              /* Set up MVs for future predictors */
  46. -            if( b_skip )
  47. -                for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
  48. -                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
  49. +            for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
  50. +                M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
  51.          }
  52.          else
  53.          {
  54. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  55. index a961baf..199bb68 100644
  56. --- a/encoder/macroblock.c
  57. +++ b/encoder/macroblock.c
  58. @@ -331,7 +331,8 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  59.      {
  60.          int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
  61.          int ssd[2];
  62. -        int score  = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
  63. +        int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
  64. +        if( score < thresh*4 )
  65.              score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
  66.          if( score < thresh*4 )
  67.          {
  68. --
  69. 1.7.0.4
  70.  
  71.  
  72. From 038481fb5dd4144946824c7ecd94646d13db1710 Mon Sep 17 00:00:00 2001
  73. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  74. Date: Fri, 21 May 2010 15:39:38 -0700
  75. Subject: [PATCH 03/10] Faster deblock strength asm on conroe/penryn
  76.  
  77. ---
  78. common/x86/deblock-a.asm |   24 +++++++++++++++++++++++-
  79.  1 files changed, 23 insertions(+), 1 deletions(-)
  80.  
  81. diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
  82. index 628ee5d..f2f3e58 100644
  83. --- a/common/x86/deblock-a.asm
  84. +++ b/common/x86/deblock-a.asm
  85. @@ -1022,7 +1022,7 @@ cglobal deblock_strength_mmxext, 6,6
  86.      RET
  87.  
  88.  %macro DEBLOCK_STRENGTH_XMM 1
  89. -cglobal deblock_strength_%1, 6,6,7
  90. +cglobal deblock_strength_%1, 6,6,8
  91.      ; Prepare mv comparison register
  92.      shl      r4d, 8
  93.      add      r4d, 3 - (1<<8)
  94. @@ -1040,6 +1040,27 @@ cglobal deblock_strength_%1, 6,6,7
  95.      por       m5, m1
  96.  
  97.      ; Check mvs
  98. +%ifidn %1, ssse3
  99. +    mova      m3, [mv+4*8*0]
  100. +    mova      m2, [mv+4*8*1]
  101. +    mova      m0, m3
  102. +    mova      m1, m2
  103. +    palignr   m3, [mv+4*8*0-16], 12
  104. +    palignr   m2, [mv+4*8*1-16], 12
  105. +    psubw     m0, m3
  106. +    psubw     m1, m2
  107. +    packsswb  m0, m1
  108. +
  109. +    mova      m3, [mv+4*8*2]
  110. +    mova      m7, [mv+4*8*3]
  111. +    mova      m2, m3
  112. +    mova      m1, m7
  113. +    palignr   m3, [mv+4*8*2-16], 12
  114. +    palignr   m7, [mv+4*8*3-16], 12
  115. +    psubw     m2, m3
  116. +    psubw     m1, m7
  117. +    packsswb  m2, m1
  118. +%else
  119.      movu      m0, [mv-4+4*8*0]
  120.      movu      m1, [mv-4+4*8*1]
  121.      movu      m2, [mv-4+4*8*2]
  122. @@ -1050,6 +1071,7 @@ cglobal deblock_strength_%1, 6,6,7
  123.      psubw     m3, [mv+4*8*3]
  124.      packsswb  m0, m1
  125.      packsswb  m2, m3
  126. +%endif
  127.      ABSB2     m0, m2, m1, m3
  128.      psubusb   m0, m6
  129.      psubusb   m2, m6
  130. --
  131. 1.7.0.4
  132.  
  133.  
  134. From 50fd9b03194695828b822020133e28430bce3d45 Mon Sep 17 00:00:00 2001
  135. From: Kieran Kunhya <kieran@kunhya.com>
  136. Date: Sat, 22 May 2010 14:32:53 +0100
  137. Subject: [PATCH 04/10] Fix typo in fake-interlaced documentation
  138.  
  139. ---
  140. x264.h |    2 +-
  141.  1 files changed, 1 insertions(+), 1 deletions(-)
  142.  
  143. diff --git a/x264.h b/x264.h
  144. index b11acf8..f714b72 100644
  145. --- a/x264.h
  146. +++ b/x264.h
  147. @@ -351,7 +351,7 @@ typedef struct x264_param_t
  148.  
  149.      /* Fake Interlaced.
  150.       *
  151. -     * Used only when b_interlaced=0. Setting this flag to zero makes it possible to flag the stream as PAFF interlaced yet
  152. +     * Used only when b_interlaced=0. Setting this flag makes it possible to flag the stream as PAFF interlaced yet
  153.       * encode all frames progessively. It is useful for encoding 25p and 30p Blu-Ray streams.
  154.       */
  155.  
  156. --
  157. 1.7.0.4
  158.  
  159.  
  160. From 23f7cfda89cd7e8c1632f86a9af887017a05594a Mon Sep 17 00:00:00 2001
  161. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  162. Date: Mon, 24 May 2010 11:13:22 -0700
  163. Subject: [PATCH 05/10] Slightly faster mbtree asm
  164.  
  165. ---
  166. common/x86/mc-a2.asm |    5 +++--
  167.  1 files changed, 3 insertions(+), 2 deletions(-)
  168.  
  169. diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
  170. index 8deb9e0..aee3f0a 100644
  171. --- a/common/x86/mc-a2.asm
  172. +++ b/common/x86/mc-a2.asm
  173. @@ -1111,7 +1111,7 @@ FRAME_INIT_LOWRES ssse3, 12
  174.  ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  175.  ;                             uint16_t *inter_costs, uint16_t *inv_qscales, int len )
  176.  ;-----------------------------------------------------------------------------
  177. -cglobal mbtree_propagate_cost_sse2, 6,6
  178. +cglobal mbtree_propagate_cost_sse2, 6,6,7
  179.      shl r5d, 1
  180.      lea r0, [r0+r5*2]
  181.      add r1, r5
  182. @@ -1121,6 +1121,7 @@ cglobal mbtree_propagate_cost_sse2, 6,6
  183.      neg r5
  184.      pxor      xmm5, xmm5
  185.      movdqa    xmm4, [pd_128]
  186. +    movdqa    xmm6, [pw_3fff]
  187.  .loop:
  188.      movq      xmm2, [r2+r5] ; intra
  189.      movq      xmm0, [r4+r5] ; invq
  190. @@ -1131,7 +1132,7 @@ cglobal mbtree_propagate_cost_sse2, 6,6
  191.      psrld     xmm0, 8       ; intra*invq>>8
  192.      movq      xmm3, [r3+r5] ; inter
  193.      movq      xmm1, [r1+r5] ; prop
  194. -    pand      xmm3, [pw_3fff]
  195. +    pand      xmm3, xmm6
  196.      punpcklwd xmm1, xmm5
  197.      punpcklwd xmm3, xmm5
  198.      paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
  199. --
  200. 1.7.0.4
  201.  
  202.  
  203. From 1d3e99cfff9c31a626a6720bc83f1fd25793d24f Mon Sep 17 00:00:00 2001
  204. From: Anton Mitrofanov <BugMaster@narod.ru>
  205. Date: Tue, 25 May 2010 18:45:16 +0400
  206. Subject: [PATCH 06/10] Fix calculation of total bitrate printed after stop by CTRL+C
  207.  
  208. ---
  209. x264.c |    2 ++
  210.  1 files changed, 2 insertions(+), 0 deletions(-)
  211.  
  212. diff --git a/x264.c b/x264.c
  213. index c4a7400..3a01854 100644
  214. --- a/x264.c
  215. +++ b/x264.c
  216. @@ -1560,6 +1560,8 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
  217.      /* duration algorithm fails when only 1 frame is output */
  218.      if( i_frame_output == 1 )
  219.          duration = (double)param->i_fps_den / param->i_fps_num;
  220. +    else if( b_ctrl_c )
  221. +        duration = (double)(2 * last_dts - prev_dts - first_dts) * param->i_timebase_num / param->i_timebase_den;
  222.      else
  223.          duration = (double)(2 * largest_pts - second_largest_pts) * param->i_timebase_num / param->i_timebase_den;
  224.      if( !(opt->i_pulldown && !param->b_vfr_input) )
  225. --
  226. 1.7.0.4
  227.  
  228.  
  229. From 9ac371a36e15b18991727d625887fad88154afd8 Mon Sep 17 00:00:00 2001
  230. From: Anton Mitrofanov <BugMaster@narod.ru>
  231. Date: Tue, 25 May 2010 19:11:42 +0400
  232. Subject: [PATCH 07/10] Fix ABR rate control calculations (incorrect use of h->fenc->i_frame instead of h->i_frame)
  233.  
  234. ---
  235. common/common.h       |    2 +-
  236.  encoder/encoder.c     |    4 ++--
  237.  encoder/ratecontrol.c |   25 ++++++++++++-------------
  238.  3 files changed, 15 insertions(+), 16 deletions(-)
  239.  
  240. diff --git a/common/common.h b/common/common.h
  241. index c564768..e1f4d0c 100644
  242. --- a/common/common.h
  243. +++ b/common/common.h
  244. @@ -491,7 +491,7 @@ struct x264_t
  245.      /* hrd */
  246.      int initial_cpb_removal_delay;
  247.      int initial_cpb_removal_delay_offset;
  248. -    int64_t first_pts;
  249. +    int64_t i_reordered_pts_delay;
  250.  
  251.      /* Current MB DCT coeffs */
  252.      struct
  253. diff --git a/encoder/encoder.c b/encoder/encoder.c
  254. index de06251..a7ccd3f 100644
  255. --- a/encoder/encoder.c
  256. +++ b/encoder/encoder.c
  257. @@ -2275,8 +2275,8 @@ int     x264_encoder_encode( x264_t *h,
  258.      /* ------------------- Get frame to be encoded ------------------------- */
  259.      /* 4: get picture to encode */
  260.      h->fenc = x264_frame_shift( h->frames.current );
  261. -    if( h->i_frame == 0 )
  262. -        h->first_pts = h->fenc->i_reordered_pts;
  263. +    if( h->i_frame == h->i_thread_frames - 1 )
  264. +        h->i_reordered_pts_delay = h->fenc->i_reordered_pts;
  265.      if( h->fenc->param )
  266.      {
  267.          x264_encoder_reconfig( h, h->fenc->param );
  268. diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
  269. index efa872c..a725a24 100644
  270. --- a/encoder/ratecontrol.c
  271. +++ b/encoder/ratecontrol.c
  272. @@ -1966,8 +1966,8 @@ static float rate_estimate_qscale( x264_t *h )
  273.              int64_t diff;
  274.              int64_t predicted_bits = total_bits;
  275.              /* Adjust ABR buffer based on distance to the end of the video. */
  276. -            if( rcc->num_entries > h->fenc->i_frame )
  277. -                abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->fenc->i_frame );
  278. +            if( rcc->num_entries > h->i_frame )
  279. +                abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->i_frame );
  280.  
  281.              if( rcc->b_vbv )
  282.              {
  283. @@ -1987,8 +1987,8 @@ static float rate_estimate_qscale( x264_t *h )
  284.              }
  285.              else
  286.              {
  287. -                if( h->fenc->i_frame < h->i_thread_frames )
  288. -                    predicted_bits += (int64_t)h->fenc->i_frame * rcc->bitrate / rcc->fps;
  289. +                if( h->i_frame < h->i_thread_frames )
  290. +                    predicted_bits += (int64_t)h->i_frame * rcc->bitrate / rcc->fps;
  291.                  else
  292.                      predicted_bits += (int64_t)(h->i_thread_frames - 1) * rcc->bitrate / rcc->fps;
  293.              }
  294. @@ -1996,12 +1996,12 @@ static float rate_estimate_qscale( x264_t *h )
  295.              diff = predicted_bits - (int64_t)rce.expected_bits;
  296.              q = rce.new_qscale;
  297.              q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2);
  298. -            if( ((h->fenc->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
  299. +            if( ((h->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
  300.                  (rcc->expected_bits_sum > 0))
  301.              {
  302.                  /* Adjust quant based on the difference between
  303.                   * achieved and expected bitrate so far */
  304. -                double cur_time = (double)h->fenc->i_frame / rcc->num_entries;
  305. +                double cur_time = (double)h->i_frame / rcc->num_entries;
  306.                  double w = x264_clip3f( cur_time*100, 0.0, 1.0 );
  307.                  q *= pow( (double)total_bits / rcc->expected_bits_sum, w );
  308.              }
  309. @@ -2063,11 +2063,6 @@ static float rate_estimate_qscale( x264_t *h )
  310.              }
  311.              else
  312.              {
  313. -                int i_frame_done = h->fenc->i_frame + 1 - h->i_thread_frames;
  314. -                double i_time_done = i_frame_done / rcc->fps;
  315. -                if( h->param.b_vfr_input )
  316. -                    i_time_done = ((double)(h->fenc->i_reordered_pts - h->first_pts)) * h->param.i_timebase_num / h->param.i_timebase_den;
  317. -
  318.                  q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );
  319.  
  320.                  /* ABR code can potentially be counterproductive in CBR, so just don't bother.
  321. @@ -2075,10 +2070,14 @@ static float rate_estimate_qscale( x264_t *h )
  322.                  if( !rcc->b_vbv_min_rate && rcc->last_satd )
  323.                  {
  324.                      // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end?
  325. -                    wanted_bits = i_time_done * rcc->bitrate;
  326. +                    int i_frame_done = h->i_frame + 1 - h->i_thread_frames;
  327. +                    double time_done = i_frame_done / rcc->fps;
  328. +                    if( h->param.b_vfr_input && i_frame_done > 0 )
  329. +                        time_done = ((double)(h->fenc->i_reordered_pts - h->i_reordered_pts_delay)) * h->param.i_timebase_num / h->param.i_timebase_den;
  330. +                    wanted_bits = time_done * rcc->bitrate;
  331.                      if( wanted_bits > 0 )
  332.                      {
  333. -                        abr_buffer *= X264_MAX( 1, sqrt(i_time_done) );
  334. +                        abr_buffer *= X264_MAX( 1, sqrt( time_done ) );
  335.                          overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
  336.                          q *= overflow;
  337.                      }
  338. --
  339. 1.7.0.4
  340.  
  341.  
  342. From e45175154b20332a29137c093bdc9866015e10c8 Mon Sep 17 00:00:00 2001
  343. From: Anton Mitrofanov <BugMaster@narod.ru>
  344. Date: Tue, 25 May 2010 13:35:45 -0700
  345. Subject: [PATCH 08/10] Use a thread pool instead of constantly spawning threads
  346.  Small performance increase; may be as high as 1-2% in some cases.
  347.  Probably helps more on OSs where thread-spawning is expensive, like OS X.
  348.  Also gets rid of "thread created" spam when debugging in gdb.
  349.  
  350. ---
  351. common/common.h     |   20 ++++--
  352.  encoder/encoder.c   |  198 +++++++++++++++++++++++++++++++++++++++------------
  353.  encoder/lookahead.c |    4 +-
  354.  3 files changed, 168 insertions(+), 54 deletions(-)
  355.  
  356. diff --git a/common/common.h b/common/common.h
  357. index e1f4d0c..98fcab5 100644
  358. --- a/common/common.h
  359. +++ b/common/common.h
  360. @@ -365,12 +365,20 @@ struct x264_t
  361.      /* encoder parameters */
  362.      x264_param_t    param;
  363.  
  364. -    x264_t          *thread[X264_THREAD_MAX+1];
  365. -    x264_pthread_t  thread_handle;
  366. -    int             b_thread_active;
  367. -    int             i_thread_phase; /* which thread to use for the next frame */
  368. -    int             i_threadslice_start; /* first row in this thread slice */
  369. -    int             i_threadslice_end; /* row after the end of this thread slice */
  370. +    x264_t               *thread[X264_THREAD_MAX+1]; /* contexts for each frame in progress + lookahead */
  371. +    x264_pthread_t       *thread_handle;
  372. +    x264_pthread_cond_t  thread_queue_cv;
  373. +    x264_pthread_mutex_t thread_queue_mutex;
  374. +    x264_t               **thread_queue; /* frames that have been prepared but not yet claimed by a worker thread */
  375. +    x264_pthread_cond_t  thread_active_cv;
  376. +    x264_pthread_mutex_t thread_active_mutex;
  377. +    int                  thread_active;
  378. +    int                  b_thread_active;
  379. +    int                  i_thread_phase; /* which thread to use for the next frame */
  380. +    int                  thread_exit;
  381. +    int                  thread_error;
  382. +    int                  i_threadslice_start; /* first row in this thread slice */
  383. +    int                  i_threadslice_end; /* row after the end of this thread slice */
  384.  
  385.      /* bitstream output */
  386.      struct
  387. diff --git a/encoder/encoder.c b/encoder/encoder.c
  388. index a7ccd3f..e839370 100644
  389. --- a/encoder/encoder.c
  390. +++ b/encoder/encoder.c
  391. @@ -44,6 +44,53 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
  392.                                     x264_nal_t **pp_nal, int *pi_nal,
  393.                                     x264_picture_t *pic_out );
  394.  
  395. +/* threading */
  396. +
  397. +static void *x264_slices_write_thread( x264_t *h );
  398. +
  399. +#ifdef HAVE_PTHREAD
  400. +static void x264_int_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
  401. +{
  402. +    x264_pthread_mutex_lock( mutex );
  403. +    *var = val;
  404. +    x264_pthread_cond_broadcast( cv );
  405. +    x264_pthread_mutex_unlock( mutex );
  406. +}
  407. +
  408. +static void x264_int_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
  409. +{
  410. +    x264_pthread_mutex_lock( mutex );
  411. +    while( *var != val )
  412. +        x264_pthread_cond_wait( cv, mutex );
  413. +    x264_pthread_mutex_unlock( mutex );
  414. +}
  415. +
  416. +#else
  417. +static void x264_int_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
  418. +{}
  419. +static void x264_int_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val )
  420. +{}
  421. +#endif
  422. +
  423. +static void x264_thread_pool_push( x264_t *h )
  424. +{
  425. +    assert( h->thread_active == 0 );
  426. +    h->thread_active = 1;
  427. +    assert( h->b_thread_active == 0 );
  428. +    h->b_thread_active = 1;
  429. +    x264_pthread_mutex_lock( &h->thread[0]->thread_queue_mutex );
  430. +    x264_frame_push( (void*)h->thread_queue, (void*)h );
  431. +    x264_pthread_cond_broadcast( &h->thread[0]->thread_queue_cv );
  432. +    x264_pthread_mutex_unlock( &h->thread[0]->thread_queue_mutex );
  433. +}
  434. +
  435. +static int x264_thread_pool_wait( x264_t *h )
  436. +{
  437. +    x264_int_cond_wait( &h->thread_active_cv, &h->thread_active_mutex, &h->thread_active, 0 );
  438. +    h->b_thread_active = 0;
  439. +    return h->thread_error;
  440. +}
  441. +
  442.  /****************************************************************************
  443.   *
  444.   ******************************* x264 libs **********************************
  445. @@ -1047,6 +1094,16 @@ x264_t *x264_encoder_open( x264_param_t *param )
  446.      for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
  447.          CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
  448.  
  449. +    if( h->param.i_threads > 1 )
  450. +    {
  451. +        CHECKED_MALLOCZERO( h->thread_handle, (h->param.i_threads + 1) * sizeof(x264_pthread_t) );
  452. +        CHECKED_MALLOCZERO( h->thread_queue, (h->param.i_threads + 1) * sizeof(x264_t*) );
  453. +        if( x264_pthread_cond_init( &h->thread_queue_cv, NULL ) )
  454. +            goto fail;
  455. +        if( x264_pthread_mutex_init( &h->thread_queue_mutex, NULL ) )
  456. +            goto fail;
  457. +    }
  458. +
  459.      if( x264_lookahead_init( h, i_slicetype_length ) )
  460.          goto fail;
  461.  
  462. @@ -1071,6 +1128,20 @@ x264_t *x264_encoder_open( x264_param_t *param )
  463.          CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
  464.          h->thread[i]->out.i_nals_allocated = init_nal_count;
  465.  
  466. +        if( h->param.i_threads > 1 )
  467. +        {
  468. +            if( x264_pthread_cond_init( &h->thread[i]->thread_active_cv, NULL ) )
  469. +                goto fail;
  470. +            if( x264_pthread_mutex_init( &h->thread[i]->thread_active_mutex, NULL ) )
  471. +                goto fail;
  472. +        }
  473. +
  474. +#ifdef HAVE_VISUALIZE
  475. +        if( h->param.b_visualize )
  476. +            if( x264_visualize_init( h->thread[i] ) )
  477. +                goto fail;
  478. +#endif
  479. +
  480.          if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
  481.              goto fail;
  482.      }
  483. @@ -1111,6 +1182,11 @@ x264_t *x264_encoder_open( x264_param_t *param )
  484.          h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
  485.          "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
  486.  
  487. +    if( h->param.i_threads > 1 )
  488. +        for( int i = 0; i < h->param.i_threads; i++ )
  489. +            if( x264_pthread_create( &h->thread_handle[i], NULL, (void*)x264_slices_write_thread, h ) )
  490. +                return NULL;
  491. +
  492.      return h;
  493.  fail:
  494.      x264_free( h );
  495. @@ -2013,24 +2089,10 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
  496.      memcpy( &dst->stat.i_frame_count, &src->stat.i_frame_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
  497.  }
  498.  
  499. -static void *x264_slices_write( x264_t *h )
  500. +static int x264_slices_write_internal( x264_t *h )
  501.  {
  502.      int i_slice_num = 0;
  503.      int last_thread_mb = h->sh.i_last_mb;
  504. -    if( h->param.i_sync_lookahead )
  505. -        x264_lower_thread_priority( 10 );
  506. -
  507. -#ifdef HAVE_MMX
  508. -    /* Misalign mask has to be set separately for each thread. */
  509. -    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
  510. -        x264_cpu_mask_misalign_sse();
  511. -#endif
  512. -
  513. -#ifdef HAVE_VISUALIZE
  514. -    if( h->param.b_visualize )
  515. -        if( x264_visualize_init( h ) )
  516. -            return (void *)-1;
  517. -#endif
  518.  
  519.      /* init stats */
  520.      memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
  521. @@ -2049,24 +2111,69 @@ static void *x264_slices_write( x264_t *h )
  522.          }
  523.          h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
  524.          if( x264_stack_align( x264_slice_write, h ) )
  525. -            return (void *)-1;
  526. +            return -1;
  527.          h->sh.i_first_mb = h->sh.i_last_mb + 1;
  528.      }
  529.  
  530.  #ifdef HAVE_VISUALIZE
  531.      if( h->param.b_visualize )
  532. -    {
  533.          x264_visualize_show( h );
  534. -        x264_visualize_close( h );
  535. -    }
  536.  #endif
  537.  
  538. +    return 0;
  539. +}
  540. +
  541. +static int x264_slices_write( x264_t *h )
  542. +{
  543. +#ifdef HAVE_MMX
  544. +    /* Misalign mask has to be set separately for each thread. */
  545. +    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
  546. +        x264_cpu_mask_misalign_sse();
  547. +#endif
  548. +
  549. +    if( x264_slices_write_internal( h ) )
  550. +        return -1;
  551. +
  552. +    return 0;
  553. +}
  554. +
  555. +static void *x264_slices_write_thread( x264_t *h )
  556. +{
  557. +    if( h->param.i_sync_lookahead )
  558. +        x264_lower_thread_priority( 10 );
  559. +
  560. +#ifdef HAVE_MMX
  561. +    /* Misalign mask has to be set separately for each thread. */
  562. +    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
  563. +        x264_cpu_mask_misalign_sse();
  564. +#endif
  565. +
  566. +    for(;;)
  567. +    {
  568. +        int b_exit;
  569. +        x264_t *t = NULL;
  570. +
  571. +        // get one frame from the queue
  572. +        x264_pthread_mutex_lock( &h->thread_queue_mutex );
  573. +        while( !h->thread_queue[0] && !h->thread_exit )
  574. +            x264_pthread_cond_wait( &h->thread_queue_cv, &h->thread_queue_mutex );
  575. +        b_exit = h->thread_exit;
  576. +        if( !b_exit )
  577. +            t = (void*)x264_frame_shift( (void*)h->thread_queue );
  578. +        x264_pthread_mutex_unlock( &h->thread_queue_mutex );
  579. +        if( b_exit )
  580. +            break;
  581. +
  582. +        t->thread_error = x264_slices_write_internal( t );
  583. +
  584. +        x264_int_cond_broadcast( &t->thread_active_cv, &t->thread_active_mutex, &t->thread_active, 0 );
  585. +    }
  586. +
  587.      return (void *)0;
  588.  }
  589.  
  590.  static int x264_threaded_slices_write( x264_t *h )
  591.  {
  592. -    void *ret = NULL;
  593.  #ifdef HAVE_MMX
  594.      if( h->param.cpu&X264_CPU_SSE_MISALIGN )
  595.          x264_cpu_mask_misalign_sse();
  596. @@ -2093,18 +2200,10 @@ static int x264_threaded_slices_write( x264_t *h )
  597.  
  598.      /* dispatch */
  599.      for( int i = 0; i < h->param.i_threads; i++ )
  600. -    {
  601. -        if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
  602. -            return -1;
  603. -        h->thread[i]->b_thread_active = 1;
  604. -    }
  605. +        x264_thread_pool_push( h->thread[i] );
  606.      for( int i = 0; i < h->param.i_threads; i++ )
  607. -    {
  608. -        x264_pthread_join( h->thread[i]->thread_handle, &ret );
  609. -        h->thread[i]->b_thread_active = 0;
  610. -        if( (intptr_t)ret )
  611. -            return (intptr_t)ret;
  612. -    }
  613. +        if( x264_thread_pool_wait( h->thread[i] ) )
  614. +            return -1;
  615.  
  616.      /* Go back and fix up the hpel on the borders between slices. */
  617.      for( int i = 1; i < h->param.i_threads; i++ )
  618. @@ -2502,18 +2601,14 @@ int     x264_encoder_encode( x264_t *h,
  619.      h->i_threadslice_start = 0;
  620.      h->i_threadslice_end = h->sps->i_mb_height;
  621.      if( h->i_thread_frames > 1 )
  622. -    {
  623. -        if( x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ) )
  624. -            return -1;
  625. -        h->b_thread_active = 1;
  626. -    }
  627. +        x264_thread_pool_push( h );
  628.      else if( h->param.b_sliced_threads )
  629.      {
  630.          if( x264_threaded_slices_write( h ) )
  631.              return -1;
  632.      }
  633.      else
  634. -        if( (intptr_t)x264_slices_write( h ) )
  635. +        if( x264_slices_write( h ) )
  636.              return -1;
  637.  
  638.      return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
  639. @@ -2526,13 +2621,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
  640.      char psz_message[80];
  641.  
  642.      if( h->b_thread_active )
  643. -    {
  644. -        void *ret = NULL;
  645. -        x264_pthread_join( h->thread_handle, &ret );
  646. -        h->b_thread_active = 0;
  647. -        if( (intptr_t)ret )
  648. -            return (intptr_t)ret;
  649. -    }
  650. +        if( x264_thread_pool_wait( h ) )
  651. +            return -1;
  652.      if( !h->out.i_nal )
  653.      {
  654.          pic_out->i_type = X264_TYPE_AUTO;
  655. @@ -2798,9 +2888,21 @@ void    x264_encoder_close  ( x264_t *h )
  656.      if( h->param.i_threads > 1 )
  657.      {
  658.          // don't strictly have to wait for the other threads, but it's simpler than canceling them
  659. +        x264_pthread_mutex_lock( &h->thread_queue_mutex );
  660. +        h->thread_exit = 1;
  661. +        x264_pthread_cond_broadcast( &h->thread_queue_cv );
  662. +        x264_pthread_mutex_unlock( &h->thread_queue_mutex );
  663.          for( int i = 0; i < h->param.i_threads; i++ )
  664. -            if( h->thread[i]->b_thread_active )
  665. -                x264_pthread_join( h->thread[i]->thread_handle, NULL );
  666. +            x264_pthread_join( h->thread_handle[i], NULL );
  667. +        for( int i = 0; i < h->param.i_threads; i++ )
  668. +        {
  669. +            x264_pthread_cond_destroy( &h->thread[i]->thread_active_cv );
  670. +            x264_pthread_mutex_destroy( &h->thread[i]->thread_active_mutex );
  671. +        }
  672. +        x264_pthread_cond_destroy( &h->thread_queue_cv );
  673. +        x264_pthread_mutex_destroy( &h->thread_queue_mutex );
  674. +        x264_free( h->thread_handle );
  675. +        x264_free( h->thread_queue );
  676.          if( h->i_thread_frames > 1 )
  677.          {
  678.              for( int i = 0; i < h->i_thread_frames; i++ )
  679. @@ -3114,6 +3216,10 @@ void    x264_encoder_close  ( x264_t *h )
  680.              x264_macroblock_cache_free( h->thread[i] );
  681.          }
  682.          x264_macroblock_thread_free( h->thread[i], 0 );
  683. +#ifdef HAVE_VISUALIZE
  684. +        if( h->param.b_visualize )
  685. +            x264_visualize_close( h->thread[i] );
  686. +#endif
  687.          x264_free( h->thread[i]->out.p_bitstream );
  688.          x264_free( h->thread[i]->out.nal);
  689.          x264_free( h->thread[i] );
  690. diff --git a/encoder/lookahead.c b/encoder/lookahead.c
  691. index 942e952..1b56c16 100644
  692. --- a/encoder/lookahead.c
  693. +++ b/encoder/lookahead.c
  694. @@ -153,7 +153,7 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )
  695.      if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
  696.          goto fail;
  697.  
  698. -    if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
  699. +    if( x264_pthread_create( &h->thread_handle[h->param.i_threads], NULL, (void *)x264_lookahead_thread, look_h ) )
  700.          goto fail;
  701.      look->b_thread_active = 1;
  702.  
  703. @@ -171,7 +171,7 @@ void x264_lookahead_delete( x264_t *h )
  704.          h->lookahead->b_exit_thread = 1;
  705.          x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
  706.          x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
  707. -        x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
  708. +        x264_pthread_join( h->thread_handle[h->param.i_threads], NULL );
  709.          x264_macroblock_cache_free( h->thread[h->param.i_threads] );
  710.          x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
  711.          x264_free( h->thread[h->param.i_threads] );
  712. --
  713. 1.7.0.4
  714.  
  715.  
  716. From 9572e5b2f839316f69c295d86dd4891f64308d4d Mon Sep 17 00:00:00 2001
  717. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  718. Date: Tue, 25 May 2010 12:42:44 -0700
  719. Subject: [PATCH 09/10] Overhaul deblocking again
  720.  Move deblock strength calculation to immediately after encoding to take advantage of the data that's already in cache.
  721.  Keep the deblocking itself as per-row.
  722.  
  723. ---
  724. common/common.h          |    3 +
  725.  common/deblock.c         |   44 +++---------
  726.  common/frame.h           |    2 +-
  727.  common/macroblock.c      |  172 ++++++++++++++++++++++++++++------------------
  728.  common/macroblock.h      |    4 +-
  729.  common/x86/deblock-a.asm |    3 +-
  730.  encoder/encoder.c        |   17 +++++
  731.  encoder/macroblock.c     |    8 ++-
  732.  tools/checkasm.c         |    4 +-
  733.  9 files changed, 147 insertions(+), 110 deletions(-)
  734.  
  735. diff --git a/common/common.h b/common/common.h
  736. index 98fcab5..d88d695 100644
  737. --- a/common/common.h
  738. +++ b/common/common.h
  739. @@ -588,6 +588,8 @@ struct x264_t
  740.          int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
  741.          uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
  742.                                               * NOTE: this will fail on resolutions above 2^16 MBs... */
  743. +        int8_t deblock_ref_table[32+2];
  744. +        #define deblock_ref_table(x) h->mb.deblock_ref_table[x+2]
  745.  
  746.           /* buffer for weighted versions of the reference frames */
  747.          uint8_t *p_weight_buf[16];
  748. @@ -787,6 +789,7 @@ struct x264_t
  749.      /* Buffers that are allocated per-thread even in sliced threads. */
  750.      void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
  751.      uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
  752. +    uint8_t (*deblock_strength[2])[2][4][4];
  753.  
  754.      /* CPU functions dependents */
  755.      x264_predict_t      predict_16x16[4+3];
  756. diff --git a/common/deblock.c b/common/deblock.c
  757. index 9450a8b..af59b18 100644
  758. --- a/common/deblock.c
  759. +++ b/common/deblock.c
  760. @@ -274,13 +274,15 @@ static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int b
  761.      deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
  762.  }
  763.  
  764. -static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, int bframe, int step, int first_edge_only )
  765. +static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
  766. +                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit,
  767. +                                int bframe )
  768.  {
  769.      for( int dir = 0; dir < 2; dir++ )
  770.      {
  771.          int s1 = dir ? 1 : 8;
  772.          int s2 = dir ? 8 : 1;
  773. -        for( int edge = 0; edge < (first_edge_only ? 1 : 4); edge += step )
  774. +        for( int edge = 0; edge < 4; edge++ )
  775.              for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
  776.              {
  777.                  int locn = loc - s2;
  778. @@ -337,46 +339,25 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2,
  779.  void x264_frame_deblock_row( x264_t *h, int mb_y )
  780.  {
  781.      int b_interlaced = h->sh.b_mbaff;
  782. -    int mvy_limit = 4 >> b_interlaced;
  783.      int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
  784.      int stridey   = h->fdec->i_stride[0];
  785.      int stride2y  = stridey << b_interlaced;
  786.      int strideuv  = h->fdec->i_stride[1];
  787.      int stride2uv = strideuv << b_interlaced;
  788. -    int deblock_ref_table[2][32+2];
  789.      uint8_t (*nnz_backup)[16] = h->scratch_buffer;
  790.  
  791. -    for( int l = 0; l < 2; l++ )
  792. -    {
  793. -        int refs = (l ? h->i_ref1 : h->i_ref0) << h->sh.b_mbaff;
  794. -        x264_frame_t **fref = l ? h->fref1 : h->fref0;
  795. -        deblock_ref_table(l,-2) = -2;
  796. -        deblock_ref_table(l,-1) = -1;
  797. -        for( int i = 0; i < refs; i++ )
  798. -        {
  799. -            /* Mask off high bits to avoid frame num collisions with -1/-2.
  800. -             * frame num values don't actually have to be correct, just unique.
  801. -             * frame num values can't cover a range of more than 32. */
  802. -            if( !h->mb.b_interlaced )
  803. -                deblock_ref_table(l,i) = fref[i]->i_frame_num&63;
  804. -            else
  805. -                deblock_ref_table(l,i) = ((fref[i>>1]->i_frame_num&63)<<1) + (i&1);
  806. -        }
  807. -    }
  808. -
  809.      if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
  810.          munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
  811.  
  812.      for( int mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
  813.      {
  814. -        ALIGNED_ARRAY_16( uint8_t, bs, [2][4][4] );
  815. -
  816.          x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
  817. -        x264_macroblock_cache_load_deblock( h, mb_x, mb_y, deblock_ref_table );
  818. +        x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
  819.  
  820.          int mb_xy = h->mb.i_mb_xy;
  821. -        int transform_8x8 = h->mb.mb_transform_size[mb_xy];
  822. +        int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy];
  823.          int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
  824. +        uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&1&b_interlaced][mb_x];
  825.  
  826.          uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
  827.          uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
  828. @@ -404,11 +385,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  829.                                       h->loopf.deblock_chroma##intra[dir] );\
  830.          } while(0)
  831.  
  832. -        if( intra_cur )
  833. -            memset( bs, 3, sizeof(bs) );
  834. -        else
  835. -            h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B, transform_8x8 + 1, first_edge_only );
  836. -
  837.          if( h->mb.i_neighbour & MB_LEFT )
  838.          {
  839.              int qpl = h->mb.qp[h->mb.i_mb_left_xy];
  840. @@ -468,13 +444,13 @@ void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int be
  841.  void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
  842.  void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
  843.                                     int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
  844. -                                   int mvy_limit, int bframe, int step, int first_edge_only );
  845. +                                   int mvy_limit, int bframe );
  846.  void x264_deblock_strength_sse2  ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
  847.                                     int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
  848. -                                   int mvy_limit, int bframe, int step, int first_edge_only );
  849. +                                   int mvy_limit, int bframe );
  850.  void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
  851.                                     int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
  852. -                                   int mvy_limit, int bframe, int step, int first_edge_only );
  853. +                                   int mvy_limit, int bframe );
  854.  #ifdef ARCH_X86
  855.  void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
  856.  void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
  857. diff --git a/common/frame.h b/common/frame.h
  858. index adc707c..91d27b5 100644
  859. --- a/common/frame.h
  860. +++ b/common/frame.h
  861. @@ -166,7 +166,7 @@ typedef struct
  862.      x264_deblock_intra_t deblock_chroma_intra[2];
  863.      void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
  864.                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit,
  865. -                               int bframe, int step, int first_edge_only );
  866. +                               int bframe );
  867.  } x264_deblock_function_t;
  868.  
  869.  x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
  870. diff --git a/common/macroblock.c b/common/macroblock.c
  871. index 1c0ff9b..fbd0307 100644
  872. --- a/common/macroblock.c
  873. +++ b/common/macroblock.c
  874. @@ -325,12 +325,15 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
  875.  {
  876.      if( !b_lookahead )
  877.          for( int i = 0; i <= h->param.b_interlaced; i++ )
  878. +        {
  879.              for( int j = 0; j < 3; j++ )
  880.              {
  881.                  /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
  882.                  CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
  883.                  h->intra_border_backup[i][j] += 8;
  884.              }
  885. +            CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->sps->i_mb_width );
  886. +        }
  887.  
  888.      /* Allocate scratch buffer */
  889.      int scratch_size = 0;
  890. @@ -357,8 +360,11 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
  891.  {
  892.      if( !b_lookahead )
  893.          for( int i = 0; i <= h->param.b_interlaced; i++ )
  894. +        {
  895. +            x264_free( h->deblock_strength[i] );
  896.              for( int j = 0; j < 3; j++ )
  897.                  x264_free( h->intra_border_backup[i][j] - 8 );
  898. +        }
  899.      x264_free( h->scratch_buffer );
  900.  }
  901.  
  902. @@ -413,6 +419,19 @@ void x264_macroblock_slice_init( x264_t *h )
  903.              h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
  904.          }
  905.  
  906. +    deblock_ref_table(-2) = -2;
  907. +    deblock_ref_table(-1) = -1;
  908. +    for( int i = 0; i < h->i_ref0 << h->sh.b_mbaff; i++ )
  909. +    {
  910. +        /* Mask off high bits to avoid frame num collisions with -1/-2.
  911. +         * frame num values don't actually have to be correct, just unique.
  912. +         * frame num values can't cover a range of more than 32. */
  913. +        if( !h->mb.b_interlaced )
  914. +            deblock_ref_table(i) = h->fref0[i]->i_frame_num&63;
  915. +        else
  916. +            deblock_ref_table(i) = ((h->fref0[i>>1]->i_frame_num&63)<<1) + (i&1);
  917. +    }
  918. +
  919.      h->mb.i_neighbour4[6] =
  920.      h->mb.i_neighbour4[9] =
  921.      h->mb.i_neighbour4[12] =
  922. @@ -873,15 +892,13 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
  923.                              | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
  924.  }
  925.  
  926. -static void inline x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
  927. +void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
  928.  {
  929. -    int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
  930.      int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
  931. +    int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
  932.  
  933.      h->mb.i_neighbour = 0;
  934.      h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
  935. -    h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x);
  936. -    h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x);
  937.  
  938.      if( mb_x > 0 )
  939.      {
  940. @@ -898,86 +915,105 @@ static void inline x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int
  941.      }
  942.  }
  943.  
  944. -void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] )
  945. +void x264_macroblock_cache_load_deblock( x264_t *h )
  946.  {
  947. -    x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
  948. +    int mb_x = h->mb.i_mb_x;
  949. +    int mb_y = h->mb.i_mb_y;
  950. +    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
  951.  
  952.      if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
  953.          return;
  954.  
  955. -    int cur  = h->mb.i_mb_xy;
  956. -    int left = h->mb.i_mb_left_xy;
  957. -    int top  = h->mb.i_mb_top_xy;
  958. -    int top_y = mb_y - (1 << h->mb.b_interlaced);
  959. -    int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
  960. -    int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
  961. -    int s8x8 = h->mb.i_b8_stride;
  962. -    int s4x4 = h->mb.i_b4_stride;
  963. +    /* If we have multiple slices and we're deblocking on slice edges, we
  964. +     * have to reload neighbour data. */
  965. +    if( h->sh.i_first_mb && deblock_on_slice_edges )
  966. +    {
  967. +        int old_neighbour = h->mb.i_neighbour;
  968. +        x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
  969. +        h->mb.i_neighbour &= ~old_neighbour;
  970. +        if( h->mb.i_neighbour )
  971. +        {
  972. +            int left = h->mb.i_mb_left_xy;
  973. +            int top  = h->mb.i_mb_top_xy;
  974. +            int top_y = mb_y - (1 << h->mb.b_interlaced);
  975. +            int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
  976. +            int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
  977. +            int s8x8 = h->mb.i_b8_stride;
  978. +            int s4x4 = h->mb.i_b4_stride;
  979.  
  980. -    uint8_t (*nnz)[24] = h->mb.non_zero_count;
  981. +            uint8_t (*nnz)[24] = h->mb.non_zero_count;
  982.  
  983. -    if( h->mb.i_neighbour & MB_TOP )
  984. -        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
  985. +            if( h->mb.i_neighbour & MB_TOP )
  986. +                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
  987.  
  988. -    if( h->mb.i_neighbour & MB_LEFT )
  989. -    {
  990. -        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
  991. -        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
  992. -        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
  993. -        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
  994. -    }
  995. +            if( h->mb.i_neighbour & MB_LEFT )
  996. +            {
  997. +                h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
  998. +                h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
  999. +                h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
  1000. +                h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
  1001. +            }
  1002.  
  1003. -    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8], &nnz[cur][0*4] );
  1004. -    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8], &nnz[cur][1*4] );
  1005. -    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8], &nnz[cur][2*4] );
  1006. -    CP32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8], &nnz[cur][3*4] );
  1007. +            for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
  1008. +            {
  1009. +                int16_t (*mv)[2] = h->mb.mv[l];
  1010. +                int8_t *ref = h->mb.ref[l];
  1011.  
  1012. -    for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
  1013. -    {
  1014. -        int16_t (*mv)[2] = h->mb.mv[l];
  1015. -        int8_t *ref = h->mb.ref[l];
  1016. +                int i8 = x264_scan8[0] - 8;
  1017. +                if( h->mb.i_neighbour & MB_TOP )
  1018. +                {
  1019. +                    h->mb.cache.ref[l][i8+0] =
  1020. +                    h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
  1021. +                    h->mb.cache.ref[l][i8+2] =
  1022. +                    h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
  1023. +                    CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
  1024. +                }
  1025.  
  1026. -        int i8 = x264_scan8[0] - 8;
  1027. -        if( h->mb.i_neighbour & MB_TOP )
  1028. -        {
  1029. -            h->mb.cache.ref[l][i8+0] =
  1030. -            h->mb.cache.ref[l][i8+1] = deblock_ref_table(l,ref[top_8x8 + 0]);
  1031. -            h->mb.cache.ref[l][i8+2] =
  1032. -            h->mb.cache.ref[l][i8+3] = deblock_ref_table(l,ref[top_8x8 + 1]);
  1033. -            CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
  1034. +                i8 = x264_scan8[0] - 1;
  1035. +                if( h->mb.i_neighbour & MB_LEFT )
  1036. +                {
  1037. +                    int ir = h->mb.i_b8_xy - 1;
  1038. +                    int iv = h->mb.i_b4_xy - 1;
  1039. +                    h->mb.cache.ref[l][i8+0*8] =
  1040. +                    h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
  1041. +                    h->mb.cache.ref[l][i8+2*8] =
  1042. +                    h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
  1043. +
  1044. +                    CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
  1045. +                    CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
  1046. +                    CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
  1047. +                    CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
  1048. +                }
  1049. +            }
  1050.          }
  1051. +    }
  1052.  
  1053. -        i8 = x264_scan8[0] - 1;
  1054. -        if( h->mb.i_neighbour & MB_LEFT )
  1055. -        {
  1056. -            int ir = h->mb.i_b8_xy - 1;
  1057. -            int iv = h->mb.i_b4_xy - 1;
  1058. -            h->mb.cache.ref[l][i8+0*8] =
  1059. -            h->mb.cache.ref[l][i8+1*8] = deblock_ref_table(l,ref[ir + 0*s8x8]);
  1060. -            h->mb.cache.ref[l][i8+2*8] =
  1061. -            h->mb.cache.ref[l][i8+3*8] = deblock_ref_table(l,ref[ir + 1*s8x8]);
  1062. -
  1063. -            CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
  1064. -            CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
  1065. -            CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
  1066. -            CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
  1067. -        }
  1068. +    if( h->param.analyse.i_weighted_pred && h->sh.i_type == SLICE_TYPE_P )
  1069. +    {
  1070. +        /* Handle reference frame duplicates */
  1071. +        int i8 = x264_scan8[0] - 8;
  1072. +        h->mb.cache.ref[0][i8+0] =
  1073. +        h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]);
  1074. +        h->mb.cache.ref[0][i8+2] =
  1075. +        h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]);
  1076.  
  1077. -        int ref0 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+0*s8x8]);
  1078. -        int ref1 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+0*s8x8]);
  1079. -        int ref2 = deblock_ref_table(l,ref[h->mb.i_b8_xy+0+1*s8x8]);
  1080. -        int ref3 = deblock_ref_table(l,ref[h->mb.i_b8_xy+1+1*s8x8]);
  1081. +        i8 = x264_scan8[0] - 1;
  1082. +        h->mb.cache.ref[0][i8+0*8] =
  1083. +        h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]);
  1084. +        h->mb.cache.ref[0][i8+2*8] =
  1085. +        h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]);
  1086. +
  1087. +        int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]);
  1088. +        int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]);
  1089. +        int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]);
  1090. +        int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]);
  1091.          uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101;
  1092.          uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101;
  1093.  
  1094. -        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*0] ) = reftop;
  1095. -        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*1] ) = reftop;
  1096. -        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*2] ) = refbot;
  1097. -        M32( &h->mb.cache.ref[l][x264_scan8[0]+8*3] ) = refbot;
  1098. -        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*0], mv[h->mb.i_b4_xy+0*s4x4] );
  1099. -        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*1], mv[h->mb.i_b4_xy+1*s4x4] );
  1100. -        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*2], mv[h->mb.i_b4_xy+2*s4x4] );
  1101. -        CP128( h->mb.cache.mv[l][x264_scan8[0]+8*3], mv[h->mb.i_b4_xy+3*s4x4] );
  1102. +        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop;
  1103. +        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop;
  1104. +        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
  1105. +        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
  1106.      }
  1107.  }
  1108.  
  1109. @@ -1041,6 +1077,8 @@ void x264_macroblock_cache_save( x264_t *h )
  1110.          h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
  1111.          h->mb.b_transform_8x8 = 0;
  1112.          memset( nnz, 16, sizeof( *h->mb.non_zero_count ) );
  1113. +        for( int i = 0; i < 24; i++ )
  1114. +            h->mb.cache.non_zero_count[x264_scan8[i]] = 16;
  1115.      }
  1116.      else
  1117.      {
  1118. diff --git a/common/macroblock.h b/common/macroblock.h
  1119. index 5fbbd16..8dc65b8 100644
  1120. --- a/common/macroblock.h
  1121. +++ b/common/macroblock.h
  1122. @@ -271,8 +271,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
  1123.  void x264_macroblock_slice_init( x264_t *h );
  1124.  void x264_macroblock_thread_init( x264_t *h );
  1125.  void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
  1126. -void x264_macroblock_cache_load_deblock( x264_t *h, int mb_x, int mb_y, int deblock_ref_table[2][34] );
  1127. -#define deblock_ref_table(l,x) deblock_ref_table[l][x+2]
  1128. +void x264_macroblock_cache_load_deblock( x264_t *h );
  1129. +void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y );
  1130.  void x264_macroblock_cache_save( x264_t *h );
  1131.  
  1132.  void x264_macroblock_bipred_init( x264_t *h );
  1133. diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
  1134. index f2f3e58..aedd688 100644
  1135. --- a/common/x86/deblock-a.asm
  1136. +++ b/common/x86/deblock-a.asm
  1137. @@ -889,8 +889,7 @@ chroma_intra_body_mmxext:
  1138.  
  1139.  ;-----------------------------------------------------------------------------
  1140.  ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
  1141. -;                               uint8_t bs[2][4][4], int mvy_limit, int bframe, int step,
  1142. -;                               int first_edge_only )
  1143. +;                               uint8_t bs[2][4][4], int mvy_limit, int bframe )
  1144.  ;-----------------------------------------------------------------------------
  1145.  
  1146.  %define scan8start (4+1*8)
  1147. diff --git a/encoder/encoder.c b/encoder/encoder.c
  1148. index e839370..7872013 100644
  1149. --- a/encoder/encoder.c
  1150. +++ b/encoder/encoder.c
  1151. @@ -1828,6 +1828,9 @@ static int x264_slice_write( x264_t *h )
  1152.      int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 3;
  1153.      int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : INT_MAX;
  1154.      int starting_bits = bs_pos(&h->out.bs);
  1155. +    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
  1156. +    int b_hpel = h->fdec->b_kept_as_ref;
  1157. +    b_deblock &= b_hpel || h->param.psz_dump_yuv;
  1158.      bs_realign( &h->out.bs );
  1159.  
  1160.      /* Slice */
  1161. @@ -1966,6 +1969,20 @@ static int x264_slice_write( x264_t *h )
  1162.          /* save cache */
  1163.          x264_macroblock_cache_save( h );
  1164.  
  1165. +        /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */
  1166. +        if( b_deblock )
  1167. +        {
  1168. +            int mvy_limit = 4 >> h->sh.b_mbaff;
  1169. +            int type = h->mb.type[h->mb.i_mb_xy];
  1170. +            uint8_t (*bs)[4][4] = h->deblock_strength[h->mb.i_mb_y&1&h->sh.b_mbaff][h->mb.i_mb_x];
  1171. +            x264_macroblock_cache_load_deblock( h );
  1172. +            if( IS_INTRA( type ) )
  1173. +                memset( bs, 3, sizeof(uint8_t)*2*4*4 );
  1174. +            else
  1175. +                h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
  1176. +                                           bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B );
  1177. +        }
  1178. +
  1179.          /* accumulate mb stats */
  1180.          h->stat.frame.i_mb_count[h->mb.i_type]++;
  1181.  
  1182. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  1183. index 199bb68..b7e5f34 100644
  1184. --- a/encoder/macroblock.c
  1185. +++ b/encoder/macroblock.c
  1186. @@ -459,8 +459,12 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  1187.  
  1188.  static void x264_macroblock_encode_skip( x264_t *h )
  1189.  {
  1190. -    for( int i = 0; i < sizeof( h->mb.cache.non_zero_count ); i += 16 )
  1191. -        M128( &h->mb.cache.non_zero_count[i] ) = M128_ZERO;
  1192. +    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ) = 0;
  1193. +    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ) = 0;
  1194. +    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ) = 0;
  1195. +    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ) = 0;
  1196. +    for( int i = 16; i < 24; i++ )
  1197. +        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
  1198.      h->mb.i_cbp_luma = 0;
  1199.      h->mb.i_cbp_chroma = 0;
  1200.      h->mb.cbp[h->mb.i_mb_xy] = 0;
  1201. diff --git a/tools/checkasm.c b/tools/checkasm.c
  1202. index 5dd360a..6469017 100644
  1203. --- a/tools/checkasm.c
  1204. +++ b/tools/checkasm.c
  1205. @@ -1164,8 +1164,8 @@ static int check_deblock( int cpu_ref, int cpu_new )
  1206.                          mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
  1207.                  }
  1208.              set_func_name( "deblock_strength" );
  1209. -            call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1), 1, 0 );
  1210. -            call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1), 1, 0 );
  1211. +            call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
  1212. +            call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
  1213.              if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
  1214.              {
  1215.                  ok = 0;
  1216. --
  1217. 1.7.0.4
  1218.  
  1219.  
  1220. From bd8642cef14e8dc13a0c87526b8f43e4436ab3a1 Mon Sep 17 00:00:00 2001
  1221. From: Jason Garrett-Glaser <darkshikari@gmail.com>
  1222. Date: Tue, 25 May 2010 16:13:59 -0700
  1223. Subject: [PATCH 10/10] Detect Atom CPU, enable appropriate asm functions
  1224.  I'm not going to actually optimize for this pile of garbage unless someone pays me.
  1225.  But it can't hurt to at least enable the correct functions based on benchmarks.
  1226.  
  1227. Also save some cache on Intel CPUs that don't need the decimate LUT due to having fast bsr/bsf.
  1228. ---
  1229. common/cpu.c           |   16 ++++++++++++----
  1230.  common/dct.c           |    2 +-
  1231.  common/pixel.c         |   17 ++++++++++-------
  1232.  common/quant.c         |   15 +++++++++++++++
  1233.  common/x86/mc-c.c      |    9 ++++++---
  1234.  common/x86/quant-a.asm |   32 ++++++++++++++++++++++++--------
  1235.  common/x86/quant.h     |    6 ++++++
  1236.  encoder/macroblock.c   |    5 +----
  1237.  tools/checkasm.c       |   14 +++++++++++++-
  1238.  x264.h                 |    2 ++
  1239.  10 files changed, 90 insertions(+), 28 deletions(-)
  1240.  
  1241. diff --git a/common/cpu.c b/common/cpu.c
  1242. index 933a754..10ac303 100644
  1243. --- a/common/cpu.c
  1244. +++ b/common/cpu.c
  1245. @@ -64,6 +64,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
  1246.      {"ARMv6", X264_CPU_ARMV6},
  1247.      {"NEON",  X264_CPU_NEON},
  1248.      {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
  1249. +    {"SlowCTZ", X264_CPU_SLOW_CTZ},
  1250. +    {"SlowAtom", X264_CPU_SLOW_ATOM},
  1251.      {"", 0},
  1252.  };
  1253.  
  1254. @@ -135,6 +137,7 @@ uint32_t x264_cpu_detect( void )
  1255.  
  1256.      if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
  1257.      {
  1258. +        cpu |= X264_CPU_SLOW_CTZ;
  1259.          x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
  1260.          if( edx&0x00400000 )
  1261.              cpu |= X264_CPU_MMXEXT;
  1262. @@ -145,6 +148,7 @@ uint32_t x264_cpu_detect( void )
  1263.                  cpu |= X264_CPU_SSE2_IS_FAST;
  1264.                  cpu |= X264_CPU_LZCNT;
  1265.                  cpu |= X264_CPU_SHUFFLE_IS_FAST;
  1266. +                cpu &= ~X264_CPU_SLOW_CTZ;
  1267.              }
  1268.              else
  1269.                  cpu |= X264_CPU_SSE2_IS_SLOW;
  1270. @@ -159,11 +163,9 @@ uint32_t x264_cpu_detect( void )
  1271.  
  1272.      if( !strcmp((char*)vendor, "GenuineIntel") )
  1273.      {
  1274. -        int family, model, stepping;
  1275.          x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
  1276. -        family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
  1277. -        model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
  1278. -        stepping = eax&0xf;
  1279. +        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
  1280. +        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
  1281.          /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
  1282.           * theoretically support sse2, but it's significantly slower than mmx for
  1283.           * almost all of x264's functions, so let's just pretend they don't. */
  1284. @@ -172,6 +174,12 @@ uint32_t x264_cpu_detect( void )
  1285.              cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
  1286.              assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
  1287.          }
  1288. +        /* Detect Atom CPU */
  1289. +        if( family == 6 && model == 28 )
  1290. +        {
  1291. +            cpu |= X264_CPU_SLOW_ATOM;
  1292. +            cpu |= X264_CPU_SLOW_CTZ;
  1293. +        }
  1294.      }
  1295.  
  1296.      if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
  1297. diff --git a/common/dct.c b/common/dct.c
  1298. index 3917510..10fe2f7 100644
  1299. --- a/common/dct.c
  1300. +++ b/common/dct.c
  1301. @@ -457,7 +457,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
  1302.          dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
  1303.      }
  1304.  
  1305. -    if( cpu&X264_CPU_SSSE3 )
  1306. +    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
  1307.      {
  1308.          dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
  1309.          dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
  1310. diff --git a/common/pixel.c b/common/pixel.c
  1311. index 20c5170..5759abf 100644
  1312. --- a/common/pixel.c
  1313. +++ b/common/pixel.c
  1314. @@ -768,17 +768,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  1315.  
  1316.      if( cpu&X264_CPU_SSSE3 )
  1317.      {
  1318. -        INIT7( ssd, _ssse3 );
  1319. -        INIT7( satd, _ssse3 );
  1320. -        INIT7( satd_x3, _ssse3 );
  1321. -        INIT7( satd_x4, _ssse3 );
  1322.          if( !(cpu&X264_CPU_STACK_MOD4) )
  1323.          {
  1324.              INIT4( hadamard_ac, _ssse3 );
  1325.          }
  1326.          INIT_ADS( _ssse3 );
  1327. -        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
  1328. -        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
  1329. +        if( !(cpu&X264_CPU_SLOW_ATOM) )
  1330. +        {
  1331. +            INIT7( ssd, _ssse3 );
  1332. +            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
  1333. +            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
  1334. +            INIT7( satd, _ssse3 );
  1335. +            INIT7( satd_x3, _ssse3 );
  1336. +            INIT7( satd_x4, _ssse3 );
  1337. +        }
  1338.          pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
  1339.          pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
  1340.          pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
  1341. @@ -794,7 +797,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  1342.              INIT2( sad_x3, _cache64_ssse3 );
  1343.              INIT2( sad_x4, _cache64_ssse3 );
  1344.          }
  1345. -        if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
  1346. +        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
  1347.          {
  1348.              INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
  1349.          }
  1350. diff --git a/common/quant.c b/common/quant.c
  1351. index ce074e2..e62fa0f 100644
  1352. --- a/common/quant.c
  1353. +++ b/common/quant.c
  1354. @@ -312,6 +312,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  1355.          pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
  1356.          pf->decimate_score15 = x264_decimate_score15_mmxext;
  1357.          pf->decimate_score16 = x264_decimate_score16_mmxext;
  1358. +        if( cpu&X264_CPU_SLOW_CTZ )
  1359. +        {
  1360. +            pf->decimate_score15 = x264_decimate_score15_mmxext_slowbsr;
  1361. +            pf->decimate_score16 = x264_decimate_score16_mmxext_slowbsr;
  1362. +        }
  1363.          pf->decimate_score64 = x264_decimate_score64_mmxext;
  1364.          pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmxext;
  1365.          pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
  1366. @@ -345,6 +350,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  1367.          pf->decimate_score15 = x264_decimate_score15_sse2;
  1368.          pf->decimate_score16 = x264_decimate_score16_sse2;
  1369.          pf->decimate_score64 = x264_decimate_score64_sse2;
  1370. +        if( cpu&X264_CPU_SLOW_CTZ )
  1371. +        {
  1372. +            pf->decimate_score15 = x264_decimate_score15_sse2_slowbsr;
  1373. +            pf->decimate_score16 = x264_decimate_score16_sse2_slowbsr;
  1374. +        }
  1375.          pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
  1376.          pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
  1377.          pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
  1378. @@ -369,6 +379,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  1379.          pf->denoise_dct = x264_denoise_dct_ssse3;
  1380.          pf->decimate_score15 = x264_decimate_score15_ssse3;
  1381.          pf->decimate_score16 = x264_decimate_score16_ssse3;
  1382. +        if( cpu&X264_CPU_SLOW_CTZ )
  1383. +        {
  1384. +            pf->decimate_score15 = x264_decimate_score15_ssse3_slowbsr;
  1385. +            pf->decimate_score16 = x264_decimate_score16_ssse3_slowbsr;
  1386. +        }
  1387.          pf->decimate_score64 = x264_decimate_score64_ssse3;
  1388.      }
  1389.  
  1390. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
  1391. index f641cff..2171f89 100644
  1392. --- a/common/x86/mc-c.c
  1393. +++ b/common/x86/mc-c.c
  1394. @@ -427,8 +427,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  1395.          return;
  1396.  
  1397.      pf->weight = x264_mc_weight_wtab_sse2;
  1398. -    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
  1399. -    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
  1400. +    if( !(cpu&X264_CPU_SLOW_ATOM) )
  1401. +    {
  1402. +        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
  1403. +        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
  1404. +    }
  1405.  
  1406.      pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
  1407.      pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
  1408. @@ -481,7 +484,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  1409.          pf->weight = x264_mc_weight_wtab_ssse3;
  1410.      }
  1411.  
  1412. -    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
  1413. +    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
  1414.          pf->integral_init4v = x264_integral_init4v_ssse3;
  1415.  
  1416.      if( !(cpu&X264_CPU_SSE4) )
  1417. diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
  1418. index 3e520fa..b770adf 100644
  1419. --- a/common/x86/quant-a.asm
  1420. +++ b/common/x86/quant-a.asm
  1421. @@ -583,7 +583,7 @@ DENOISE_DCT ssse3, 7
  1422.  cextern decimate_table4
  1423.  cextern decimate_table8
  1424.  
  1425. -%macro DECIMATE4x4 2
  1426. +%macro DECIMATE4x4 3
  1427.  
  1428.  ;A LUT is faster than bsf on AMD processors, and no slower on Intel
  1429.  ;This is not true for score64.
  1430. @@ -605,6 +605,7 @@ cglobal decimate_score%1_%2, 1,3
  1431.  %if %1==15
  1432.      shr   edx, 1
  1433.  %endif
  1434. +%if %3==1
  1435.      movzx ecx, dl
  1436.      movzx eax, byte [mask_table + rcx]
  1437.      cmp   edx, ecx
  1438. @@ -617,8 +618,17 @@ cglobal decimate_score%1_%2, 1,3
  1439.      shr   edx, cl
  1440.      add    al, byte [table + rcx]
  1441.      add    al, byte [mask_table + rdx]
  1442. +%else
  1443. +.loop:
  1444. +    bsf   ecx, edx
  1445. +    shr   edx, cl
  1446. +    movzx ecx, byte [table + rcx]
  1447. +    add   eax, ecx
  1448. +    shr   edx, 1
  1449. +    jne  .loop
  1450. +%endif
  1451.  .ret:
  1452. -    REP_RET
  1453. +    RET
  1454.  .ret9:
  1455.      mov   eax, 9
  1456.      RET
  1457. @@ -627,14 +637,20 @@ cglobal decimate_score%1_%2, 1,3
  1458.  
  1459.  %ifndef ARCH_X86_64
  1460.  %define DECIMATE_MASK DECIMATE_MASK_MMX
  1461. -DECIMATE4x4 15, mmxext
  1462. -DECIMATE4x4 16, mmxext
  1463. +DECIMATE4x4 15, mmxext, 0
  1464. +DECIMATE4x4 16, mmxext, 0
  1465. +DECIMATE4x4 15, mmxext_slowbsr, 1
  1466. +DECIMATE4x4 16, mmxext_slowbsr, 1
  1467.  %endif
  1468.  %define DECIMATE_MASK DECIMATE_MASK_SSE2
  1469. -DECIMATE4x4 15, sse2
  1470. -DECIMATE4x4 15, ssse3
  1471. -DECIMATE4x4 16, sse2
  1472. -DECIMATE4x4 16, ssse3
  1473. +DECIMATE4x4 15, sse2, 0
  1474. +DECIMATE4x4 16, sse2, 0
  1475. +DECIMATE4x4 15, sse2_slowbsr, 1
  1476. +DECIMATE4x4 16, sse2_slowbsr, 1
  1477. +DECIMATE4x4 15, ssse3, 0
  1478. +DECIMATE4x4 16, ssse3, 0
  1479. +DECIMATE4x4 15, ssse3_slowbsr, 1
  1480. +DECIMATE4x4 16, ssse3_slowbsr, 1
  1481.  
  1482.  %macro DECIMATE8x8 1
  1483.  
  1484. diff --git a/common/x86/quant.h b/common/x86/quant.h
  1485. index 4e42b81..4ffd684 100644
  1486. --- a/common/x86/quant.h
  1487. +++ b/common/x86/quant.h
  1488. @@ -57,6 +57,12 @@ int x264_decimate_score15_ssse3 ( int16_t *dct );
  1489.  int x264_decimate_score16_mmxext( int16_t *dct );
  1490.  int x264_decimate_score16_sse2  ( int16_t *dct );
  1491.  int x264_decimate_score16_ssse3 ( int16_t *dct );
  1492. +int x264_decimate_score15_mmxext_slowbsr( int16_t *dct );
  1493. +int x264_decimate_score15_sse2_slowbsr  ( int16_t *dct );
  1494. +int x264_decimate_score15_ssse3_slowbsr ( int16_t *dct );
  1495. +int x264_decimate_score16_mmxext_slowbsr( int16_t *dct );
  1496. +int x264_decimate_score16_sse2_slowbsr  ( int16_t *dct );
  1497. +int x264_decimate_score16_ssse3_slowbsr ( int16_t *dct );
  1498.  int x264_decimate_score64_mmxext( int16_t *dct );
  1499.  int x264_decimate_score64_sse2  ( int16_t *dct );
  1500.  int x264_decimate_score64_ssse3 ( int16_t *dct );
  1501. diff --git a/encoder/macroblock.c b/encoder/macroblock.c
  1502. index b7e5f34..984f8a8 100644
  1503. --- a/encoder/macroblock.c
  1504. +++ b/encoder/macroblock.c
  1505. @@ -997,10 +997,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  1506.          /* calculate dct coeffs */
  1507.          for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
  1508.          {
  1509. -            /* We don't need to zero the DC coefficient before quantization because we already
  1510. -             * checked that all the DCs were zero above at twice the precision that quant4x4
  1511. -             * uses.  This applies even though the DC here is being quantized before the 2x2
  1512. -             * transform. */
  1513. +            dct4x4[i4x4][0] = 0;
  1514.              if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
  1515.                  continue;
  1516.              h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
  1517. diff --git a/tools/checkasm.c b/tools/checkasm.c
  1518. index 6469017..a0a9d54 100644
  1519. --- a/tools/checkasm.c
  1520. +++ b/tools/checkasm.c
  1521. @@ -173,7 +173,9 @@ static void print_bench(void)
  1522.                      b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
  1523.                      b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
  1524.                      b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
  1525. -                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
  1526. +                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
  1527. +                    b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
  1528. +                    b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
  1529.                      ((int64_t)10*b->cycles/b->den - nop_time)/4 );
  1530.          }
  1531.  }
  1532. @@ -1700,6 +1702,8 @@ static int check_all_flags( void )
  1533.              ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
  1534.              cpu1 &= ~X264_CPU_LZCNT;
  1535.          }
  1536. +        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
  1537. +        cpu1 &= ~X264_CPU_SLOW_CTZ;
  1538.      }
  1539.      if( x264_cpu_detect() & X264_CPU_SSE2 )
  1540.      {
  1541. @@ -1708,6 +1712,10 @@ static int check_all_flags( void )
  1542.          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
  1543.          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
  1544.          cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
  1545. +        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
  1546. +        cpu1 &= ~X264_CPU_SLOW_CTZ;
  1547. +        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
  1548. +        cpu1 &= ~X264_CPU_SLOW_ATOM;
  1549.      }
  1550.      if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
  1551.      {
  1552. @@ -1730,6 +1738,10 @@ static int check_all_flags( void )
  1553.          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
  1554.          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
  1555.          cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
  1556. +        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
  1557. +        cpu1 &= ~X264_CPU_SLOW_CTZ;
  1558. +        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
  1559. +        cpu1 &= ~X264_CPU_SLOW_ATOM;
  1560.      }
  1561.      if( x264_cpu_detect() & X264_CPU_SSE4 )
  1562.      {
  1563. diff --git a/x264.h b/x264.h
  1564. index f714b72..6d7b703 100644
  1565. --- a/x264.h
  1566. +++ b/x264.h
  1567. @@ -66,6 +66,8 @@ typedef struct x264_t x264_t;
  1568.  #define X264_CPU_ARMV6          0x020000
  1569.  #define X264_CPU_NEON           0x040000  /* ARM NEON */
  1570.  #define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
  1571. +#define X264_CPU_SLOW_CTZ       0x100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
  1572. +#define X264_CPU_SLOW_ATOM      0x200000  /* The Atom just sucks */
  1573.  
  1574.  /* Analyse flags
  1575.   */
  1576. --
  1577. 1.7.0.4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement