diff --git a/common/common.h b/common/common.h
index 5e34212..cf5c1e4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,6 +56,7 @@ do {\
#define X264_BFRAME_MAX 16
#define X264_REF_MAX 16
#define X264_THREAD_MAX 128
+#define X264_LOOKAHEAD_THREAD_MAX 16
#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
#define X264_LOOKAHEAD_MAX 250
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
diff --git a/common/macroblock.c b/common/macroblock.c
index 11c3e75..f175fef 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -396,6 +396,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
}
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
+ //int buf_lookahead_threads = (h->mb.i_mb_height + (h->param.i_lookahead_threads-1) / h->param.i_lookahead_threads) * sizeof(int)
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
else
diff --git a/common/threadpool.c b/common/threadpool.c
index f7a95fc..61e5b15 100644
--- a/common/threadpool.c
+++ b/common/threadpool.c
@@ -83,7 +83,7 @@ int x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
pool->init_func = init_func;
pool->init_arg = init_arg;
- pool->threads = X264_MIN( threads, X264_THREAD_MAX );
+ pool->threads = threads;
CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) );
diff --git a/encoder/encoder.c b/encoder/encoder.c
index b42d5dc..f0217fc 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -494,6 +494,8 @@ static int x264_validate_parameters( x264_t *h, int b_open )
if( h->param.i_threads == X264_THREADS_AUTO )
h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
+ if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
+ h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:4);
if( h->param.i_threads > 1 )
{
#if !HAVE_THREAD
@@ -509,8 +511,12 @@ static int x264_validate_parameters( x264_t *h, int b_open )
}
}
h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+ h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_LOOKAHEAD_THREAD_MAX );
if( h->param.i_threads == 1 )
+ {
h->param.b_sliced_threads = 0;
+ h->param.i_lookahead_threads = 1;
+ }
h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
if( h->i_thread_frames > 1 )
h->param.nalu_process = NULL;
@@ -1268,8 +1274,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size );
+ int total_threads = h->param.i_threads + h->param.i_lookahead_threads * (h->param.i_lookahead_threads > 1);
if( h->param.i_threads > 1 &&
- x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
+ x264_threadpool_init( &h->threadpool, total_threads, (void*)x264_encoder_thread_init, h ) )
goto fail;
h->thread[0] = h;
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 1aa4891..495d094 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -571,7 +571,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
if( i_mb_x < h->mb.i_mb_width - 1 )
MVC( fenc_mv[1] );
- if( i_mb_y < h->mb.i_mb_height - 1 )
+ if( i_mb_y < h->i_threadslice_end - 1 )
{
MVC( fenc_mv[i_mb_stride] );
if( i_mb_x > 0 )
@@ -701,6 +701,55 @@ lowres_intra_mb:
(h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
h->mb.i_mb_width * h->mb.i_mb_height)
+typedef struct
+{
+ x264_t *h;
+ x264_mb_analysis_t *a;
+ x264_frame_t **frames;
+ int p0;
+ int p1;
+ int b;
+ int dist_scale_factor;
+ int *do_search;
+ const x264_weight_t *w;
+} x264_slicetype_slice_t;
+
+static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s )
+{
+ x264_frame_t **frames = s->frames;
+ x264_t *h = s->h;
+ int p0 = s->p0;
+ int p1 = s->p1;
+ int b = s->b;
+ /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
+ * This considerably improves MV prediction overall. */
+
+ /* The edge mbs seem to reduce the predictive quality of the
+ * whole frame's score, but are needed for a spatial distribution. */
+ if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
+ h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
+ {
+ int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
+ int *row_satd_intra = frames[b]->i_row_satds[0][0];
+ for( h->mb.i_mb_y = h->i_threadslice_end - 1; h->mb.i_mb_y >= h->i_threadslice_start; h->mb.i_mb_y-- )
+ {
+ row_satd[h->mb.i_mb_y] = 0;
+ if( !frames[b]->b_intra_calculated )
+ row_satd_intra[h->mb.i_mb_y] = 0;
+ for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
+ x264_slicetype_mb_cost( h, s->a, frames, p0, p1, b, s->dist_scale_factor, s->do_search, s->w );
+ }
+ }
+ else
+ {
+ int start_row = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 );
+ int end_row = X264_MAX( h->i_threadslice_start, 1 );
+ for( h->mb.i_mb_y = start_row; h->mb.i_mb_y >= end_row; h->mb.i_mb_y-- )
+ for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
+ x264_slicetype_mb_cost( h, s->a, frames, p0, p1, b, s->dist_scale_factor, s->do_search, s->w );
+ }
+}
+
static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
x264_frame_t **frames, int p0, int p1, int b,
int b_intra_penalty )
@@ -717,8 +766,6 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
else
{
int dist_scale_factor = 128;
- int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
- int *row_satd_intra = frames[b]->i_row_satds[0][0];
/* For each list, check to see whether we have lowres motion-searched this reference frame before. */
do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
@@ -748,28 +795,29 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
frames[b]->i_cost_est[b-p0][p1-b] = 0;
frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
- /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
- * This considerably improves MV prediction overall. */
-
- /* The edge mbs seem to reduce the predictive quality of the
- * whole frame's score, but are needed for a spatial distribution. */
- if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
- h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
+ if( h->param.i_lookahead_threads > 1 )
{
- for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+ x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+ ALIGNED_16( x264_t temp_struct[X264_LOOKAHEAD_THREAD_MAX] );
+
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
{
- row_satd[h->mb.i_mb_y] = 0;
- if( !frames[b]->b_intra_calculated )
- row_satd_intra[h->mb.i_mb_y] = 0;
- for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
- x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+ x264_t *t = &temp_struct[i];
+ memcpy( t, h, sizeof(x264_t) );
+ s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w };
+ t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+ t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+ x264_threadpool_run( h->threadpool, (void*)x264_slicetype_slice_cost, &s[i] );
}
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ x264_threadpool_wait( h->threadpool, &s[i] );
}
else
{
- for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
- for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
- x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+ h->i_threadslice_start = 0;
+ h->i_threadslice_end = h->mb.i_mb_height;
+ x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w };
+ x264_slicetype_slice_cost( &s );
}
i_score = frames[b]->i_cost_est[b-p0][p1-b];
diff --git a/x264.h b/x264.h
index eb2b3b7..b6c258e 100644
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
#include "x264_config.h"
-#define X264_BUILD 124
+#define X264_BUILD 125
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
@@ -254,7 +254,8 @@ typedef struct x264_param_t
{
/* CPU flags */
unsigned int cpu;
- int i_threads; /* encode multiple frames in parallel */
+ int i_threads; /* encode multiple frames in parallel */
+ int i_lookahead_threads; /* multiple threads for lookahead analysis */
int b_sliced_threads; /* Whether to use slice-based threading. */
int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
int b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */