Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From ee75acd55e1a89eb9a8c2f7d14c923b57e08ceb3 Mon Sep 17 00:00:00 2001
- From: Sean McGovern <gseanmcg@gmail.com>
- Date: Sun, 10 Oct 2010 19:34:18 -0400
- Subject: [PATCH 1/9] Fix build on SPARC Solaris 10
- ---
- common/pixel.c | 6 +++---
- configure | 29 +++++++++++++++++------------
- 2 files changed, 20 insertions(+), 15 deletions(-)
- diff --git a/common/pixel.c b/common/pixel.c
- index 1e21550..7fa497c 100644
- --- a/common/pixel.c
- +++ b/common/pixel.c
- @@ -36,7 +36,7 @@
- #if ARCH_ARM
- # include "arm/pixel.h"
- #endif
- -#if ARCH_UltraSparc
- +#if ARCH_UltraSPARC
- # include "sparc/pixel.h"
- #endif
- @@ -443,7 +443,7 @@ SAD_X( 4x8 )
- SAD_X( 4x4 )
- #if !HIGH_BIT_DEPTH
- -#if ARCH_UltraSparc
- +#if ARCH_UltraSPARC
- SAD_X( 16x16_vis )
- SAD_X( 16x8_vis )
- SAD_X( 8x16_vis )
- @@ -1063,7 +1063,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
- }
- #endif
- #if !HIGH_BIT_DEPTH
- -#if ARCH_UltraSparc
- +#if ARCH_UltraSPARC
- INIT4( sad, _vis );
- INIT4( sad_x3, _vis );
- INIT4( sad_x4, _vis );
- diff --git a/configure b/configure
- index 2f38154..fd62337 100755
- --- a/configure
- +++ b/configure
- @@ -392,15 +392,20 @@ case $host_cpu in
- fi
- ;;
- sparc)
- - if [ $asm = auto ] && test "$(uname -m)" = "sun4u"; then
- - ARCH="UltraSparc"
- - CFLAGS="$CFLAGS -mcpu=ultrasparc"
- - LDFLAGS="$LDFLAGS -mcpu=ultrasparc"
- - AS="${AS-${cross_prefix}as}"
- - ASFLAGS="$ASFLAGS -xarch=v8plusa"
- - else
- - ARCH="Sparc"
- - fi
- + ARCH="SPARC"
- + case $(uname -m) in
- + sun4u|sun4v)
- + if [ $asm = auto ]; then
- + ARCH="UltraSPARC"
- + if ! echo $CFLAGS | grep -Eq '\-mcpu' ; then
- + CFLAGS="$CFLAGS -mcpu=ultrasparc"
- + LDFLAGS="$LDFLAGS -mcpu=ultrasparc"
- + fi
- + AS="${AS-${cross_prefix}as}"
- + ASFLAGS="$ASFLAGS -xarch=v8plusa"
- + fi
- + ;;
- + esac
- ;;
- mips|mipsel|mips64|mips64el)
- ARCH="MIPS"
- @@ -497,11 +502,11 @@ fi
- define ARCH_$ARCH
- define SYS_$SYS
- -echo "int i = 0x42494745; double f = 0x1.0656e6469616ep+102;" > conftest.c
- +echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
- $CC $CFLAGS conftest.c -c -o conftest.o 2>$DEVNULL || die "endian test failed"
- -if grep -q BIGE conftest.o && grep -q FPendian conftest.o ; then
- +if (strings -a conftest.o | grep -q BIGE) && (strings -a conftest.o | grep -q FPendian) ; then
- define WORDS_BIGENDIAN
- -elif !(grep -q EGIB conftest.o && grep -q naidnePF conftest.o) ; then
- +elif !(strings -a conftest.o | grep -q EGIB && strings -a conftest.o | grep -q naidnePF) ; then
- die "endian test failed"
- fi
- --
- 1.7.3.2.146.gca209
- From b38fd04c376bac31544782fabe03471567a8badf Mon Sep 17 00:00:00 2001
- From: Sean McGovern <gseanmcg@gmail.com>
- Date: Sun, 21 Nov 2010 01:59:33 -0500
- Subject: [PATCH 2/9] Only build SPARC VIS asm if high bit-depth is disabled
- ---
- Makefile | 2 ++
- 1 files changed, 2 insertions(+), 0 deletions(-)
- diff --git a/Makefile b/Makefile
- index 8a3a327..21f57e7 100644
- --- a/Makefile
- +++ b/Makefile
- @@ -110,9 +110,11 @@ endif
- # VIS optims
- ifeq ($(ARCH),UltraSparc)
- +ifeq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),)
- ASMSRC += common/sparc/pixel.asm
- OBJASM = $(ASMSRC:%.asm=%.o)
- endif
- +endif
- ifneq ($(HAVE_GETOPT_LONG),1)
- SRCCLI += extras/getopt.c
- --
- 1.7.3.2.146.gca209
- From bb45211ef98ae8b382a15d4b03c2dc5c8d8f67e0 Mon Sep 17 00:00:00 2001
- From: Steven Walters <kemuri9@gmail.com>
- Date: Mon, 22 Nov 2010 10:31:05 +0900
- Subject: [PATCH 3/9] Fix configure so that boolean configuration options are 1/0
- There are many cases of 1/undef, not 1/0.
- ---
- Makefile | 12 ++++++------
- common/arm/asm.S | 2 +-
- common/bitstream.c | 4 ++--
- common/macroblock.h | 2 +-
- configure | 15 +++++++++------
- 5 files changed, 19 insertions(+), 16 deletions(-)
- diff --git a/Makefile b/Makefile
- index 21f57e7..82f0bbc 100644
- --- a/Makefile
- +++ b/Makefile
- @@ -25,29 +25,29 @@ SRCSO =
- CONFIG := $(shell cat config.h)
- # GPL-only files
- -ifeq ($(GPL),yes)
- +ifneq ($(findstring HAVE_GPL 1, $(CONFIG)),)
- SRCCLI +=
- endif
- # Optional module sources
- -ifneq ($(findstring HAVE_AVS, $(CONFIG)),)
- +ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),)
- SRCCLI += input/avs.c
- endif
- -ifneq ($(findstring HAVE_PTHREAD, $(CONFIG)),)
- +ifneq ($(findstring HAVE_PTHREAD 1, $(CONFIG)),)
- SRCCLI += input/thread.c
- SRCS += common/threadpool.c
- endif
- -ifneq ($(findstring HAVE_LAVF, $(CONFIG)),)
- +ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),)
- SRCCLI += input/lavf.c
- endif
- -ifneq ($(findstring HAVE_FFMS, $(CONFIG)),)
- +ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),)
- SRCCLI += input/ffms.c
- endif
- -ifneq ($(findstring HAVE_GPAC, $(CONFIG)),)
- +ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),)
- SRCCLI += output/mp4.c
- endif
- diff --git a/common/arm/asm.S b/common/arm/asm.S
- index 7434262..92e3b14 100644
- --- a/common/arm/asm.S
- +++ b/common/arm/asm.S
- @@ -65,7 +65,7 @@ ELF .type \name, %function
- .endm
- .macro movconst rd, val
- -#ifdef HAVE_ARMV6T2
- +#if HAVE_ARMV6T2
- movw \rd, #:lower16:\val
- .if \val >> 16
- movt \rd, #:upper16:\val
- diff --git a/common/bitstream.c b/common/bitstream.c
- index 8350fb3..0f2bc9f 100644
- --- a/common/bitstream.c
- +++ b/common/bitstream.c
- @@ -39,7 +39,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
- return dst;
- }
- -#ifdef HAVE_MMX
- +#if HAVE_MMX
- uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
- uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
- #endif
- @@ -88,7 +88,7 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
- void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
- {
- pf->nal_escape = x264_nal_escape_c;
- -#ifdef HAVE_MMX
- +#if HAVE_MMX
- if( cpu&X264_CPU_MMXEXT )
- pf->nal_escape = x264_nal_escape_mmxext;
- if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
- diff --git a/common/macroblock.h b/common/macroblock.h
- index 7562948..ce4ead9 100644
- --- a/common/macroblock.h
- +++ b/common/macroblock.h
- @@ -364,7 +364,7 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
- }
- static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
- {
- -#ifdef WORDS_BIGENDIAN
- +#if WORDS_BIGENDIAN
- return b + ((uint64_t)a<<32);
- #else
- return a + ((uint64_t)b<<32);
- diff --git a/configure b/configure
- index fd62337..ef33323 100755
- --- a/configure
- +++ b/configure
- @@ -171,6 +171,9 @@ cross_prefix=""
- EXE=""
- +# list of all preprocessor HAVE values we can define
- +CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON PTHREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL"
- +
- # parse options
- for opt do
- @@ -735,11 +738,12 @@ fi
- define BIT_DEPTH $bit_depth
- ASFLAGS="$ASFLAGS -DBIT_DEPTH=$bit_depth"
- -if [ $gpl = yes ]; then
- - define HAVE_GPL 1
- -else
- - define HAVE_GPL 0
- -fi
- +[ $gpl = yes ] && define HAVE_GPL
- +
- +#define undefined vars as 0
- +for var in $CONFIG_HAVE; do
- + grep -q "HAVE_$var 1" config.h || define HAVE_$var 0
- +done
- rm -f conftest*
- @@ -766,7 +770,6 @@ EXE=$EXE
- VIS=$vis
- HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG
- DEVNULL=$DEVNULL
- -GPL=$gpl
- EOF
- if [ "$shared" = "yes" ]; then
- --
- 1.7.3.2.146.gca209
- From 8363900cb951db4acc1ed82fac2ede3996c4703f Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Sat, 20 Nov 2010 23:30:42 -0800
- Subject: [PATCH 4/9] Change qpmin default to 0
- There's probably no real reason to keep it at 10 anymore, and lowering it allows AQ to pick lower quantizers in really flat areas.
- Might help on gradients at high quality levels.
- The previous value of 10 was arbitrary anyways.
- ---
- common/common.c | 2 +-
- 1 files changed, 1 insertions(+), 1 deletions(-)
- diff --git a/common/common.c b/common/common.c
- index 6c88556..1f99e9e 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -98,7 +98,7 @@ void x264_param_default( x264_param_t *param )
- param->rc.f_vbv_buffer_init = 0.9;
- param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
- param->rc.f_rf_constant = 23;
- - param->rc.i_qp_min = 10;
- + param->rc.i_qp_min = 0;
- param->rc.i_qp_max = QP_MAX;
- param->rc.i_qp_step = 4;
- param->rc.f_ip_factor = 1.4;
- --
- 1.7.3.2.146.gca209
- From 07c6b3fd5028057ad8f6511e0773506a2bba69b9 Mon Sep 17 00:00:00 2001
- From: Yasuhiro Ikeda <wipple625@gmail.com>
- Date: Mon, 22 Nov 2010 11:01:57 +0900
- Subject: [PATCH 5/9] Add some more info to `x264 --version`
- ---
- x264.c | 23 ++++++++++++++++++++---
- 1 files changed, 20 insertions(+), 3 deletions(-)
- diff --git a/x264.c b/x264.c
- index bba17b8..f9a1c6a 100644
- --- a/x264.c
- +++ b/x264.c
- @@ -55,6 +55,14 @@
- #include <libavutil/pixdesc.h>
- #endif
- +#if HAVE_SWSCALE
- +#include <libswscale/swscale.h>
- +#endif
- +
- +#if HAVE_FFMS
- +#include <ffms.h>
- +#endif
- +
- /* Ctrl-C handler */
- static volatile int b_ctrl_c = 0;
- static int b_exit_on_ctrl_c = 0;
- @@ -208,6 +216,15 @@ static void print_version_info()
- #else
- printf( "x264 0.%d.X\n", X264_BUILD );
- #endif
- +#if HAVE_SWSCALE
- + printf( "(libswscale %d.%d.%d)\n", LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO );
- +#endif
- +#if HAVE_LAVF
- + printf( "(libavformat %d.%d.%d)\n", LIBAVFORMAT_VERSION_MAJOR, LIBAVFORMAT_VERSION_MINOR, LIBAVFORMAT_VERSION_MICRO );
- +#endif
- +#if HAVE_FFMS
- + printf( "(ffmpegsource %d.%d.%d.%d)\n", FFMS_VERSION >> 24, (FFMS_VERSION & 0xff0000) >> 16, (FFMS_VERSION & 0xff00) >> 8, FFMS_VERSION & 0xff );
- +#endif
- printf( "built on " __DATE__ ", " );
- #ifdef __GNUC__
- printf( "gcc: " __VERSION__ "\n" );
- @@ -221,9 +238,9 @@ static void print_version_info()
- #else
- printf( "Non-GPL commercial\n" );
- #endif
- -#if HAVE_LAVF
- - const char *license = avformat_license();
- - printf( "libavformat license: %s\n", license );
- +#if HAVE_SWSCALE
- + const char *license = swscale_license();
- + printf( "libswscale%s%s license: %s\n",HAVE_LAVF ? "/libavformat" : "", HAVE_FFMS ? "/ffmpegsource" : "" , license );
- if( !strcmp( license, "nonfree and unredistributable" ) ||
- (!HAVE_GPL && (!strcmp( license, "GPL version 2 or later" )
- || !strcmp( license, "GPL version 3 or later" ))))
- --
- 1.7.3.2.146.gca209
- From b4650b65511b1f5c348d298abda02db99e7fcd16 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 19 Nov 2010 16:58:38 -0800
- Subject: [PATCH 6/9] Add API function to return max number of delayed frames
- ---
- encoder/encoder.c | 8 +++++---
- x264.h | 6 +++++-
- 2 files changed, 10 insertions(+), 4 deletions(-)
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 8b14b41..2d5c778 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -3320,9 +3320,6 @@ void x264_encoder_close ( x264_t *h )
- }
- }
- -/****************************************************************************
- - * x264_encoder_delayed_frames:
- - ****************************************************************************/
- int x264_encoder_delayed_frames( x264_t *h )
- {
- int delayed_frames = 0;
- @@ -3343,3 +3340,8 @@ int x264_encoder_delayed_frames( x264_t *h )
- x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
- return delayed_frames;
- }
- +
- +int x264_encoder_maximum_delayed_frames( x264_t *h )
- +{
- + return h->frames.i_delay;
- +}
- diff --git a/x264.h b/x264.h
- index ce79d40..e144e51 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -39,7 +39,7 @@
- #include <stdarg.h>
- -#define X264_BUILD 108
- +#define X264_BUILD 109
- /* x264_t:
- * opaque handler for encoder */
- @@ -766,6 +766,10 @@ void x264_encoder_close ( x264_t * );
- * return the number of currently delayed (buffered) frames
- * this should be used at the end of the stream, to know when you have all the encoded frames. */
- int x264_encoder_delayed_frames( x264_t * );
- +/* x264_encoder_maximum_delayed_frames( x264_t *h ):
- + * return the maximum number of delayed (buffered) frames that can occur with the current
- + * parameters. */
- +int x264_encoder_maximum_delayed_frames( x264_t *h );
- /* x264_encoder_intra_refresh:
- * If an intra refresh is not in progress, begin one with the next P-frame.
- * If an intra refresh is in progress, begin one as soon as the current one finishes.
- --
- 1.7.3.2.146.gca209
- From 3797b68a8201c74a321883f90536ce30a251e187 Mon Sep 17 00:00:00 2001
- From: Anton Mitrofanov <BugMaster@narod.ru>
- Date: Tue, 23 Nov 2010 23:06:51 +0300
- Subject: [PATCH 7/9] Clean up of weights analyse function
- ---
- encoder/slicetype.c | 42 +++++++++++++++++++-----------------------
- 1 files changed, 19 insertions(+), 23 deletions(-)
- diff --git a/encoder/slicetype.c b/encoder/slicetype.c
- index edf74c4..dd6c360 100644
- --- a/encoder/slicetype.c
- +++ b/encoder/slicetype.c
- @@ -57,7 +57,7 @@ static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
- }
- /* makes a non-h264 weight (i.e. fix7), into an h264 weight */
- -static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_weight_t *w )
- +static void x264_weight_get_h264( int weight_nonh264, int offset, x264_weight_t *w )
- {
- w->i_offset = offset;
- w->i_denom = 7;
- @@ -221,39 +221,37 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
- void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
- {
- - float fenc_mean, ref_mean, fenc_var, ref_var;
- - int offset_search;
- - int minoff, minscale, mindenom;
- - unsigned int minscore, origscore;
- int i_delta_index = fenc->i_frame - ref->i_frame - 1;
- /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
- - const float epsilon = 1.0/128.0;
- - float guess_scale;
- - int found;
- + const float epsilon = 1.f/128.f;
- x264_weight_t *weights = fenc->weight[0];
- + SET_WEIGHT( weights[0], 0, 1, 0, 0 );
- SET_WEIGHT( weights[1], 0, 1, 0, 0 );
- SET_WEIGHT( weights[2], 0, 1, 0, 0 );
- /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
- for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
- {
- - fenc_var = round( sqrt( fenc->i_pixel_ssd[plane] ) );
- - ref_var = round( sqrt( ref->i_pixel_ssd[plane] ) );
- - fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
- - ref_mean = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
- + int offset_search;
- + int minoff, minscale, mindenom;
- + unsigned int minscore, origscore;
- + int found;
- + float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
- + float ref_var = ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
- + float guess_scale = sqrtf( fenc_var / ref_var );
- + float fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
- + float ref_mean = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
- //early termination
- - if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
- + if( fabsf( ref_mean - fenc_mean ) < 0.5f && fabsf( 1.f - guess_scale ) < epsilon )
- {
- SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
- continue;
- }
- - guess_scale = ref_var ? (float)fenc_var/ref_var : 0;
- -
- if( plane )
- {
- weights[plane].i_denom = 6;
- - weights[plane].i_scale = x264_clip3( round(guess_scale * 64.0), 0, 255 );
- + weights[plane].i_scale = x264_clip3( (int)(guess_scale * 64.f + 0.5f), 0, 255 );
- if( weights[plane].i_scale > 127 )
- {
- weights[1].weightfn = weights[2].weightfn = NULL;
- @@ -261,9 +259,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
- }
- }
- else
- - x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] );
- - if( weights[plane].weightfn )
- - h->mc.weight_cache( h, &weights[plane] );
- + x264_weight_get_h264( (int)(guess_scale * 128.f + 0.5f), 0, &weights[plane] );
- found = 0;
- mindenom = weights[plane].i_denom;
- @@ -280,7 +276,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
- x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
- }
- mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
- - origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, 0 );
- + origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, NULL );
- }
- else
- {
- @@ -290,7 +286,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
- if( plane == 1 )
- x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
- mcbuf = plane == 1 ? dstu : dstv;
- - origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, 0 );
- + origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, NULL );
- }
- if( !minscore )
- @@ -299,7 +295,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
- // This gives a slight improvement due to rounding errors but only tests
- // one offset on lookahead.
- // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
- - offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
- + offset_search = x264_clip3( (int)(fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f * b_lookahead), -128, 126 );
- for( int i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
- {
- SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
- @@ -314,7 +310,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
- /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
- /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
- - if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
- + if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
- {
- SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
- continue;
- --
- 1.7.3.2.146.gca209
- From a72ed07c56c9a456c9fa38db0722a37d0eaac101 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Tue, 23 Nov 2010 20:29:37 -0500
- Subject: [PATCH 8/9] SSE version of high-bit-depth add4x4_idct_sse2
- ~6.3x faster than C.
- Our first Google Code-In patch!
- ---
- common/dct.c | 4 +
- common/x86/const-a.asm | 1 +
- common/x86/dct-32.asm | 36 ++++++------
- common/x86/dct-64.asm | 38 ++++++------
- common/x86/dct-a.asm | 47 +++++++++++++--
- common/x86/dct.h | 1 +
- common/x86/pixel-a.asm | 16 +++---
- common/x86/x86util.asm | 147 +++++++++++++++++++++++++-----------------------
- 8 files changed, 167 insertions(+), 123 deletions(-)
- diff --git a/common/dct.c b/common/dct.c
- index 975afef..1b3d87b 100644
- --- a/common/dct.c
- +++ b/common/dct.c
- @@ -429,6 +429,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
- dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
- dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
- }
- + if( cpu&X264_CPU_SSE2 )
- + {
- + dctf->add4x4_idct = x264_add4x4_idct_sse2;
- + }
- #endif // HAVE_MMX
- #else // !HIGH_BIT_DEPTH
- #if HAVE_MMX
- diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
- index 32579e3..d6e621e 100644
- --- a/common/x86/const-a.asm
- +++ b/common/x86/const-a.asm
- @@ -50,6 +50,7 @@ const pw_3fff, times 8 dw 0x3fff
- const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
- const pd_1, times 4 dd 1
- +const pd_32, times 4 dd 32
- const pd_128, times 4 dd 128
- const pw_00ff, times 8 dw 0x00ff
- const pw_ff00, times 8 dw 0xff00
- diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
- index 24e6efc..21e70c1 100644
- --- a/common/x86/dct-32.asm
- +++ b/common/x86/dct-32.asm
- @@ -38,13 +38,13 @@ cextern hsub_mul
- ; in: m0..m7
- ; out: 0,4,6 in mem, rest in regs
- %macro DCT8_1D 9
- - SUMSUB_BA m%8, m%1 ; %8 = s07, %1 = d07
- - SUMSUB_BA m%7, m%2 ; %7 = s16, %2 = d16
- - SUMSUB_BA m%6, m%3 ; %6 = s25, %3 = d25
- - SUMSUB_BA m%5, m%4 ; %5 = s34, %4 = d34
- - SUMSUB_BA m%5, m%8 ; %5 = a0, %8 = a2
- - SUMSUB_BA m%6, m%7 ; %6 = a1, %7 = a3
- - SUMSUB_BA m%6, m%5 ; %6 = dst0, %5 = dst4
- + SUMSUB_BA w, m%8, m%1 ; %8 = s07, %1 = d07
- + SUMSUB_BA w, m%7, m%2 ; %7 = s16, %2 = d16
- + SUMSUB_BA w, m%6, m%3 ; %6 = s25, %3 = d25
- + SUMSUB_BA w, m%5, m%4 ; %5 = s34, %4 = d34
- + SUMSUB_BA w, m%5, m%8 ; %5 = a0, %8 = a2
- + SUMSUB_BA w, m%6, m%7 ; %6 = a1, %7 = a3
- + SUMSUB_BA w, m%6, m%5 ; %6 = dst0, %5 = dst4
- mova [%9+0x00], m%6
- mova [%9+0x40], m%5
- mova m%6, m%7 ; a3
- @@ -127,13 +127,13 @@ cextern hsub_mul
- psubw m%2, m%1
- mova m%1, [%9+0x00]
- mova m%6, [%9+0x40]
- - SUMSUB_BA m%6, m%1
- - SUMSUB_BA m%7, m%6
- - SUMSUB_BA m%3, m%1
- - SUMSUB_BA m%5, m%7
- - SUMSUB_BA m%2, m%3
- - SUMSUB_BA m%8, m%1
- - SUMSUB_BA m%4, m%6
- + SUMSUB_BA w, m%6, m%1
- + SUMSUB_BA w, m%7, m%6
- + SUMSUB_BA w, m%3, m%1
- + SUMSUB_BA w, m%5, m%7
- + SUMSUB_BA w, m%2, m%3
- + SUMSUB_BA w, m%8, m%1
- + SUMSUB_BA w, m%4, m%6
- SWAP %1, %5, %6
- SWAP %3, %8, %7
- %endmacro
- @@ -434,18 +434,18 @@ global add8x8_idct_sse2.skip_prologue
- SBUTTERFLY qdq, 4, 5, 0
- SBUTTERFLY qdq, 6, 7, 0
- UNSPILL r1,0
- - IDCT4_1D 0,1,2,3,r1
- + IDCT4_1D w,0,1,2,3,r1
- SPILL r1, 4
- TRANSPOSE2x4x4W 0,1,2,3,4
- UNSPILL r1, 4
- - IDCT4_1D 4,5,6,7,r1
- + IDCT4_1D w,4,5,6,7,r1
- SPILL r1, 0
- TRANSPOSE2x4x4W 4,5,6,7,0
- UNSPILL r1, 0
- paddw m0, [pw_32]
- - IDCT4_1D 0,1,2,3,r1
- + IDCT4_1D w,0,1,2,3,r1
- paddw m4, [pw_32]
- - IDCT4_1D 4,5,6,7,r1
- + IDCT4_1D w,4,5,6,7,r1
- SPILL r1, 6,7
- pxor m7, m7
- DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
- diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
- index 5e43b9c..70edcbd 100644
- --- a/common/x86/dct-64.asm
- +++ b/common/x86/dct-64.asm
- @@ -36,13 +36,13 @@ cextern hsub_mul
- INIT_XMM
- %macro DCT8_1D 10
- - SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34
- - SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
- - SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
- - SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
- + SUMSUB_BA w, m%5, m%4 ; %5=s34, %4=d34
- + SUMSUB_BA w, m%6, m%3 ; %6=s25, %3=d25
- + SUMSUB_BA w, m%7, m%2 ; %7=s16, %2=d16
- + SUMSUB_BA w, m%8, m%1 ; %8=s07, %1=d07
- - SUMSUB_BA m%6, m%7, m%10 ; %6=a1, %7=a3
- - SUMSUB_BA m%5, m%8, m%10 ; %5=a0, %8=a2
- + SUMSUB_BA w, m%6, m%7, m%10 ; %6=a1, %7=a3
- + SUMSUB_BA w, m%5, m%8, m%10 ; %5=a0, %8=a2
- movdqa m%9, m%1
- psraw m%9, 1
- @@ -56,7 +56,7 @@ INIT_XMM
- paddw m%10, m%2
- psubw m%10, m%3 ; %10=a7
- - SUMSUB_BA m%4, m%1
- + SUMSUB_BA w, m%4, m%1
- psubw m%1, m%3
- psubw m%4, m%2
- psraw m%3, 1
- @@ -70,7 +70,7 @@ INIT_XMM
- psraw m%9, 2
- psubw m%9, m%10 ; %9=b7
- - SUMSUB_BA m%6, m%5, m%10 ; %6=b0, %5=b4
- + SUMSUB_BA w, m%6, m%5, m%10 ; %6=b0, %5=b4
- movdqa m%3, m%7
- psraw m%3, 1
- @@ -88,7 +88,7 @@ INIT_XMM
- %endmacro
- %macro IDCT8_1D 10
- - SUMSUB_BA m%5, m%1, m%9 ; %5=a0, %1=a2
- + SUMSUB_BA w, m%5, m%1, m%9 ; %5=a0, %1=a2
- movdqa m%9, m%2
- psraw m%9, 1
- @@ -123,8 +123,8 @@ INIT_XMM
- psraw m%6, 2
- psubw m%9, m%6 ; %9=b7
- - SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6
- - SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4
- + SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
- + SUMSUB_BA w, m%3, m%1, m%6; %3=b2, %1=b4
- movdqa m%8, m%10
- psraw m%8, 2
- @@ -132,10 +132,10 @@ INIT_XMM
- psraw m%2, 2
- psubw m%2, m%10 ; %2=b5
- - SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7
- - SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6
- - SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5
- - SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4
- + SUMSUB_BA w, m%9, m%7, m%6 ; %9=c0, %7=c7
- + SUMSUB_BA w, m%2, m%3, m%6 ; %2=c1, %3=c6
- + SUMSUB_BA w, m%8, m%1, m%6 ; %8=c2, %1=c5
- + SUMSUB_BA w, m%4, m%5, m%6 ; %4=c3, %5=c4
- SWAP %1, %9, %6
- SWAP %3, %8, %7
- @@ -263,14 +263,14 @@ global add8x8_idct_sse2.skip_prologue
- mova m7, [r1+112]
- SBUTTERFLY qdq, 4, 5, 8
- SBUTTERFLY qdq, 6, 7, 8
- - IDCT4_1D 0,1,2,3,8,10
- + IDCT4_1D w,0,1,2,3,8,10
- TRANSPOSE2x4x4W 0,1,2,3,8
- - IDCT4_1D 4,5,6,7,8,10
- + IDCT4_1D w,4,5,6,7,8,10
- TRANSPOSE2x4x4W 4,5,6,7,8
- paddw m0, [pw_32]
- - IDCT4_1D 0,1,2,3,8,10
- + IDCT4_1D w,0,1,2,3,8,10
- paddw m4, [pw_32]
- - IDCT4_1D 4,5,6,7,8,10
- + IDCT4_1D w,4,5,6,7,8,10
- DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
- DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
- DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
- index 67fa34a..0e4b514 100644
- --- a/common/x86/dct-a.asm
- +++ b/common/x86/dct-a.asm
- @@ -52,13 +52,15 @@ SECTION .text
- cextern pw_32_0
- cextern pw_32
- cextern pw_8000
- +cextern pw_pixel_max
- cextern hsub_mul
- cextern pb_1
- cextern pw_1
- +cextern pd_32
- %macro WALSH4_1D 5
- - SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
- - SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
- + SUMSUB_BADC w, m%4, m%3, m%2, m%1, m%5
- + SUMSUB_BADC w, m%4, m%2, m%3, m%1, m%5
- SWAP %1, %4, %3
- %endmacro
- @@ -86,7 +88,7 @@ cglobal dct4x4dc_mmx, 1,1
- movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
- WALSH4_1D 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- - SUMSUB_BADC m1, m0, m3, m2, m4
- + SUMSUB_BADC w, m1, m0, m3, m2, m4
- SWAP 0, 1
- SWAP 2, 3
- SUMSUB_17BIT 0,2,4,7
- @@ -175,10 +177,10 @@ cglobal add4x4_idct_mmx, 2,2
- movq m3, [r1+24]
- movq m2, [r1+16]
- movq m0, [r1+ 0]
- - IDCT4_1D 0,1,2,3,4,5
- + IDCT4_1D w,0,1,2,3,4,5
- TRANSPOSE4x4W 0,1,2,3,4
- paddw m0, [pw_32]
- - IDCT4_1D 0,1,2,3,4,5
- + IDCT4_1D w,0,1,2,3,4,5
- STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
- STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
- STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
- @@ -198,7 +200,7 @@ cglobal add4x4_idct_sse4, 2,2,6
- psubw m0, m3 ; row1>>1-row3/row0-2
- paddw m2, m1 ; row3>>1+row1/row0+2
- SBUTTERFLY2 wd, 0, 2, 1
- - SUMSUB_BA m2, m0, m1
- + SUMSUB_BA w, m2, m0, m1
- pshuflw m1, m2, 10110001b
- pshufhw m2, m2, 10110001b
- punpckldq m1, m0
- @@ -215,7 +217,7 @@ cglobal add4x4_idct_sse4, 2,2,6
- psubw m0, m3 ; row1>>1-row3/row0-2
- paddw m2, m1 ; row3>>1+row1/row0+2
- SBUTTERFLY2 qdq, 0, 2, 1
- - SUMSUB_BA m2, m0, m1
- + SUMSUB_BA w, m2, m0, m1
- movd m4, [r0+FDEC_STRIDE*0]
- movd m1, [r0+FDEC_STRIDE*1]
- @@ -236,6 +238,37 @@ cglobal add4x4_idct_sse4, 2,2,6
- movd [r0+FDEC_STRIDE*2], m0
- pextrd [r0+FDEC_STRIDE*3], m0, 1
- RET
- +
- +%else
- +
- +%macro STORE_DIFFx2 6
- + psrad %1, 6
- + psrad %2, 6
- + packssdw %1, %2
- + movq %3, %5
- + movhps %3, %6
- + paddsw %1, %3
- + pxor %4, %4
- + CLIPW %1, %4, [pw_pixel_max]
- + movq %5, %1
- + movhps %6, %1
- +%endmacro
- +
- +INIT_XMM
- +cglobal add4x4_idct_sse2, 2,2,7
- + pxor m6, m6
- +.skip_prologue:
- + mova m1, [r1+16]
- + mova m3, [r1+48]
- + mova m2, [r1+32]
- + mova m0, [r1+ 0]
- + IDCT4_1D d,0,1,2,3,4,5
- + TRANSPOSE4x4D 0,1,2,3,4
- + paddd m0, [pd_32]
- + IDCT4_1D d,0,1,2,3,4,5
- + STORE_DIFFx2 m0, m1, m4, m6, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
- + STORE_DIFFx2 m2, m3, m4, m6, [r0+4*FDEC_STRIDE], [r0+6*FDEC_STRIDE]
- + RET
- %endif ; !HIGH_BIT_DEPTH
- INIT_MMX
- diff --git a/common/x86/dct.h b/common/x86/dct.h
- index 58b9d17..ec6ec25 100644
- --- a/common/x86/dct.h
- +++ b/common/x86/dct.h
- @@ -40,6 +40,7 @@ void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix
- void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
- void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
- +void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
- void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] );
- void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
- void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [ 4] );
- diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
- index acb3612..6cd79e1 100644
- --- a/common/x86/pixel-a.asm
- +++ b/common/x86/pixel-a.asm
- @@ -881,7 +881,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
- DEINTB %1, %2, %3, %4, %5
- psubw m%1, m%3
- psubw m%2, m%4
- - SUMSUB_BA m%1, m%2, m%3
- + SUMSUB_BA w, m%1, m%2, m%3
- %endmacro
- %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
- @@ -1278,10 +1278,10 @@ cglobal pixel_sa8d_8x8_internal_%1
- %else ; non-sse2
- HADAMARD4_V m0, m1, m2, m8, m6
- HADAMARD4_V m4, m5, m3, m9, m6
- - SUMSUB_BADC m0, m4, m1, m5, m6
- + SUMSUB_BADC w, m0, m4, m1, m5, m6
- HADAMARD 2, sumsub, 0, 4, 6, 11
- HADAMARD 2, sumsub, 1, 5, 6, 11
- - SUMSUB_BADC m2, m3, m8, m9, m6
- + SUMSUB_BADC w, m2, m3, m8, m9, m6
- HADAMARD 2, sumsub, 2, 3, 6, 11
- HADAMARD 2, sumsub, 8, 9, 6, 11
- HADAMARD 1, amax, 0, 4, 6, 11
- @@ -1379,7 +1379,7 @@ cglobal pixel_sa8d_8x8_internal_%1
- mova spill0, m6
- mova spill1, m7
- HADAMARD4_V m0, m1, m2, m3, m7
- - SUMSUB_BADC m0, m4, m1, m5, m7
- + SUMSUB_BADC w, m0, m4, m1, m5, m7
- HADAMARD 2, sumsub, 0, 4, 7, 6
- HADAMARD 2, sumsub, 1, 5, 7, 6
- HADAMARD 1, amax, 0, 4, 7, 6
- @@ -1387,7 +1387,7 @@ cglobal pixel_sa8d_8x8_internal_%1
- mova m6, spill0
- mova m7, spill1
- paddw m0, m1
- - SUMSUB_BADC m2, m6, m3, m7, m4
- + SUMSUB_BADC w, m2, m6, m3, m7, m4
- HADAMARD 2, sumsub, 2, 6, 4, 5
- HADAMARD 2, sumsub, 3, 7, 4, 5
- HADAMARD 1, amax, 2, 6, 4, 5
- @@ -1994,7 +1994,7 @@ cglobal hadamard_ac_2x2max_mmxext
- mova m2, [r3+0x40]
- mova m3, [r3+0x60]
- sub r3, 8
- - SUMSUB_BADC m0, m1, m2, m3, m4
- + SUMSUB_BADC w, m0, m1, m2, m3, m4
- ABS4 m0, m2, m1, m3, m4, m5
- HADAMARD 0, max, 0, 2, 4, 5
- HADAMARD 0, max, 1, 3, 4, 5
- @@ -2059,7 +2059,7 @@ cglobal hadamard_ac_8x8_mmxext
- mova m1, [r3+0x20]
- mova m2, [r3+0x40]
- mova m3, [r3+0x60]
- - SUMSUB_BADC m0, m1, m2, m3, m4
- + SUMSUB_BADC w, m0, m1, m2, m3, m4
- HADAMARD 0, sumsub, 0, 2, 4, 5
- ABS4 m1, m3, m0, m2, m4, m5
- HADAMARD 0, max, 1, 3, 4, 5
- @@ -2266,7 +2266,7 @@ cglobal hadamard_ac_8x8_%1
- ABS_MOV m2, m4
- ABS_MOV m3, m5
- paddw m1, m2
- - SUMSUB_BA m0, m4; m2
- + SUMSUB_BA w, m0, m4; m2
- %if vertical
- pand m1, [mask_ac4]
- %else
- diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
- index 9acaa3d..d16a237 100644
- --- a/common/x86/x86util.asm
- +++ b/common/x86/x86util.asm
- @@ -241,44 +241,44 @@
- psrlw m%4, 8 ; src .. y7 .. y5
- %endmacro
- -%macro SUMSUB_BA 2-3
- -%if %0==2
- - paddw %1, %2
- - paddw %2, %2
- - psubw %2, %1
- +%macro SUMSUB_BA 3-4
- +%if %0==3
- + padd%1 %2, %3
- + padd%1 %3, %3
- + psub%1 %3, %2
- %else
- - mova %3, %1
- - paddw %1, %2
- - psubw %2, %3
- + mova %4, %2
- + padd%1 %2, %3
- + psub%1 %3, %4
- %endif
- %endmacro
- -%macro SUMSUB_BADC 4-5
- -%if %0==5
- - SUMSUB_BA %1, %2, %5
- - SUMSUB_BA %3, %4, %5
- +%macro SUMSUB_BADC 5-6
- +%if %0==6
- + SUMSUB_BA %1, %2, %3, %6
- + SUMSUB_BA %1, %4, %5, %6
- %else
- - paddw %1, %2
- - paddw %3, %4
- - paddw %2, %2
- - paddw %4, %4
- - psubw %2, %1
- - psubw %4, %3
- + padd%1 %2, %3
- + padd%1 %4, %5
- + padd%1 %3, %3
- + padd%1 %5, %5
- + psub%1 %3, %2
- + psub%1 %5, %4
- %endif
- %endmacro
- %macro HADAMARD4_V 4+
- - SUMSUB_BADC %1, %2, %3, %4
- - SUMSUB_BADC %1, %3, %2, %4
- + SUMSUB_BADC w, %1, %2, %3, %4
- + SUMSUB_BADC w, %1, %3, %2, %4
- %endmacro
- %macro HADAMARD8_V 8+
- - SUMSUB_BADC %1, %2, %3, %4
- - SUMSUB_BADC %5, %6, %7, %8
- - SUMSUB_BADC %1, %3, %2, %4
- - SUMSUB_BADC %5, %7, %6, %8
- - SUMSUB_BADC %1, %5, %2, %6
- - SUMSUB_BADC %3, %7, %4, %8
- + SUMSUB_BADC w, %1, %2, %3, %4
- + SUMSUB_BADC w, %5, %6, %7, %8
- + SUMSUB_BADC w, %1, %3, %2, %4
- + SUMSUB_BADC w, %5, %7, %6, %8
- + SUMSUB_BADC w, %1, %5, %2, %6
- + SUMSUB_BADC w, %3, %7, %4, %8
- %endmacro
- %macro TRANS_SSE2 5-6
- @@ -363,7 +363,7 @@
- %endif
- %endif
- %ifidn %2, sumsub
- - SUMSUB_BA m%3, m%4, m%5
- + SUMSUB_BA w, m%3, m%4, m%5
- %else
- %ifidn %2, amax
- %if %0==6
- @@ -426,67 +426,72 @@
- %endif
- %endmacro
- -%macro SUMSUB2_AB 3
- - mova %3, %1
- - paddw %1, %1
- - paddw %1, %2
- - psubw %3, %2
- - psubw %3, %2
- +%macro SUMSUB2_AB 4
- + mova %4, %2
- + padd%1 %2, %2
- + padd%1 %2, %3
- + psub%1 %4, %3
- + psub%1 %4, %3
- %endmacro
- -%macro SUMSUB2_BA 3
- - mova m%3, m%1
- - paddw m%1, m%2
- - paddw m%1, m%2
- - psubw m%2, m%3
- - psubw m%2, m%3
- +%macro SUMSUB2_BA 4
- + mova m%4, m%2
- + padd%1 m%2, m%3
- + padd%1 m%2, m%3
- + psub%1 m%3, m%4
- + psub%1 m%3, m%4
- %endmacro
- -%macro SUMSUBD2_AB 4
- - mova %4, %1
- - mova %3, %2
- - psraw %2, 1 ; %2: %2>>1
- - psraw %1, 1 ; %1: %1>>1
- - paddw %2, %4 ; %2: %2>>1+%1
- - psubw %1, %3 ; %1: %1>>1-%2
- +%macro SUMSUBD2_AB 5
- + mova %5, %2
- + mova %4, %3
- + psra%1 %3, 1 ; %3: %2>>1
- + psra%1 %2, 1 ; %2: %1>>1
- + padd%1 %3, %5 ; %3: %2>>1+%1
- + psub%1 %2, %4 ; %2: %1>>1-%2
- %endmacro
- %macro DCT4_1D 5
- %ifnum %5
- - SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
- - SUMSUB_BA m%3, m%4, m%5
- - SUMSUB2_AB m%1, m%2, m%5
- + SUMSUB_BADC w, m%4, m%1, m%3, m%2; m%5
- + SUMSUB_BA w, m%3, m%4, m%5
- + SUMSUB2_AB w, m%1, m%2, m%5
- SWAP %1, %3, %4, %5, %2
- %else
- - SUMSUB_BADC m%4, m%1, m%3, m%2
- - SUMSUB_BA m%3, m%4
- + SUMSUB_BADC w, m%4, m%1, m%3, m%2
- + SUMSUB_BA w, m%3, m%4
- mova [%5], m%2
- - SUMSUB2_AB m%1, [%5], m%2
- + SUMSUB2_AB w, m%1, [%5], m%2
- SWAP %1, %3, %4, %2
- %endif
- %endmacro
- -%macro IDCT4_1D 5-6
- -%ifnum %5
- - SUMSUBD2_AB m%2, m%4, m%6, m%5
- - ; %2: %2>>1-%4 %4: %2+%4>>1
- - SUMSUB_BA m%3, m%1, m%6
- - ; %3: %1+%3 %1: %1-%3
- - SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
- - ; %4: %1+%3 + (%2+%4>>1)
- - ; %3: %1+%3 - (%2+%4>>1)
- - ; %2: %1-%3 + (%2>>1-%4)
- - ; %1: %1-%3 - (%2>>1-%4)
- +%macro IDCT4_1D 6-7
- +%ifnum %6
- + SUMSUBD2_AB %1, m%3, m%5, m%7, m%6
- + ; %3: %3>>2-%5 %5: %3+%5>>2
- + SUMSUB_BA %1, m%4, m%2, m%7
- + ; %4: %2+%4 %2: %2-%4
- + SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%7
- + ; %5: %2+%4 + (%3+%5>>1)
- + ; %4: %2+%4 - (%3+%5>>1)
- + ; %3: %2-%4 + (%3>>1-%5)
- + ; %2: %2-%4 - (%3>>1-%5)
- %else
- - SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
- - SUMSUB_BA m%3, m%1
- - SUMSUB_BADC m%4, m%3, m%2, m%1
- +%ifidn %1,w
- + SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+16]
- +%endif
- +%ifidn %1,d
- + SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+32]
- +%endif
- + SUMSUB_BA %1, m%4, m%2
- + SUMSUB_BADC %1, m%5, m%4, m%3, m%2
- %endif
- - SWAP %1, %4, %3
- - ; %1: %1+%3 + (%2+%4>>1) row0
- - ; %2: %1-%3 + (%2>>1-%4) row1
- - ; %3: %1-%3 - (%2>>1-%4) row2
- - ; %4: %1+%3 - (%2+%4>>1) row3
- + SWAP %2, %5, %4
- + ; %2: %2+%4 + (%3+%5>>1) row0
- + ; %3: %2-%4 + (%3>>1-%5) row2
- + ; %4: %2-%4 - (%3>>1-%5) row3
- + ; %5: %2+%4 - (%3+%5>>1) row4
- %endmacro
- --
- 1.7.3.2.146.gca209
- From a989eef327f86107f565e448a17ba07a06546d8d Mon Sep 17 00:00:00 2001
- From: Alex Wright <alexw0885@gmail.com>
- Date: Wed, 24 Nov 2010 02:19:51 -0800
- Subject: [PATCH 9/9] Make --weightp 1 a better speed tradeoff
- Since fade analysis is now so fast, weightp 1 now does fade analysis but no reference duplication.
- This is the opposite of what it used to do (reference duplication but no fade analysis).
- This also gives weightp's better fade quality to faster presets (up to superfast).
- ---
- common/common.c | 7 ++++---
- common/macroblock.c | 8 +++-----
- encoder/encoder.c | 23 ++++++++---------------
- encoder/ratecontrol.c | 4 ++--
- encoder/slicetype.c | 5 ++---
- x264.c | 4 ++--
- x264.h | 4 ++--
- 7 files changed, 23 insertions(+), 32 deletions(-)
- diff --git a/common/common.c b/common/common.c
- index 1f99e9e..1845e3b 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -204,7 +204,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
- param->analyse.b_mixed_references = 0;
- param->analyse.i_trellis = 0;
- param->rc.b_mb_tree = 0;
- - param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
- + param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
- param->rc.i_lookahead = 0;
- }
- else if( !strcasecmp( preset, "veryfast" ) )
- @@ -214,7 +214,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
- param->i_frame_reference = 1;
- param->analyse.b_mixed_references = 0;
- param->analyse.i_trellis = 0;
- - param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
- + param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
- param->rc.i_lookahead = 10;
- }
- else if( !strcasecmp( preset, "faster" ) )
- @@ -222,13 +222,14 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
- param->analyse.b_mixed_references = 0;
- param->i_frame_reference = 2;
- param->analyse.i_subpel_refine = 4;
- - param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
- + param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
- param->rc.i_lookahead = 20;
- }
- else if( !strcasecmp( preset, "fast" ) )
- {
- param->i_frame_reference = 2;
- param->analyse.i_subpel_refine = 6;
- + param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
- param->rc.i_lookahead = 30;
- }
- else if( !strcasecmp( preset, "medium" ) )
- diff --git a/common/macroblock.c b/common/macroblock.c
- index 5c76d3f..9075efc9 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -239,8 +239,6 @@ int x264_macroblock_cache_allocate( x264_t *h )
- int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
- if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
- - else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
- - i_refs = X264_MIN(X264_REF_MAX, i_refs + 1); //blind weights add one duplicate frame
- for( int j = !i; j < i_refs; j++ )
- {
- @@ -277,7 +275,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
- //SMART can weight one ref and one offset -1
- numweightbuf = 2;
- else
- - //blind only has one weighted copy (offset -1)
- + //simple only has one weighted ref
- numweightbuf = 1;
- }
- @@ -398,7 +396,7 @@ void x264_macroblock_slice_init( x264_t *h )
- {
- memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
- - if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
- + if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- {
- deblock_ref_table(-2) = -2;
- deblock_ref_table(-1) = -1;
- @@ -999,7 +997,7 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
- h->mb.i_neighbour = new_neighbour;
- }
- - if( h->param.analyse.i_weighted_pred && h->sh.i_type == SLICE_TYPE_P )
- + if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P )
- {
- /* Handle reference frame duplicates */
- int i8 = x264_scan8[0] - 8;
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 2d5c778..dbbe9a0 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -772,8 +772,6 @@ static int x264_validate_parameters( x264_t *h )
- h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
- if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
- h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
- - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND && BIT_DEPTH > 8 )
- - h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE;
- if( h->i_thread_frames > 1 )
- {
- @@ -996,7 +994,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
- || h->param.i_bframe_adaptive
- || h->param.i_scenecut_threshold
- || h->param.rc.b_mb_tree
- - || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART );
- + || h->param.analyse.i_weighted_pred );
- h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0;
- h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
- @@ -1432,6 +1430,10 @@ int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t
- if( i <= 1 ) /* empty list, definitely can't duplicate frame */
- return -1;
- + //Duplication isn't used for X264_WEIGHTP_SIMPLE
- + if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SIMPLE )
- + return -1;
- +
- /* Duplication is a hack to compensate for crappy rounding in motion compensation.
- * With high bit depth, it's not worth doing, so turn it off except in the case of
- * unweighted dupes. */
- @@ -1609,7 +1611,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
- if( h->fenc->i_type == X264_TYPE_P )
- {
- int idx = -1;
- - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- + if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
- {
- x264_weight_t w[3];
- w[1].weightfn = w[2].weightfn = NULL;
- @@ -1638,15 +1640,6 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
- }
- }
- }
- - else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
- - {
- - //weighted offset=-1
- - x264_weight_t w[3];
- - SET_WEIGHT( w[0], 1, 1, 0, -1 );
- - h->fenc->weight[0][0].i_denom = 0;
- - w[1].weightfn = w[2].weightfn = NULL;
- - idx = x264_weighted_reference_duplicate( h, 0, w );
- - }
- h->mb.ref_blind_dupe = idx;
- }
- @@ -2876,7 +2869,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
- if( h->sh.i_type == SLICE_TYPE_P )
- {
- h->stat.i_consecutive_bframes[h->fdec->i_frame - h->fref0[0]->i_frame - 1]++;
- - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- + if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
- {
- h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn;
- h->stat.i_wpred[1] += !!h->sh.weight[0][1].weightfn || !!h->sh.weight[0][2].weightfn;
- @@ -3225,7 +3218,7 @@ void x264_encoder_close ( x264_t *h )
- fixed_pred_modes[3][2] * 100.0 / sum_pred_modes[3],
- fixed_pred_modes[3][3] * 100.0 / sum_pred_modes[3] );
- - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
- + if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
- x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%% UV:%.1f%%\n",
- h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P],
- h->stat.i_wpred[1] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index e949e24..780c0e1 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -284,7 +284,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
- }
- }
- /* Need variance data for weighted prediction */
- - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- + if( h->param.analyse.i_weighted_pred )
- {
- for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
- for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
- @@ -1558,7 +1558,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
- goto fail;
- }
- - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.weight[0][0].weightfn )
- + if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->sh.weight[0][0].weightfn )
- {
- if( fprintf( rc->p_stat_file_out, "w:%d,%d,%d",
- h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
- diff --git a/encoder/slicetype.c b/encoder/slicetype.c
- index dd6c360..4f47710 100644
- --- a/encoder/slicetype.c
- +++ b/encoder/slicetype.c
- @@ -647,8 +647,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
- do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
- if( do_search[0] )
- {
- - if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
- - h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
- + if( h->param.analyse.i_weighted_pred && b == p1 )
- {
- x264_emms();
- x264_weights_analyse( h, frames[b], frames[p0], 1 );
- @@ -1549,7 +1548,7 @@ void x264_slicetype_decide( x264_t *h )
- /* Analyse for weighted P frames */
- if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
- - && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
- + && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
- {
- x264_emms();
- x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 );
- diff --git a/x264.c b/x264.c
- index f9a1c6a..b4530f8 100644
- --- a/x264.c
- +++ b/x264.c
- @@ -609,8 +609,8 @@ static void help( x264_param_t *defaults, int longhelp )
- H2( " --no-weightb Disable weighted prediction for B-frames\n" );
- H1( " --weightp <integer> Weighted prediction for P-frames [%d]\n"
- " - 0: Disabled\n"
- - " - 1: Blind offset\n"
- - " - 2: Smart analysis\n", defaults->analyse.i_weighted_pred );
- + " - 1: Weighted refs\n"
- + " - 2: Weighted refs + Duplicates\n", defaults->analyse.i_weighted_pred );
- H1( " --me <string> Integer pixel motion estimation method [\"%s\"]\n",
- strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
- H2( " - dia: diamond search, radius 1 (fast)\n"
- diff --git a/x264.h b/x264.h
- index e144e51..bfe478b 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -39,7 +39,7 @@
- #include <stdarg.h>
- -#define X264_BUILD 109
- +#define X264_BUILD 110
- /* x264_t:
- * opaque handler for encoder */
- @@ -151,7 +151,7 @@ typedef struct
- #define X264_B_ADAPT_FAST 1
- #define X264_B_ADAPT_TRELLIS 2
- #define X264_WEIGHTP_NONE 0
- -#define X264_WEIGHTP_BLIND 1
- +#define X264_WEIGHTP_SIMPLE 1
- #define X264_WEIGHTP_SMART 2
- #define X264_B_PYRAMID_NONE 0
- #define X264_B_PYRAMID_STRICT 1
- --
- 1.7.3.2.146.gca209
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement