Untitled

From ee75acd55e1a89eb9a8c2f7d14c923b57e08ceb3 Mon Sep 17 00:00:00 2001
From: Sean McGovern <gseanmcg@gmail.com>
Date: Sun, 10 Oct 2010 19:34:18 -0400
Subject: [PATCH 1/9] Fix build on SPARC Solaris 10

---
 common/pixel.c |    6 +++---
 configure      |   29 +++++++++++++++++------------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 1e21550..7fa497c 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -36,7 +36,7 @@
 #if ARCH_ARM
 #   include "arm/pixel.h"
 #endif
-#if ARCH_UltraSparc
+#if ARCH_UltraSPARC
 #   include "sparc/pixel.h"
 #endif

@@ -443,7 +443,7 @@ SAD_X( 4x8 )
 SAD_X( 4x4 )

 #if !HIGH_BIT_DEPTH
-#if ARCH_UltraSparc
+#if ARCH_UltraSPARC
 SAD_X( 16x16_vis )
 SAD_X( 16x8_vis )
 SAD_X( 8x16_vis )
@@ -1063,7 +1063,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     }
 #endif
 #if !HIGH_BIT_DEPTH
-#if ARCH_UltraSparc
+#if ARCH_UltraSPARC
     INIT4( sad, _vis );
     INIT4( sad_x3, _vis );
     INIT4( sad_x4, _vis );
diff --git a/configure b/configure
index 2f38154..fd62337 100755
--- a/configure
+++ b/configure
@@ -392,15 +392,20 @@ case $host_cpu in
     fi
     ;;
   sparc)
-    if [ $asm = auto ] && test "$(uname -m)" = "sun4u"; then
-      ARCH="UltraSparc"
-      CFLAGS="$CFLAGS -mcpu=ultrasparc"
-      LDFLAGS="$LDFLAGS -mcpu=ultrasparc"
-      AS="${AS-${cross_prefix}as}"
-      ASFLAGS="$ASFLAGS -xarch=v8plusa"
-    else
-      ARCH="Sparc"
-    fi
+    ARCH="SPARC"
+    case $(uname -m) in
+      sun4u|sun4v)
+        if [ $asm = auto ]; then
+          ARCH="UltraSPARC"
+          if ! echo $CFLAGS | grep -Eq '\-mcpu' ; then
+            CFLAGS="$CFLAGS -mcpu=ultrasparc"
+            LDFLAGS="$LDFLAGS -mcpu=ultrasparc"
+          fi
+          AS="${AS-${cross_prefix}as}"
+          ASFLAGS="$ASFLAGS -xarch=v8plusa"
+        fi
+        ;;
+    esac
     ;;
   mips|mipsel|mips64|mips64el)
     ARCH="MIPS"
@@ -497,11 +502,11 @@ fi
 define ARCH_$ARCH
 define SYS_$SYS

-echo "int i = 0x42494745; double f = 0x1.0656e6469616ep+102;" > conftest.c
+echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
 $CC $CFLAGS conftest.c -c -o conftest.o 2>$DEVNULL || die "endian test failed"
-if grep -q BIGE conftest.o && grep -q FPendian conftest.o ; then
+if (strings -a conftest.o | grep -q BIGE) && (strings -a conftest.o | grep -q FPendian) ; then
     define WORDS_BIGENDIAN
-elif !(grep -q EGIB conftest.o && grep -q naidnePF conftest.o) ; then
+elif !(strings -a conftest.o | grep -q EGIB && strings -a conftest.o | grep -q naidnePF) ; then
     die "endian test failed"
 fi

--
1.7.3.2.146.gca209


From b38fd04c376bac31544782fabe03471567a8badf Mon Sep 17 00:00:00 2001
From: Sean McGovern <gseanmcg@gmail.com>
Date: Sun, 21 Nov 2010 01:59:33 -0500
Subject: [PATCH 2/9] Only build SPARC VIS asm if high bit-depth is disabled

---
 Makefile |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/Makefile b/Makefile
index 8a3a327..21f57e7 100644
--- a/Makefile
+++ b/Makefile
@@ -110,9 +110,11 @@ endif

 # VIS optims
 ifeq ($(ARCH),UltraSparc)
+ifeq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),)
 ASMSRC += common/sparc/pixel.asm
 OBJASM  = $(ASMSRC:%.asm=%.o)
 endif
+endif

 ifneq ($(HAVE_GETOPT_LONG),1)
 SRCCLI += extras/getopt.c
--
1.7.3.2.146.gca209


From bb45211ef98ae8b382a15d4b03c2dc5c8d8f67e0 Mon Sep 17 00:00:00 2001
From: Steven Walters <kemuri9@gmail.com>
Date: Mon, 22 Nov 2010 10:31:05 +0900
Subject: [PATCH 3/9] Fix configure so that boolean configuration options are 1/0

There are many cases of 1/undef, not 1/0.
---
 Makefile            |   12 ++++++------
 common/arm/asm.S    |    2 +-
 common/bitstream.c  |    4 ++--
 common/macroblock.h |    2 +-
 configure           |   15 +++++++++------
 5 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 21f57e7..82f0bbc 100644
--- a/Makefile
+++ b/Makefile
@@ -25,29 +25,29 @@ SRCSO =
 CONFIG := $(shell cat config.h)

 # GPL-only files
-ifeq ($(GPL),yes)
+ifneq ($(findstring HAVE_GPL 1, $(CONFIG)),)
 SRCCLI +=
 endif

 # Optional module sources
-ifneq ($(findstring HAVE_AVS, $(CONFIG)),)
+ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),)
 SRCCLI += input/avs.c
 endif

-ifneq ($(findstring HAVE_PTHREAD, $(CONFIG)),)
+ifneq ($(findstring HAVE_PTHREAD 1, $(CONFIG)),)
 SRCCLI += input/thread.c
 SRCS   += common/threadpool.c
 endif

-ifneq ($(findstring HAVE_LAVF, $(CONFIG)),)
+ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),)
 SRCCLI += input/lavf.c
 endif

-ifneq ($(findstring HAVE_FFMS, $(CONFIG)),)
+ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),)
 SRCCLI += input/ffms.c
 endif

-ifneq ($(findstring HAVE_GPAC, $(CONFIG)),)
+ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),)
 SRCCLI += output/mp4.c
 endif

diff --git a/common/arm/asm.S b/common/arm/asm.S
index 7434262..92e3b14 100644
--- a/common/arm/asm.S
+++ b/common/arm/asm.S
@@ -65,7 +65,7 @@ ELF     .type   \name, %function
         .endm

 .macro movconst rd, val
-#ifdef HAVE_ARMV6T2
+#if HAVE_ARMV6T2
     movw        \rd, #:lower16:\val
 .if \val >> 16
     movt        \rd, #:upper16:\val
diff --git a/common/bitstream.c b/common/bitstream.c
index 8350fb3..0f2bc9f 100644
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -39,7 +39,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
     return dst;
 }

-#ifdef HAVE_MMX
+#if HAVE_MMX
 uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
 uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
 #endif
@@ -88,7 +88,7 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
 void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
 {
     pf->nal_escape = x264_nal_escape_c;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     if( cpu&X264_CPU_MMXEXT )
         pf->nal_escape = x264_nal_escape_mmxext;
     if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
diff --git a/common/macroblock.h b/common/macroblock.h
index 7562948..ce4ead9 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -364,7 +364,7 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
 }
 static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
 {
-#ifdef WORDS_BIGENDIAN
+#if WORDS_BIGENDIAN
    return b + ((uint64_t)a<<32);
 #else
    return a + ((uint64_t)b<<32);
diff --git a/configure b/configure
index fd62337..ef33323 100755
--- a/configure
+++ b/configure
@@ -171,6 +171,9 @@ cross_prefix=""

 EXE=""

+# list of all preprocessor HAVE values we can define
+CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON PTHREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL"
+
 # parse options

 for opt do
@@ -735,11 +738,12 @@ fi
 define BIT_DEPTH $bit_depth
 ASFLAGS="$ASFLAGS -DBIT_DEPTH=$bit_depth"

-if [ $gpl = yes ]; then
-    define HAVE_GPL 1
-else
-    define HAVE_GPL 0
-fi
+[ $gpl = yes ] && define HAVE_GPL
+
+#define undefined vars as 0
+for var in $CONFIG_HAVE; do
+    grep -q "HAVE_$var 1" config.h || define HAVE_$var 0
+done

 rm -f conftest*

@@ -766,7 +770,6 @@ EXE=$EXE
 VIS=$vis
 HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG
 DEVNULL=$DEVNULL
-GPL=$gpl
 EOF

 if [ "$shared" = "yes" ]; then
--
1.7.3.2.146.gca209


From 8363900cb951db4acc1ed82fac2ede3996c4703f Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Sat, 20 Nov 2010 23:30:42 -0800
Subject: [PATCH 4/9] Change qpmin default to 0
 There's probably no real reason to keep it at 10 anymore, and lowering it allows AQ to pick lower quantizers in really flat areas.
 Might help on gradients at high quality levels.
 The previous value of 10 was arbitrary anyways.

---
 common/common.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/common/common.c b/common/common.c
index 6c88556..1f99e9e 100644
--- a/common/common.c
+++ b/common/common.c
@@ -98,7 +98,7 @@ void x264_param_default( x264_param_t *param )
     param->rc.f_vbv_buffer_init = 0.9;
     param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
     param->rc.f_rf_constant = 23;
-    param->rc.i_qp_min = 10;
+    param->rc.i_qp_min = 0;
     param->rc.i_qp_max = QP_MAX;
     param->rc.i_qp_step = 4;
     param->rc.f_ip_factor = 1.4;
--
1.7.3.2.146.gca209


From 07c6b3fd5028057ad8f6511e0773506a2bba69b9 Mon Sep 17 00:00:00 2001
From: Yasuhiro Ikeda <wipple625@gmail.com>
Date: Mon, 22 Nov 2010 11:01:57 +0900
Subject: [PATCH 5/9] Add some more info to `x264 --version`

---
 x264.c |   23 ++++++++++++++++++++---
 1 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/x264.c b/x264.c
index bba17b8..f9a1c6a 100644
--- a/x264.c
+++ b/x264.c
@@ -55,6 +55,14 @@
 #include <libavutil/pixdesc.h>
 #endif

+#if HAVE_SWSCALE
+#include <libswscale/swscale.h>
+#endif
+
+#if HAVE_FFMS
+#include <ffms.h>
+#endif
+
 /* Ctrl-C handler */
 static volatile int b_ctrl_c = 0;
 static int          b_exit_on_ctrl_c = 0;
@@ -208,6 +216,15 @@ static void print_version_info()
 #else
     printf( "x264 0.%d.X\n", X264_BUILD );
 #endif
+#if HAVE_SWSCALE
+    printf( "(libswscale %d.%d.%d)\n", LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO );
+#endif
+#if HAVE_LAVF
+    printf( "(libavformat %d.%d.%d)\n", LIBAVFORMAT_VERSION_MAJOR, LIBAVFORMAT_VERSION_MINOR, LIBAVFORMAT_VERSION_MICRO );
+#endif
+#if HAVE_FFMS
+    printf( "(ffmpegsource %d.%d.%d.%d)\n", FFMS_VERSION >> 24, (FFMS_VERSION & 0xff0000) >> 16, (FFMS_VERSION & 0xff00) >> 8, FFMS_VERSION & 0xff );
+#endif
     printf( "built on " __DATE__ ", " );
 #ifdef __GNUC__
     printf( "gcc: " __VERSION__ "\n" );
@@ -221,9 +238,9 @@ static void print_version_info()
 #else
     printf( "Non-GPL commercial\n" );
 #endif
-#if HAVE_LAVF
-    const char *license = avformat_license();
-    printf( "libavformat license: %s\n", license );
+#if HAVE_SWSCALE
+    const char *license = swscale_license();
+    printf( "libswscale%s%s license: %s\n",HAVE_LAVF ? "/libavformat" : "", HAVE_FFMS ? "/ffmpegsource" : "" , license );
     if( !strcmp( license, "nonfree and unredistributable" ) ||
        (!HAVE_GPL && (!strcmp( license, "GPL version 2 or later" )
                   ||  !strcmp( license, "GPL version 3 or later" ))))
--
1.7.3.2.146.gca209


From b4650b65511b1f5c348d298abda02db99e7fcd16 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 19 Nov 2010 16:58:38 -0800
Subject: [PATCH 6/9] Add API function to return max number of delayed frames

---
 encoder/encoder.c |    8 +++++---
 x264.h            |    6 +++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/encoder/encoder.c b/encoder/encoder.c
index 8b14b41..2d5c778 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -3320,9 +3320,6 @@ void    x264_encoder_close  ( x264_t *h )
     }
 }

-/****************************************************************************
- * x264_encoder_delayed_frames:
- ****************************************************************************/
 int x264_encoder_delayed_frames( x264_t *h )
 {
     int delayed_frames = 0;
@@ -3343,3 +3340,8 @@ int x264_encoder_delayed_frames( x264_t *h )
     x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
     return delayed_frames;
 }
+
+int x264_encoder_maximum_delayed_frames( x264_t *h )
+{
+    return h->frames.i_delay;
+}
diff --git a/x264.h b/x264.h
index ce79d40..e144e51 100644
--- a/x264.h
+++ b/x264.h
@@ -39,7 +39,7 @@

 #include <stdarg.h>

-#define X264_BUILD 108
+#define X264_BUILD 109

 /* x264_t:
  *      opaque handler for encoder */
@@ -766,6 +766,10 @@ void    x264_encoder_close  ( x264_t * );
  *      return the number of currently delayed (buffered) frames
  *      this should be used at the end of the stream, to know when you have all the encoded frames. */
 int     x264_encoder_delayed_frames( x264_t * );
+/* x264_encoder_maximum_delayed_frames( x264_t *h ):
+ *      return the maximum number of delayed (buffered) frames that can occur with the current
+ *      parameters. */
+int     x264_encoder_maximum_delayed_frames( x264_t *h );
 /* x264_encoder_intra_refresh:
  *      If an intra refresh is not in progress, begin one with the next P-frame.
  *      If an intra refresh is in progress, begin one as soon as the current one finishes.
--
1.7.3.2.146.gca209


From 3797b68a8201c74a321883f90536ce30a251e187 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Tue, 23 Nov 2010 23:06:51 +0300
Subject: [PATCH 7/9] Clean up of weights analyse function

---
 encoder/slicetype.c |   42 +++++++++++++++++++-----------------------
 1 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index edf74c4..dd6c360 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -57,7 +57,7 @@ static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
 }

 /* makes a non-h264 weight (i.e. fix7), into an h264 weight */
-static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_weight_t *w )
+static void x264_weight_get_h264( int weight_nonh264, int offset, x264_weight_t *w )
 {
     w->i_offset = offset;
     w->i_denom = 7;
@@ -221,39 +221,37 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f

 void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
 {
-    float fenc_mean, ref_mean, fenc_var, ref_var;
-    int offset_search;
-    int minoff, minscale, mindenom;
-    unsigned int minscore, origscore;
     int i_delta_index = fenc->i_frame - ref->i_frame - 1;
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
-    const float epsilon = 1.0/128.0;
-    float guess_scale;
-    int found;
+    const float epsilon = 1.f/128.f;
     x264_weight_t *weights = fenc->weight[0];
+    SET_WEIGHT( weights[0], 0, 1, 0, 0 );
     SET_WEIGHT( weights[1], 0, 1, 0, 0 );
     SET_WEIGHT( weights[2], 0, 1, 0, 0 );
     /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
     for( int plane = 0; plane <= 2  && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
     {
-        fenc_var = round( sqrt( fenc->i_pixel_ssd[plane] ) );
-        ref_var  = round( sqrt(  ref->i_pixel_ssd[plane] ) );
-        fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
-        ref_mean  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
+        int offset_search;
+        int minoff, minscale, mindenom;
+        unsigned int minscore, origscore;
+        int found;
+        float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
+        float ref_var  =  ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
+        float guess_scale = sqrtf( fenc_var / ref_var );
+        float fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);
+        float ref_mean  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]);

         //early termination
-        if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
+        if( fabsf( ref_mean - fenc_mean ) < 0.5f && fabsf( 1.f - guess_scale ) < epsilon )
         {
             SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
             continue;
         }

-        guess_scale = ref_var ? (float)fenc_var/ref_var : 0;
-
         if( plane )
         {
             weights[plane].i_denom = 6;
-            weights[plane].i_scale = x264_clip3( round(guess_scale * 64.0), 0, 255 );
+            weights[plane].i_scale = x264_clip3( (int)(guess_scale * 64.f + 0.5f), 0, 255 );
             if( weights[plane].i_scale > 127 )
             {
                 weights[1].weightfn = weights[2].weightfn = NULL;
@@ -261,9 +259,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
             }
         }
         else
-            x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] );
-        if( weights[plane].weightfn )
-            h->mc.weight_cache( h, &weights[plane] );
+            x264_weight_get_h264( (int)(guess_scale * 128.f + 0.5f), 0, &weights[plane] );

         found = 0;
         mindenom = weights[plane].i_denom;
@@ -280,7 +276,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
                 x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
             }
             mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
-            origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, 0 );
+            origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, NULL );
         }
         else
         {
@@ -290,7 +286,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
             if( plane == 1 )
                 x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
             mcbuf = plane == 1 ? dstu : dstv;
-            origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, 0 );
+            origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, NULL );
         }

         if( !minscore )
@@ -299,7 +295,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
         // This gives a slight improvement due to rounding errors but only tests
         // one offset on lookahead.
         // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
-        offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
+        offset_search = x264_clip3( (int)(fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f * b_lookahead), -128, 126 );
         for( int i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
         {
             SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
@@ -314,7 +310,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int

         /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
         /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
-        if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
+        if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
         {
             SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
             continue;
--
1.7.3.2.146.gca209


From a72ed07c56c9a456c9fa38db0722a37d0eaac101 Mon Sep 17 00:00:00 2001
From: Daniel Kang <daniel.d.kang@gmail.com>
Date: Tue, 23 Nov 2010 20:29:37 -0500
Subject: [PATCH 8/9] SSE version of high-bit-depth add4x4_idct_sse2
 ~6.3x faster than C.
 Our first Google Code-In patch!

---
 common/dct.c           |    4 +
 common/x86/const-a.asm |    1 +
 common/x86/dct-32.asm  |   36 ++++++------
 common/x86/dct-64.asm  |   38 ++++++------
 common/x86/dct-a.asm   |   47 +++++++++++++--
 common/x86/dct.h       |    1 +
 common/x86/pixel-a.asm |   16 +++---
 common/x86/x86util.asm |  147 +++++++++++++++++++++++++-----------------------
 8 files changed, 167 insertions(+), 123 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 975afef..1b3d87b 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -429,6 +429,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
     }
+    if( cpu&X264_CPU_SSE2 )
+    {
+        dctf->add4x4_idct   = x264_add4x4_idct_sse2;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index 32579e3..d6e621e 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -50,6 +50,7 @@ const pw_3fff,     times 8 dw 0x3fff
 const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)

 const pd_1,        times 4 dd 1
+const pd_32,       times 4 dd 32
 const pd_128,      times 4 dd 128
 const pw_00ff,     times 8 dw 0x00ff
 const pw_ff00,     times 8 dw 0xff00
diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
index 24e6efc..21e70c1 100644
--- a/common/x86/dct-32.asm
+++ b/common/x86/dct-32.asm
@@ -38,13 +38,13 @@ cextern hsub_mul
 ; in: m0..m7
 ; out: 0,4,6 in mem, rest in regs
 %macro DCT8_1D 9
-    SUMSUB_BA  m%8, m%1      ; %8 = s07, %1 = d07
-    SUMSUB_BA  m%7, m%2      ; %7 = s16, %2 = d16
-    SUMSUB_BA  m%6, m%3      ; %6 = s25, %3 = d25
-    SUMSUB_BA  m%5, m%4      ; %5 = s34, %4 = d34
-    SUMSUB_BA  m%5, m%8      ; %5 = a0,  %8 = a2
-    SUMSUB_BA  m%6, m%7      ; %6 = a1,  %7 = a3
-    SUMSUB_BA  m%6, m%5      ; %6 = dst0, %5 = dst4
+    SUMSUB_BA  w, m%8, m%1      ; %8 = s07, %1 = d07
+    SUMSUB_BA  w, m%7, m%2      ; %7 = s16, %2 = d16
+    SUMSUB_BA  w, m%6, m%3      ; %6 = s25, %3 = d25
+    SUMSUB_BA  w, m%5, m%4      ; %5 = s34, %4 = d34
+    SUMSUB_BA  w, m%5, m%8      ; %5 = a0,  %8 = a2
+    SUMSUB_BA  w, m%6, m%7      ; %6 = a1,  %7 = a3
+    SUMSUB_BA  w, m%6, m%5      ; %6 = dst0, %5 = dst4
     mova    [%9+0x00], m%6
     mova    [%9+0x40], m%5
     mova    m%6, m%7         ; a3
@@ -127,13 +127,13 @@ cextern hsub_mul
     psubw     m%2, m%1
     mova      m%1, [%9+0x00]
     mova      m%6, [%9+0x40]
-    SUMSUB_BA m%6, m%1
-    SUMSUB_BA m%7, m%6
-    SUMSUB_BA m%3, m%1
-    SUMSUB_BA m%5, m%7
-    SUMSUB_BA m%2, m%3
-    SUMSUB_BA m%8, m%1
-    SUMSUB_BA m%4, m%6
+    SUMSUB_BA w, m%6, m%1
+    SUMSUB_BA w, m%7, m%6
+    SUMSUB_BA w, m%3, m%1
+    SUMSUB_BA w, m%5, m%7
+    SUMSUB_BA w, m%2, m%3
+    SUMSUB_BA w, m%8, m%1
+    SUMSUB_BA w, m%4, m%6
     SWAP %1, %5, %6
     SWAP %3, %8, %7
 %endmacro
@@ -434,18 +434,18 @@ global add8x8_idct_sse2.skip_prologue
     SBUTTERFLY qdq, 4, 5, 0
     SBUTTERFLY qdq, 6, 7, 0
     UNSPILL r1,0
-    IDCT4_1D 0,1,2,3,r1
+    IDCT4_1D w,0,1,2,3,r1
     SPILL r1, 4
     TRANSPOSE2x4x4W 0,1,2,3,4
     UNSPILL r1, 4
-    IDCT4_1D 4,5,6,7,r1
+    IDCT4_1D w,4,5,6,7,r1
     SPILL r1, 0
     TRANSPOSE2x4x4W 4,5,6,7,0
     UNSPILL r1, 0
     paddw m0, [pw_32]
-    IDCT4_1D 0,1,2,3,r1
+    IDCT4_1D w,0,1,2,3,r1
     paddw m4, [pw_32]
-    IDCT4_1D 4,5,6,7,r1
+    IDCT4_1D w,4,5,6,7,r1
     SPILL r1, 6,7
     pxor m7, m7
     DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
index 5e43b9c..70edcbd 100644
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -36,13 +36,13 @@ cextern hsub_mul
 INIT_XMM

 %macro DCT8_1D 10
-    SUMSUB_BA  m%5, m%4 ; %5=s34, %4=d34
-    SUMSUB_BA  m%6, m%3 ; %6=s25, %3=d25
-    SUMSUB_BA  m%7, m%2 ; %7=s16, %2=d16
-    SUMSUB_BA  m%8, m%1 ; %8=s07, %1=d07
+    SUMSUB_BA  w, m%5, m%4 ; %5=s34, %4=d34
+    SUMSUB_BA  w, m%6, m%3 ; %6=s25, %3=d25
+    SUMSUB_BA  w, m%7, m%2 ; %7=s16, %2=d16
+    SUMSUB_BA  w, m%8, m%1 ; %8=s07, %1=d07

-    SUMSUB_BA  m%6, m%7, m%10 ; %6=a1, %7=a3
-    SUMSUB_BA  m%5, m%8, m%10 ; %5=a0, %8=a2
+    SUMSUB_BA  w, m%6, m%7, m%10 ; %6=a1, %7=a3
+    SUMSUB_BA  w, m%5, m%8, m%10 ; %5=a0, %8=a2

     movdqa  m%9, m%1
     psraw   m%9, 1
@@ -56,7 +56,7 @@ INIT_XMM
     paddw   m%10, m%2
     psubw   m%10, m%3 ; %10=a7

-    SUMSUB_BA  m%4, m%1
+    SUMSUB_BA  w, m%4, m%1
     psubw   m%1, m%3
     psubw   m%4, m%2
     psraw   m%3, 1
@@ -70,7 +70,7 @@ INIT_XMM
     psraw   m%9, 2
     psubw   m%9, m%10 ; %9=b7

-    SUMSUB_BA  m%6, m%5, m%10 ; %6=b0, %5=b4
+    SUMSUB_BA  w, m%6, m%5, m%10 ; %6=b0, %5=b4

     movdqa  m%3, m%7
     psraw   m%3, 1
@@ -88,7 +88,7 @@ INIT_XMM
 %endmacro

 %macro IDCT8_1D 10
-    SUMSUB_BA  m%5, m%1, m%9 ; %5=a0, %1=a2
+    SUMSUB_BA  w, m%5, m%1, m%9 ; %5=a0, %1=a2

     movdqa  m%9, m%2
     psraw   m%9, 1
@@ -123,8 +123,8 @@ INIT_XMM
     psraw   m%6, 2
     psubw   m%9, m%6 ; %9=b7

-    SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6
-    SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4
+    SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
+    SUMSUB_BA w, m%3, m%1, m%6; %3=b2, %1=b4

     movdqa  m%8, m%10
     psraw   m%8, 2
@@ -132,10 +132,10 @@ INIT_XMM
     psraw   m%2, 2
     psubw   m%2, m%10 ; %2=b5

-    SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7
-    SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6
-    SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5
-    SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4
+    SUMSUB_BA w, m%9, m%7, m%6 ; %9=c0, %7=c7
+    SUMSUB_BA w, m%2, m%3, m%6 ; %2=c1, %3=c6
+    SUMSUB_BA w, m%8, m%1, m%6 ; %8=c2, %1=c5
+    SUMSUB_BA w, m%4, m%5, m%6 ; %4=c3, %5=c4

     SWAP %1, %9, %6
     SWAP %3, %8, %7
@@ -263,14 +263,14 @@ global add8x8_idct_sse2.skip_prologue
     mova   m7, [r1+112]
     SBUTTERFLY qdq, 4, 5, 8
     SBUTTERFLY qdq, 6, 7, 8
-    IDCT4_1D 0,1,2,3,8,10
+    IDCT4_1D w,0,1,2,3,8,10
     TRANSPOSE2x4x4W 0,1,2,3,8
-    IDCT4_1D 4,5,6,7,8,10
+    IDCT4_1D w,4,5,6,7,8,10
     TRANSPOSE2x4x4W 4,5,6,7,8
     paddw m0, [pw_32]
-    IDCT4_1D 0,1,2,3,8,10
+    IDCT4_1D w,0,1,2,3,8,10
     paddw m4, [pw_32]
-    IDCT4_1D 4,5,6,7,8,10
+    IDCT4_1D w,4,5,6,7,8,10
     DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
     DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
     DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 67fa34a..0e4b514 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -52,13 +52,15 @@ SECTION .text
 cextern pw_32_0
 cextern pw_32
 cextern pw_8000
+cextern pw_pixel_max
 cextern hsub_mul
 cextern pb_1
 cextern pw_1
+cextern pd_32

 %macro WALSH4_1D 5
-    SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
-    SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
+    SUMSUB_BADC w, m%4, m%3, m%2, m%1, m%5
+    SUMSUB_BADC w, m%4, m%2, m%3, m%1, m%5
     SWAP %1, %4, %3
 %endmacro

@@ -86,7 +88,7 @@ cglobal dct4x4dc_mmx, 1,1
     movq   m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
     WALSH4_1D  0,1,2,3,4
     TRANSPOSE4x4W 0,1,2,3,4
-    SUMSUB_BADC m1, m0, m3, m2, m4
+    SUMSUB_BADC w, m1, m0, m3, m2, m4
     SWAP 0, 1
     SWAP 2, 3
     SUMSUB_17BIT 0,2,4,7
@@ -175,10 +177,10 @@ cglobal add4x4_idct_mmx, 2,2
     movq  m3, [r1+24]
     movq  m2, [r1+16]
     movq  m0, [r1+ 0]
-    IDCT4_1D 0,1,2,3,4,5
+    IDCT4_1D w,0,1,2,3,4,5
     TRANSPOSE4x4W 0,1,2,3,4
     paddw m0, [pw_32]
-    IDCT4_1D 0,1,2,3,4,5
+    IDCT4_1D w,0,1,2,3,4,5
     STORE_DIFF  m0, m4, m7, [r0+0*FDEC_STRIDE]
     STORE_DIFF  m1, m4, m7, [r0+1*FDEC_STRIDE]
     STORE_DIFF  m2, m4, m7, [r0+2*FDEC_STRIDE]
@@ -198,7 +200,7 @@ cglobal add4x4_idct_sse4, 2,2,6
     psubw     m0, m3            ; row1>>1-row3/row0-2
     paddw     m2, m1            ; row3>>1+row1/row0+2
     SBUTTERFLY2 wd, 0, 2, 1
-    SUMSUB_BA m2, m0, m1
+    SUMSUB_BA w, m2, m0, m1
     pshuflw   m1, m2, 10110001b
     pshufhw   m2, m2, 10110001b
     punpckldq m1, m0
@@ -215,7 +217,7 @@ cglobal add4x4_idct_sse4, 2,2,6
     psubw     m0, m3            ; row1>>1-row3/row0-2
     paddw     m2, m1            ; row3>>1+row1/row0+2
     SBUTTERFLY2 qdq, 0, 2, 1
-    SUMSUB_BA m2, m0, m1
+    SUMSUB_BA w, m2, m0, m1

     movd      m4, [r0+FDEC_STRIDE*0]
     movd      m1, [r0+FDEC_STRIDE*1]
@@ -236,6 +238,37 @@ cglobal add4x4_idct_sse4, 2,2,6
     movd     [r0+FDEC_STRIDE*2], m0
     pextrd   [r0+FDEC_STRIDE*3], m0, 1
     RET
+
+%else
+
+%macro STORE_DIFFx2 6
+    psrad     %1, 6
+    psrad     %2, 6
+    packssdw  %1, %2
+    movq      %3, %5
+    movhps    %3, %6
+    paddsw    %1, %3
+    pxor      %4, %4
+    CLIPW     %1, %4, [pw_pixel_max]
+    movq      %5, %1
+    movhps    %6, %1
+%endmacro
+
+INIT_XMM
+cglobal add4x4_idct_sse2, 2,2,7
+    pxor  m6, m6
+.skip_prologue:
+    mova  m1, [r1+16]
+    mova  m3, [r1+48]
+    mova  m2, [r1+32]
+    mova  m0, [r1+ 0]
+    IDCT4_1D d,0,1,2,3,4,5
+    TRANSPOSE4x4D 0,1,2,3,4
+    paddd m0, [pd_32]
+    IDCT4_1D d,0,1,2,3,4,5
+    STORE_DIFFx2 m0, m1, m4, m6, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
+    STORE_DIFFx2 m2, m3, m4, m6, [r0+4*FDEC_STRIDE], [r0+6*FDEC_STRIDE]
+    RET
 %endif ; !HIGH_BIT_DEPTH

 INIT_MMX
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 58b9d17..ec6ec25 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -40,6 +40,7 @@ void x264_sub8x8_dct_dc_mmxext( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix
 void x264_sub8x8_dct_dc_sse2  ( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );

 void x264_add4x4_idct_mmx       ( uint8_t *p_dst, int16_t dct    [16] );
+void x264_add4x4_idct_sse2     ( uint16_t *p_dst, int32_t dct    [16] );
 void x264_add4x4_idct_sse4      ( uint8_t *p_dst, int16_t dct    [16] );
 void x264_add8x8_idct_mmx       ( uint8_t *p_dst, int16_t dct[ 4][16] );
 void x264_add8x8_idct_dc_mmx    ( uint8_t *p_dst, int16_t dct    [ 4] );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index acb3612..6cd79e1 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -881,7 +881,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
     DEINTB %1, %2, %3, %4, %5
     psubw m%1, m%3
     psubw m%2, m%4
-    SUMSUB_BA m%1, m%2, m%3
+    SUMSUB_BA w, m%1, m%2, m%3
 %endmacro

 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
@@ -1278,10 +1278,10 @@ cglobal pixel_sa8d_8x8_internal_%1
 %else ; non-sse2
     HADAMARD4_V m0, m1, m2, m8, m6
     HADAMARD4_V m4, m5, m3, m9, m6
-    SUMSUB_BADC m0, m4, m1, m5, m6
+    SUMSUB_BADC w, m0, m4, m1, m5, m6
     HADAMARD 2, sumsub, 0, 4, 6, 11
     HADAMARD 2, sumsub, 1, 5, 6, 11
-    SUMSUB_BADC m2, m3, m8, m9, m6
+    SUMSUB_BADC w, m2, m3, m8, m9, m6
     HADAMARD 2, sumsub, 2, 3, 6, 11
     HADAMARD 2, sumsub, 8, 9, 6, 11
     HADAMARD 1, amax, 0, 4, 6, 11
@@ -1379,7 +1379,7 @@ cglobal pixel_sa8d_8x8_internal_%1
     mova spill0, m6
     mova spill1, m7
     HADAMARD4_V m0, m1, m2, m3, m7
-    SUMSUB_BADC m0, m4, m1, m5, m7
+    SUMSUB_BADC w, m0, m4, m1, m5, m7
     HADAMARD 2, sumsub, 0, 4, 7, 6
     HADAMARD 2, sumsub, 1, 5, 7, 6
     HADAMARD 1, amax, 0, 4, 7, 6
@@ -1387,7 +1387,7 @@ cglobal pixel_sa8d_8x8_internal_%1
     mova m6, spill0
     mova m7, spill1
     paddw m0, m1
-    SUMSUB_BADC m2, m6, m3, m7, m4
+    SUMSUB_BADC w, m2, m6, m3, m7, m4
     HADAMARD 2, sumsub, 2, 6, 4, 5
     HADAMARD 2, sumsub, 3, 7, 4, 5
     HADAMARD 1, amax, 2, 6, 4, 5
@@ -1994,7 +1994,7 @@ cglobal hadamard_ac_2x2max_mmxext
     mova      m2, [r3+0x40]
     mova      m3, [r3+0x60]
     sub       r3, 8
-    SUMSUB_BADC m0, m1, m2, m3, m4
+    SUMSUB_BADC w, m0, m1, m2, m3, m4
     ABS4 m0, m2, m1, m3, m4, m5
     HADAMARD 0, max, 0, 2, 4, 5
     HADAMARD 0, max, 1, 3, 4, 5
@@ -2059,7 +2059,7 @@ cglobal hadamard_ac_8x8_mmxext
     mova      m1, [r3+0x20]
     mova      m2, [r3+0x40]
     mova      m3, [r3+0x60]
-    SUMSUB_BADC m0, m1, m2, m3, m4
+    SUMSUB_BADC w, m0, m1, m2, m3, m4
     HADAMARD 0, sumsub, 0, 2, 4, 5
     ABS4 m1, m3, m0, m2, m4, m5
     HADAMARD 0, max, 1, 3, 4, 5
@@ -2266,7 +2266,7 @@ cglobal hadamard_ac_8x8_%1
     ABS_MOV   m2, m4
     ABS_MOV   m3, m5
     paddw     m1, m2
-    SUMSUB_BA m0, m4; m2
+    SUMSUB_BA w, m0, m4; m2
 %if vertical
     pand      m1, [mask_ac4]
 %else
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 9acaa3d..d16a237 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -241,44 +241,44 @@
     psrlw  m%4, 8   ; src .. y7 .. y5
 %endmacro

-%macro SUMSUB_BA 2-3
-%if %0==2
-    paddw   %1, %2
-    paddw   %2, %2
-    psubw   %2, %1
+%macro SUMSUB_BA 3-4
+%if %0==3
+    padd%1  %2, %3
+    padd%1  %3, %3
+    psub%1  %3, %2
 %else
-    mova    %3, %1
-    paddw   %1, %2
-    psubw   %2, %3
+    mova    %4, %2
+    padd%1  %2, %3
+    psub%1  %3, %4
 %endif
 %endmacro

-%macro SUMSUB_BADC 4-5
-%if %0==5
-    SUMSUB_BA %1, %2, %5
-    SUMSUB_BA %3, %4, %5
+%macro SUMSUB_BADC 5-6
+%if %0==6
+    SUMSUB_BA %1, %2, %3, %6
+    SUMSUB_BA %1, %4, %5, %6
 %else
-    paddw   %1, %2
-    paddw   %3, %4
-    paddw   %2, %2
-    paddw   %4, %4
-    psubw   %2, %1
-    psubw   %4, %3
+    padd%1  %2, %3
+    padd%1  %4, %5
+    padd%1  %3, %3
+    padd%1  %5, %5
+    psub%1  %3, %2
+    psub%1  %5, %4
 %endif
 %endmacro

 %macro HADAMARD4_V 4+
-    SUMSUB_BADC %1, %2, %3, %4
-    SUMSUB_BADC %1, %3, %2, %4
+    SUMSUB_BADC w, %1, %2, %3, %4
+    SUMSUB_BADC w, %1, %3, %2, %4
 %endmacro

 %macro HADAMARD8_V 8+
-    SUMSUB_BADC %1, %2, %3, %4
-    SUMSUB_BADC %5, %6, %7, %8
-    SUMSUB_BADC %1, %3, %2, %4
-    SUMSUB_BADC %5, %7, %6, %8
-    SUMSUB_BADC %1, %5, %2, %6
-    SUMSUB_BADC %3, %7, %4, %8
+    SUMSUB_BADC w, %1, %2, %3, %4
+    SUMSUB_BADC w, %5, %6, %7, %8
+    SUMSUB_BADC w, %1, %3, %2, %4
+    SUMSUB_BADC w, %5, %7, %6, %8
+    SUMSUB_BADC w, %1, %5, %2, %6
+    SUMSUB_BADC w, %3, %7, %4, %8
 %endmacro

 %macro TRANS_SSE2 5-6
@@ -363,7 +363,7 @@
     %endif
 %endif
 %ifidn %2, sumsub
-    SUMSUB_BA m%3, m%4, m%5
+    SUMSUB_BA w, m%3, m%4, m%5
 %else
     %ifidn %2, amax
         %if %0==6
@@ -426,67 +426,72 @@
 %endif
 %endmacro

-%macro SUMSUB2_AB 3
-    mova    %3, %1
-    paddw   %1, %1
-    paddw   %1, %2
-    psubw   %3, %2
-    psubw   %3, %2
+%macro SUMSUB2_AB 4
+    mova    %4, %2
+    padd%1  %2, %2
+    padd%1  %2, %3
+    psub%1  %4, %3
+    psub%1  %4, %3
 %endmacro

-%macro SUMSUB2_BA 3
-    mova    m%3, m%1
-    paddw   m%1, m%2
-    paddw   m%1, m%2
-    psubw   m%2, m%3
-    psubw   m%2, m%3
+%macro SUMSUB2_BA 4
+    mova    m%4, m%2
+    padd%1  m%2, m%3
+    padd%1  m%2, m%3
+    psub%1  m%3, m%4
+    psub%1  m%3, m%4
 %endmacro

-%macro SUMSUBD2_AB 4
-    mova    %4, %1
-    mova    %3, %2
-    psraw   %2, 1  ; %2: %2>>1
-    psraw   %1, 1  ; %1: %1>>1
-    paddw   %2, %4 ; %2: %2>>1+%1
-    psubw   %1, %3 ; %1: %1>>1-%2
+%macro SUMSUBD2_AB 5
+    mova    %5, %2
+    mova    %4, %3
+    psra%1  %3, 1  ; %3: %2>>1
+    psra%1  %2, 1  ; %2: %1>>1
+    padd%1  %3, %5 ; %3: %2>>1+%1
+    psub%1  %2, %4 ; %2: %1>>1-%2
 %endmacro

 %macro DCT4_1D 5
 %ifnum %5
-    SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
-    SUMSUB_BA   m%3, m%4, m%5
-    SUMSUB2_AB  m%1, m%2, m%5
+    SUMSUB_BADC w, m%4, m%1, m%3, m%2; m%5
+    SUMSUB_BA   w, m%3, m%4, m%5
+    SUMSUB2_AB  w, m%1, m%2, m%5
     SWAP %1, %3, %4, %5, %2
 %else
-    SUMSUB_BADC m%4, m%1, m%3, m%2
-    SUMSUB_BA   m%3, m%4
+    SUMSUB_BADC w, m%4, m%1, m%3, m%2
+    SUMSUB_BA   w, m%3, m%4
     mova       [%5], m%2
-    SUMSUB2_AB m%1, [%5], m%2
+    SUMSUB2_AB w, m%1, [%5], m%2
     SWAP %1, %3, %4, %2
 %endif
 %endmacro

-%macro IDCT4_1D 5-6
-%ifnum %5
-    SUMSUBD2_AB m%2, m%4, m%6, m%5
-    ; %2: %2>>1-%4 %4: %2+%4>>1
-    SUMSUB_BA   m%3, m%1, m%6
-    ; %3: %1+%3 %1: %1-%3
-    SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
-    ; %4: %1+%3 + (%2+%4>>1)
-    ; %3: %1+%3 - (%2+%4>>1)
-    ; %2: %1-%3 + (%2>>1-%4)
-    ; %1: %1-%3 - (%2>>1-%4)
+%macro IDCT4_1D 6-7
+%ifnum %6
+    SUMSUBD2_AB %1, m%3, m%5, m%7, m%6
+    ; %3: %3>>2-%5 %5: %3+%5>>2
+    SUMSUB_BA   %1, m%4, m%2, m%7
+    ; %4: %2+%4 %2: %2-%4
+    SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%7
+    ; %5: %2+%4 + (%3+%5>>1)
+    ; %4: %2+%4 - (%3+%5>>1)
+    ; %3: %2-%4 + (%3>>1-%5)
+    ; %2: %2-%4 - (%3>>1-%5)
 %else
-    SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
-    SUMSUB_BA   m%3, m%1
-    SUMSUB_BADC m%4, m%3, m%2, m%1
+%ifidn %1,w
+    SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+16]
+%endif
+%ifidn %1,d
+    SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+32]
+%endif
+    SUMSUB_BA   %1, m%4, m%2
+    SUMSUB_BADC %1, m%5, m%4, m%3, m%2
 %endif
-    SWAP %1, %4, %3
-    ; %1: %1+%3 + (%2+%4>>1) row0
-    ; %2: %1-%3 + (%2>>1-%4) row1
-    ; %3: %1-%3 - (%2>>1-%4) row2
-    ; %4: %1+%3 - (%2+%4>>1) row3
+    SWAP %2, %5, %4
+    ; %2: %2+%4 + (%3+%5>>1) row0
+    ; %3: %2-%4 + (%3>>1-%5) row2
+    ; %4: %2-%4 - (%3>>1-%5) row3
+    ; %5: %2+%4 - (%3+%5>>1) row4
 %endmacro


--
1.7.3.2.146.gca209


From a989eef327f86107f565e448a17ba07a06546d8d Mon Sep 17 00:00:00 2001
From: Alex Wright <alexw0885@gmail.com>
Date: Wed, 24 Nov 2010 02:19:51 -0800
Subject: [PATCH 9/9] Make --weightp 1 a better speed tradeoff
 Since fade analysis is now so fast, weightp 1 now does fade analysis but no reference duplication.
 This is the opposite of what it used to do (reference duplication but no fade analysis).
 This also gives weightp's better fade quality to faster presets (up to superfast).

---
 common/common.c       |    7 ++++---
 common/macroblock.c   |    8 +++-----
 encoder/encoder.c     |   23 ++++++++---------------
 encoder/ratecontrol.c |    4 ++--
 encoder/slicetype.c   |    5 ++---
 x264.c                |    4 ++--
 x264.h                |    4 ++--
 7 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/common/common.c b/common/common.c
index 1f99e9e..1845e3b 100644
--- a/common/common.c
+++ b/common/common.c
@@ -204,7 +204,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
         param->analyse.b_mixed_references = 0;
         param->analyse.i_trellis = 0;
         param->rc.b_mb_tree = 0;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
         param->rc.i_lookahead = 0;
     }
     else if( !strcasecmp( preset, "veryfast" ) )
@@ -214,7 +214,7 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
         param->i_frame_reference = 1;
         param->analyse.b_mixed_references = 0;
         param->analyse.i_trellis = 0;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
         param->rc.i_lookahead = 10;
     }
     else if( !strcasecmp( preset, "faster" ) )
@@ -222,13 +222,14 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
         param->analyse.b_mixed_references = 0;
         param->i_frame_reference = 2;
         param->analyse.i_subpel_refine = 4;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
         param->rc.i_lookahead = 20;
     }
     else if( !strcasecmp( preset, "fast" ) )
     {
         param->i_frame_reference = 2;
         param->analyse.i_subpel_refine = 6;
+        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
         param->rc.i_lookahead = 30;
     }
     else if( !strcasecmp( preset, "medium" ) )
diff --git a/common/macroblock.c b/common/macroblock.c
index 5c76d3f..9075efc9 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -239,8 +239,6 @@ int x264_macroblock_cache_allocate( x264_t *h )
         int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
             i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
-        else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
-            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1); //blind weights add one duplicate frame

         for( int j = !i; j < i_refs; j++ )
         {
@@ -277,7 +275,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
                 //SMART can weight one ref and one offset -1
                 numweightbuf = 2;
             else
-                //blind only has one weighted copy (offset -1)
+                //simple only has one weighted ref
                 numweightbuf = 1;
         }

@@ -398,7 +396,7 @@ void x264_macroblock_slice_init( x264_t *h )
     {
         memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );

-        if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred )
+        if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
         {
             deblock_ref_table(-2) = -2;
             deblock_ref_table(-1) = -1;
@@ -999,7 +997,7 @@ void x264_macroblock_cache_load_deblock( x264_t *h )
         h->mb.i_neighbour = new_neighbour;
     }

-    if( h->param.analyse.i_weighted_pred && h->sh.i_type == SLICE_TYPE_P )
+    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P )
     {
         /* Handle reference frame duplicates */
         int i8 = x264_scan8[0] - 8;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 2d5c778..dbbe9a0 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -772,8 +772,6 @@ static int x264_validate_parameters( x264_t *h )
     h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
     if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
         h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
-    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND && BIT_DEPTH > 8 )
-        h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE;

     if( h->i_thread_frames > 1 )
     {
@@ -996,7 +994,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
           || h->param.i_bframe_adaptive
           || h->param.i_scenecut_threshold
           || h->param.rc.b_mb_tree
-          || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART );
+          || h->param.analyse.i_weighted_pred );
     h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0;
     h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);

@@ -1432,6 +1430,10 @@ int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t
     if( i <= 1 ) /* empty list, definitely can't duplicate frame */
         return -1;

+    //Duplication isn't used for X264_WEIGHTP_SIMPLE
+    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SIMPLE )
+        return -1;
+
     /* Duplication is a hack to compensate for crappy rounding in motion compensation.
      * With high bit depth, it's not worth doing, so turn it off except in the case of
      * unweighted dupes. */
@@ -1609,7 +1611,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
     if( h->fenc->i_type == X264_TYPE_P )
     {
         int idx = -1;
-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
         {
             x264_weight_t w[3];
             w[1].weightfn = w[2].weightfn = NULL;
@@ -1638,15 +1640,6 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
                 }
             }
         }
-        else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
-        {
-            //weighted offset=-1
-            x264_weight_t w[3];
-            SET_WEIGHT( w[0], 1, 1, 0, -1 );
-            h->fenc->weight[0][0].i_denom = 0;
-            w[1].weightfn = w[2].weightfn = NULL;
-            idx = x264_weighted_reference_duplicate( h, 0, w );
-        }
         h->mb.ref_blind_dupe = idx;
     }

@@ -2876,7 +2869,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     if( h->sh.i_type == SLICE_TYPE_P )
     {
         h->stat.i_consecutive_bframes[h->fdec->i_frame - h->fref0[0]->i_frame - 1]++;
-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
         {
             h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn;
             h->stat.i_wpred[1] += !!h->sh.weight[0][1].weightfn || !!h->sh.weight[0][2].weightfn;
@@ -3225,7 +3218,7 @@ void    x264_encoder_close  ( x264_t *h )
                       fixed_pred_modes[3][2] * 100.0 / sum_pred_modes[3],
                       fixed_pred_modes[3][3] * 100.0 / sum_pred_modes[3] );

-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
+        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
             x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%% UV:%.1f%%\n",
                       h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P],
                       h->stat.i_wpred[1] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index e949e24..780c0e1 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -284,7 +284,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
             }
         }
         /* Need variance data for weighted prediction */
-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        if( h->param.analyse.i_weighted_pred )
         {
             for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
                 for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
@@ -1558,7 +1558,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
                 goto fail;
         }

-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.weight[0][0].weightfn )
+        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->sh.weight[0][0].weightfn )
         {
             if( fprintf( rc->p_stat_file_out, "w:%d,%d,%d",
                          h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index dd6c360..4f47710 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -647,8 +647,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
         do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
         if( do_search[0] )
         {
-            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ||
-                  h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
+            if( h->param.analyse.i_weighted_pred && b == p1 )
             {
                 x264_emms();
                 x264_weights_analyse( h, frames[b], frames[p0], 1 );
@@ -1549,7 +1548,7 @@ void x264_slicetype_decide( x264_t *h )

     /* Analyse for weighted P frames */
     if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
-        && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
     {
         x264_emms();
         x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 );
diff --git a/x264.c b/x264.c
index f9a1c6a..b4530f8 100644
--- a/x264.c
+++ b/x264.c
@@ -609,8 +609,8 @@ static void help( x264_param_t *defaults, int longhelp )
     H2( "      --no-weightb            Disable weighted prediction for B-frames\n" );
     H1( "      --weightp <integer>     Weighted prediction for P-frames [%d]\n"
         "                                  - 0: Disabled\n"
-        "                                  - 1: Blind offset\n"
-        "                                  - 2: Smart analysis\n", defaults->analyse.i_weighted_pred );
+        "                                  - 1: Weighted refs\n"
+        "                                  - 2: Weighted refs + Duplicates\n", defaults->analyse.i_weighted_pred );
     H1( "      --me <string>           Integer pixel motion estimation method [\"%s\"]\n",
                                        strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
     H2( "                                  - dia: diamond search, radius 1 (fast)\n"
diff --git a/x264.h b/x264.h
index e144e51..bfe478b 100644
--- a/x264.h
+++ b/x264.h
@@ -39,7 +39,7 @@

 #include <stdarg.h>

-#define X264_BUILD 109
+#define X264_BUILD 110

 /* x264_t:
  *      opaque handler for encoder */
@@ -151,7 +151,7 @@ typedef struct
 #define X264_B_ADAPT_FAST            1
 #define X264_B_ADAPT_TRELLIS         2
 #define X264_WEIGHTP_NONE            0
-#define X264_WEIGHTP_BLIND           1
+#define X264_WEIGHTP_SIMPLE          1
 #define X264_WEIGHTP_SMART           2
 #define X264_B_PYRAMID_NONE          0
 #define X264_B_PYRAMID_STRICT        1
--
1.7.3.2.146.gca209