Untitled

From: James Darnley <james.darnley@gmail.com>
Subject: [PATCH 1/3] Add hqdn3d filter

 Makefile               |    2 +-
 configure              |    2 +-
 filters/video/hqdn3d.c |  289 ++++++++++++++++++++++++++++++++++++++++++++++++
 filters/video/video.c  |    1 +
 4 files changed, 292 insertions(+), 2 deletions(-)
 create mode 100644 filters/video/hqdn3d.c

Index: Makefile
===================================================================
--- Makefile_orig
+++ Makefile
@@ -18,7 +18,7 @@ SRCCLI = x264.c input/input.c input/timecode.c input/raw.c input/y4m.c \
          output/flv.c output/flv_bytestream.c filters/filters.c \
          filters/video/video.c filters/video/source.c filters/video/internal.c \
          filters/video/resize.c filters/video/cache.c filters/video/fix_vfr_pts.c \
-         filters/video/select_every.c filters/video/crop.c
+         filters/video/select_every.c filters/video/crop.c filters/video/hqdn3d.c

 SRCSO =

Index: configure
===================================================================
--- configure_orig
+++ configure
@@ -770,7 +770,7 @@ Libs: $pclibs
 Cflags: -I$includedir
 EOF

-filters="crop select_every"
+filters="crop select_every hqdn3d"
 [ $swscale = yes ] && filters="resize $filters"

 cat > conftest.log <<EOF
Index: filters/video/hqdn3d.c
===================================================================
--- /dev/null
+++ filters/video/hqdn3d.c
@@ -0,0 +1,289 @@
+/*****************************************************************************
+ * hqdn3d.c: x264 hqdn3d filter
+ *****************************************************************************
+ * Copyright (C) 2003 Daniel Moreno <comac@comac.darktech.org>
+ * Avisynth port (C) 2005 Loren Merritt <lorenm@u.washington.edu>
+ * x264 port (C) 2010 James Darnley <james.darnley@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <math.h>
+#include "video.h"
+#define NAME "hqdn3d"
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )
+
+#define PARAM1_DEFAULT 4.0
+#define PARAM2_DEFAULT 3.0
+#define PARAM3_DEFAULT 6.0
+
+cli_vid_filter_t hqdn3d_filter;
+
+typedef struct
+{
+    hnd_t prev_hnd;
+    cli_vid_filter_t prev_filter;
+    int coefs[4][512*16];
+    unsigned int *line;
+    unsigned short *frame[3];
+    int w, h, cw, ch, first_frame;
+} hqdn3d_hnd_t;
+
+static void help( int longhelp )
+{
+    printf( "      "NAME":ls,cs,lt,ct\n" );
+    if(!longhelp)
+        return;
+    printf( "            Denoises the image using mplayer's hqdn3d filter\n"
+            "            The four arguments are floats and are optional\n"
+            "            If any options are omitted, they will assume a\n"
+            "            value based on previous options that you did specify\n"
+            "            - ls = luma spatial filter strength [%.1lf]\n"
+            "            - cs = chroma spatial filter strength [%.1lf]\n"
+            "            - lt = luma temporal filter strength [%.1lf]\n"
+            "            - ct = chroma temporal filter strength [%.1lf]\n",
+           PARAM1_DEFAULT, PARAM2_DEFAULT, PARAM3_DEFAULT,
+           PARAM3_DEFAULT * PARAM2_DEFAULT / PARAM1_DEFAULT);
+}
+
+#define ABS(A) ( (A) > 0 ? (A) : -(A) )
+
+static void precalc_coefs(int *ct, double dist25)
+{
+    //int i;
+    double gamma_d, simil, c;
+
+    gamma_d = log(0.25) / log(1.0 - dist25/255.0 - 0.00001);
+
+    for (int i = -255*16; i < 256*16; i++)
+    {
+        simil = 1.0 - ABS(i) / (16*255.0);
+        c = pow(simil, gamma_d) * 65536.0 * (double)i / 16.0;
+        ct[16*256+i] = (int)((c<0) ? (c-0.5) : (c+0.5));
+    }
+}
+
+static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info,
+                 x264_param_t *param, char *opt_string )
+{
+    double lum_spac, lum_tmp, chrom_spac, chrom_tmp;
+    double param1, param2, param3, param4;
+
+    hqdn3d_hnd_t *h = calloc( 1, sizeof(hqdn3d_hnd_t) );
+    FAIL_IF_ERROR( !h, "Memory allocation error (hqdn3d.c:%d)\n", __LINE__ )
+
+    h->w = info->width*x264_cli_csps[info->csp].width[0];
+    h->h = info->height*x264_cli_csps[info->csp].height[0];
+    h->cw = info->width*x264_cli_csps[info->csp].width[1];
+    h->ch = info->height*x264_cli_csps[info->csp].height[1];
+
+    h->line = calloc( 1, info->width*sizeof(int) );
+    h->frame[0] = malloc( h->w * h->h * sizeof(short) );
+    h->frame[1] = malloc( h->cw * h->ch * sizeof(short) );
+    h->frame[2] = malloc( h->cw * h->ch * sizeof(short) );
+    FAIL_IF_ERROR( !h->line || !h->frame[0] || !h->frame[1] || !h->frame[2],
+        "Memory allocation error(hqdn3d.c:%d)\n", __LINE__ )
+
+    FAIL_IF_ERROR( !(info->csp == X264_CSP_I420 || info->csp == X264_CSP_I422
+        || info->csp == X264_CSP_I444 || info->csp == X264_CSP_YV12 ),
+        "Only planar YUV images supported\n" )
+    if(opt_string)
+    {
+        switch(sscanf( opt_string, "%lf,%lf,%lf,%lf",
+                       &param1, &param2, &param3, &param4 ))
+        {
+        case 1:
+            lum_spac = param1;
+            lum_tmp = PARAM3_DEFAULT * param1 / PARAM1_DEFAULT;
+            chrom_spac = PARAM2_DEFAULT * param1 / PARAM1_DEFAULT;
+            chrom_tmp = lum_tmp * chrom_spac / lum_spac;
+            break;
+        case 2:
+            lum_spac = param1;
+            lum_tmp = PARAM3_DEFAULT * param1 / PARAM1_DEFAULT;
+            chrom_spac = param2;
+            chrom_tmp = lum_tmp * chrom_spac / lum_spac;
+            break;
+        case 3:
+            lum_spac = param1;
+            lum_tmp = param3;
+            chrom_spac = param2;
+            chrom_tmp = lum_tmp * chrom_spac / lum_spac;
+            break;
+        case 4:
+            lum_spac = param1;
+            lum_tmp = param3;
+            chrom_spac = param2;
+            chrom_tmp = param4;
+            break;
+        default:
+            lum_spac = PARAM1_DEFAULT;
+            lum_tmp = PARAM3_DEFAULT;
+            chrom_spac = PARAM2_DEFAULT;
+            chrom_tmp = lum_tmp * chrom_spac / lum_spac;
+        }
+    }
+    else
+    {
+        lum_spac = PARAM1_DEFAULT;
+        lum_tmp = PARAM3_DEFAULT;
+        chrom_spac = PARAM2_DEFAULT;
+        chrom_tmp = lum_tmp * chrom_spac / lum_spac;
+    }
+
+    precalc_coefs(h->coefs[0], lum_spac);
+    precalc_coefs(h->coefs[1], lum_tmp);
+    precalc_coefs(h->coefs[2], chrom_spac);
+    precalc_coefs(h->coefs[3], chrom_tmp);
+
+    x264_cli_log( NAME, X264_LOG_INFO,
+        "using strengths %.1lf,%.1lf,%.1lf,%.1lf\n",
+        lum_spac, chrom_spac, lum_tmp, chrom_tmp );
+
+    h->first_frame = 1;
+    h->prev_filter = *filter;
+    h->prev_hnd = *handle;
+    *handle = h;
+    *filter = hqdn3d_filter;
+    return 0;
+}
+
+static inline unsigned int low_pass_mul(unsigned int prev_mul, unsigned int curr_mul, int* coef)
+{
+//    int d_mul= (prev_mul&0xFFFFFF)-(curr_mul&0xFFFFFF);
+    int d_mul= prev_mul-curr_mul;
+    int d=((d_mul+0x10007FF)/(65536/16));
+    return curr_mul + coef[d];
+}
+
+static void denoise(const unsigned char *frame,  // mpi->planes[x]
+                    unsigned char *frame_dest,    // dmpi->planes[x]
+                    unsigned int *line_ant,       // vf->priv->Line (width bytes)
+                    unsigned short *frame_ant,
+                    int w, int h, int stride_src, int stride_dest,
+                    int *horizontal, int *vertical, int *temporal)
+{
+    //int X, Y;
+    int line_offs_src = 0, line_offs_dest = 0;
+    unsigned int pixel_ant;
+    int pixel_dst;
+
+    /* First pixel has no left nor top neightbour. Only previous frame */
+    line_ant[0] = pixel_ant = frame[0]<<16;
+    pixel_dst = low_pass_mul(frame_ant[0]<<8, pixel_ant, temporal);
+    frame_ant[0] = ((pixel_dst+0x1000007F)/256);
+    frame_dest[0]= ((pixel_dst+0x10007FFF)/65536);
+
+    /* Fist line has no top neightbour. Only left one for each pixel and
+     * last frame */
+    for (int x = 1; x < w; x++){
+        line_ant[x] = pixel_ant = low_pass_mul(pixel_ant, frame[x]<<16, horizontal);
+        pixel_dst = low_pass_mul(frame_ant[x]<<8, pixel_ant, temporal);
+        frame_ant[x] = ((pixel_dst+0x1000007F)/256);
+        frame_dest[x]= ((pixel_dst+0x10007FFF)/65536);
+    }
+
+    for (int y = 1; y < h; y++){
+        //unsigned int pixel_ant_1;
+        unsigned short* LinePrev=&frame_ant[y*w];
+        line_offs_src += stride_src, line_offs_dest += stride_dest;
+        /* First pixel on each line doesn't have previous pixel */
+        pixel_ant = frame[line_offs_src]<<16;
+        line_ant[0] = low_pass_mul(line_ant[0], pixel_ant, vertical);
+        pixel_dst = low_pass_mul(LinePrev[0]<<8, line_ant[0], temporal);
+        LinePrev[0] = ((pixel_dst+0x1000007F)/256);
+        frame_dest[line_offs_dest]= ((pixel_dst+0x10007FFF)/65536);
+
+        for (int x = 1; x < w; x++){
+            //int pixel_dst_1;
+            /* The rest are normal */
+            pixel_ant = low_pass_mul(pixel_ant, frame[line_offs_src+x]<<16, horizontal);
+            line_ant[x] = low_pass_mul(line_ant[x], pixel_ant, vertical);
+            pixel_dst = low_pass_mul(LinePrev[x]<<8, line_ant[x], temporal);
+            LinePrev[x] = ((pixel_dst+0x1000007F)/256);
+            frame_dest[line_offs_dest+x]= ((pixel_dst+0x10007FFF)/65536);
+        }
+    }
+}
+
+static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
+{
+    hqdn3d_hnd_t *h = handle;
+
+    if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) )
+        return -1;
+
+    if( h->first_frame )
+    {
+        int width = h->w;
+        int height = h->h;
+        int stride = output->img.stride[0];
+        for(int y = 0; y<height; y++)
+            for(int x = 0; x<width; x++)
+                h->frame[0][y*width+x] = output->img.plane[0][y*stride+x] << 8;
+
+        width = h->cw;
+        height = h->ch;
+        stride = output->img.stride[1];
+        for(int y = 0; y<height; y++)
+            for(int x = 0; x<width; x++)
+                h->frame[1][y*width+x] = output->img.plane[1][y*stride+x] << 8;
+
+        stride = output->img.stride[2];
+        for(int y = 0; y<height; y++)
+            for(int x = 0; x<width; x++)
+                h->frame[2][y*width+x] = output->img.plane[2][y*stride+x] << 8;
+
+        h->first_frame = 0;
+    }
+
+    denoise(output->img.plane[0],
+            output->img.plane[0],
+            h->line, h->frame[0],
+            h->w, h->h,
+            output->img.stride[0], output->img.stride[0],
+            h->coefs[0], h->coefs[0], h->coefs[1]);
+    denoise(output->img.plane[1],
+            output->img.plane[1],
+            h->line, h->frame[1],
+            h->cw, h->ch,
+            output->img.stride[1], output->img.stride[1],
+            h->coefs[2], h->coefs[2], h->coefs[3]);
+    denoise(output->img.plane[2],
+            output->img.plane[2],
+            h->line, h->frame[2],
+            h->cw, h->ch,
+            output->img.stride[2], output->img.stride[2],
+            h->coefs[2], h->coefs[2], h->coefs[3]);
+    return 0;
+}
+
+static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
+{
+    hqdn3d_hnd_t *h = handle;
+    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
+}
+
+static void free_filter( hnd_t handle )
+{
+    hqdn3d_hnd_t *h = handle;
+    h->prev_filter.free( h->prev_hnd );
+    free( h->line );
+    for(int i = 0; i<3; i++)
+        free( h->frame[i] );
+    free( h );
+}
+
+cli_vid_filter_t hqdn3d_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
Index: filters/video/video.c
===================================================================
--- /dev/null
+++ filters/video/video.c
@@ -46,6 +46,7 @@ void x264_register_vid_filters()
     REGISTER_VFILTER( fix_vfr_pts );
     REGISTER_VFILTER( resize );
     REGISTER_VFILTER( select_every );
+    REGISTER_VFILTER( hqdn3d );
 }

 int x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter,

From: James Darnley <james.darnley@gmail.com>
Subject: [PATCH 2/3] Add yadif filter

 Makefile                              |    3 +-
 configure                             |    2 +-
 filters/video/avs_vf_yadif_template.h |  245 ++++++++++++++++++++++
 filters/video/video.c                 |    1 +
 filters/video/yadif.c                 |  230 +++++++++++++++++++++
 filters/video/yadif_filter_line.c     |  358 +++++++++++++++++++++++++++++++++
 filters/video/yadif_filter_line.h     |   27 +++
 7 files changed, 864 insertions(+), 2 deletions(-)
 create mode 100644 filters/video/avs_vf_yadif_template.h
 create mode 100644 filters/video/yadif.c
 create mode 100644 filters/video/yadif_filter_line.c
 create mode 100644 filters/video/yadif_filter_line.h

Index: Makefile
===================================================================
--- Makefile_orig
+++ Makefile
@@ -18,7 +18,8 @@ SRCCLI = x264.c input/input.c input/timecode.c input/raw.c input/y4m.c \
          output/flv.c output/flv_bytestream.c filters/filters.c \
          filters/video/video.c filters/video/source.c filters/video/internal.c \
          filters/video/resize.c filters/video/cache.c filters/video/fix_vfr_pts.c \
-         filters/video/select_every.c filters/video/crop.c filters/video/hqdn3d.c
+         filters/video/select_every.c filters/video/crop.c filters/video/hqdn3d.c \
+         filters/video/yadif.c filters/video/yadif_filter_line.c

 SRCSO =

Index: configure
===================================================================
--- configure_orig
+++ configure
@@ -770,7 +770,7 @@ Libs: $pclibs
 Cflags: -I$includedir
 EOF

-filters="crop select_every hqdn3d"
+filters="crop select_every hqdn3d yadif"
 [ $swscale = yes ] && filters="resize $filters"

 cat > conftest.log <<EOF
Index: filters/video/avs_vf_yadif_template.h
===================================================================
--- /dev/null
+++ filters/video/avs_vf_yadif_template.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * SSE2/SSSE3 version (custom optimization) by h.yamagata
+ *
+ * Small fix by Alexander Balakhnin (fizick@avisynth.org.ru)
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define LOAD8(mem,dst) \
+            "movq      "mem", "#dst" \n\t"\
+            "punpcklbw %%xmm7, "#dst" \n\t"
+
+#define CHECK(pj,mj) \
+            "movdqu "#pj"(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1+j] */\
+            "movdqu "#mj"(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1-j] */\
+            "movdqa      %%xmm2, %%xmm4 \n\t"\
+            "movdqa      %%xmm2, %%xmm5 \n\t"\
+            "pxor        %%xmm3, %%xmm4 \n\t"\
+            "pavgb       %%xmm3, %%xmm5 \n\t"\
+            "pand        %[pb1], %%xmm4 \n\t"\
+            "psubusb     %%xmm4, %%xmm5 \n\t"\
+            "psrldq      $1,    %%xmm5 \n\t"\
+            "punpcklbw   %%xmm7, %%xmm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
+            "movdqa      %%xmm2, %%xmm4 \n\t"\
+            "psubusb     %%xmm3, %%xmm2 \n\t"\
+            "psubusb     %%xmm4, %%xmm3 \n\t"\
+            "pmaxub      %%xmm3, %%xmm2 \n\t"\
+            "movdqa      %%xmm2, %%xmm3 \n\t"\
+            "movdqa      %%xmm2, %%xmm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
+            "psrldq      $1,   %%xmm3 \n\t" /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\
+            "psrldq      $2,   %%xmm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
+            "punpcklbw   %%xmm7, %%xmm2 \n\t"\
+            "punpcklbw   %%xmm7, %%xmm3 \n\t"\
+            "punpcklbw   %%xmm7, %%xmm4 \n\t"\
+            "paddw       %%xmm3, %%xmm2 \n\t"\
+            "paddw       %%xmm4, %%xmm2 \n\t" /* score */
+
+#define CHECK1 \
+            "movdqa      %%xmm0, %%xmm3 \n\t"\
+            "pcmpgtw     %%xmm2, %%xmm3 \n\t" /* if(score < spatial_score) */\
+            "pminsw      %%xmm2, %%xmm0 \n\t" /* spatial_score= score; */\
+            "movdqa      %%xmm3, %%xmm6 \n\t"\
+            "pand        %%xmm3, %%xmm5 \n\t"\
+            "pandn       %%xmm1, %%xmm3 \n\t"\
+            "por         %%xmm5, %%xmm3 \n\t"\
+            "movdqa      %%xmm3, %%xmm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
+
+#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
+                  hurts both quality and speed, but matches the C version. */\
+            "paddw       %[pw1], %%xmm6 \n\t"\
+            "psllw       $14,   %%xmm6 \n\t"\
+            "paddsw      %%xmm6, %%xmm2 \n\t"\
+            "movdqa      %%xmm0, %%xmm3 \n\t"\
+            "pcmpgtw     %%xmm2, %%xmm3 \n\t"\
+            "pminsw      %%xmm2, %%xmm0 \n\t"\
+            "pand        %%xmm3, %%xmm5 \n\t"\
+            "pandn       %%xmm1, %%xmm3 \n\t"\
+            "por         %%xmm5, %%xmm3 \n\t"\
+            "movdqa      %%xmm3, %%xmm1 \n\t"
+
+/* mode argument mod - Fizick */
+
+/* static  attribute_align_arg void FILTER_LINE_FUNC_NAME(YadifContext *yadctx, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){
+     const int mode = yadctx->mode; */
+static attribute_align_arg void FILTER_LINE_FUNC_NAME(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
+    DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
+    DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
+    DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
+    DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
+    int x;
+    static DECLARE_ALIGNED(16, const unsigned short, pw_1[]) =
+    {
+        0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001
+    };
+
+    static DECLARE_ALIGNED(16, const unsigned short, pb_1[]) =
+    {
+        0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101
+    };
+
+
+#define FILTER\
+    for(x=0; x<w; x+=8){\
+        __asm__ volatile(\
+            "pxor        %%xmm7, %%xmm7 \n\t"\
+            LOAD8("(%[cur],%[mrefs])", %%xmm0) /* c = cur[x-refs] */\
+            LOAD8("(%[cur],%[prefs])", %%xmm1) /* e = cur[x+refs] */\
+            LOAD8("(%["prev2"])", %%xmm2) /* prev2[x] */\
+            LOAD8("(%["next2"])", %%xmm3) /* next2[x] */\
+            "movdqa      %%xmm3, %%xmm4 \n\t"\
+            "paddw       %%xmm2, %%xmm3 \n\t"\
+            "psraw       $1,    %%xmm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
+            "movdqa      %%xmm0, %[tmp0] \n\t" /* c */\
+            "movdqa      %%xmm3, %[tmp1] \n\t" /* d */\
+            "movdqa      %%xmm1, %[tmp2] \n\t" /* e */\
+            "psubw       %%xmm4, %%xmm2 \n\t"\
+            PABS(        %%xmm4, %%xmm2) /* temporal_diff0 */\
+            LOAD8("(%[prev],%[mrefs])", %%xmm3) /* prev[x-refs] */\
+            LOAD8("(%[prev],%[prefs])", %%xmm4) /* prev[x+refs] */\
+            "psubw       %%xmm0, %%xmm3 \n\t"\
+            "psubw       %%xmm1, %%xmm4 \n\t"\
+            PABS(        %%xmm5, %%xmm3)\
+            PABS(        %%xmm5, %%xmm4)\
+            "paddw       %%xmm4, %%xmm3 \n\t" /* temporal_diff1 */\
+            "psrlw       $1,    %%xmm2 \n\t"\
+            "psrlw       $1,    %%xmm3 \n\t"\
+            "pmaxsw      %%xmm3, %%xmm2 \n\t"\
+            LOAD8("(%[next],%[mrefs])", %%xmm3) /* next[x-refs] */\
+            LOAD8("(%[next],%[prefs])", %%xmm4) /* next[x+refs] */\
+            "psubw       %%xmm0, %%xmm3 \n\t"\
+            "psubw       %%xmm1, %%xmm4 \n\t"\
+            PABS(        %%xmm5, %%xmm3)\
+            PABS(        %%xmm5, %%xmm4)\
+            "paddw       %%xmm4, %%xmm3 \n\t" /* temporal_diff2 */\
+            "psrlw       $1,    %%xmm3 \n\t"\
+            "pmaxsw      %%xmm3, %%xmm2 \n\t"\
+            "movdqa      %%xmm2, %[tmp3] \n\t" /* diff */\
+\
+            "paddw       %%xmm0, %%xmm1 \n\t"\
+            "paddw       %%xmm0, %%xmm0 \n\t"\
+            "psubw       %%xmm1, %%xmm0 \n\t"\
+            "psrlw       $1,    %%xmm1 \n\t" /* spatial_pred */\
+            PABS(        %%xmm2, %%xmm0)      /* ABS(c-e) */\
+\
+            "movdqu      -1(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1] */\
+            "movdqu      -1(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1] */\
+            "movdqa      %%xmm2, %%xmm4 \n\t"\
+            "psubusb     %%xmm3, %%xmm2 \n\t"\
+            "psubusb     %%xmm4, %%xmm3 \n\t"\
+            "pmaxub      %%xmm3, %%xmm2 \n\t"\
+            /*"pshuflw      $9,%%xmm2, %%xmm3 \n\t"*/\
+            /*"pshufhw      $9,%%xmm2, %%xmm3 \n\t"*/\
+            "movdqa %%xmm2, %%xmm3 \n\t" /* correct replacement (here)  */\
+            "psrldq $2, %%xmm3 \n\t"/* for "pshufw $9,%%mm2, %%mm3" - fix by Fizick */\
+            "punpcklbw   %%xmm7, %%xmm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
+            "punpcklbw   %%xmm7, %%xmm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
+            "paddw       %%xmm2, %%xmm0 \n\t"\
+            "paddw       %%xmm3, %%xmm0 \n\t"\
+            "psubw       %[pw1], %%xmm0 \n\t" /* spatial_score */\
+\
+            CHECK(-2,0)\
+            CHECK1\
+            CHECK(-3,1)\
+            CHECK2\
+            CHECK(0,-2)\
+            CHECK1\
+            CHECK(1,-3)\
+            CHECK2\
+\
+            /* if(yadctx->mode<2) ... */\
+            "movdqa      %[tmp3], %%xmm6 \n\t" /* diff */\
+            "cmp         $2, %[mode] \n\t"\
+            "jge         1f \n\t"\
+            LOAD8("(%["prev2"],%[mrefs],2)", %%xmm2) /* prev2[x-2*refs] */\
+            LOAD8("(%["next2"],%[mrefs],2)", %%xmm4) /* next2[x-2*refs] */\
+            LOAD8("(%["prev2"],%[prefs],2)", %%xmm3) /* prev2[x+2*refs] */\
+            LOAD8("(%["next2"],%[prefs],2)", %%xmm5) /* next2[x+2*refs] */\
+            "paddw       %%xmm4, %%xmm2 \n\t"\
+            "paddw       %%xmm5, %%xmm3 \n\t"\
+            "psrlw       $1,    %%xmm2 \n\t" /* b */\
+            "psrlw       $1,    %%xmm3 \n\t" /* f */\
+            "movdqa      %[tmp0], %%xmm4 \n\t" /* c */\
+            "movdqa      %[tmp1], %%xmm5 \n\t" /* d */\
+            "movdqa      %[tmp2], %%xmm7 \n\t" /* e */\
+            "psubw       %%xmm4, %%xmm2 \n\t" /* b-c */\
+            "psubw       %%xmm7, %%xmm3 \n\t" /* f-e */\
+            "movdqa      %%xmm5, %%xmm0 \n\t"\
+            "psubw       %%xmm4, %%xmm5 \n\t" /* d-c */\
+            "psubw       %%xmm7, %%xmm0 \n\t" /* d-e */\
+            "movdqa      %%xmm2, %%xmm4 \n\t"\
+            "pminsw      %%xmm3, %%xmm2 \n\t"\
+            "pmaxsw      %%xmm4, %%xmm3 \n\t"\
+            "pmaxsw      %%xmm5, %%xmm2 \n\t"\
+            "pminsw      %%xmm5, %%xmm3 \n\t"\
+            "pmaxsw      %%xmm0, %%xmm2 \n\t" /* max */\
+            "pminsw      %%xmm0, %%xmm3 \n\t" /* min */\
+            "pxor        %%xmm4, %%xmm4 \n\t"\
+            "pmaxsw      %%xmm3, %%xmm6 \n\t"\
+            "psubw       %%xmm2, %%xmm4 \n\t" /* -max */\
+            "pmaxsw      %%xmm4, %%xmm6 \n\t" /* diff= MAX3(diff, min, -max); */\
+            "1: \n\t"\
+\
+            "movdqa      %[tmp1], %%xmm2 \n\t" /* d */\
+            "movdqa      %%xmm2, %%xmm3 \n\t"\
+            "psubw       %%xmm6, %%xmm2 \n\t" /* d-diff */\
+            "paddw       %%xmm6, %%xmm3 \n\t" /* d+diff */\
+            "pmaxsw      %%xmm2, %%xmm1 \n\t"\
+            "pminsw      %%xmm3, %%xmm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
+            "packuswb    %%xmm1, %%xmm1 \n\t"\
+\
+            :[tmp0]"=m"(tmp0),\
+             [tmp1]"=m"(tmp1),\
+             [tmp2]"=m"(tmp2),\
+             [tmp3]"=m"(tmp3)\
+            :[prev] "r"(prev),\
+             [cur]  "r"(cur),\
+             [next] "r"(next),\
+             [prefs]"r"((long)refs),\
+             [mrefs]"r"((long)-refs),\
+             [pw1]  "m"(*pw_1),\
+             [pb1]  "m"(*pb_1),\
+             [mode] "g"(mode)\
+        );\
+        __asm__ volatile("movq %%xmm1, %0" :"=m"(*dst));\
+        dst += 8;\
+        prev+= 8;\
+        cur += 8;\
+        next+= 8;\
+    }
+
+    if(parity){
+#define prev2 "prev"
+#define next2 "cur"
+        FILTER
+#undef prev2
+#undef next2
+    }else{
+#define prev2 "cur"
+#define next2 "next"
+        FILTER
+#undef prev2
+#undef next2
+    }
+}
+#undef LOAD8
+#undef PABS
+#undef CHECK
+#undef CHECK1
+#undef CHECK2
+#undef FILTER
+#undef FILTER_LINE_FUNC_NAME
Index: filters/video/video.c
===================================================================
--- filters/video/video_orig.c
+++ filters/video/video.c
@@ -47,6 +47,7 @@ void x264_register_vid_filters()
     REGISTER_VFILTER( resize );
     REGISTER_VFILTER( select_every );
     REGISTER_VFILTER( hqdn3d );
+    REGISTER_VFILTER( yadif );
 }

 int x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter,
Index: filters/video/yadif.c
===================================================================
--- /dev/null
+++ filters/video/yadif.c
@@ -0,0 +1,230 @@
+/*****************************************************************************
+ * yadif.c: x264 yadif filter
+ *****************************************************************************
+ * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+ * Avisynth port (C) 2007 Alexander G. Balakhnin aka Fizick  http://avisynth.org.ru
+ * x264 port (C) 2010 James Darnley <james.darnley@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <string.h>
+#include "video.h"
+#include "yadif_filter_line.h"
+#define NAME "yadif"
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )
+
+cli_vid_filter_t yadif_filter;
+filter_line_func filter_line;
+
+typedef struct {
+    hnd_t prev_handle;
+    cli_vid_filter_t prev_filter;
+    int mode;
+    int tff;
+    cli_pic_t buffer;
+} yadif_handle_t;
+
+/***********************
+*         Help         *
+***********************/
+
+static void help( int longhelp )
+{
+    printf( "      "NAME":[mode][,order]\n" );
+    if(!longhelp)
+        return;
+    printf(
+"            Deinterlaces the picture using mplayer's YADIF\n"
+"            mode: sets the deinterlacing mode\n"
+"            0 - single-rate deinterlacing (default)\n"
+"            1 - double-rate deinterlacing (bob)\n"
+"            2 - single-rate deinterlacing without spacial interlacing check\n"
+"            3 - double-rate deinterlacing withput spacial interlacing check\n"
+"            order: forces the field order\n"
+"            tff - top-field first\n"
+"            bff - bottom-field first\n" );
+}
+
+/***********************
+*         Init         *
+***********************/
+
+static int yadif_init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
+{
+    yadif_handle_t *h = calloc( 1, sizeof(yadif_handle_t) );
+    if(!h)
+        return -1;
+
+    FAIL_IF_ERROR( !(info->csp == X264_CSP_I420 || info->csp == X264_CSP_I422
+        || info->csp == X264_CSP_I444 || info->csp == X264_CSP_YV12 ),
+        "Only planar YUV images supported\n" )
+
+    if(x264_cli_pic_alloc( &h->buffer, info->csp, info->width, info->height ))
+        return -1;
+
+    char *mode, *order, *opt;
+    static const char *optlist[] = { "mode", "order", NULL };
+    char **opts = x264_split_options( opt_string, optlist );
+
+    opt = x264_get_option( "mode", opts );
+    mode = (opt) ? opt : "";
+    h->mode = x264_otoi(mode,0);
+    if(h->mode < 0 || h->mode > 3) {
+        x264_cli_log( NAME, X264_LOG_WARNING, "Invalid mode (%s), ignoring\n", mode);
+        mode = 0;
+    }
+
+    opt = x264_get_option( "order", opts );
+    order = (opt) ? opt : "";
+    if (!strcmp(order, "top") || !strcmp(order, "tff"))
+        h->tff = 1;
+    else if (!strcmp(order, "bottom") || !strcmp(order, "bff"))
+        h->tff = 0;
+    else {
+        if(opt)
+            x264_cli_log( NAME, X264_LOG_WARNING, "Unknown order (%s), ignoring\n", order);
+        h->tff = info->tff;
+    }
+
+    x264_free_string_array(opts);
+
+    if (x264_init_vid_filter( "cache", handle, filter, info, param, (void*)3 ))
+        return -1;
+
+    if(h->mode&1) {
+        info->num_frames *=2;
+        info->fps_num *=2;
+        info->timebase_den *=2;
+    }
+
+    info->interlaced = 0;
+    h->prev_filter = *filter;
+    h->prev_handle = *handle;
+    *handle = h;
+    *filter = yadif_filter;
+
+    filter_line = get_filter_func(param->cpu);
+
+    x264_cli_log( NAME, X264_LOG_INFO, "%s-rate deinterlacing "
+        "%s spatial interlacing check, %s-field first\n",
+        (h->mode&1) ? "double"  : "single",
+        (h->mode&2) ? "without" : "with",
+        (h->tff)    ? "top"     : "bottom" );
+
+    return 0;
+}
+
+/***********************
+*    Process Frames    *
+***********************/
+static void interpolate(uint8_t *dst, const uint8_t *cur0,  const uint8_t *cur2, int w)
+{
+    int x;
+    for (x=0; x<w; x++)
+        dst[x] = (cur0[x] + cur2[x] + 1)>>1; // simple average
+}
+
+static int get_frame( hnd_t handle, cli_pic_t *output, int frame_out )
+{
+    yadif_handle_t *h = handle;
+    cli_pic_t prev, cur, next;
+    int tff = h->tff, ret = 0;
+    int parity = (h->mode & 1) ? (frame_out & 1) ^ (1^tff) : (tff ^ 1);
+    int frame_in = (h->mode&1) ? frame_out/2 : frame_out;
+
+    *output = h->buffer;
+
+    if (frame_in==0)
+    {
+        ret |= h->prev_filter.get_frame( h->prev_handle, &prev, frame_in+1 );
+        ret |= h->prev_filter.get_frame( h->prev_handle, &cur, frame_in );
+        ret |= h->prev_filter.get_frame( h->prev_handle, &next, frame_in+1 );
+    }
+    else
+    {
+        ret |= h->prev_filter.get_frame( h->prev_handle, &prev, frame_in-1 );
+        ret |= h->prev_filter.get_frame( h->prev_handle, &cur, frame_in );
+        if (h->prev_filter.get_frame( h->prev_handle, &next, frame_in+1 ))
+            ret |= h->prev_filter.get_frame( h->prev_handle, &next, frame_in );
+    }
+    if(ret)
+        return ret;
+
+    for (int i=0; i<3; i++)
+    {
+        int width = cur.img.width * x264_cli_csps[cur.img.csp].width[i];
+        int height = cur.img.height * x264_cli_csps[cur.img.csp].height[i];
+        int stride = cur.img.stride[i];
+
+        int y=0;
+        if((y^parity)&1)
+            memcpy(output->img.plane[i], cur.img.plane[i]+stride, width);// duplicate 1
+        else
+            memcpy(output->img.plane[i], cur.img.plane[i], width);
+        y=1;
+        if((y^parity)&1)
+            interpolate(output->img.plane[i]+stride, cur.img.plane[i], cur.img.plane[i]+2*stride, width);   // interpolate 0 and 2
+        else
+            memcpy(output->img.plane[i]+stride, cur.img.plane[i]+stride, width); // copy original
+        for (y=2; y<height-2; y++)
+        {
+            if ((y ^ parity) & 1)
+                filter_line( h->mode,
+                             output->img.plane[i]+y*stride,
+                             prev.img.plane[i]+y*stride,
+                             cur.img.plane[i]+y*stride,
+                             next.img.plane[i]+y*stride,
+                             width, stride, parity^tff );
+            else
+                memcpy( output->img.plane[i]+y*stride,
+                        cur.img.plane[i]+y*stride,
+                        width );
+        }
+        y=height-2;
+        if((y^parity)&1)
+            interpolate(output->img.plane[i]+y*stride, cur.img.plane[i]+(y-1)*stride, cur.img.plane[i]+(y+1)*stride, width);   // interpolate h-3 and h-1
+        else
+            memcpy(output->img.plane[i]+y*stride, cur.img.plane[i]+y*stride, width); // copy original
+        y=height-1;
+        if((y^parity)&1)
+            memcpy(output->img.plane[i]+y*stride, cur.img.plane[i]+(y-1)*stride, width); // duplicate h-2
+        else
+            memcpy(output->img.plane[i]+y*stride, cur.img.plane[i]+(y-1)*stride, width); // copy original
+    }
+
+    if(frame_out < 3 || !(frame_out&1))
+        return 0;
+    return h->prev_filter.release_frame( h->prev_handle, &prev, frame_in-1 );
+}
+
+/***********************
+*         Free         *
+***********************/
+
+static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
+{
+    return 0;
+}
+
+static void free_filter( hnd_t handle )
+{
+    yadif_handle_t *h = handle;
+    h->prev_filter.free( h->prev_handle );
+    x264_cli_pic_clean( &h->buffer );
+    free( h );
+}
+
+cli_vid_filter_t yadif_filter = { NAME, help, yadif_init, get_frame, release_frame, free_filter, NULL };
Index: filters/video/yadif_filter_line.c
===================================================================
--- /dev/null
+++ filters/video/yadif_filter_line.c
@@ -0,0 +1,358 @@
+/*****************************************************************************
+ * yadif_filter_line.c: x264 yadif filter
+ *****************************************************************************
+ * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+ * Avisynth port (C) 2007 Alexander G. Balakhnin aka Fizick  http://avisynth.org.ru
+ * x264 port (C) 2010 James Darnley <james.darnley@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************
+ * Copied nearly verbatim from the Avisynth filter's yadif.c so that x264's
+ * yadif.c is cleaner
+ *****************************************************************************/
+
+#include "config.h"
+#include "yadif_filter_line.h"
+#include "x264.h"
+
+#if defined __GNUC__ && defined HAVE_MMX
+#define uint64_t unsigned __int64
+#define LOAD4(mem,dst) \
+            "movd      "mem", "#dst" \n\t"\
+            "punpcklbw %%mm7, "#dst" \n\t"
+
+#define PABS(tmp,dst) \
+            "pxor     "#tmp", "#tmp" \n\t"\
+            "psubw    "#dst", "#tmp" \n\t"\
+            "pmaxsw   "#tmp", "#dst" \n\t"
+
+#define CHECK(pj,mj) \
+            "movq "#pj"(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1+j] */\
+            "movq "#mj"(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1-j] */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "movq      %%mm2, %%mm5 \n\t"\
+            "pxor      %%mm3, %%mm4 \n\t"\
+            "pavgb     %%mm3, %%mm5 \n\t"\
+            "pand     %[pb1], %%mm4 \n\t"\
+            "psubusb   %%mm4, %%mm5 \n\t"\
+            "psrlq     $8,    %%mm5 \n\t"\
+            "punpcklbw %%mm7, %%mm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "psubusb   %%mm3, %%mm2 \n\t"\
+            "psubusb   %%mm4, %%mm3 \n\t"\
+            "pmaxub    %%mm3, %%mm2 \n\t"\
+            "movq      %%mm2, %%mm3 \n\t"\
+            "movq      %%mm2, %%mm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
+            "psrlq      $8,   %%mm3 \n\t" /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\
+            "psrlq     $16,   %%mm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
+            "punpcklbw %%mm7, %%mm2 \n\t"\
+            "punpcklbw %%mm7, %%mm3 \n\t"\
+            "punpcklbw %%mm7, %%mm4 \n\t"\
+            "paddw     %%mm3, %%mm2 \n\t"\
+            "paddw     %%mm4, %%mm2 \n\t" /* score */
+
+#define CHECK1 \
+            "movq      %%mm0, %%mm3 \n\t"\
+            "pcmpgtw   %%mm2, %%mm3 \n\t" /* if(score < spatial_score) */\
+            "pminsw    %%mm2, %%mm0 \n\t" /* spatial_score= score; */\
+            "movq      %%mm3, %%mm6 \n\t"\
+            "pand      %%mm3, %%mm5 \n\t"\
+            "pandn     %%mm1, %%mm3 \n\t"\
+            "por       %%mm5, %%mm3 \n\t"\
+            "movq      %%mm3, %%mm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
+
+#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
+                  hurts both quality and speed, but matches the C version. */\
+            "paddw    %[pw1], %%mm6 \n\t"\
+            "psllw     $14,   %%mm6 \n\t"\
+            "paddsw    %%mm6, %%mm2 \n\t"\
+            "movq      %%mm0, %%mm3 \n\t"\
+            "pcmpgtw   %%mm2, %%mm3 \n\t"\
+            "pminsw    %%mm2, %%mm0 \n\t"\
+            "pand      %%mm3, %%mm5 \n\t"\
+            "pandn     %%mm1, %%mm3 \n\t"\
+            "por       %%mm5, %%mm3 \n\t"\
+            "movq      %%mm3, %%mm1 \n\t"
+
+static void filter_line_mmx2(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
+    static const uint64_t pw_1 = 0x0001000100010001ULL;
+    static const uint64_t pb_1 = 0x0101010101010101ULL;
+//    const int mode = p->mode;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    int x;
+
+#define FILTER\
+    for(x=0; x<w; x+=4){\
+        asm volatile(\
+            "pxor      %%mm7, %%mm7 \n\t"\
+            LOAD4("(%[cur],%[mrefs])", %%mm0) /* c = cur[x-refs] */\
+            LOAD4("(%[cur],%[prefs])", %%mm1) /* e = cur[x+refs] */\
+            LOAD4("(%["prev2"])", %%mm2) /* prev2[x] */\
+            LOAD4("(%["next2"])", %%mm3) /* next2[x] */\
+            "movq      %%mm3, %%mm4 \n\t"\
+            "paddw     %%mm2, %%mm3 \n\t"\
+            "psraw     $1,    %%mm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
+            "movq      %%mm0, %[tmp0] \n\t" /* c */\
+            "movq      %%mm3, %[tmp1] \n\t" /* d */\
+            "movq      %%mm1, %[tmp2] \n\t" /* e */\
+            "psubw     %%mm4, %%mm2 \n\t"\
+            PABS(      %%mm4, %%mm2) /* temporal_diff0 */\
+            LOAD4("(%[prev],%[mrefs])", %%mm3) /* prev[x-refs] */\
+            LOAD4("(%[prev],%[prefs])", %%mm4) /* prev[x+refs] */\
+            "psubw     %%mm0, %%mm3 \n\t"\
+            "psubw     %%mm1, %%mm4 \n\t"\
+            PABS(      %%mm5, %%mm3)\
+            PABS(      %%mm5, %%mm4)\
+            "paddw     %%mm4, %%mm3 \n\t" /* temporal_diff1 */\
+            "psrlw     $1,    %%mm2 \n\t"\
+            "psrlw     $1,    %%mm3 \n\t"\
+            "pmaxsw    %%mm3, %%mm2 \n\t"\
+            LOAD4("(%[next],%[mrefs])", %%mm3) /* next[x-refs] */\
+            LOAD4("(%[next],%[prefs])", %%mm4) /* next[x+refs] */\
+            "psubw     %%mm0, %%mm3 \n\t"\
+            "psubw     %%mm1, %%mm4 \n\t"\
+            PABS(      %%mm5, %%mm3)\
+            PABS(      %%mm5, %%mm4)\
+            "paddw     %%mm4, %%mm3 \n\t" /* temporal_diff2 */\
+            "psrlw     $1,    %%mm3 \n\t"\
+            "pmaxsw    %%mm3, %%mm2 \n\t"\
+            "movq      %%mm2, %[tmp3] \n\t" /* diff */\
+\
+            "paddw     %%mm0, %%mm1 \n\t"\
+            "paddw     %%mm0, %%mm0 \n\t"\
+            "psubw     %%mm1, %%mm0 \n\t"\
+            "psrlw     $1,    %%mm1 \n\t" /* spatial_pred */\
+            PABS(      %%mm2, %%mm0)      /* ABS(c-e) */\
+\
+            "movq -1(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1] */\
+            "movq -1(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1] */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "psubusb   %%mm3, %%mm2 \n\t"\
+            "psubusb   %%mm4, %%mm3 \n\t"\
+            "pmaxub    %%mm3, %%mm2 \n\t"\
+            /*"pshufw $9,%%mm2, %%mm3 \n\t"*/\
+            "movq %%mm2, %%mm3 \n\t" /* replace for "pshufw $9,%%mm2, %%mm3" - Fizick */\
+            "psrlq $16, %%mm3 \n\t"/* replace for "pshufw $9,%%mm2, %%mm3" - Fizick*/\
+            "punpcklbw %%mm7, %%mm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
+            "punpcklbw %%mm7, %%mm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
+            "paddw     %%mm2, %%mm0 \n\t"\
+            "paddw     %%mm3, %%mm0 \n\t"\
+            "psubw    %[pw1], %%mm0 \n\t" /* spatial_score */\
+\
+            CHECK(-2,0)\
+            CHECK1\
+            CHECK(-3,1)\
+            CHECK2\
+            CHECK(0,-2)\
+            CHECK1\
+            CHECK(1,-3)\
+            CHECK2\
+\
+            /* if(p->mode<2) ... */\
+            "movq    %[tmp3], %%mm6 \n\t" /* diff */\
+            "cmp       $2, %[mode] \n\t"\
+            "jge       1f \n\t"\
+            LOAD4("(%["prev2"],%[mrefs],2)", %%mm2) /* prev2[x-2*refs] */\
+            LOAD4("(%["next2"],%[mrefs],2)", %%mm4) /* next2[x-2*refs] */\
+            LOAD4("(%["prev2"],%[prefs],2)", %%mm3) /* prev2[x+2*refs] */\
+            LOAD4("(%["next2"],%[prefs],2)", %%mm5) /* next2[x+2*refs] */\
+            "paddw     %%mm4, %%mm2 \n\t"\
+            "paddw     %%mm5, %%mm3 \n\t"\
+            "psrlw     $1,    %%mm2 \n\t" /* b */\
+            "psrlw     $1,    %%mm3 \n\t" /* f */\
+            "movq    %[tmp0], %%mm4 \n\t" /* c */\
+            "movq    %[tmp1], %%mm5 \n\t" /* d */\
+            "movq    %[tmp2], %%mm7 \n\t" /* e */\
+            "psubw     %%mm4, %%mm2 \n\t" /* b-c */\
+            "psubw     %%mm7, %%mm3 \n\t" /* f-e */\
+            "movq      %%mm5, %%mm0 \n\t"\
+            "psubw     %%mm4, %%mm5 \n\t" /* d-c */\
+            "psubw     %%mm7, %%mm0 \n\t" /* d-e */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "pminsw    %%mm3, %%mm2 \n\t"\
+            "pmaxsw    %%mm4, %%mm3 \n\t"\
+            "pmaxsw    %%mm5, %%mm2 \n\t"\
+            "pminsw    %%mm5, %%mm3 \n\t"\
+            "pmaxsw    %%mm0, %%mm2 \n\t" /* max */\
+            "pminsw    %%mm0, %%mm3 \n\t" /* min */\
+            "pxor      %%mm4, %%mm4 \n\t"\
+            "pmaxsw    %%mm3, %%mm6 \n\t"\
+            "psubw     %%mm2, %%mm4 \n\t" /* -max */\
+            "pmaxsw    %%mm4, %%mm6 \n\t" /* diff= MAX3(diff, min, -max); */\
+            "1: \n\t"\
+\
+            "movq    %[tmp1], %%mm2 \n\t" /* d */\
+            "movq      %%mm2, %%mm3 \n\t"\
+            "psubw     %%mm6, %%mm2 \n\t" /* d-diff */\
+            "paddw     %%mm6, %%mm3 \n\t" /* d+diff */\
+            "pmaxsw    %%mm2, %%mm1 \n\t"\
+            "pminsw    %%mm3, %%mm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
+            "packuswb  %%mm1, %%mm1 \n\t"\
+\
+            :[tmp0]"=m"(tmp0),\
+             [tmp1]"=m"(tmp1),\
+             [tmp2]"=m"(tmp2),\
+             [tmp3]"=m"(tmp3)\
+            :[prev] "r"(prev),\
+             [cur]  "r"(cur),\
+             [next] "r"(next),\
+             [prefs]"r"((long)refs),\
+             [mrefs]"r"((long)-refs),\
+             [pw1]  "m"(pw_1),\
+             [pb1]  "m"(pb_1),\
+             [mode] "g"(mode)\
+        );\
+        asm volatile("movd %%mm1, %0" :"=m"(*dst));\
+        dst += 4;\
+        prev+= 4;\
+        cur += 4;\
+        next+= 4;\
+    }
+
+    if(parity){
+#define prev2 "prev"
+#define next2 "cur"
+        FILTER
+#undef prev2
+#undef next2
+    }else{
+#define prev2 "cur"
+#define next2 "next"
+        FILTER
+#undef prev2
+#undef next2
+    }
+}
+#undef LOAD4
+#undef PABS
+#undef CHECK
+#undef CHECK1
+#undef CHECK2
+#undef FILTER
+
+#ifndef attribute_align_arg
+#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
+#    define attribute_align_arg __attribute__((force_align_arg_pointer))
+#else
+#    define attribute_align_arg
+#endif
+#endif
+
+// for proper alignment SSE2 we need in GCC 4.2 and above
+#if (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
+
+#ifndef DECLARE_ALIGNED
+#define DECLARE_ALIGNED(n,t,v)       t v __attribute__ ((aligned (n)))
+#endif
+
+// ================= SSE2 =================
+#define PABS(tmp,dst) \
+            "pxor     "#tmp", "#tmp" \n\t"\
+            "psubw    "#dst", "#tmp" \n\t"\
+            "pmaxsw   "#tmp", "#dst" \n\t"
+
+#define FILTER_LINE_FUNC_NAME filter_line_sse2
+#include "avs_vf_yadif_template.h"
+
+// ================ SSSE3 =================
+#define PABS(tmp,dst) \
+            "pabsw     "#dst", "#dst" \n\t"
+
+#define FILTER_LINE_FUNC_NAME filter_line_ssse3
+#include "avs_vf_yadif_template.h"
+
+#endif
+
+#endif
+
+#define MIN(a,b) ( (a)<(b) ? (a) : (b) )
+#define MAX(a,b) ( (a)>(b) ? (a) : (b) )
+#define MIN3(a,b,c) MIN((a),MIN((b),(c)))
+#define MAX3(a,b,c) MAX((a),MAX((b),(c)))
+#define ABS(a) ( (a) > 0 ? (a) : -(a) )
+
+static void filter_line_c(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
+    int x;
+    const uint8_t *prev2= parity ? prev : cur ;
+    const uint8_t *next2= parity ? cur  : next;
+    for(x=0; x<w; x++){
+        int c= cur[-refs];
+        int d= (prev2[0] + next2[0])>>1;
+        int e= cur[+refs];
+        int temporal_diff0= ABS(prev2[0] - next2[0]);
+        int temporal_diff1=( ABS(prev[-refs] - c) + ABS(prev[+refs] - e) )>>1;
+        int temporal_diff2=( ABS(next[-refs] - c) + ABS(next[+refs] - e) )>>1;
+        int diff= MAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2);
+        int spatial_pred= (c+e)>>1;
+        int spatial_score= ABS(cur[-refs-1] - cur[+refs-1]) + ABS(c-e)
+                         + ABS(cur[-refs+1] - cur[+refs+1]) - 1;
+
+#define CHECK(j)\
+    {   int score= ABS(cur[-refs-1+ j] - cur[+refs-1- j])\
+                 + ABS(cur[-refs  + j] - cur[+refs  - j])\
+                 + ABS(cur[-refs+1+ j] - cur[+refs+1- j]);\
+        if(score < spatial_score){\
+            spatial_score= score;\
+            spatial_pred= (cur[-refs  + j] + cur[+refs  - j])>>1;\
+
+        CHECK(-1) CHECK(-2) }} }}
+        CHECK( 1) CHECK( 2) }} }}
+
+        if(mode<2){
+            int b= (prev2[-2*refs] + next2[-2*refs])>>1;
+            int f= (prev2[+2*refs] + next2[+2*refs])>>1;
+#if 0
+            int a= cur[-3*refs];
+            int g= cur[+3*refs];
+            int max= MAX3(d-e, d-c, MIN3(MAX(b-c,f-e),MAX(b-c,b-a),MAX(f-g,f-e)) );
+            int min= MIN3(d-e, d-c, MAX3(MIN(b-c,f-e),MIN(b-c,b-a),MIN(f-g,f-e)) );
+#else
+            int max= MAX3(d-e, d-c, MIN(b-c, f-e));
+            int min= MIN3(d-e, d-c, MAX(b-c, f-e));
+#endif
+
+            diff= MAX3(diff, min, -max);
+        }
+
+        if(spatial_pred > d + diff)
+           spatial_pred = d + diff;
+        else if(spatial_pred < d - diff)
+           spatial_pred = d - diff;
+
+        dst[0] = spatial_pred;
+
+        dst++;
+        cur++;
+        prev++;
+        next++;
+        prev2++;
+        next2++;
+    }
+}
+
+filter_line_func get_filter_func(unsigned int cpu) {
+    filter_line_func ret = filter_line_c;
+#if defined __GNUC__  && defined HAVE_MMX
+    if (cpu & X264_CPU_MMXEXT)
+        ret = filter_line_mmx2;
+#if (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
+    if (cpu & (X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW|X264_CPU_SSE2_IS_FAST))
+        ret = filter_line_sse2;
+    if (cpu & X264_CPU_SSSE3)
+        ret = filter_line_ssse3;
+#endif
+#endif
+    return ret;
+}
Index: filters/video/yadif_filter_line.h
===================================================================
--- /dev/null
+++ filters/video/yadif_filter_line.h
@@ -0,0 +1,27 @@
+/*****************************************************************************
+ * yadif_filter_line.h: x264 yadif filter
+ *****************************************************************************
+ * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+ * Avisynth port (C) 2007 Alexander G. Balakhnin aka Fizick  http://avisynth.org.ru
+ * x264 port (C) 2010 James Darnley <james.darnley@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <inttypes.h>
+
+typedef void (*filter_line_func)(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity);
+
+filter_line_func get_filter_func(unsigned int cpu);

From: James Darnley <james.darnley@gmail.com>
Subject: [PATCH 3/3] Add pad filter

 Makefile              |    2 +-
 configure             |    2 +-
 filters/video/pad.c   |  190 +++++++++++++++++++++++++++++++++++++++++++++++++
 filters/video/video.c |    1 +
 4 files changed, 193 insertions(+), 2 deletions(-)
 create mode 100644 filters/video/pad.c

Index: Makefile
===================================================================
--- Makefile_orig
+++ Makefile
@@ -19,7 +19,7 @@ SRCCLI = x264.c input/input.c input/timecode.c input/raw.c input/y4m.c \
          filters/video/video.c filters/video/source.c filters/video/internal.c \
          filters/video/resize.c filters/video/cache.c filters/video/fix_vfr_pts.c \
          filters/video/select_every.c filters/video/crop.c filters/video/hqdn3d.c \
-         filters/video/yadif.c filters/video/yadif_filter_line.c
+         filters/video/yadif.c filters/video/yadif_filter_line.c filters/video/pad.c

 SRCSO =

Index: configure
===================================================================
--- configure_orig
+++ configure
@@ -770,7 +770,7 @@ Libs: $pclibs
 Cflags: -I$includedir
 EOF

-filters="crop select_every hqdn3d yadif"
+filters="crop select_every hqdn3d yadif pad"
 [ $swscale = yes ] && filters="resize $filters"

 cat > conftest.log <<EOF
Index: filters/video/pad.c
===================================================================
--- /dev/null
+++ filters/video/pad.c
@@ -0,0 +1,190 @@
+/*****************************************************************************
+ * yadif.c: x264 yadif filter
+ *****************************************************************************
+ * Copyright (C) 2010 James Darnley <james.darnley@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "internal.h"
+#include "video.h"
+#define NAME "pad"
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )
+
+cli_vid_filter_t pad_filter;
+
+typedef struct {
+    hnd_t prev_handle;
+    cli_vid_filter_t prev_filter;
+    int width;
+    int height;
+    int cols;
+    int rows;
+    char colour[4];
+    cli_pic_t buffer;
+    const x264_cli_csp_t *csp;
+} pad_handle_t;
+
+static void help( int longhelp )
+{
+    printf( "      "NAME":[left][,top][,right][,bottom][,width][,height][,colour]\n" );
+    if( !longhelp )
+        return;
+    printf( "            adds pixels to the frame edge\n"
+            "            colour values are in YUV not RGB\n"
+            "            default colour is black\n" );
+}
+
+static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
+{
+    int arg[7];
+    char *opt;
+    const x264_cli_csp_t *csp = x264_cli_get_csp(info->csp);
+    static const char *optlist[] = { "left", "top", "right", "bottom", "width",
+                                     "height", "colour", "color", NULL };
+    char **opts = x264_split_options( opt_string, optlist );
+
+    pad_handle_t *h = calloc( 1, sizeof(pad_handle_t) );
+    if( !h )
+        return -1;
+
+    FAIL_IF_ERROR( !(info->csp == X264_CSP_I420 || info->csp == X264_CSP_I422
+        || info->csp == X264_CSP_I444 || info->csp == X264_CSP_YV12 ),
+        "Only planar YUV images currently supported, patches welcome\n" )
+
+    for(int i=0; i<6; i++) {
+        int mod = i&1 ? (csp->mod_height << info->interlaced) : csp->mod_width;
+        opt = x264_get_option( optlist[i], opts );
+        arg[i] = x264_otoi(opt, 0);
+        FAIL_IF_ERROR( arg[i] % mod, "%s pad value '%s' is not a "
+            "multiple of %d\n", optlist[i], opt, mod )
+    }
+    opt = x264_get_option( optlist[6], opts );
+    if(!opt)
+        opt = x264_get_option( optlist[7], opts );
+    arg[6] = x264_otoi(opt, -1);
+    if(arg[6] > -1) {
+        h->colour[0] = (arg[6]&0xFF0000) >> 16;
+        h->colour[1] = (arg[6]&0xFF00) >> 8;
+        h->colour[2] = arg[6]&0xFF;
+    } else {
+        h->colour[0] = 0;
+        h->colour[1] = 0x80;
+        h->colour[2] = 0x80;
+    }
+    x264_free_string_array(opts);
+
+/* For sanity! */
+#define left   arg[0]
+#define top    arg[1]
+#define right  arg[2]
+#define bottom arg[3]
+#define WIDTH  arg[4]
+#define HEIGHT arg[5]
+    FAIL_IF_ERROR( WIDTH && WIDTH < info->width + left + right,
+        "requested width (%d) is less than requested padding (%d + %d + %d)\n",
+        WIDTH, info->width, left, right )
+
+    FAIL_IF_ERROR( HEIGHT && HEIGHT < info->height + top + bottom,
+        "requested height (%d) is less than requested padding (%d + %d + %d)\n",
+        HEIGHT, info->height, top, bottom )
+
+    h->width = (WIDTH) ? WIDTH : info->width + left + right;
+    h->height = (HEIGHT) ? HEIGHT : info->height + top + bottom;
+
+    h->cols = (left) ? left
+            : (right) ? h->width - right - info->width
+            : (h->width - info->width)/2;
+    h->cols = ((h->cols+1) / csp->mod_width) * csp->mod_width;
+
+    h->rows = (top) ? top
+            : (bottom) ? h->height - bottom - info->height
+            : (h->height - info->height)/2;
+    h->rows = ((h->rows+1) / csp->mod_height) * csp->mod_height;
+#undef left
+#undef top
+#undef right
+#undef bottom
+#undef WIDTH
+#undef HEIGHT
+
+    if( h->width == info->width && h->height == info->height ) {
+        free(h);
+        return 0;
+    }
+
+    if(x264_cli_pic_alloc( &h->buffer, info->csp, h->width, h->height ))
+        return -1;
+    for(int i=0; i<h->buffer.img.planes; i++) {
+        memset( h->buffer.img.plane[i], h->colour[i],
+            h->height * csp->height[i] * h->buffer.img.stride[i] );
+    }
+
+    x264_cli_log( NAME, X264_LOG_INFO,
+        "expanding frame to %dx%d, picture starting at (%d,%d)\n",
+        h->width, h->height, h->cols, h->rows );
+
+    info->width  = h->width;
+    info->height = h->height;
+    h->prev_filter = *filter;
+    h->prev_handle = *handle;
+    h->csp = csp;
+    *handle = h;
+    *filter = pad_filter;
+
+    return 0;
+}
+
+static int get_frame( hnd_t handle, cli_pic_t *out, int frame )
+{
+    pad_handle_t *h = handle;
+    cli_pic_t in;
+
+    if( h->prev_filter.get_frame( h->prev_handle, &in, frame ) )
+        return -1;
+
+    *out = h->buffer;
+
+    for(int i=0; i<in.img.planes; i++) {
+        float scale[2] = { h->csp->width[i],
+                           h->csp->height[i] };
+        int stride[2]  = { in.img.stride[i],
+                           out->img.stride[i] };
+        int in_dim[2]  = { in.img.width * scale[0],
+                           in.img.height * scale[1] };
+        int offset = h->cols*scale[0] + h->rows*scale[1]*stride[1];
+
+        x264_cli_plane_copy( out->img.plane[i]+offset, stride[1],
+            in.img.plane[i], stride[0], in_dim[0], in_dim[1] );
+    }
+
+    return h->prev_filter.release_frame( h->prev_handle, &in, frame );
+}
+
+
+static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
+{
+    return 0;
+}
+
+static void free_filter( hnd_t handle )
+{
+    pad_handle_t *h = handle;
+    h->prev_filter.free( h->prev_handle );
+    x264_cli_pic_clean( &h->buffer );
+    free( h );
+}
+
+cli_vid_filter_t pad_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
Index: filters/video/video.c
===================================================================
--- filters/video/video_orig.c
+++ filters/video/video.c
@@ -48,6 +48,7 @@ void x264_register_vid_filters()
     REGISTER_VFILTER( select_every );
     REGISTER_VFILTER( hqdn3d );
     REGISTER_VFILTER( yadif );
+    REGISTER_VFILTER( pad );
 }

 int x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter,