First public x264's OpenCL lookahead test patch

# HG changeset patch
# User Steve Borho <[email protected]>
# Date 1337123100 18000
# Node ID 9a682cb0c74fbf0ef19ef7eaa04b20fc0e27d360
# Parent  087c10a1fec707bc7c53e91f5efeb3567f0bf77f
OpenCL lookahead

diff -r 087c10a1fec7 -r 9a682cb0c74f Makefile
--- a/Makefile  Thu Mar 29 14:14:07 2012 -0700
+++ b/Makefile  Tue May 15 18:05:00 2012 -0500
@@ -8,6 +8,8 @@
 vpath %.asm $(SRCPATH)
 vpath %.rc $(SRCPATH)

+GENERATED =
+
 all: default
 default:

@@ -145,6 +147,36 @@
 endif
 endif

+QUOTED_CFLAGS := $(CFLAGS)
+
+ifeq ($(HAVE_OPENCL),yes)
+empty:=
+space:=$(empty) $(empty)
+escaped:=\ $(empty)
+open:=(
+escopen:=\(
+close:=)
+escclose:=\)
+SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
+SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
+SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
+SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
+SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
+SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
+# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
+# make gcc happy
+CFLAGS += -I$(SAFE_INC_DIR)
+LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
+# For the CFLAGS used by the .depend rule, we must add quotes because
+# the rule does an extra level of shell expansions
+QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)"
+encoder/oclobj.h: common/opencl/x264-cl.h $(wildcard common/opencl/*.cl)
+   echo "static const char x264_opencl_source [] = {" > $@
+   cat $^ | xxd -i >> $@
+   echo ",0x00 };" >> $@
+GENERATED += encoder/oclobj.h
+endif
+
 OBJS   += $(SRCS:%.c=%.o)
 OBJCLI += $(SRCCLI:%.c=%.o)
 OBJSO  += $(SRCSO:%.c=%.o)
@@ -164,17 +196,22 @@
    $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)

 ifneq ($(EXE),)
-.PHONY: x264 checkasm
+.PHONY: x264 checkasm test-opencl
 x264: x264$(EXE)
 checkasm: checkasm$(EXE)
+test-opencl: test-opencl$(EXE)
 endif

-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
+x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
    $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)

-checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
    $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)

+OBJOCL = tools/test-opencl.o
+test-opencl$(EXE): .depend ${OBJOCL} $(LIBX264)
+   $(LD)$@ $(OBJOCL) $(LIBX264) $(LDFLAGS)
+
 $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend

 %.o: %.asm
@@ -193,7 +230,7 @@

 .depend: config.mak
    @rm -f .depend
-   @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
+   @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)

 config.mak:
    ./configure
@@ -231,7 +268,7 @@

 clean:
    rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
-   rm -f checkasm checkasm.exe $(OBJCHK)
+   rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
    rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock

 distclean: clean
diff -r 087c10a1fec7 -r 9a682cb0c74f common/common.c
--- a/common/common.c   Thu Mar 29 14:14:07 2012 -0700
+++ b/common/common.c   Tue May 15 18:05:00 2012 -0500
@@ -170,6 +170,10 @@
     param->b_pic_struct = 0;
     param->b_fake_interlaced = 0;
     param->i_frame_packing = -1;
+#if HAVE_OPENCL
+    param->b_opencl = 1;
+    param->psz_clbin_file = NULL;
+#endif
 }

 static int x264_param_apply_preset( x264_param_t *param, const char *preset )
@@ -1013,6 +1017,10 @@
         p->b_fake_interlaced = atobool(value);
     OPT("frame-packing")
         p->i_frame_packing = atoi(value);
+    OPT("opencl")
+        p->b_opencl = atobool( value );
+    OPT("clbin-file")
+        p->psz_clbin_file = strdup( value );
     else
         return X264_PARAM_BAD_NAME;
 #undef OPT
diff -r 087c10a1fec7 -r 9a682cb0c74f common/common.h
--- a/common/common.h   Thu Mar 29 14:14:07 2012 -0700
+++ b/common/common.h   Tue May 15 18:05:00 2012 -0500
@@ -92,6 +92,11 @@
 #include <assert.h>
 #include <limits.h>

+#if HAVE_OPENCL
+#    define NUM_IMAGE_SCALES 4
+#    include "CL/cl.h"
+#endif
+
 #if HAVE_INTERLACED
 #   define MB_INTERLACED h->mb.b_interlaced
 #   define SLICE_MBAFF h->sh.b_mbaff
@@ -943,6 +948,65 @@
     struct visualize_t *visualize;
 #endif
     x264_lookahead_t *lookahead;
+
+#if HAVE_OPENCL
+#define MAX_FINISH_COPIES 1024
+#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
+    struct
+    {
+        cl_context       context;
+        cl_device_id     device;
+        cl_command_queue queue;
+
+        cl_program       lookahead_program;
+        cl_int           last_buf;
+
+        cl_mem           page_locked_buffer;
+        char            *page_locked_ptr;
+        int              pl_occupancy;
+
+        struct
+        {
+            void *src;
+            void *dest;
+            int   bytes;
+        } copies[MAX_FINISH_COPIES];
+        int         num_copies;
+
+        int         b_device_AMD_SI;
+
+        /* downscale lowres luma */
+        cl_kernel   downscale_hpel_kernel;
+        cl_kernel   downscale_kernel1;
+        cl_kernel   downscale_kernel2;
+        cl_mem      luma_16x16_image[2];
+
+        /* weightp filtering */
+        cl_kernel   weightp_hpel_kernel;
+        cl_kernel   weightp_scaled_images_kernel;
+        cl_mem      weighted_scaled_images[NUM_IMAGE_SCALES];
+        cl_mem      weighted_luma_hpel;
+
+        /* intra */
+        cl_kernel   memset_kernel;
+        cl_kernel   intra_kernel;
+        cl_kernel   rowsum_intra_kernel;
+        cl_mem      row_satds[2];
+
+        /* hierarchical motion estimation */
+        cl_kernel   hme_kernel;
+        cl_kernel   subpel_refine_kernel;
+        cl_mem      mv_buffers[2];
+        cl_mem      lowres_mv_costs;
+        cl_mem      mvp_buffer;
+
+        /* bidir */
+        cl_kernel   mode_select_kernel;
+        cl_kernel   rowsum_inter_kernel;
+        cl_mem      lowres_costs[2];
+        cl_mem      frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
+    } opencl;
+#endif
 };

 // included at the end because it needs x264_t
diff -r 087c10a1fec7 -r 9a682cb0c74f common/frame.c
--- a/common/frame.c    Thu Mar 29 14:14:07 2012 -0700
+++ b/common/frame.c    Tue May 15 18:05:00 2012 -0500
@@ -258,6 +258,23 @@
     return NULL;
 }

+#if HAVE_OPENCL
+static void x264_opencl_frame_delete( x264_frame_t *frame )
+{
+#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
+    for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
+        RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
+    RELEASEBUF( frame->opencl.luma_hpel );
+    RELEASEBUF( frame->opencl.inv_qscale_factor );
+    RELEASEBUF( frame->opencl.intra_cost );
+    RELEASEBUF( frame->opencl.lowres_mvs0 );
+    RELEASEBUF( frame->opencl.lowres_mvs1 );
+    RELEASEBUF( frame->opencl.lowres_mv_costs0 );
+    RELEASEBUF( frame->opencl.lowres_mv_costs1 );
+#undef RELEASEBUF
+}
+#endif
+
 void x264_frame_delete( x264_frame_t *frame )
 {
     /* Duplicate frames are blank copies of real frames (including pointers),
@@ -302,6 +319,9 @@
         x264_free( frame->ref[1] );
         x264_pthread_mutex_destroy( &frame->mutex );
         x264_pthread_cond_destroy( &frame->cv );
+#if HAVE_OPENCL
+        x264_opencl_frame_delete( frame );
+#endif
     }
     x264_free( frame );
 }
diff -r 087c10a1fec7 -r 9a682cb0c74f common/frame.h
--- a/common/frame.h    Thu Mar 29 14:14:07 2012 -0700
+++ b/common/frame.h    Tue May 15 18:05:00 2012 -0500
@@ -170,6 +170,20 @@
     /* user frame properties */
     uint8_t *mb_info;
     void (*mb_info_free)( void* );
+
+#if HAVE_OPENCL
+    struct
+    {
+        cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
+        cl_mem luma_hpel;
+        cl_mem inv_qscale_factor;
+        cl_mem intra_cost;
+        cl_mem lowres_mvs0;
+        cl_mem lowres_mvs1;
+        cl_mem lowres_mv_costs0;
+        cl_mem lowres_mv_costs1;
+    } opencl;
+#endif
 } x264_frame_t;

 /* synchronized frame list */
diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/bidir.cl
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/opencl/bidir.cl    Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,254 @@
+
+/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
+int bidir_satd_8x8_ii_coop4(
+    read_only image2d_t fenc_lowres,  int2 fencpos,
+    read_only image2d_t fref0_planes, int2 qpos0,
+    read_only image2d_t fref1_planes, int2 qpos1,
+    int weight,
+    local sum2_t *tmpp,
+    int idx )
+{
+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+    sum2_t b0, b1, b2, b3;
+    sum2_t sum = 0;
+
+    // fencpos is full-pel position of original MB
+    // qpos0 is qpel position within reference frame 0
+    // qpos1 is qpel position within reference frame 1
+
+    int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
+    int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
+
+    int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
+    int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
+    int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
+
+    int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
+    int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
+
+    int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
+    int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
+    int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
+
+    uint mask_shift0A = 8 * (3 - hpel0A), mask_shift0B = 8 * (3 - hpel0B);
+    uint mask0A = 0x000000ff << mask_shift0A, mask0B = 0x000000ff << mask_shift0B;
+    uint mask_shift1A = 8 * (3 - hpel1A), mask_shift1B = 8 * (3 - hpel1B);
+    uint mask1A = 0x000000ff << mask_shift1A, mask1B = 0x000000ff << mask_shift1B;
+
+    uint vA, vB;
+    uint enc, ref0, ref1;
+    uint a0, a1;
+    const int weight2 = 64 - weight;
+
+#define READ_BIDIR_DIFF( OUT, X )\
+    enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
+    vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 & mask0A) >> mask_shift0A;\
+    vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 & mask0B) >> mask_shift0B;\
+    ref0 = rhadd( vA, vB );\
+    vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 & mask1A) >> mask_shift1A;\
+    vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 & mask1B) >> mask_shift1B;\
+    ref1 = rhadd( vA, vB );\
+    OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
+
+#define READ_DIFF_EX( OUT, a, b )\
+    READ_BIDIR_DIFF( a0, a );\
+    READ_BIDIR_DIFF( a1, b );\
+    OUT = a0 + (a1<<BITS_PER_SUM);
+
+#define ROW_8x4_SATD( a, b, c )\
+    fencpos.y += a;\
+    fref0Apos.y += b;\
+    fref0Bpos.y += b;\
+    fref1Apos.y += c;\
+    fref1Bpos.y += c;\
+    READ_DIFF_EX( b0, 0, 4 );\
+    READ_DIFF_EX( b1, 1, 5 );\
+    READ_DIFF_EX( b2, 2, 6 );\
+    READ_DIFF_EX( b3, 3, 7 );\
+    HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+    HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+    sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
+
+    ROW_8x4_SATD( 0, 0, 0 );
+    ROW_8x4_SATD( 4, 4, 4 );
+
+#undef READ_BIDIR_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+
+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+/*
+ * mode selection
+ *
+ * global launch dimensions:  [mb_width, mb_height]
+ *
+ * If this is a B frame, launch dims are [mb_width * 4, mb_height]
+ */
+kernel void mode_selection(
+    read_only image2d_t   fenc_lowres,
+    read_only image2d_t   fref0_planes,
+    read_only image2d_t   fref1_planes,
+    const global short2  *fenc_lowres_mvs0,
+    const global short2  *fenc_lowres_mvs1,
+    const global short2  *fref1_lowres_mvs0,
+    const global int16_t *fenc_lowres_mv_costs0,
+    const global int16_t *fenc_lowres_mv_costs1,
+    const global uint16_t *fenc_intra_cost,
+    global uint16_t      *lowres_costs,
+    global int           *frame_stats,
+    local int16_t        *cost_local,
+    local sum2_t         *satd_local,
+    int                   mb_width,
+    int                   bipred_weight,
+    int                   dist_scale_factor,
+    int                   b,
+    int                   p0,
+    int                   p1,
+    int                   lambda )
+{
+    int mb_x = get_global_id( 0 );
+    int b_bidir = (b < p1);
+    if( b_bidir )
+    {
+        mb_x >>= 2;
+        if( mb_x >= mb_width )
+            return;
+    }
+    int mb_y = get_global_id( 1 );
+    int mb_height = get_global_size( 1 );
+    int mb_count = mb_width * mb_height;
+    int mb_xy = mb_x + mb_y * mb_width;
+
+    /* Initialize frame stats for next kernel (sum_inter_cost) */
+    if( mb_x < 4 && mb_y == 0 )
+        frame_stats[mb_x] = 0;
+
+    int bcost = COST_MAX;
+    int list_used = 0;
+
+    if( !b_bidir )
+    {
+        int icost = fenc_intra_cost[mb_xy];
+        COPY2_IF_LT( bcost, icost, list_used, 0 );
+    }
+    if( b != p0 )
+    {
+        int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
+        COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
+    }
+    if( b != p1 )
+    {
+        int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
+        COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
+    }
+
+    if( b_bidir )
+    {
+        int2 coord = (int2)(mb_x << 3, mb_y << 3);
+        int mb_i = get_global_id( 0 ) & 3;
+        int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+        cost_local += mb_in_group * 4;
+        satd_local += mb_in_group * 16;
+
+#define TRY_BIDIR( mv0, mv1, penalty )\
+    {\
+        int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
+        int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
+        cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
+        int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
+        COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
+    }
+        /* temporal prediction */
+        short2 dmv0, dmv1;
+        short2 mvr = fref1_lowres_mvs0[mb_xy];
+        dmv0.x = (mvr.x * dist_scale_factor + 128) >> 8;
+        dmv0.y = (mvr.y * dist_scale_factor + 128) >> 8;
+        dmv1.x = dmv0.x - mvr.x;
+        dmv1.y = dmv0.y - mvr.y;
+        TRY_BIDIR( dmv0, dmv1, 0 )
+
+        if( as_uint( dmv0 ) || as_uint( dmv1 ) )
+        {
+            /* B-direct prediction */
+            dmv0 = 0; dmv1 = 0;
+            TRY_BIDIR( dmv0, dmv1, 0 );
+        }
+
+        /* L0+L1 prediction */
+        dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
+        dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
+        TRY_BIDIR( dmv0, dmv1, 5 );
+#undef TRY_BIDIR
+    }
+
+    lowres_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
+}
+
+/*
+ * parallel sum inter costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_inter_cost(
+    const global uint16_t *fenc_lowres_costs,
+    const global uint16_t *inv_qscale_factor,
+    global int           *fenc_row_satds,
+    global int           *frame_stats,
+    int                   mb_width,
+    int                   bframe_bias,
+    int                   b,
+    int                   p0,
+    int                   p1 )
+{
+    int y = get_global_id( 1 );
+    int mb_height = get_global_size( 1 );
+
+    int row_satds = 0;
+    int cost_est = 0;
+    int cost_est_aq = 0;
+    int intra_mbs = 0;
+
+    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+    {
+        int mb_xy = x + y * mb_width;
+        int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
+        int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
+        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+        if( list == 0 && b_frame_score_mb )
+        {
+            intra_mbs++;
+        }
+
+        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+
+        row_satds += cost_aq;
+
+        if( b_frame_score_mb )
+        {
+            cost_est += cost;
+            cost_est_aq += cost_aq;
+        }
+    }
+
+    local int buffer[256];
+    int x = get_global_id( 0 );
+
+    row_satds   = parallel_sum( row_satds, x, buffer );
+    cost_est    = parallel_sum( cost_est, x, buffer );
+    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+    intra_mbs   = parallel_sum( intra_mbs, x, buffer );
+
+    if( b != p1 )
+        cost_est = (int)((float)cost_est * 100 / (120 + bframe_bias));
+
+    if( get_global_id( 0 ) == 0 )
+    {
+        fenc_row_satds[y] = row_satds;
+        atomic_add( frame_stats + 0, cost_est );
+        atomic_add( frame_stats + 1, cost_est_aq );
+        atomic_add( frame_stats + 2, intra_mbs );
+    }
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/downscale.cl
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/opencl/downscale.cl    Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,138 @@
+/*
+ * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
+ *
+ * --
+ *
+ * fenc_img is an output image (area of memory referenced through a texture
+ * cache). A read of any pixel location (x,y) returns four pixel values:
+ *
+ * val.s0 = P(x,y)
+ * val.s1 = P(x+1,y)
+ * val.s2 = P(x+2,y)
+ * val.s3 = P(x+3,y)
+ *
+ * This is a 4x replication of the lowres pixels, a trade-off between memory
+ * size and read latency.
+ *
+ * --
+ *
+ * hpel_planes is an output image that contains the four HPEL planes used for
+ * subpel refinement. A read of any pixel location (x,y) returns four HPEL pixel
+ * values:
+ *
+ * val.s0 = F(x,y) - aka the full pel pixel
+ * val.s1 = H(x,y) - horizontally interpolated pixel
+ * val.s2 = V(x,y) - vertically interpolated pixel
+ * val.s3 = C(x,y) - center (or corner) pixel
+ *
+ * launch dimensions:  [lowres-width, lowres-height]
+ */
+kernel void downscale_hpel(
+    const global pixel *fenc,
+    write_only image2d_t fenc_img,
+    write_only image2d_t hpel_planes,
+    int stride )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    uint4 values;
+
+    fenc += y * stride * 2;
+    const global pixel *src1 = fenc + stride;
+    const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
+    int2 pos = (int2)(x, y);
+    pixel right, left;
+
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s0 = rhadd( right, left );           // F
+
+    right = rhadd( fenc[2*x+1], src1[2*x+1] );
+    left  = rhadd( fenc[2*x+2], src1[2*x+2] );
+    values.s1 = rhadd( right, left );           // H
+
+    right = rhadd( src1[2*x], src2[2*x] );
+    left  = rhadd( src1[2*x+1], src2[2*x+1] );
+    values.s2 = rhadd( right, left );           // V
+
+    right = rhadd( src1[2*x+1], src2[2*x+1] );
+    left  = rhadd( src1[2*x+2], src2[2*x+2] );
+    values.s3 = rhadd( right, left );           // C
+
+    uint4 val = (uint4)(((values.s0 & 0xff) << 24) + ((values.s1 & 0xff) << 16) + ((values.s2 & 0xff) << 8) + (values.s3 & 0xff), 0, 0, 0);
+    write_imageui( hpel_planes, pos, val );
+
+    x = x+1 < get_global_size( 0 ) ? x+1 : x;
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s1 = rhadd( right, left );
+
+    x = x+1 < get_global_size( 0 ) ? x+1 : x;
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s2 = rhadd( right, left );
+
+    x = x+1 < get_global_size( 0 ) ? x+1 : x;
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s3 = rhadd( right, left );
+
+    write_imageui( fenc_img, pos, values );
+}
+
+/*
+ * downscale lowres hierarchical motion search image
+ *
+ * launch dimensions:  [lowres-width, lowres-height]
+ */
+kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    int2 pos = (int2)(x, y);
+    int gs = get_global_size( 0 );
+    uint4 top, bot, values;
+    top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
+    bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+    // these select statements appear redundant, and they should be, but tests break when
+    // they are not here.
+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+    top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
+    bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+    write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/*
+ * Second copy of downscale kernel, no differences. This is a (no perf loss) workaround for a
+ * scheduling bug in current Tahiti drivers.
+ */
+kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    int2 pos = (int2)(x, y);
+    int gs = get_global_size( 0 );
+    uint4 top, bot, values;
+    top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
+    bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+    // these select statements appear redundant, and they should be, but tests break when
+    // they are not here.
+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+    top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
+    bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+    write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
+kernel void memset_int16( global int16_t *buf, int16_t value )
+{
+    buf[get_global_id( 0 )] = value;
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/intra.cl
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/opencl/intra.cl    Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,1053 @@
+
+inline uint32_t pack16to32( uint32_t a, uint32_t b )
+{
+    return a + (b << 16);
+}
+
+inline uint32_t pack8to16( uint32_t a, uint32_t b )
+{
+    return a + (b << 8);
+}
+
+inline int clamp_int( int a, int b, int c )
+{
+    return clamp( a, b, c );
+}
+
+#if VECTORIZE
+int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
+{
+    int8 a_v, d_v;
+    int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
+    int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
+
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr0).s04152637;
+    HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr1).s04152637;
+    HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr2).s04152637;
+    HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr3).s04152637;
+    HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    uint8 sum_v;
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
+    sum_v = abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
+    sum_v += abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
+    sum_v += abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
+    sum_v += abs( a_v );
+
+    uint4 sum2 = sum_v.hi + sum_v.lo;
+    uint2 sum3 = sum2.hi + sum2.lo;
+    return ( sum3.hi + sum3.lo ) >> 1;
+}
+#else
+SATD_C_8x4_Q( satd_8x4_lp, const local, private )
+#endif
+
+/****************************************************************************
+ * 8x8 prediction for intra luma block
+ ****************************************************************************/
+
+#define F1            rhadd
+#define F2( a, b, c ) ( a+2*b+c+2 )>>2
+
+#if VECTORIZE
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
+    pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+    pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+
+    pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+    pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+
+    pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+
+    pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+
+    pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+
+    pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+    pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+
+    pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+    pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+    pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
+
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr3.s0 = F2( left[1], left[2], left[3] );
+    pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+    pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
+    pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
+    pr0.s7 = F2( top[5], top[6], top[7] );
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr3.s0 = F2( left[5], left[6], left[7] );
+    pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
+    pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
+    pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
+    pr0.s7 = F2( top[1], top[2], top[3] );
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr2.s0 = F2( left[1], left[0], left_top );
+    pr3.s0 = F2( left[2], left[1], left[0] );
+    pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
+    pr0.s0 = pr2.s1 = F1( left_top, top[0] );
+    pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
+    pr0.s1 = pr2.s2 = F1( top[0], top[1] );
+    pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
+    pr0.s2 = pr2.s3 = F1( top[1], top[2] );
+    pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
+    pr0.s3 = pr2.s4 = F1( top[2], top[3] );
+    pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
+    pr0.s4 = pr2.s5 = F1( top[3], top[4] );
+    pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
+    pr0.s5 = pr2.s6 = F1( top[4], top[5] );
+    pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
+    pr0.s6 = pr2.s7 = F1( top[5], top[6] );
+    pr1.s7 = F2( top[5], top[6], top[7] );
+    pr0.s7 = F1( top[6], top[7] );
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr2.s0 = F2( left[5], left[4], left[3] );
+    pr3.s0 = F2( left[6], left[5], left[4] );
+    pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
+    pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
+    pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
+    pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
+    pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+    pr0.s2 = pr2.s3 = F1( left_top, top[0] );
+    pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+    pr0.s3 = pr2.s4 = F1( top[0], top[1] );
+    pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+    pr0.s4 = pr2.s5 = F1( top[1], top[2] );
+    pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+    pr0.s5 = pr2.s6 = F1( top[2], top[3] );
+    pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+    pr0.s6 = pr2.s7 = F1( top[3], top[4] );
+    pr1.s7 = F2( top[3], top[4], top[5] );
+    pr0.s7 = F1( top[4], top[5] );
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+#undef PRED
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
+    pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
+    pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
+
+    pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+    pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
+    pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
+
+    pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+    pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
+
+    pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+    pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+
+    pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+
+    pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+
+    pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = F1( top[0], top[1] );
+    pr1.s0 = F2( top[0], top[1], top[2] );
+    pr2.s0 = pr0.s1 = F1( top[1], top[2] );
+    pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
+    pr2.s1 = pr0.s2 = F1( top[2], top[3] );
+    pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
+    pr2.s2 = pr0.s3 = F1( top[3], top[4] );
+    pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
+    pr2.s3 = pr0.s4 = F1( top[4], top[5] );
+    pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
+    pr2.s4 = pr0.s5 = F1( top[5], top[6] );
+    pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
+    pr2.s5 = pr0.s6 = F1( top[6], top[7] );
+    pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
+    pr2.s6 = pr0.s7 = F1( top[7], top[8] );
+    pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
+    pr2.s7 = F1( top[8], top[9] );
+    pr3.s7 = F2( top[8], top[9], top[10] );
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = F1( top[2], top[3] );
+    pr1.s0 = F2( top[2], top[3], top[4] );
+    pr2.s0 = pr0.s1 = F1( top[3], top[4] );
+    pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
+    pr2.s1 = pr0.s2 = F1( top[4], top[5] );
+    pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
+    pr2.s2 = pr0.s3 = F1( top[5], top[6] );
+    pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
+    pr2.s3 = pr0.s4 = F1( top[6], top[7] );
+    pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
+    pr2.s4 = pr0.s5 = F1( top[7], top[8] );
+    pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
+    pr2.s5 = pr0.s6 = F1( top[8], top[9] );
+    pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
+    pr2.s6 = pr0.s7 = F1( top[9], top[10] );
+    pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
+    pr2.s7 = F1( top[10], top[11] );
+    pr3.s7 = F2( top[10], top[11], top[12] );
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+
+    pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+
+    pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+
+    pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    pr0.s6 = left[7]; pr0.s7 = left[7];
+
+    pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    pr1.s4 = left[7]; pr1.s5 = left[7];
+    pr1.s6 = left[7]; pr1.s7 = left[7];
+
+    pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    pr2.s2 = left[7]; pr2.s3 = left[7];
+    pr2.s4 = left[7]; pr2.s5 = left[7];
+    pr2.s6 = left[7]; pr2.s7 = left[7];
+
+    pr3 = (int8)left[7];
+
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+    const local pixel *src_l = src;
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0 = (int8)src[-1]; src += src_stride;
+    pr1 = (int8)src[-1]; src += src_stride;
+    pr2 = (int8)src[-1]; src += src_stride;
+    pr3 = (int8)src[-1]; src += src_stride;
+    int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
+
+    //Lower half of pred[]
+    pr0 = (int8)src[-1]; src += src_stride;
+    pr1 = (int8)src[-1]; src += src_stride;
+    pr2 = (int8)src[-1]; src += src_stride;
+    pr3 = (int8)src[-1];
+    return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+    int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
+    return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
+           satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+    int H = 0, V = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+    }
+
+    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+    int b = (17 * H + 16) >> 5;
+    int c = (17 * V + 16) >> 5;
+    int i00 = a - 3 * b - 3 * c + 16;
+
+    // Upper half of pred[]
+    int pix = i00;
+    int8 pr0, pr1, pr2, pr3;
+    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    //Lower half of pred[]
+    pix = i00;
+    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        s0 += src[i - src_stride];
+        s1 += src[i + 4 - src_stride];
+        s2 += src[-1 + i * src_stride];
+        s3 += src[-1 + (i+4)*src_stride];
+    }
+
+    // Upper half of pred[]
+    int8 dc0;
+    dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
+    dc0.hi = (int4)( (s1 + 2) >> 2 );
+    int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
+
+    // Lower half of pred[]
+    dc0.lo = (int4)( (s3 + 2) >> 2 );
+    dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
+}
+
+#else  /* not vectorized: private is cheap registers are scarce */
+
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    private pixel pred[32];
+    int x_plus_y;
+
+    // Upper half of pred[]
+    for( int y = 0; y < 4; y++ )
+    {
+        for( int x = 0; x < 8; x++ )
+        {
+            x_plus_y = clamp_int( x + y, 0, 13 );
+            pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+        }
+    }
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    //Lower half of pred[]
+    for( int y = 4; y < 8; y++ )
+    {
+        for( int x = 0; x < 8; x++ )
+        {
+            x_plus_y = clamp_int( x + y, 0, 13 );
+            pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+        }
+    }
+    pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+    // Upper half of pred[]
+    PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
+    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
+    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
+    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
+    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
+    PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+    // Lower half of pred[]
+    PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
+    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
+    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
+    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
+    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
+    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
+    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
+    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
+    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
+    PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+    // Upper half of pred[]
+    PRED( 0, 2 ) = F2( left[1], left[0], left_top );
+    PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
+    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
+    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
+    PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
+    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
+    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
+    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
+    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
+    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
+    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
+    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
+    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
+    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
+    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
+    PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
+    PRED( 7, 0 ) = F1( top[6], top[7] );
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+    //Lower half of pred[]
+    PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
+    PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
+    PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
+    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
+    PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
+    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
+    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
+    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
+    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
+    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
+    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
+    PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
+    PRED( 7, 0 ) = F1( top[4], top[5] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    private pixel pred[32];
+    int satd;
+    int p1 =  pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+    int p2 =  pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+    int p3 =  pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+    int p4 =  pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+    int p5 =  pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+    int p6 =  pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+    int p7 =  pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
+    int p8 =  pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
+    int p9 =  pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
+    int p10 =  pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
+    int p11 =  pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
+    // Upper half of pred[]
+    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    private pixel pred[32];
+    int satd;
+#define PRED( x, y ) pred[(x) + (y)*8]
+    // Upper half of pred[]
+    PRED( 0, 0 ) = F1( top[0], top[1] );
+    PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
+    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
+    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
+    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
+    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
+    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
+    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
+    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
+    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
+    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
+    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
+    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
+    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
+    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
+    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
+    PRED( 7, 2 ) = F1( top[8], top[9] );
+    PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    PRED( 0, 0 ) = F1( top[2], top[3] );
+    PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
+    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
+    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
+    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
+    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
+    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
+    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
+    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
+    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
+    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
+    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
+    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
+    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
+    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
+    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
+    PRED( 7, 2 ) = F1( top[10], top[11] );
+    PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+    private pixel pred[32];
+    int satd;
+    int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+    int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+    int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+    int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+    int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+    int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+    int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
+    int p8 = pack8to16( left[7], left[7] );
+    // Upper half of pred[]
+    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+    private pixel pred[32];
+    const local pixel *src_l = src;
+
+    // Upper half of pred[]
+    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
+    int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
+
+    // Lower half of pred[]
+    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 3, pred );
+    return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+    private pixel pred[32];
+    uchar16 v16;
+    v16.lo = vload8( 0, &src[-src_stride] );
+    v16.hi = vload8( 0, &src[-src_stride] );
+
+    vstore16( v16, 0, pred );
+    vstore16( v16, 1, pred );
+
+    return satd_8x4_lp( src, src_stride, pred, 8 ) +
+           satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+    int H = 0, V = 0;
+    private pixel pred[32];
+    int satd;
+
+    for( int i = 0; i < 4; i++ )
+    {
+        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+    }
+
+    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+    int b = (17 * H + 16) >> 5;
+    int c = (17 * V + 16) >> 5;
+    int i00 = a - 3 * b - 3 * c + 16;
+
+    // Upper half of pred[]
+    for( int y = 0; y < 4; y++ )
+    {
+        int pix = i00;
+        for( int x = 0; x < 8; x++ )
+        {
+            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+            pix += b;
+        }
+        i00 += c;
+    }
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    for( int y = 0; y < 4; y++ )
+    {
+        int pix = i00;
+        for( int x = 0; x < 8; x++ )
+        {
+            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+            pix += b;
+        }
+        i00 += c;
+    }
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+    private pixel pred[32];
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        s0 += src[i - src_stride];
+        s1 += src[i + 4 - src_stride];
+        s2 += src[-1 + i * src_stride];
+        s3 += src[-1 + (i+4)*src_stride];
+    }
+
+    // Upper half of pred[]
+    uchar8 dc0;
+    dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
+    dc0.hi = (uchar4)( (s1 + 2) >> 2 );
+    vstore8( dc0, 0, pred );
+    vstore8( dc0, 1, pred );
+    vstore8( dc0, 2, pred );
+    vstore8( dc0, 3, pred );
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+    // Lower half of pred[]
+    dc0.lo = (uchar4)( (s3 + 2) >> 2 );
+    dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
+    vstore8( dc0, 0, pred );
+    vstore8( dc0, 1, pred );
+    vstore8( dc0, 2, pred );
+    vstore8( dc0, 3, pred );
+    return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+#endif
+
+kernel void mb_intra_cost_satd_8x8(
+    read_only image2d_t  fenc,
+    global uint16_t     *fenc_intra_cost,
+    global int          *frame_stats,
+    int                  lambda,
+    int                  mb_width,
+    int                  slow )
+{
+#define CACHE_STRIDE (265)
+#define BLOCK_OFFSET (266)
+    local pixel cache[2385];
+    local int cost_buf[32];
+    local pixel top[32 * 16];
+    local pixel left[32 * 8];
+    local pixel left_top[32];
+
+    int lx = (int)get_local_id( 0 );
+    int ly = (int)get_local_id( 1 );
+    int gx = (int)get_global_id( 0 );
+    int gy = (int)get_global_id( 1 );
+    int gidx = (int)get_group_id( 0 );
+    int gidy = (int)get_group_id( 1 );
+    int linear_id = (int)(ly * get_local_size( 0 ) + lx);
+    int satd = COST_MAX;
+    int basex = (gidx << 8);
+    int basey = (gidy << 3) - 1;
+
+    for( int y = 0; y < 9; y++ ) //Maybe unrolled later?
+    {
+        for( int x = 4*linear_id; x < (33 * 8); x += 1024 )
+        {
+            uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
+            cache[y * CACHE_STRIDE + 1 + x] = data.s0;
+            cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
+            cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
+            cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
+        }
+    }
+    if( linear_id < 9 )
+        cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // Cooperatively build the top edge for the macroblock using smoothing function
+    int j = ly;
+    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+    j += 8;
+    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+    // Cooperatively build the left edge for the macroblock using smoothing function
+    left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
+                        2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
+                        cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp_int((ly + 1), 0, 7 )] + 2 ) >> 2;
+    // One left_top per macroblock
+    if( 0 == ly )
+    {
+        left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
+                         cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
+        cost_buf[lx] = COST_MAX;
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // each warp/wavefront generates a different prediction type; no divergence
+    switch( ly )
+    {
+        case 0:
+            satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 1:
+            satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 2:
+            satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 3:
+            satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 4:
+            satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+            break;
+        case 5:
+            satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+            break;
+        case 6:
+            satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+            break;
+        case 7:
+            satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
+            break;
+        default:
+            break;
+    }
+    atom_min( &cost_buf[lx], satd );
+    if( slow )
+    {
+        // Do the remaining two (least likely) prediction modes
+        switch( ly )
+        {
+            case 0: // DDL
+                satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+                atom_min( &cost_buf[lx], satd );
+                break;
+            case 1: // VL
+                satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+                atom_min( &cost_buf[lx], satd );
+                break;
+            default:
+                break;
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if( (0 == ly) && (gx < mb_width) )
+    {
+        fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
+    }
+
+    //Auxiliary work for one of the following kernels
+    if( gx < 2 && gy == 0 )
+        frame_stats[gx] = 0;
+#undef CACHE_STRIDE
+#undef BLOCK_OFFSET
+}
+
+
+
+/*
+ * parallel sum intra costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_intra_cost(
+    const global uint16_t *fenc_intra_cost,
+    const global uint16_t *inv_qscale_factor,
+    global int           *fenc_row_satds,
+    global int           *frame_stats,
+    int                   mb_width )
+{
+    int y = get_global_id( 1 );
+    int mb_height = get_global_size( 1 );
+
+    int row_satds = 0;
+    int cost_est = 0;
+    int cost_est_aq = 0;
+
+    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+    {
+        int mb_xy = x + y * mb_width;
+        int cost = fenc_intra_cost[mb_xy];
+        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+        row_satds += cost_aq;
+        if( b_frame_score_mb )
+        {
+            cost_est += cost;
+            cost_est_aq += cost_aq;
+        }
+    }
+
+    local int buffer[256];
+    int x = get_global_id( 0 );
+
+    row_satds   = parallel_sum( row_satds, x, buffer );
+    cost_est    = parallel_sum( cost_est, x, buffer );
+    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+
+    if( get_global_id( 0 ) == 0 )
+    {
+        fenc_row_satds[y] = row_satds;
+        atomic_add( frame_stats + 0, cost_est );
+        atomic_add( frame_stats + 1, cost_est_aq );
+    }
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/motionsearch.cl
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/opencl/motionsearch.cl Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,276 @@
+
+
+
+int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
+{
+    /* edge macroblocks might not have a direct descendant, use nearest */
+    x = (x == mb_width-1)  ? (x - (mb_width&1)) >> 1 : x >> 1;
+    y = (y == mb_height-1) ? (y - (mb_height&1)) >> 1 : y >> 1;
+    return (mb_width>>1) * y + x;
+}
+
+/* Four threads calculate an 8x8 SAD.  Each does two rows */
+int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
+{
+    frefpos.y += idx << 1;
+    fencpos.y += idx << 1;
+    int cost = 0;
+    if( frefpos.x < 0 )
+    {
+        /* slow path when MV goes past right edge */
+        for( int y = 0; y < 2; y++ )
+        {
+            for( int x = 0; x < 8; x++ )
+            {
+                pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+                pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+                cost += abs_diff( enc, ref );
+            }
+        }
+    }
+    else
+    {
+        uint4 enc, ref, costs = 0;
+        enc = read_imageui( fenc, sampler, fencpos );
+        ref = read_imageui( fref, sampler, frefpos );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
+        costs += abs_diff( enc, ref );
+        cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
+    }
+    costs[idx] = cost;
+    return costs[0] + costs[1] + costs[2] + costs[3];
+}
+
+/* One thread performs 8x8 SAD */
+int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
+{
+    if( frefpos.x < 0 )
+    {
+        /* slow path when MV goes past right edge */
+        int cost = 0;
+        for( int y = 0; y < 8; y++ )
+        {
+            for( int x = 0; x < 8; x++ )
+            {
+                uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+                uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+                cost += abs_diff( enc, ref );
+            }
+        }
+        return cost;
+    }
+    else
+    {
+        uint4 enc, ref, cost = 0;
+        for( int y = 0; y < 8; y++ )
+        {
+            for( int x = 0; x < 8; x += 4 )
+            {
+                enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
+                ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
+                cost += abs_diff( enc, ref );
+            }
+        }
+        return cost.s0 + cost.s1 + cost.s2 + cost.s3;
+    }
+}
+
+
+/*
+ * hierarchical motion estimation
+ *
+ * Each kernel launch is a single iteration
+ *
+ * MB per work group is determined by lclx / 4 * lcly
+ *
+ * global launch dimensions:  [mb_width * 4, mb_height]
+ */
+kernel void hierarchical_motion(
+    read_only image2d_t  fenc,
+    read_only image2d_t  fref,
+    const global short2 *in_mvs,
+    global short2       *out_mvs,
+    global int16_t      *out_mv_costs,
+    global short2       *mvp_buffer,
+    local int16_t       *cost_local,
+    local short2        *mvc_local,
+    int                  mb_width,
+    int                  lambda,
+    int                  me_range,
+    int                  scale,
+    int                  b_shift_index,
+    int                  b_first_iteration,
+    int                  b_reverse_references )
+{
+    int mb_x = get_global_id( 0 ) >> 2;
+    if( mb_x >= mb_width )
+        return;
+    int mb_height = get_global_size( 1 );
+    int mb_i = get_global_id( 0 ) & 3;
+    int mb_y = get_global_id( 1 );
+    int mb_xy = mb_y * mb_width + mb_x;
+    const int mb_size = 8;
+    int2 coord;
+    coord.x = mb_x * mb_size;
+    coord.y = mb_y * mb_size;
+
+    const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+    cost_local += 4 * mb_in_group;
+
+    int i_mvc = 0;
+    mvc_local += 4 * mb_in_group;
+    mvc_local[mb_i] = 0;
+    short2 mvp;
+
+    if( b_first_iteration )
+    {
+        mvp.x = 0;
+        mvp.y = 0;
+    }
+    else
+    {
+#define MVC( DX, DY )\
+    {\
+        int px = mb_x + DX;\
+        int py = mb_y + DY;\
+        if( b_shift_index )\
+            mvc_local[i_mvc] = in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )];\
+        else\
+            mvc_local[i_mvc] = in_mvs[mb_width * py + px];\
+        mvc_local[i_mvc].x >>= scale;\
+        mvc_local[i_mvc].y >>= scale;\
+        i_mvc++;\
+    }
+        /* Find MVP from median of MVCs */
+        if( b_reverse_references )
+        {
+            /* odd iterations: derive MVP from down and right */
+            if( mb_x < mb_width - 1 )
+                MVC( 1, 0 );
+            if( mb_y < mb_height - 1 )
+            {
+                MVC( 0, 1 );
+                if( mb_x > b_shift_index )
+                    MVC( -1, 1 );
+                if( mb_x < mb_width - 1 )
+                    MVC( 1, 1 );
+            }
+        }
+        else
+        {
+            /* even iterations: derive MVP from up and left */
+            if( mb_x > 0 )
+                MVC( -1, 0 );
+            if( mb_y > 0 )
+            {
+                MVC( 0, -1 );
+                if( mb_x < mb_width - 1 )
+                    MVC( 1, -1 );
+                if( mb_x > b_shift_index )
+                    MVC( -1, -1 );
+            }
+        }
+        if( i_mvc <= 1 )
+        {
+            mvp = mvc_local[0];
+        }
+        else
+            mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+#undef MVC
+    }
+    //new mvp == old mvp, copy the input mv to the output mv and exit.
+    if( (!b_shift_index) && (mvp.x == mvp_buffer[mb_xy].x) && (mvp.y == mvp_buffer[mb_xy].y) )
+    {
+        out_mvs[mb_xy] = in_mvs[mb_xy];
+        return;
+    }
+    mvp_buffer[mb_xy] = mvp;
+    short2 mv_min;
+    short2 mv_max;
+    mv_min.x = -mb_size * mb_x - 4;
+    mv_max.x = mb_size * (mb_width - mb_x - 1) + 4;
+    mv_min.y = -mb_size * mb_y - 4;
+    mv_max.y = mb_size * (mb_height - mb_y - 1) + 4;
+
+    short2 bestmv;
+    bestmv.x = x264_clip3( mvp.x, mv_min.x, mv_max.x );
+    bestmv.y = x264_clip3( mvp.y, mv_min.y, mv_max.y );
+
+    int2 refcrd;
+    refcrd.x = coord.x + bestmv.x;
+    refcrd.y = coord.y + bestmv.y;
+    /* measure cost at bestmv */
+    int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
+                lambda * calc_mv_cost( abs_diff( bestmv.x, mvp.x ) << (2 + scale), abs_diff( bestmv.y, mvp.y ) << (2 + scale) );
+
+    do
+    {
+        /* measure costs at offsets from bestmv */
+        refcrd.x = coord.x + bestmv.x + dia_offs[mb_i].x;
+        refcrd.y = coord.y + bestmv.y + dia_offs[mb_i].y;
+        short2 trymv = bestmv + dia_offs[mb_i];
+        int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
+                   lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );
+
+        cost_local[mb_i] = (cost<<2) | mb_i;
+        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
+
+        if( (cost >> 2) >= bcost )
+            break;
+
+        bestmv += dia_offs[cost&3];
+        bcost = cost>>2;
+
+        if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
+            break;
+    }
+    while( --me_range > 0 );
+
+    short2 trymv;
+
+#define COST_MV_NO_PAD( X, Y, L )\
+    trymv.x = x264_clip3( X, mv_min.x, mv_max.x );\
+    trymv.y = x264_clip3( Y, mv_min.y, mv_max.y );\
+    if( abs_diff( mvp.x, trymv.x ) > 1 || abs_diff( mvp.y, trymv.y ) > 1 ) {\
+        int2 refcrd = coord; refcrd.x += trymv.x; refcrd.y += trymv.y;\
+        int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
+                   L * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );\
+        if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
+
+    COST_MV_NO_PAD( 0, 0, 0 );
+
+    if( !b_first_iteration )
+    {
+        /* try cost at previous iteration's MV, if MVP was too far away */
+        short2 prevmv;
+        if( b_shift_index )
+            prevmv = in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )];
+        else
+            prevmv = in_mvs[mb_xy];
+        prevmv.x >>= scale;
+        prevmv.y >>= scale;
+        COST_MV_NO_PAD( prevmv.x, prevmv.y, lambda );
+    }
+
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        /* try cost at each candidate MV, if MVP was too far away */
+        COST_MV_NO_PAD( mvc_local[i].x, mvc_local[i].y, lambda );
+    }
+
+    if( mb_i == 0 )
+    {
+        bestmv.x <<= scale;
+        bestmv.y <<= scale;
+        out_mvs[mb_xy] = bestmv;
+        out_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
+    }
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/subpel.cl
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/opencl/subpel.cl   Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,283 @@
+
+/* One thread performs 8x8 SAD */
+int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+    int2 frefpos = (int2)(qpos.x >> 2, qpos.y >> 2);
+    int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+
+    uint mask_shift = 8 * (3 - hpel_idx);
+    uint mask = 0x000000ff << mask_shift;
+
+    uint4 cost4 = 0;
+
+    for( int y = 0; y < 8; y++ )
+    {
+        uint4 enc, val4;
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 & mask) >> mask_shift;
+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 & mask) >> mask_shift;
+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 & mask) >> mask_shift;
+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 & mask) >> mask_shift;
+        cost4 += abs_diff( enc, val4 );
+
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 & mask) >> mask_shift;
+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 & mask) >> mask_shift;
+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 & mask) >> mask_shift;
+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 & mask) >> mask_shift;
+        cost4 += abs_diff( enc, val4 );
+    }
+
+    return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
+}
+
+/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
+int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+    int2 frefApos = qpos >> 2;
+    int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+
+    int2 qposB = qpos + ((qpos & 1) << 1);
+    int2 frefBpos = qposB >> 2;
+    int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
+
+    uint mask_shift0 = 8 * (3 - hpelA), mask_shift1 = 8 * (3 - hpelB);
+    uint mask0 = 0x000000ff << mask_shift0, mask1 = 0x000000ff << mask_shift1;
+
+    int cost = 0;
+
+    for( int y = 0; y < 8; y++ )
+    {
+        for( int x = 0; x < 8; x++ )
+        {
+            uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
+            uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 & mask0) >> mask_shift0;
+            uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 & mask1) >> mask_shift1;
+            uint ref = rhadd( vA, vB );
+            cost += abs_diff( enc, ref );
+        }
+    }
+
+    return cost;
+}
+
+/* Four thread measures 8x8 SATD cost at a QPEL offset into an HPEL plane
+ *
+ * Each thread collects 1/4 of the rows of diffs and processes one quarter of
+ * the transforms
+ */
+int satd_8x8_ii_qpel_coop4(
+    read_only image2d_t fenc, int2 fencpos,
+    read_only image2d_t fref_planes, int2 qpos,
+    local sum2_t *tmpp,
+    int idx )
+{
+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+    sum2_t b0, b1, b2, b3;
+
+    // fencpos is full-pel position of original MB
+    // qpos is qpel position within reference frame
+    int2 frefApos = (int2)(qpos.x>>2, qpos.y>>2);
+    int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
+
+    int2 qposB = (int2)qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
+    int2 frefBpos = (int2)(qposB.x>>2, qposB.y>>2);
+    int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
+
+    uint mask_shift0 = 8 * (3 - hpelA), mask_shift1 = 8 * (3 - hpelB);
+    uint mask0 = 0x000000ff << mask_shift0, mask1 = 0x000000ff << mask_shift1;
+
+    uint vA, vB;
+    uint a0, a1;
+    uint enc;
+    sum2_t sum = 0;
+
+
+#define READ_DIFF( OUT, X )\
+    enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
+    vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 & mask0) >> mask_shift0;\
+    vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 & mask1) >> mask_shift1;\
+    OUT = enc - rhadd( vA, vB );
+
+#define READ_DIFF_EX( OUT, a, b )\
+    {\
+        READ_DIFF( a0, a );\
+        READ_DIFF( a1, b );\
+        OUT = a0 + (a1<<BITS_PER_SUM);\
+    }
+#define ROW_8x4_SATD( a, b )\
+    {\
+        fencpos.y += a;\
+        frefApos.y += b;\
+        frefBpos.y += b;\
+        READ_DIFF_EX( b0, 0, 4 );\
+        READ_DIFF_EX( b1, 1, 5 );\
+        READ_DIFF_EX( b2, 2, 6 );\
+        READ_DIFF_EX( b3, 3, 7 );\
+        HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+        HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+        sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
+    }
+    ROW_8x4_SATD( 0, 0 );
+    ROW_8x4_SATD( 4, 4 );
+
+#undef READ_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+constant short2 hpoffs[4] =
+{
+    {0, -2}, {-2, 0}, {2, 0}, {0, 2}
+};
+
+/*
+ * sub pixel refinement of motion vectors, output MVs and costs are moved from
+ * temporary buffers into final per-frame buffer
+ *
+ * global launch dimensions:  [mb_width * 4, mb_height]
+ *
+ * With X being the source 16x16 pixels, F is the lowres pixel used by the
+ * motion search.  We will now utilize the H V and C pixels (stored in separate
+ * planes) to search at half-pel increments.
+ *
+ * X X X X X X
+ *  F H F H F
+ * X X X X X X
+ *  V C V C V
+ * X X X X X X
+ *  F H F H F
+ * X X X X X X
+ *
+ * The YX HPEL bits of the motion vector selects the plane we search in.  The
+ * four planes are packed in the fref_planes 2D image buffer.  Each sample
+ * returns:  s0 = F, s1 = H, s2 = V, s3 = C
+ */
+kernel void subpel_refine(
+    read_only image2d_t   fenc,
+    read_only image2d_t   fref_planes,
+    const global short2  *in_mvs,
+    const global int16_t *in_sad_mv_costs,
+    local int16_t        *cost_local,
+    local sum2_t         *satd_local,
+    local short2         *mvc_local,
+    global short2        *fenc_lowres_mv,
+    global int16_t       *fenc_lowres_mv_costs,
+    int                   mb_width,
+    int                   lambda,
+    int                   b,
+    int                   ref,
+    int                   b_islist1 )
+{
+    int mb_x = get_global_id( 0 ) >> 2;
+    if( mb_x >= mb_width )
+        return;
+    int mb_height = get_global_size( 1 );
+
+    int mb_i = get_global_id( 0 ) & 3;
+    int mb_y = get_global_id( 1 );
+    int mb_xy = mb_y * mb_width + mb_x;
+
+    fenc_lowres_mv += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
+    fenc_lowres_mv_costs += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
+
+    int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+    cost_local += mb_in_group * 4;
+    satd_local += mb_in_group * 16;
+    mvc_local += mb_in_group * 4;
+
+    int i_mvc = 0;
+    mvc_local[0] = 0;
+    mvc_local[1] = 0;
+    mvc_local[2] = 0;
+    mvc_local[3] = 0;
+    short2 mvp;
+#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
+    if( mb_x > 0 )
+        MVC( -1, 0 );
+    if( mb_y > 0 )
+    {
+        MVC( 0, -1 );
+        if( mb_x < mb_width - 1 )
+            MVC( 1, -1 );
+        if( mb_x > 0 )
+            MVC( -1, -1 );
+    }
+    if( i_mvc <= 1 )
+        mvp = mvc_local[0];
+    else
+        mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+#undef MVC
+
+    int bcost =  in_sad_mv_costs[mb_xy];
+    short2 bmv = in_mvs[mb_xy];
+
+    /* Make mvp a QPEL MV */
+    mvp.x <<= 2;
+    mvp.y <<= 2;
+
+    /* Make bmv a QPEL MV */
+    bmv.x <<= 2;
+    bmv.y <<= 2;
+
+    int2 coord;
+    coord.x = mb_x << 3;
+    coord.y = mb_y << 3;
+    short2 trymv;
+    int2 qpos;
+    int cost = 0;
+
+#define HPEL_QPEL( ARR, FUNC )\
+    {\
+        trymv = bmv + ARR[mb_i];\
+        qpos.x = (coord.x << 2) + trymv.x;\
+        qpos.y = (coord.y << 2) + trymv.y;\
+        cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
+        cost_local[mb_i] = (cost<<2) + mb_i;\
+        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
+        if( (cost>>2) < bcost )\
+        {\
+            bmv += ARR[cost&3];\
+            bcost = cost>>2;\
+        }\
+    }
+
+    HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
+
+#define CHEAP_QPEL 1
+
+#if CHEAP_QPEL
+    HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
+#endif
+
+    /* remeasure with SATD */
+    qpos.x = (coord.x << 2) + bmv.x;
+    qpos.y = (coord.y << 2) + bmv.y;
+    cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
+    bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
+    bcost += lambda * calc_mv_cost( abs_diff( bmv.x, mvp.x ), abs_diff( bmv.y, mvp.y ) );
+    short2 bestmv = 0;
+
+#if !CHEAP_QPEL
+#define QPEL_SATD( ARR, FUNC )\
+    {\
+        trymv = bmv + ARR;\
+        qpos.x = (coord.x << 2) + trymv.x;\
+        qpos.y = (coord.y << 2) + trymv.y;\
+        cost_local[mb_i] = FUNC( fenc, coord, fref_planes, qpos, satd_local, mb_i );\
+        cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
+        cost += lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
+        if( cost < bcost )\
+        {\
+            bestmv = ARR;\
+            bcost = cost;\
+        }\
+    }
+    for( int i = 0; i<4; i++ )
+        QPEL_SATD( dia_offs[i], satd_8x8_ii_qpel_coop4 );
+#endif
+
+    fenc_lowres_mv[mb_xy] = bmv+bestmv;
+    fenc_lowres_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/weightp.cl
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/opencl/weightp.cl  Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,45 @@
+/*
+Launch dimensions: width x height (in pixels)
+*/
+kernel void weightp_scaled_images( read_only image2d_t in_plane,
+                                   write_only image2d_t out_plane,
+                                   uint offset,
+                                   uint scale,
+                                   uint denom )
+{
+    int gx = get_global_id( 0 );
+    int gy = get_global_id( 1 );
+    uint4 input_val;
+    uint4 output_val;
+
+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
+    output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
+    write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
+
+/*
+Launch dimensions: width x height (in pixels)
+*/
+kernel void weightp_hpel( read_only image2d_t in_plane,
+                          write_only image2d_t out_plane,
+                          uint offset,
+                          uint scale,
+                          uint denom )
+{
+    int gx = get_global_id( 0 );
+    int gy = get_global_id( 1 );
+    uint input_val;
+    uint output_val;
+
+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
+    //Unpack
+    uint4 temp;
+    temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
+    temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
+
+    temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
+
+    //Pack
+    output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
+    write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/x264-cl.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/opencl/x264-cl.h   Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,156 @@
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+
+constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+/* 7.18.1.1  Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char   uint8_t;
+typedef short  int16_t;
+typedef unsigned short  uint16_t;
+typedef int  int32_t;
+typedef unsigned   uint32_t;
+typedef long  int64_t;
+typedef unsigned long uint64_t;
+
+typedef union
+{
+    uint32_t i;
+    uint16_t b[2];
+    uint8_t  c[4];
+}  x264_union32_t;
+
+typedef uint8_t  pixel;
+typedef uint16_t sum_t;
+typedef uint32_t sum2_t;
+
+#define LOWRES_COST_MASK ((1<<14)-1)
+#define LOWRES_COST_SHIFT 14
+#define COST_MAX (1<<28)
+
+#define X264_MIN( a, b ) ((a)<(b) ? (a) : (b))
+
+#define PIXEL_MAX 255
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+#define COPY2_IF_LT( x, y, a, b )\
+    if((y)<(x))\
+    {\
+        (x) = (y);\
+        (a) = (b);\
+    }
+constant short2 dia_offs[4] =
+{
+    {0, -1}, {-1, 0}, {1, 0}, {0, 1},
+};
+
+inline pixel x264_clip_pixel( int x )
+{
+    return ((x & ~PIXEL_MAX) ? (-x) >> 31 & PIXEL_MAX : x);
+}
+
+inline int x264_clip3( int v, int i_min, int i_max )
+{
+    return ((v < i_min) ? i_min : (v > i_max) ? i_max : v);
+}
+
+inline int x264_median( int a, int b, int c )
+{
+    int t = (a - b) & ((a - b) >> 31);
+    a -= t;
+    b += t;
+    b -= (b - c) & ((b - c) >> 31);
+    b += (a - b) & ((a - b) >> 31);
+    return b;
+}
+
+inline short2 x264_median_mv( short2 a, short2 b, short2 c )
+{
+    short2 dst;
+    dst.x = x264_median( a.x, b.x, c.x );
+    dst.y = x264_median( a.y, b.y, c.y );
+    return dst;
+}
+
+inline sum2_t abs2( sum2_t a )
+{
+    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
+    return (a + s) ^ s;
+}
+
+#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+        sum2_t t0 = s0 + s1;\
+        sum2_t t1 = s0 - s1;\
+        sum2_t t2 = s2 + s3;\
+        sum2_t t3 = s2 - s3;\
+        d0 = t0 + t2;\
+        d2 = t0 - t2;\
+        d1 = t1 + t3;\
+        d3 = t1 - t3;\
+}
+
+#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+        int2 t0 = s0 + s1;\
+        int2 t1 = s0 - s1;\
+        int2 t2 = s2 + s3;\
+        int2 t3 = s2 - s3;\
+        d0 = t0 + t2;\
+        d2 = t0 - t2;\
+        d1 = t1 + t3;\
+        d3 = t1 - t3;\
+}
+
+#define SATD_C_8x4_Q( name, q1, q2 )\
+    int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
+    {\
+        sum2_t tmp[4][4];\
+        sum2_t a0, a1, a2, a3;\
+        sum2_t sum = 0;\
+        for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
+        {\
+            a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
+            a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
+            a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
+            a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
+            HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
+        }\
+        for( int i = 0; i < 4; i++ )\
+        {\
+            HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
+            sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
+        }\
+        return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
+    }
+
+/*
+ * Utility function to perform a parallel sum reduction of an array of integers
+ */
+int parallel_sum( int value, int x, volatile local int *array )
+{
+    array[x] = value;
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    int dim = get_local_size( 0 );
+
+    while( dim > 1 )
+    {
+        dim >>= 1;
+
+        if( x < dim )
+        {
+            array[x] += array[x + dim];
+        }
+
+        if( dim > 32 )
+        {
+            barrier( CLK_LOCAL_MEM_FENCE );
+        }
+    }
+
+    return array[0];
+}
+
+int calc_mv_cost( int dx, int dy )
+{
+    return round( (log2( (float)(dx + 1) ) * 2.0f + 0.718f + !!dx) ) +
+           round( (log2( (float)(dy + 1) ) * 2.0f + 0.718f + !!dy) );
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f configure
--- a/configure Thu Mar 29 14:14:07 2012 -0700
+++ b/configure Tue May 15 18:05:00 2012 -0500
@@ -25,6 +25,7 @@
   --system-libx264         use system libx264 instead of internal
   --enable-shared          build shared library
   --enable-static          build static library
+  --disable-opencl         disable OpenCL features
   --disable-gpl            disable GPL-only features
   --disable-thread         disable multithreaded encoding
   --enable-win32thread     use win32threads (windows only)
@@ -273,6 +274,7 @@
 bit_depth="8"
 chroma_format="all"
 compiler="GNU"
+opencl="yes"

 CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
 LDFLAGS="$LDFLAGS"
@@ -381,6 +383,9 @@
         --host=*)
             host="$optarg"
             ;;
+        --disable-opencl)
+            opencl="no"
+            ;;
         --cross-prefix=*)
             cross_prefix="$optarg"
             ;;
@@ -1082,6 +1087,37 @@
 PROF_USE_LD=$PROF_USE_LD
 EOF

+if [ "$opencl" = "yes" ]; then
+    log_check "looking for xxd"
+    if ! $(xxd -v 2>/dev/null); then
+        echo 'OpenCL support requires xxd to compile.'
+        echo 'use --disable-opencl to compile without OpenCL'
+        exit 1
+    elif [ "$CUDA_PATH" != "" ]; then
+        echo 'HAVE_OPENCL=yes' >> config.mak
+        echo 'OPENCL_LIB=OpenCL' >> config.mak
+        echo 'OPENCL_INC_DIR=$(CUDA_PATH)include' >> config.mak
+        if [ "$ARCH" = "X86" ]; then
+            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/Win32' >> config.mak
+        else
+            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/x64' >> config.mak
+        fi
+        define HAVE_OPENCL
+    elif [ "$AMDAPPSDKROOT" != "" ]; then
+        echo 'HAVE_OPENCL=yes' >> config.mak
+        echo 'OPENCL_LIB=OpenCL' >> config.mak
+        echo 'OPENCL_INC_DIR=$(AMDAPPSDKROOT)/include' >> config.mak
+        if [ "$ARCH" = "X86" ]; then
+            echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86' >> config.mak
+        else
+            echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86_64' >> config.mak
+        fi
+        define HAVE_OPENCL
+    else
+        opencl="no"
+    fi
+fi
+
 if [ $compiler = ICL ]; then
     echo '%.o: %.c' >> config.mak
     echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
@@ -1192,6 +1228,7 @@
 visualize:     $vis
 bit depth:     $bit_depth
 chroma format: $chroma_format
+opencl:        $opencl
 EOF

 echo >> config.log
diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/encoder.c
--- a/encoder/encoder.c Thu Mar 29 14:14:07 2012 -0700
+++ b/encoder/encoder.c Tue May 15 18:05:00 2012 -0500
@@ -37,6 +37,10 @@
 #include "common/visualize.h"
 #endif

+#if HAVE_OPENCL
+#include "opencl.c"
+#endif
+
 //#define DEBUG_MB_TYPE

 #define bs_write_ue bs_write_ue_big
@@ -515,6 +519,21 @@
     if( h->i_thread_frames > 1 )
         h->param.nalu_process = NULL;

+#ifndef HAVE_OPENCL
+    if( h->param.b_opencl )
+    {
+        x264_log( h, X264_LOG_WARNING, "not compiled with OpenCL support!\n" );
+        h->param.b_opencl = 0;
+    }
+#endif
+#if BIT_DEPTH > 8
+    if( h->param.b_opencl )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth!\n" );
+        h->param.b_opencl = 0;
+    }
+#endif
+
     h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
     if( h->param.i_keyint_max == 1 )
     {
@@ -1306,6 +1325,14 @@
             goto fail;
     }

+#if HAVE_OPENCL
+    if( h->param.b_opencl )
+    {
+        if( x264_opencl_init( h ) )
+            h->param.b_opencl = 0;
+    }
+#endif
+
     if( x264_lookahead_init( h, i_slicetype_length ) )
         goto fail;

@@ -3451,6 +3478,10 @@
                    || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
                    || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];

+#if HAVE_OPENCL
+    x264_opencl_free( h );
+#endif
+
     x264_lookahead_delete( h );

     if( h->param.b_sliced_threads )
diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/opencl.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/encoder/opencl.c  Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,424 @@
+/*****************************************************************************
+ * opencl.c: OpenCL initialization and kernel compilation
+ *****************************************************************************
+ * Copyright (C) 2010-2012 x264 project
+ *
+ * Authors: Steve Borho <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at [email protected].
+ *****************************************************************************/
+
+#if HAVE_OPENCL
+#include "common/common.h"
+#include "encoder/oclobj.h"
+
+/* Try to load the cached compiled program binary, verify the device context is
+ * still valid before reuse */
+static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
+{
+    cl_program program = NULL;
+    cl_int status;
+
+    /* try to load cached program binary */
+    FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
+    if( !fp )
+        return NULL;
+
+    fseek( fp, 0L, SEEK_END );
+    size_t size = ftell( fp );
+    rewind( fp );
+    uint8_t *binary = x264_malloc( size );
+    if( !binary )
+        goto fail;
+
+    fread( binary, 1, size, fp );
+    const uint8_t *ptr = (const uint8_t*)binary;
+
+#define CHECK_STRING( STR )\
+    do {\
+        size_t len = strlen( STR );\
+        if( size <= len || strncmp( (char*)ptr, STR, len ) )\
+            goto fail;\
+        else {\
+            size -= (len+1); ptr += (len+1);\
+        }\
+    } while( 0 )
+
+    CHECK_STRING( devname );
+    CHECK_STRING( devvendor );
+    CHECK_STRING( driverversion );
+#undef CHECK_STRING
+
+    program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
+    if( status != CL_SUCCESS )
+        program = NULL;
+
+fail:
+    fclose( fp );
+    x264_free( binary );
+    return program;
+}
+
+/* Save the compiled program binary to a file for later reuse.  Device context
+ * is also saved in the cache file so we do not reuse stale binaries */
+static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
+{
+    FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
+    if( !fp )
+        return;
+
+    size_t size;
+    cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
+    if( status == CL_SUCCESS )
+    {
+        unsigned char *binary = x264_malloc( size );
+        status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binary, NULL );
+        if( status == CL_SUCCESS )
+        {
+            fwrite( devname, 1, strlen( devname ), fp );
+            fwrite( "\n", 1, 1, fp );
+            fwrite( devvendor, 1, strlen( devvendor ), fp );
+            fwrite( "\n", 1, 1, fp );
+            fwrite( driverversion, 1, strlen( driverversion ), fp );
+            fwrite( "\n", 1, 1, fp );
+            fwrite( binary, 1, size, fp );
+        }
+        x264_free( binary );
+    }
+    fclose( fp );
+}
+
+static int x264_detect_AMD_GPU( cl_device_id device, int *b_is_SI )
+{
+    char extensions[512];
+    char boardname[64];
+    char devname[64];
+
+    *b_is_SI = 0;
+
+    cl_int status = clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL );
+    if( status != CL_SUCCESS || !strstr( extensions, "cl_amd_media_ops" ) )
+        return 0;
+
+    /* Detect whether GPU is a SouthernIsland based device */
+#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
+    status = clGetDeviceInfo( device, CL_DEVICE_BOARD_NAME_AMD, sizeof(boardname), boardname, NULL );
+    if( status == CL_SUCCESS )
+    {
+        boardname[15] = boardname[16] = boardname[17] = 'X';
+        *b_is_SI = !strcmp( boardname, "AMD Radeon HD 7XXX Series" );
+        return 1;
+    }
+
+    /* Fall back to checking the device name */
+    status = clGetDeviceInfo( device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
+    if( status != CL_SUCCESS )
+        return 1;
+
+    const char *tahiti_names[] = { "Tahiti", "Pitcairn", "Capeverde", "Bali", NULL };
+    for( int i = 0; tahiti_names[i]; i++ )
+        if( !strcmp( devname, tahiti_names[i] ) )
+        {
+            *b_is_SI = 1;
+            return 1;
+        }
+    return 1;
+}
+
+/* The OpenCL source under common/opencl will be merged into encoder/oclobj.h by
+ * the Makefile. It defines a x264_opencl_source byte array which we will pass
+ * to clCreateProgramWithSource().  We also attempt to use a cache file for the
+ * compiled binary, stored in the current working folder.  */
+static cl_program x264_opencl_compile( x264_t *h )
+{
+    cl_program program;
+    cl_int status;
+
+    char devname[64];
+    char devvendor[64];
+    char driverversion[64];
+    status  = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME,    sizeof(devname), devname, NULL );
+    status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR,  sizeof(devvendor), devvendor, NULL );
+    status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
+    if( status != CL_SUCCESS )
+        return NULL;
+
+    int b_isamd = x264_detect_AMD_GPU( h->opencl.device, &h->opencl.b_device_AMD_SI );
+
+    x264_log( h, X264_LOG_INFO, "OpenCL: %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
+
+    program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
+    if( !program )
+    {
+        /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
+        x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
+        const char *strptr = (const char*)x264_opencl_source;
+        size_t size = sizeof(x264_opencl_source);
+        program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
+        if( status != CL_SUCCESS || !program )
+        {
+            x264_log( h, X264_LOG_ERROR, "OpenCL: unable to create program\n" );
+            return NULL;
+        }
+    }
+
+    /* Build the program binary for the OpenCL device */
+    const char *buildopts = "";
+    if( b_isamd && !h->opencl.b_device_AMD_SI )
+        buildopts = "-DVECTORIZE=1";
+    status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
+    if( status == CL_SUCCESS )
+    {
+        x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
+        return program;
+    }
+
+    /* Compile failure, should not happen with production code. */
+
+    size_t build_log_len = 0;
+
+    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_ERROR, "OpenCL: Compilation failed, unable to get build log\n" );
+        return NULL;
+    }
+
+    char *build_log = x264_malloc( build_log_len );
+    if( !build_log )
+        return NULL;
+
+    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_ERROR, "OpenCL: Compilation failed, unable to get build log\n" );
+        x264_free( build_log );
+        return NULL;
+    }
+
+    FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
+    if( lg )
+    {
+        fwrite( build_log, 1, build_log_len, lg );
+        fclose( lg );
+        x264_log( h, X264_LOG_ERROR, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
+    }
+
+    x264_free( build_log );
+    return NULL;
+}
+
+static void x264_opencl_free_lookahead( x264_t *h )
+{
+#define RELEASE( a, f ) if( a ) f( a );
+    RELEASE( h->opencl.intra_kernel, clReleaseKernel )
+    RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
+    RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
+    RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
+    RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
+    RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
+    RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
+    RELEASE( h->opencl.memset_kernel, clReleaseKernel )
+    RELEASE( h->opencl.hme_kernel, clReleaseKernel )
+    RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
+    RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
+    RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
+    RELEASE( h->opencl.lookahead_program, clReleaseProgram )
+    RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
+    RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
+    RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
+    RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
+    RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
+    RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
+    RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
+    RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
+    RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
+    RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
+    RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
+    RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
+    RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
+    RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
+    for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+    {
+        RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
+    }
+#undef RELEASE
+}
+
+static int x264_opencl_init_lookahead( x264_t *h )
+{
+    if( h->param.rc.i_lookahead == 0 )
+        return -1;
+
+    char *kernelnames[] = {
+        "mb_intra_cost_satd_8x8",
+        "sum_intra_cost",
+        "downscale_hpel",
+        "downscale1",
+        "downscale2",
+        "memset_int16",
+        "weightp_scaled_images",
+        "weightp_hpel",
+        "hierarchical_motion",
+        "subpel_refine",
+        "mode_selection",
+        "sum_inter_cost"
+    };
+    cl_kernel *kernels[] = {
+        &h->opencl.intra_kernel,
+        &h->opencl.rowsum_intra_kernel,
+        &h->opencl.downscale_hpel_kernel,
+        &h->opencl.downscale_kernel1,
+        &h->opencl.downscale_kernel2,
+        &h->opencl.memset_kernel,
+        &h->opencl.weightp_scaled_images_kernel,
+        &h->opencl.weightp_hpel_kernel,
+        &h->opencl.hme_kernel,
+        &h->opencl.subpel_refine_kernel,
+        &h->opencl.mode_select_kernel,
+        &h->opencl.rowsum_inter_kernel
+    };
+    cl_int status;
+
+    h->opencl.lookahead_program = x264_opencl_compile( h );
+    if( !h->opencl.lookahead_program )
+    {
+        x264_opencl_free_lookahead( h );
+        return -1;
+    }
+
+    for( int i = 0; i < sizeof(kernelnames)/sizeof(char*); i++ )
+    {
+        *kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
+        if( status != CL_SUCCESS )
+        {
+            x264_log( h, X264_LOG_ERROR, "Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
+            x264_opencl_free_lookahead( h );
+            return -1;
+        }
+    }
+
+    h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_ERROR, "Unable to allocate page-locked buffer, error '%d'\n", status );
+        return -1;
+    }
+    h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                                                    0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_ERROR, "Unable to map page-locked buffer, error '%d'\n", status );
+        return -1;
+    }
+
+    return 0;
+}
+
+static void  x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
+{
+    x264_t *h = (x264_t*)user_data;
+    x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
+}
+
+static int x264_opencl_init( x264_t *h )
+{
+    cl_int status;
+    cl_uint numPlatforms;
+    int ret = -1;
+
+    status = clGetPlatformIDs( 0, NULL, &numPlatforms );
+    if( status != CL_SUCCESS || numPlatforms == 0 )
+        return -1;
+
+    cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
+    status = clGetPlatformIDs( numPlatforms, platforms, NULL );
+    if( status != CL_SUCCESS )
+    {
+        x264_free( platforms );
+        return -1;
+    }
+
+    /* Select first OpenCL platform that supports a GPU device */
+    for( cl_uint i = 0; i < numPlatforms; ++i )
+    {
+        cl_uint gpu_count;
+        status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
+        if( status == CL_SUCCESS && gpu_count > 0 )
+        {
+            /* take GPU 0 */
+            status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 1, &h->opencl.device, NULL );
+            if( status != CL_SUCCESS )
+                continue;
+
+            h->opencl.context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
+            if( status != CL_SUCCESS )
+                continue;
+
+            cl_bool image_support;
+            clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
+            if( !image_support )
+                continue;
+
+            cl_uint count = 0;
+            cl_image_format imageType[100];
+            clGetSupportedImageFormats( h->opencl.context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 100, imageType, &count );
+            count = X264_MIN( count, 100 );
+
+            int b_has_r = 0;
+            int b_has_rgba = 0;
+            for( cl_uint j = 0; j<count; j++ )
+            {
+                if( imageType[j].image_channel_order == CL_R )
+                    b_has_r = 1;
+                else if( imageType[j].image_channel_order == CL_RGBA )
+                    b_has_rgba = 1;
+            }
+            if( !b_has_r || !b_has_rgba )
+                continue;
+
+            h->opencl.queue = clCreateCommandQueue( h->opencl.context, h->opencl.device, 0, &status );
+            if( status != CL_SUCCESS )
+                continue;
+
+            ret = 0;
+            break;
+        }
+    }
+
+    x264_free( platforms );
+
+    if( !h->param.psz_clbin_file )
+        h->param.psz_clbin_file = "x264_lookahead.clbin";
+
+    if( !ret )
+        ret = x264_opencl_init_lookahead( h );
+
+    return ret;
+}
+
+void x264_opencl_free( x264_t *h )
+{
+    x264_opencl_free_lookahead( h );
+
+    if( h->opencl.queue )
+        clReleaseCommandQueue( h->opencl.queue );
+    if( h->opencl.context )
+        clReleaseContext( h->opencl.context );
+}
+#endif /* HAVE_OPENCL */
diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/slicetype-cl.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/encoder/slicetype-cl.c    Tue May 15 18:05:00 2012 -0500
@@ -0,0 +1,655 @@
+/*****************************************************************************
+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
+ *****************************************************************************
+ * Copyright (C) 2012 x264 project
+ *
+ * Authors: Steve Borho <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at [email protected].
+ *****************************************************************************/
+
+#define OCLCHECK( method, ... )\
+    status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) {\
+        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status ); return status; }
+
+static void x264_opencl_flush( x264_t *h )
+{
+    clFinish( h->opencl.queue );
+
+    /* Finish copies from the GPU by copying from the page-locked buffer to
+     * their final destination
+     */
+    for( int i = 0; i < h->opencl.num_copies; i++ )
+        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
+    h->opencl.num_copies = 0;
+    h->opencl.pl_occupancy = 0;
+}
+
+static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
+{
+    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
+        x264_opencl_flush( h );
+    assert( bytes < PAGE_LOCKED_BUF_SIZE );
+    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
+    h->opencl.pl_occupancy += bytes;
+    return ptr;
+}
+
+static int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
+{
+    if( fenc->b_intra_calculated )
+        return 0;
+    fenc->b_intra_calculated = 1;
+
+    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
+
+#define CREATEBUF( out, flags, size )\
+    out = clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
+    if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
+#define CREATEIMAGE( out, flags, pf, width, height )\
+    out = clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
+    if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
+
+    int mb_count = h->mb.i_mb_count;
+    cl_int status;
+
+    if( !h->opencl.lowres_mv_costs )
+    {
+        /* Allocate shared memory buffers */
+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+
+        CREATEBUF( h->opencl.lowres_mv_costs,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[0],  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[1],  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mvp_buffer,    CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.frame_stats[0],   CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.frame_stats[1],   CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[0],     CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[1],     CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
+        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
+    }
+
+    if( !fenc->opencl.intra_cost )
+    {
+        /* Allocate per-frame buffers */
+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+    }
+#undef CREATEBUF
+
+    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
+
+    char *locked = x264_opencl_alloc_locked( h, luma_length );
+    memcpy( locked, fenc->plane[0], luma_length );
+    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
+
+    size_t gdim[2];
+    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
+    {
+        int size = h->mb.i_mb_count * sizeof(int16_t);
+        locked = x264_opencl_alloc_locked( h, size );
+        memcpy( locked, fenc->i_inv_qscale_factor, size );
+        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    }
+    else
+    {
+        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
+        cl_uint arg = 0;
+        int16_t value = 256;
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
+        gdim[0] = h->mb.i_mb_count;
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
+    }
+
+    int stride = fenc->i_stride[0];
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
+    gdim[0] = 8 * h->mb.i_mb_width;
+    gdim[1] = 8 * h->mb.i_mb_height;
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
+
+    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
+    {
+        /* Workaround for AMD Southern Island, alternate kernel instances.  No
+         * perf impact to this, so we do it for all GPUs
+         */
+        cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
+
+        arg = 0;
+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
+        gdim[0] >>= 1;
+        gdim[1] >>= 1;
+        if( gdim[0] < 16 || gdim[1] < 16 )
+            break;
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
+    }
+
+    size_t ldim[2];
+    gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
+    gdim[1] = 8*h->mb.i_mb_height;
+    ldim[0] = 32;
+    ldim[1] = 8;
+    arg = 0;
+    int slow = h->param.analyse.i_subpel_refine > 7;
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    gdim[0] = 256;
+    gdim[1] = h->mb.i_mb_height;
+    ldim[0] = 256;
+    ldim[1] = 1;
+    arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+        x264_opencl_flush( h );
+
+    int size = h->mb.i_mb_count * sizeof(int16_t);
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    h->opencl.num_copies++;
+
+    size = h->mb.i_mb_height * sizeof(int);
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    h->opencl.num_copies++;
+
+    size = sizeof(int) * 4;
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+
+    h->opencl.last_buf = !h->opencl.last_buf;
+    return 0;
+}
+
+static void x264_optimal_launch_dims( size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
+{
+    size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
+    size_t preferred_multiple = 64;
+    cl_uint num_cus = 6;
+
+    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
+    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
+    clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
+
+    ldims[0] = preferred_multiple;
+    ldims[1] = 8;
+
+    /* make ldims[1] an even divisor of gdims[1] */
+    while( gdims[1] & (ldims[1] - 1) )
+    {
+        ldims[0] <<= 1;
+        ldims[1] >>= 1;
+    }
+    /* make total ldims fit under the max work-group dimensions for the device */
+    while( ldims[0] * ldims[1] > max_work_group )
+    {
+        if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
+            ldims[1] >>= 1;
+        else
+            ldims[0] >>= 1;
+    }
+
+    if( ldims[0] > gdims[0] )
+    {
+        /* remove preferred multiples until we're close to gdims[0] */
+        while( gdims[0] + preferred_multiple < ldims[0] )
+            ldims[0] -= preferred_multiple;
+        gdims[0] = ldims[0];
+    }
+    else
+    {
+        /* make gdims an even multiple of ldims */
+        gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
+        gdims[0] *= ldims[0];
+    }
+
+    /* make ldims smaller to spread work across compute units */
+    while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
+    {
+        if( ldims[0] > preferred_multiple )
+            ldims[0] >>= 1;
+        else if( ldims[1] > 1 )
+            ldims[1] >>= 1;
+        else
+            break;
+    }
+    /* for smaller GPUs, try not to abuse their texture cache */
+    if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
+        ldims[0] = 32;
+}
+
+static int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
+{
+    x264_frame_t *fenc = frames[b];
+    x264_frame_t *fref = frames[ref];
+
+    cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
+    cl_mem ref_luma_hpel;
+    cl_int status;
+
+    if( w && (w->i_denom) != 0 )
+    {
+        size_t gdims[2];
+
+        gdims[0] = 8 * h->mb.i_mb_width;
+        gdims[1] = 8 * h->mb.i_mb_height;
+
+        /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            cl_uint arg = 0;
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+            gdims[0] >>= 1;
+            gdims[1] >>= 1;
+            if( gdims[0] < 16 || gdims[1] < 16 )
+                break;
+        }
+
+        cl_uint arg = 0;
+        gdims[0] = 8 * h->mb.i_mb_width;
+        gdims[1] = 8 * h->mb.i_mb_height;
+
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+        /* Use weighted reference planes for motion search */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+            ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
+        ref_luma_hpel = h->opencl.weighted_luma_hpel;
+    }
+    else
+    {
+        /* Use unweighted reference planes for motion search */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+            ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
+        ref_luma_hpel = fref->opencl.luma_hpel;
+    }
+
+    const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
+    int b_first_iteration = 1;
+    int b_reverse_references = 1;
+    int A = 1;
+
+
+    int mb_per_group = 0;
+    int cost_local_size = 0;
+    int mvc_local_size = 0;
+    int fenc_local_size = 0;
+    int mb_width;
+
+    size_t gdims[2];
+    size_t ldims[2];
+
+    /* scale 0 is 8x8 */
+    for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
+    {
+        mb_width = h->mb.i_mb_width >> scale;
+        gdims[0] = mb_width;
+        gdims[1] = h->mb.i_mb_height >> scale;
+        if( gdims[0] < 2 || gdims[1] < 2 )
+            continue;
+        gdims[0] <<= 2;
+        x264_optimal_launch_dims( gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
+
+        mb_per_group = (ldims[0] >> 2) * ldims[1];
+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+        fenc_local_size = 16 * sizeof(cl_uint4) * mb_per_group;
+        mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
+        int scaled_me_range = h->param.analyse.i_me_range >> scale;
+        int b_shift_index = 1;
+
+        cl_uint arg = 0;
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
+
+        for( int iter = 0; iter < num_iterations[scale]; iter++ )
+        {
+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+            b_shift_index = 0;
+            b_first_iteration = 0;
+            if( scale > 2 )
+                b_reverse_references = !b_reverse_references;
+            else
+                b_reverse_references = 0;
+            A = !A;
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
+        }
+    }
+
+    int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
+
+    if( b_islist1 )
+    {
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+    }
+    else
+    {
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+    }
+
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
+
+    if( h->opencl.b_device_AMD_SI )
+    {
+        /* workaround for AMD Southern Island, perform meaningless small copy */
+        OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
+    }
+
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+    int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
+        x264_opencl_flush( h );
+
+    char *locked = x264_opencl_alloc_locked( h, mvlen );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
+
+    if( b_islist1 )
+    {
+        int mvs_offset = mvlen * (ref - b - 1);
+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
+    }
+    else
+    {
+        int mvs_offset = mvlen * (b - ref - 1);
+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
+    }
+
+    h->opencl.num_copies++;
+
+    return 0;
+}
+
+static int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
+{
+    cl_int status;
+    x264_frame_t *fenc = frames[b];
+    x264_frame_t *fref0 = frames[p0];
+    x264_frame_t *fref1 = frames[p1];
+
+    int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
+
+    /* Tasks for this kernel:
+     * 1. Select least cost mode (intra, ref0, ref1)
+     *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
+     * 2. if B frame, try bidir predictions.
+     * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
+     */
+    size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
+    size_t ldim_bidir[2];
+    size_t *ldims = NULL;
+    int cost_local_size = 4;
+    int satd_local_size = 4;
+    if( b < p1 )
+    {
+        /* For B frames, use 4 threads per MB for BIDIR checks */
+        ldims = ldim_bidir;
+        gdims[0] <<= 2;
+        x264_optimal_launch_dims( gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
+        int mb_per_group = (ldims[0] >> 2) * ldims[1];
+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+        satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
+    }
+
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+    /* Sum costs across rows, atomicAdd down frame */
+    size_t gdim[2] = { 256, h->mb.i_mb_height };
+    size_t ldim[2] = { 256, 1 };
+
+    arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+        x264_opencl_flush( h );
+
+    int size =  h->mb.i_mb_count * sizeof(int16_t);
+    char *locked = x264_opencl_alloc_locked( h, size );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.num_copies++;
+
+    size =  h->mb.i_mb_height * sizeof(int);
+    locked = x264_opencl_alloc_locked( h, size );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.num_copies++;
+
+    size =  4 * sizeof(int);
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.last_buf = !h->opencl.last_buf;
+
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+
+    if( b == p1 ) // P frames only
+    {
+        h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
+        h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
+        h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+        h->opencl.num_copies++;
+    }
+    return 0;
+}
+
+static int x264_opencl_precalculate_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
+{
+    if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
+        return 0;
+    else
+    {
+        int do_search[2];
+        int dist_scale_factor = 128;
+        const x264_weight_t *w = x264_weight_none;
+
+        // avoid duplicating work
+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
+
+        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+        if( do_search[0] )
+        {
+            if( h->param.analyse.i_weighted_pred && b == p1 )
+            {
+                x264_emms();
+                x264_weights_analyse( h, frames[b], frames[p0], 1 );
+                w = frames[b]->weight[0];
+            }
+            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+        }
+        if( do_search[1] )
+            frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+        if( b == p1 )
+            frames[b]->i_intra_mbs[b-p0] = 0;
+        if( p1 != p0 )
+            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+
+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
+        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
+
+        x264_opencl_lowres_init( h, frames[b], a->i_lambda );
+
+        if( do_search[0] )
+        {
+            x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
+            x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
+        }
+        if( do_search[1] )
+        {
+            x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
+            x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
+        }
+        x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
+        return 1;
+    }
+}
diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/slicetype.c
--- a/encoder/slicetype.c   Thu Mar 29 14:14:07 2012 -0700
+++ b/encoder/slicetype.c   Tue May 15 18:05:00 2012 -0500
@@ -36,6 +36,15 @@
                                       x264_frame_t **frames, int p0, int p1, int b,
                                       int b_intra_penalty );

+static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+#if HAVE_OPENCL
+#ifdef _WIN32
+#include "windows.h"
+#endif
+#include "slicetype-cl.c"
+#endif
+
 static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
 {
     a->i_qp = X264_LOOKAHEAD_QP;
@@ -748,38 +757,62 @@
         frames[b]->i_cost_est[b-p0][p1-b] = 0;
         frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;

-        /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
-         * This considerably improves MV prediction overall. */
+#if HAVE_OPENCL
+        if( h->param.b_opencl )
+        {
+            x264_opencl_lowres_init(h, frames[b], a->i_lambda );
+            if( do_search[0] )
+            {
+                x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
+                x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
+            }
+            if( do_search[1] )
+            {
+                x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
+                x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
+            }
+            if( b != p0 )
+                x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
+            x264_opencl_flush( h );

-        /* The edge mbs seem to reduce the predictive quality of the
-         * whole frame's score, but are needed for a spatial distribution. */
-        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
-            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
-        {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
-            {
-                row_satd[h->mb.i_mb_y] = 0;
-                if( !frames[b]->b_intra_calculated )
-                    row_satd_intra[h->mb.i_mb_y] = 0;
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
-            }
+            i_score = frames[b]->i_cost_est[b-p0][p1-b];
         }
         else
+#endif
         {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+            /* Lowres lookahead goes backwards because the MVs are used as predictors in
+             * the main encode. This considerably improves MV prediction overall. */
+
+            /* The edge mbs seem to reduce the predictive quality of the
+             * whole frame's score, but are needed for a spatial distribution. */
+            if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
+                    h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
+            {
+                for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+                {
+                    row_satd[h->mb.i_mb_y] = 0;
+                    if( !frames[b]->b_intra_calculated )
+                        row_satd_intra[h->mb.i_mb_y] = 0;
+                    for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
+                        x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+                }
+            }
+            else
+            {
+                for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
+                    for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
+                        x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+            }
+
+            i_score = frames[b]->i_cost_est[b-p0][p1-b];
+            if( b != p1 )
+                i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
+            else
+                frames[b]->b_intra_calculated = 1;
+
+            frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+            x264_emms();
         }
-
-        i_score = frames[b]->i_cost_est[b-p0][p1-b];
-        if( b != p1 )
-            i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
-        else
-            frames[b]->b_intra_calculated = 1;
-
-        frames[b]->i_cost_est[b-p0][p1-b] = i_score;
-        x264_emms();
     }

     if( b_intra_penalty )
@@ -1344,6 +1377,61 @@
         return;
     }

+#if HAVE_OPENCL
+#ifdef _WIN32
+    int my_pri = THREAD_PRIORITY_NORMAL, ocl_pri = THREAD_PRIORITY_NORMAL;
+#endif
+
+    if( h->param.b_opencl )
+    {
+#ifdef _WIN32
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+        HANDLE id = GetCurrentThread();
+        my_pri = GetThreadPriority( id );
+        SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+        if( status == CL_SUCCESS )
+        {
+            ocl_pri = GetThreadPriority( id );
+            SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+        }
+#endif
+
+        int b_work_done = 0;
+
+        /* precalculate intra and I frames */
+        for( int i = 0; i <= num_frames; i++ )
+        {
+            b_work_done |= x264_opencl_lowres_init( h, frames[i], a.i_lambda );
+        }
+        if( b_work_done && h->param.analyse.i_weighted_pred )
+        {
+            /* weightp estimation requires intra costs to be on the CPU */
+            x264_opencl_flush( h );
+        }
+
+        if( h->param.i_bframe && h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+        {
+            /* Precalculate motion searches in both directions */
+            for( int i = 0; i < num_frames; i++ )
+            {
+                for (int b = 0; b < h->param.i_bframe; b++)
+                {
+                    if (i <= b)
+                    {
+                        break;
+                    }
+
+                    b_work_done |= x264_opencl_precalculate_frame_cost( h, &a, frames, i, i-b-1, i-b-1 );
+                    b_work_done |= x264_opencl_precalculate_frame_cost( h, &a, frames, i-b-1, i, i );
+                }
+            }
+        }
+        if( b_work_done )
+            x264_opencl_flush( h );
+    }
+#endif
+
     if( h->param.i_bframe )
     {
         if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
@@ -1377,6 +1465,18 @@
                     continue;
                 }

+#if HAVE_OPENCL
+                if( h->param.b_opencl )
+                {
+                    int b_work_done = 0;
+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+0, i+2, i+1 );
+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+0, i+1, i+1 );
+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+1, i+2, i+2 );
+                    if( b_work_done )
+                        x264_opencl_flush( h );
+                }
+#endif
+
                 cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
                 cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
                 cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
@@ -1459,6 +1559,19 @@
     /* Restore frametypes for all frames that haven't actually been decided yet. */
     for( int j = reset_start; j <= num_frames; j++ )
         frames[j]->i_type = X264_TYPE_AUTO;
+
+#ifdef _WIN32
+#if HAVE_OPENCL
+    if( h->param.b_opencl )
+    {
+        HANDLE id = GetCurrentThread();
+        SetThreadPriority( id, my_pri );
+        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+        if( status == CL_SUCCESS )
+            SetThreadPriority( id, ocl_pri );
+    }
+#endif
+#endif
 }

 void x264_slicetype_decide( x264_t *h )
diff -r 087c10a1fec7 -r 9a682cb0c74f x264.c
--- a/x264.c    Thu Mar 29 14:14:07 2012 -0700
+++ b/x264.c    Tue May 15 18:05:00 2012 -0500
@@ -805,6 +805,8 @@
         "                                  as opposed to letting them select different algorithms\n" );
     H2( "      --asm <integer>         Override CPU detection\n" );
     H2( "      --no-asm                Disable all CPU optimizations\n" );
+    H2( "      --no-opencl             Disable use of OpenCL\n" );
+    H2( "      --clbin-file <string>   Specify path of compiled OpenCL kernel cache\n" );
     H2( "      --visualize             Show MB types overlayed on the encoded video\n" );
     H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
     H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
@@ -909,6 +911,8 @@
     { "ref",         required_argument, NULL, 'r' },
     { "asm",         required_argument, NULL, 0 },
     { "no-asm",            no_argument, NULL, 0 },
+    { "no-opencl",         no_argument, NULL, 0 },
+    { "clbin-file",  required_argument, NULL, 0 },
     { "sar",         required_argument, NULL, 0 },
     { "fps",         required_argument, NULL, OPT_FPS },
     { "frames",      required_argument, NULL, OPT_FRAMES },
diff -r 087c10a1fec7 -r 9a682cb0c74f x264.h
--- a/x264.h    Thu Mar 29 14:14:07 2012 -0700
+++ b/x264.h    Tue May 15 18:05:00 2012 -0500
@@ -462,6 +462,9 @@

     int b_fake_interlaced;

+    int b_opencl;            /* use OpenCL when available */
+    char *psz_clbin_file;    /* compiled OpenCL kernel cache file */
+
     /* Slicing parameters */
     int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
     int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */