Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # HG changeset patch
- # User Steve Borho <steve@borho.org>
- # Date 1337123100 18000
- # Node ID 9a682cb0c74fbf0ef19ef7eaa04b20fc0e27d360
- # Parent 087c10a1fec707bc7c53e91f5efeb3567f0bf77f
- OpenCL lookahead
- diff -r 087c10a1fec7 -r 9a682cb0c74f Makefile
- --- a/Makefile Thu Mar 29 14:14:07 2012 -0700
- +++ b/Makefile Tue May 15 18:05:00 2012 -0500
- @@ -8,6 +8,8 @@
- vpath %.asm $(SRCPATH)
- vpath %.rc $(SRCPATH)
- +GENERATED =
- +
- all: default
- default:
- @@ -145,6 +147,36 @@
- endif
- endif
- +QUOTED_CFLAGS := $(CFLAGS)
- +
- +ifeq ($(HAVE_OPENCL),yes)
- +empty:=
- +space:=$(empty) $(empty)
- +escaped:=\ $(empty)
- +open:=(
- +escopen:=\(
- +close:=)
- +escclose:=\)
- +SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
- +SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
- +SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
- +SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
- +SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
- +SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
- +# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
- +# make gcc happy
- +CFLAGS += -I$(SAFE_INC_DIR)
- +LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
- +# For the CFLAGS used by the .depend rule, we must add quotes because
- +# the rule does an extra level of shell expansions
- +QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)"
- +encoder/oclobj.h: common/opencl/x264-cl.h $(wildcard common/opencl/*.cl)
- + echo "static const char x264_opencl_source [] = {" > $@
- + cat $^ | xxd -i >> $@
- + echo ",0x00 };" >> $@
- +GENERATED += encoder/oclobj.h
- +endif
- +
- OBJS += $(SRCS:%.c=%.o)
- OBJCLI += $(SRCCLI:%.c=%.o)
- OBJSO += $(SRCSO:%.c=%.o)
- @@ -164,17 +196,22 @@
- $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
- ifneq ($(EXE),)
- -.PHONY: x264 checkasm
- +.PHONY: x264 checkasm test-opencl
- x264: x264$(EXE)
- checkasm: checkasm$(EXE)
- +test-opencl: test-opencl$(EXE)
- endif
- -x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
- +x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
- $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
- -checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
- +checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
- $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
- +OBJOCL = tools/test-opencl.o
- +test-opencl$(EXE): .depend ${OBJOCL} $(LIBX264)
- + $(LD)$@ $(OBJOCL) $(LIBX264) $(LDFLAGS)
- +
- $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
- %.o: %.asm
- @@ -193,7 +230,7 @@
- .depend: config.mak
- @rm -f .depend
- - @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
- + @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
- config.mak:
- ./configure
- @@ -231,7 +268,7 @@
- clean:
- rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
- - rm -f checkasm checkasm.exe $(OBJCHK)
- + rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
- rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
- distclean: clean
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/common.c
- --- a/common/common.c Thu Mar 29 14:14:07 2012 -0700
- +++ b/common/common.c Tue May 15 18:05:00 2012 -0500
- @@ -170,6 +170,10 @@
- param->b_pic_struct = 0;
- param->b_fake_interlaced = 0;
- param->i_frame_packing = -1;
- +#if HAVE_OPENCL
- + param->b_opencl = 1;
- + param->psz_clbin_file = NULL;
- +#endif
- }
- static int x264_param_apply_preset( x264_param_t *param, const char *preset )
- @@ -1013,6 +1017,10 @@
- p->b_fake_interlaced = atobool(value);
- OPT("frame-packing")
- p->i_frame_packing = atoi(value);
- + OPT("opencl")
- + p->b_opencl = atobool( value );
- + OPT("clbin-file")
- + p->psz_clbin_file = strdup( value );
- else
- return X264_PARAM_BAD_NAME;
- #undef OPT
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/common.h
- --- a/common/common.h Thu Mar 29 14:14:07 2012 -0700
- +++ b/common/common.h Tue May 15 18:05:00 2012 -0500
- @@ -92,6 +92,11 @@
- #include <assert.h>
- #include <limits.h>
- +#if HAVE_OPENCL
- +# define NUM_IMAGE_SCALES 4
- +# include "CL/cl.h"
- +#endif
- +
- #if HAVE_INTERLACED
- # define MB_INTERLACED h->mb.b_interlaced
- # define SLICE_MBAFF h->sh.b_mbaff
- @@ -943,6 +948,65 @@
- struct visualize_t *visualize;
- #endif
- x264_lookahead_t *lookahead;
- +
- +#if HAVE_OPENCL
- +#define MAX_FINISH_COPIES 1024
- +#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
- + struct
- + {
- + cl_context context;
- + cl_device_id device;
- + cl_command_queue queue;
- +
- + cl_program lookahead_program;
- + cl_int last_buf;
- +
- + cl_mem page_locked_buffer;
- + char *page_locked_ptr;
- + int pl_occupancy;
- +
- + struct
- + {
- + void *src;
- + void *dest;
- + int bytes;
- + } copies[MAX_FINISH_COPIES];
- + int num_copies;
- +
- + int b_device_AMD_SI;
- +
- + /* downscale lowres luma */
- + cl_kernel downscale_hpel_kernel;
- + cl_kernel downscale_kernel1;
- + cl_kernel downscale_kernel2;
- + cl_mem luma_16x16_image[2];
- +
- + /* weightp filtering */
- + cl_kernel weightp_hpel_kernel;
- + cl_kernel weightp_scaled_images_kernel;
- + cl_mem weighted_scaled_images[NUM_IMAGE_SCALES];
- + cl_mem weighted_luma_hpel;
- +
- + /* intra */
- + cl_kernel memset_kernel;
- + cl_kernel intra_kernel;
- + cl_kernel rowsum_intra_kernel;
- + cl_mem row_satds[2];
- +
- + /* hierarchical motion estimation */
- + cl_kernel hme_kernel;
- + cl_kernel subpel_refine_kernel;
- + cl_mem mv_buffers[2];
- + cl_mem lowres_mv_costs;
- + cl_mem mvp_buffer;
- +
- + /* bidir */
- + cl_kernel mode_select_kernel;
- + cl_kernel rowsum_inter_kernel;
- + cl_mem lowres_costs[2];
- + cl_mem frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
- + } opencl;
- +#endif
- };
- // included at the end because it needs x264_t
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/frame.c
- --- a/common/frame.c Thu Mar 29 14:14:07 2012 -0700
- +++ b/common/frame.c Tue May 15 18:05:00 2012 -0500
- @@ -258,6 +258,23 @@
- return NULL;
- }
- +#if HAVE_OPENCL
- +static void x264_opencl_frame_delete( x264_frame_t *frame )
- +{
- +#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
- + for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
- + RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
- + RELEASEBUF( frame->opencl.luma_hpel );
- + RELEASEBUF( frame->opencl.inv_qscale_factor );
- + RELEASEBUF( frame->opencl.intra_cost );
- + RELEASEBUF( frame->opencl.lowres_mvs0 );
- + RELEASEBUF( frame->opencl.lowres_mvs1 );
- + RELEASEBUF( frame->opencl.lowres_mv_costs0 );
- + RELEASEBUF( frame->opencl.lowres_mv_costs1 );
- +#undef RELEASEBUF
- +}
- +#endif
- +
- void x264_frame_delete( x264_frame_t *frame )
- {
- /* Duplicate frames are blank copies of real frames (including pointers),
- @@ -302,6 +319,9 @@
- x264_free( frame->ref[1] );
- x264_pthread_mutex_destroy( &frame->mutex );
- x264_pthread_cond_destroy( &frame->cv );
- +#if HAVE_OPENCL
- + x264_opencl_frame_delete( frame );
- +#endif
- }
- x264_free( frame );
- }
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/frame.h
- --- a/common/frame.h Thu Mar 29 14:14:07 2012 -0700
- +++ b/common/frame.h Tue May 15 18:05:00 2012 -0500
- @@ -170,6 +170,20 @@
- /* user frame properties */
- uint8_t *mb_info;
- void (*mb_info_free)( void* );
- +
- +#if HAVE_OPENCL
- + struct
- + {
- + cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
- + cl_mem luma_hpel;
- + cl_mem inv_qscale_factor;
- + cl_mem intra_cost;
- + cl_mem lowres_mvs0;
- + cl_mem lowres_mvs1;
- + cl_mem lowres_mv_costs0;
- + cl_mem lowres_mv_costs1;
- + } opencl;
- +#endif
- } x264_frame_t;
- /* synchronized frame list */
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/bidir.cl
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/common/opencl/bidir.cl Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,254 @@
- +
- +/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
- +int bidir_satd_8x8_ii_coop4(
- + read_only image2d_t fenc_lowres, int2 fencpos,
- + read_only image2d_t fref0_planes, int2 qpos0,
- + read_only image2d_t fref1_planes, int2 qpos1,
- + int weight,
- + local sum2_t *tmpp,
- + int idx )
- +{
- + volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
- + sum2_t b0, b1, b2, b3;
- + sum2_t sum = 0;
- +
- + // fencpos is full-pel position of original MB
- + // qpos0 is qpel position within reference frame 0
- + // qpos1 is qpel position within reference frame 1
- +
- + int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
- + int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
- +
- + int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
- + int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
- + int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
- +
- + int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
- + int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
- +
- + int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
- + int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
- + int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
- +
- + uint mask_shift0A = 8 * (3 - hpel0A), mask_shift0B = 8 * (3 - hpel0B);
- + uint mask0A = 0x000000ff << mask_shift0A, mask0B = 0x000000ff << mask_shift0B;
- + uint mask_shift1A = 8 * (3 - hpel1A), mask_shift1B = 8 * (3 - hpel1B);
- + uint mask1A = 0x000000ff << mask_shift1A, mask1B = 0x000000ff << mask_shift1B;
- +
- + uint vA, vB;
- + uint enc, ref0, ref1;
- + uint a0, a1;
- + const int weight2 = 64 - weight;
- +
- +#define READ_BIDIR_DIFF( OUT, X )\
- + enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
- + vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 & mask0A) >> mask_shift0A;\
- + vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 & mask0B) >> mask_shift0B;\
- + ref0 = rhadd( vA, vB );\
- + vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 & mask1A) >> mask_shift1A;\
- + vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 & mask1B) >> mask_shift1B;\
- + ref1 = rhadd( vA, vB );\
- + OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
- +
- +#define READ_DIFF_EX( OUT, a, b )\
- + READ_BIDIR_DIFF( a0, a );\
- + READ_BIDIR_DIFF( a1, b );\
- + OUT = a0 + (a1<<BITS_PER_SUM);
- +
- +#define ROW_8x4_SATD( a, b, c )\
- + fencpos.y += a;\
- + fref0Apos.y += b;\
- + fref0Bpos.y += b;\
- + fref1Apos.y += c;\
- + fref1Bpos.y += c;\
- + READ_DIFF_EX( b0, 0, 4 );\
- + READ_DIFF_EX( b1, 1, 5 );\
- + READ_DIFF_EX( b2, 2, 6 );\
- + READ_DIFF_EX( b3, 3, 7 );\
- + HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
- + HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
- + sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
- +
- + ROW_8x4_SATD( 0, 0, 0 );
- + ROW_8x4_SATD( 4, 4, 4 );
- +
- +#undef READ_BIDIR_DIFF
- +#undef READ_DIFF_EX
- +#undef ROW_8x4_SATD
- +
- + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
- +}
- +
- +/*
- + * mode selection
- + *
- + * global launch dimensions: [mb_width, mb_height]
- + *
- + * If this is a B frame, launch dims are [mb_width * 4, mb_height]
- + */
- +kernel void mode_selection(
- + read_only image2d_t fenc_lowres,
- + read_only image2d_t fref0_planes,
- + read_only image2d_t fref1_planes,
- + const global short2 *fenc_lowres_mvs0,
- + const global short2 *fenc_lowres_mvs1,
- + const global short2 *fref1_lowres_mvs0,
- + const global int16_t *fenc_lowres_mv_costs0,
- + const global int16_t *fenc_lowres_mv_costs1,
- + const global uint16_t *fenc_intra_cost,
- + global uint16_t *lowres_costs,
- + global int *frame_stats,
- + local int16_t *cost_local,
- + local sum2_t *satd_local,
- + int mb_width,
- + int bipred_weight,
- + int dist_scale_factor,
- + int b,
- + int p0,
- + int p1,
- + int lambda )
- +{
- + int mb_x = get_global_id( 0 );
- + int b_bidir = (b < p1);
- + if( b_bidir )
- + {
- + mb_x >>= 2;
- + if( mb_x >= mb_width )
- + return;
- + }
- + int mb_y = get_global_id( 1 );
- + int mb_height = get_global_size( 1 );
- + int mb_count = mb_width * mb_height;
- + int mb_xy = mb_x + mb_y * mb_width;
- +
- + /* Initialize frame stats for next kernel (sum_inter_cost) */
- + if( mb_x < 4 && mb_y == 0 )
- + frame_stats[mb_x] = 0;
- +
- + int bcost = COST_MAX;
- + int list_used = 0;
- +
- + if( !b_bidir )
- + {
- + int icost = fenc_intra_cost[mb_xy];
- + COPY2_IF_LT( bcost, icost, list_used, 0 );
- + }
- + if( b != p0 )
- + {
- + int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
- + COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
- + }
- + if( b != p1 )
- + {
- + int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
- + COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
- + }
- +
- + if( b_bidir )
- + {
- + int2 coord = (int2)(mb_x << 3, mb_y << 3);
- + int mb_i = get_global_id( 0 ) & 3;
- + int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
- + cost_local += mb_in_group * 4;
- + satd_local += mb_in_group * 16;
- +
- +#define TRY_BIDIR( mv0, mv1, penalty )\
- + {\
- + int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
- + int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
- + cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
- + int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
- + COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
- + }
- + /* temporal prediction */
- + short2 dmv0, dmv1;
- + short2 mvr = fref1_lowres_mvs0[mb_xy];
- + dmv0.x = (mvr.x * dist_scale_factor + 128) >> 8;
- + dmv0.y = (mvr.y * dist_scale_factor + 128) >> 8;
- + dmv1.x = dmv0.x - mvr.x;
- + dmv1.y = dmv0.y - mvr.y;
- + TRY_BIDIR( dmv0, dmv1, 0 )
- +
- + if( as_uint( dmv0 ) || as_uint( dmv1 ) )
- + {
- + /* B-direct prediction */
- + dmv0 = 0; dmv1 = 0;
- + TRY_BIDIR( dmv0, dmv1, 0 );
- + }
- +
- + /* L0+L1 prediction */
- + dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
- + dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
- + TRY_BIDIR( dmv0, dmv1, 5 );
- +#undef TRY_BIDIR
- + }
- +
- + lowres_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
- +}
- +
- +/*
- + * parallel sum inter costs
- + *
- + * global launch dimensions: [256, mb_height]
- + */
- +kernel void sum_inter_cost(
- + const global uint16_t *fenc_lowres_costs,
- + const global uint16_t *inv_qscale_factor,
- + global int *fenc_row_satds,
- + global int *frame_stats,
- + int mb_width,
- + int bframe_bias,
- + int b,
- + int p0,
- + int p1 )
- +{
- + int y = get_global_id( 1 );
- + int mb_height = get_global_size( 1 );
- +
- + int row_satds = 0;
- + int cost_est = 0;
- + int cost_est_aq = 0;
- + int intra_mbs = 0;
- +
- + for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
- + {
- + int mb_xy = x + y * mb_width;
- + int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
- + int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
- + int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
- +
- + if( list == 0 && b_frame_score_mb )
- + {
- + intra_mbs++;
- + }
- +
- + int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
- +
- + row_satds += cost_aq;
- +
- + if( b_frame_score_mb )
- + {
- + cost_est += cost;
- + cost_est_aq += cost_aq;
- + }
- + }
- +
- + local int buffer[256];
- + int x = get_global_id( 0 );
- +
- + row_satds = parallel_sum( row_satds, x, buffer );
- + cost_est = parallel_sum( cost_est, x, buffer );
- + cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
- + intra_mbs = parallel_sum( intra_mbs, x, buffer );
- +
- + if( b != p1 )
- + cost_est = (int)((float)cost_est * 100 / (120 + bframe_bias));
- +
- + if( get_global_id( 0 ) == 0 )
- + {
- + fenc_row_satds[y] = row_satds;
- + atomic_add( frame_stats + 0, cost_est );
- + atomic_add( frame_stats + 1, cost_est_aq );
- + atomic_add( frame_stats + 2, intra_mbs );
- + }
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/downscale.cl
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/common/opencl/downscale.cl Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,138 @@
- +/*
- + * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
- + *
- + * --
- + *
- + * fenc_img is an output image (area of memory referenced through a texture
- + * cache). A read of any pixel location (x,y) returns four pixel values:
- + *
- + * val.s0 = P(x,y)
- + * val.s1 = P(x+1,y)
- + * val.s2 = P(x+2,y)
- + * val.s3 = P(x+3,y)
- + *
- + * This is a 4x replication of the lowres pixels, a trade-off between memory
- + * size and read latency.
- + *
- + * --
- + *
- + * hpel_planes is an output image that contains the four HPEL planes used for
- + * subpel refinement. A read of any pixel location (x,y) returns four HPEL pixel
- + * values:
- + *
- + * val.s0 = F(x,y) - aka the full pel pixel
- + * val.s1 = H(x,y) - horizontally interpolated pixel
- + * val.s2 = V(x,y) - vertically interpolated pixel
- + * val.s3 = C(x,y) - center (or corner) pixel
- + *
- + * launch dimensions: [lowres-width, lowres-height]
- + */
- +kernel void downscale_hpel(
- + const global pixel *fenc,
- + write_only image2d_t fenc_img,
- + write_only image2d_t hpel_planes,
- + int stride )
- +{
- + int x = get_global_id( 0 );
- + int y = get_global_id( 1 );
- + uint4 values;
- +
- + fenc += y * stride * 2;
- + const global pixel *src1 = fenc + stride;
- + const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
- + int2 pos = (int2)(x, y);
- + pixel right, left;
- +
- + right = rhadd( fenc[x*2], src1[x*2] );
- + left = rhadd( fenc[x*2+1], src1[x*2+1] );
- + values.s0 = rhadd( right, left ); // F
- +
- + right = rhadd( fenc[2*x+1], src1[2*x+1] );
- + left = rhadd( fenc[2*x+2], src1[2*x+2] );
- + values.s1 = rhadd( right, left ); // H
- +
- + right = rhadd( src1[2*x], src2[2*x] );
- + left = rhadd( src1[2*x+1], src2[2*x+1] );
- + values.s2 = rhadd( right, left ); // V
- +
- + right = rhadd( src1[2*x+1], src2[2*x+1] );
- + left = rhadd( src1[2*x+2], src2[2*x+2] );
- + values.s3 = rhadd( right, left ); // C
- +
- + uint4 val = (uint4)(((values.s0 & 0xff) << 24) + ((values.s1 & 0xff) << 16) + ((values.s2 & 0xff) << 8) + (values.s3 & 0xff), 0, 0, 0);
- + write_imageui( hpel_planes, pos, val );
- +
- + x = x+1 < get_global_size( 0 ) ? x+1 : x;
- + right = rhadd( fenc[x*2], src1[x*2] );
- + left = rhadd( fenc[x*2+1], src1[x*2+1] );
- + values.s1 = rhadd( right, left );
- +
- + x = x+1 < get_global_size( 0 ) ? x+1 : x;
- + right = rhadd( fenc[x*2], src1[x*2] );
- + left = rhadd( fenc[x*2+1], src1[x*2+1] );
- + values.s2 = rhadd( right, left );
- +
- + x = x+1 < get_global_size( 0 ) ? x+1 : x;
- + right = rhadd( fenc[x*2], src1[x*2] );
- + left = rhadd( fenc[x*2+1], src1[x*2+1] );
- + values.s3 = rhadd( right, left );
- +
- + write_imageui( fenc_img, pos, values );
- +}
- +
- +/*
- + * downscale lowres hierarchical motion search image
- + *
- + * launch dimensions: [lowres-width, lowres-height]
- + */
- +kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
- +{
- + int x = get_global_id( 0 );
- + int y = get_global_id( 1 );
- + int2 pos = (int2)(x, y);
- + int gs = get_global_size( 0 );
- + uint4 top, bot, values;
- + top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
- + bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
- + values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
- +
- + // these select statements appear redundant, and they should be, but tests break when
- + // they are not here.
- + values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
- + top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
- + bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
- + values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
- + values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
- + write_imageui( lower_res, pos, (uint4)(values) );
- +}
- +
- +/*
- + * Second copy of downscale kernel, no differences. This is a (no perf loss) workaround for a
- + * scheduling bug in current Tahiti drivers.
- + */
- +kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
- +{
- + int x = get_global_id( 0 );
- + int y = get_global_id( 1 );
- + int2 pos = (int2)(x, y);
- + int gs = get_global_size( 0 );
- + uint4 top, bot, values;
- + top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
- + bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
- + values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
- +
- + // these select statements appear redundant, and they should be, but tests break when
- + // they are not here.
- + values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
- + top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
- + bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
- + values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
- + values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
- + write_imageui( lower_res, pos, (uint4)(values) );
- +}
- +
- +/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
- +kernel void memset_int16( global int16_t *buf, int16_t value )
- +{
- + buf[get_global_id( 0 )] = value;
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/intra.cl
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/common/opencl/intra.cl Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,1053 @@
- +
- +inline uint32_t pack16to32( uint32_t a, uint32_t b )
- +{
- + return a + (b << 16);
- +}
- +
- +inline uint32_t pack8to16( uint32_t a, uint32_t b )
- +{
- + return a + (b << 8);
- +}
- +
- +inline int clamp_int( int a, int b, int c )
- +{
- + return clamp( a, b, c );
- +}
- +
- +#if VECTORIZE
- +int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
- +{
- + int8 a_v, d_v;
- + int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
- + int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
- +
- + d_v = convert_int8( vload8( 0, data ) );
- + a_v.s01234567 = (d_v - pr0).s04152637;
- + HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
- +
- + data += data_stride;
- + d_v = convert_int8( vload8( 0, data ) );
- + a_v.s01234567 = (d_v - pr1).s04152637;
- + HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
- +
- + data += data_stride;
- + d_v = convert_int8( vload8( 0, data ) );
- + a_v.s01234567 = (d_v - pr2).s04152637;
- + HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
- +
- + data += data_stride;
- + d_v = convert_int8( vload8( 0, data ) );
- + a_v.s01234567 = (d_v - pr3).s04152637;
- + HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
- +
- + uint8 sum_v;
- +
- + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
- + sum_v = abs( a_v );
- +
- + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
- + sum_v += abs( a_v );
- +
- + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
- + sum_v += abs( a_v );
- +
- + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
- + sum_v += abs( a_v );
- +
- + uint4 sum2 = sum_v.hi + sum_v.lo;
- + uint2 sum3 = sum2.hi + sum2.lo;
- + return ( sum3.hi + sum3.lo ) >> 1;
- +}
- +#else
- +SATD_C_8x4_Q( satd_8x4_lp, const local, private )
- +#endif
- +
- +/****************************************************************************
- + * 8x8 prediction for intra luma block
- + ****************************************************************************/
- +
- +#define F1 rhadd
- +#define F2( a, b, c ) ( a+2*b+c+2 )>>2
- +
- +#if VECTORIZE
- +int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
- +{
- + int8 pr0, pr1, pr2, pr3;
- +
- + // Upper half of pred[]
- + pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
- + pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
- + pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
- + pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
- + pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
- + pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
- + pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
- + pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- +
- + pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
- + pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
- + pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
- + pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
- + pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
- + pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
- + pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- + pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
- +
- + pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
- + pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
- + pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
- + pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
- + pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
- + pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- + pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
- + pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
- +
- + pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
- + pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
- + pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
- + pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
- + pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- + pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
- + pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
- + pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
- + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
- +
- + // Lower half of pred[]
- + pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
- + pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
- + pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
- + pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- + pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
- + pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
- + pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
- + pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
- +
- + pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
- + pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
- + pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- + pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
- + pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
- + pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
- + pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
- + pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
- +
- + pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
- + pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- + pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
- + pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
- + pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
- + pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
- + pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
- + pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
- +
- + pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
- + pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
- + pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
- + pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
- + pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
- + pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
- + pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
- + pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
- +
- + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
- +}
- +
- +int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
- +{
- + int8 pr0, pr1, pr2, pr3;
- +
- + // Upper half of pred[]
- + pr3.s0 = F2( left[1], left[2], left[3] );
- + pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
- + pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
- + pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
- + pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
- + pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
- + pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
- + pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
- + pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
- + pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
- + pr0.s7 = F2( top[5], top[6], top[7] );
- + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
- +
- + // Lower half of pred[]
- + pr3.s0 = F2( left[5], left[6], left[7] );
- + pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
- + pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
- + pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
- + pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
- + pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
- + pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
- + pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
- + pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
- + pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
- + pr0.s7 = F2( top[1], top[2], top[3] );
- + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
- +}
- +
- +int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
- +{
- + int8 pr0, pr1, pr2, pr3;
- +
- + // Upper half of pred[]
- + pr2.s0 = F2( left[1], left[0], left_top );
- + pr3.s0 = F2( left[2], left[1], left[0] );
- + pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
- + pr0.s0 = pr2.s1 = F1( left_top, top[0] );
- + pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
- + pr0.s1 = pr2.s2 = F1( top[0], top[1] );
- + pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
- + pr0.s2 = pr2.s3 = F1( top[1], top[2] );
- + pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
- + pr0.s3 = pr2.s4 = F1( top[2], top[3] );
- + pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
- + pr0.s4 = pr2.s5 = F1( top[3], top[4] );
- + pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
- + pr0.s5 = pr2.s6 = F1( top[4], top[5] );
- + pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
- + pr0.s6 = pr2.s7 = F1( top[5], top[6] );
- + pr1.s7 = F2( top[5], top[6], top[7] );
- + pr0.s7 = F1( top[6], top[7] );
- + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
- +
- + // Lower half of pred[]
- + pr2.s0 = F2( left[5], left[4], left[3] );
- + pr3.s0 = F2( left[6], left[5], left[4] );
- + pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
- + pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
- + pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
- + pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
- + pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
- + pr0.s2 = pr2.s3 = F1( left_top, top[0] );
- + pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
- + pr0.s3 = pr2.s4 = F1( top[0], top[1] );
- + pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
- + pr0.s4 = pr2.s5 = F1( top[1], top[2] );
- + pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
- + pr0.s5 = pr2.s6 = F1( top[2], top[3] );
- + pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
- + pr0.s6 = pr2.s7 = F1( top[3], top[4] );
- + pr1.s7 = F2( top[3], top[4], top[5] );
- + pr0.s7 = F1( top[4], top[5] );
- + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
- +#undef PRED
- +}
- +
- +int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
- +{
- + int8 pr0, pr1, pr2, pr3;
- +
- + // Upper half of pred[]
- + pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
- + pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
- + pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
- + pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
- +
- + pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
- + pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
- + pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
- + pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
- +
- + pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
- + pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
- + pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
- + pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
- +
- + pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
- + pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
- + pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
- + pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
- + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
- +
- + // Lower half of pred[]
- + pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
- + pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
- + pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
- + pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
- +
- + pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
- + pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
- + pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
- + pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
- +
- + pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
- + pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
- + pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
- + pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
- +
- + pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
- + pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
- + pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
- + pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
- + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
- +}
- +
- +int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
- +{
- + int8 pr0, pr1, pr2, pr3;
- +
- + // Upper half of pred[]
- + pr0.s0 = F1( top[0], top[1] );
- + pr1.s0 = F2( top[0], top[1], top[2] );
- + pr2.s0 = pr0.s1 = F1( top[1], top[2] );
- + pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
- + pr2.s1 = pr0.s2 = F1( top[2], top[3] );
- + pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
- + pr2.s2 = pr0.s3 = F1( top[3], top[4] );
- + pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
- + pr2.s3 = pr0.s4 = F1( top[4], top[5] );
- + pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
- + pr2.s4 = pr0.s5 = F1( top[5], top[6] );
- + pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
- + pr2.s5 = pr0.s6 = F1( top[6], top[7] );
- + pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
- + pr2.s6 = pr0.s7 = F1( top[7], top[8] );
- + pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
- + pr2.s7 = F1( top[8], top[9] );
- + pr3.s7 = F2( top[8], top[9], top[10] );
- + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
- +
- + // Lower half of pred[]
- + pr0.s0 = F1( top[2], top[3] );
- + pr1.s0 = F2( top[2], top[3], top[4] );
- + pr2.s0 = pr0.s1 = F1( top[3], top[4] );
- + pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
- + pr2.s1 = pr0.s2 = F1( top[4], top[5] );
- + pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
- + pr2.s2 = pr0.s3 = F1( top[5], top[6] );
- + pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
- + pr2.s3 = pr0.s4 = F1( top[6], top[7] );
- + pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
- + pr2.s4 = pr0.s5 = F1( top[7], top[8] );
- + pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
- + pr2.s5 = pr0.s6 = F1( top[8], top[9] );
- + pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
- + pr2.s6 = pr0.s7 = F1( top[9], top[10] );
- + pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
- + pr2.s7 = F1( top[10], top[11] );
- + pr3.s7 = F2( top[10], top[11], top[12] );
- + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
- +}
- +
- +int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
- +{
- + int8 pr0, pr1, pr2, pr3;
- +
- + // Upper half of pred[]
- + pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
- + pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
- + pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
- + pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
- +
- + pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
- + pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
- + pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
- + pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
- +
- + pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
- + pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
- + pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
- + pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
- +
- + pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
- + pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
- + pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
- + pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
- + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
- +
- + // Lower half of pred[]
- + pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
- + pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
- + pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
- + pr0.s6 = left[7]; pr0.s7 = left[7];
- +
- + pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
- + pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
- + pr1.s4 = left[7]; pr1.s5 = left[7];
- + pr1.s6 = left[7]; pr1.s7 = left[7];
- +
- + pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
- + pr2.s2 = left[7]; pr2.s3 = left[7];
- + pr2.s4 = left[7]; pr2.s5 = left[7];
- + pr2.s6 = left[7]; pr2.s7 = left[7];
- +
- + pr3 = (int8)left[7];
- +
- + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
- +}
- +
- +int x264_predict_8x8c_h( const local pixel *src, int src_stride )
- +{
- + const local pixel *src_l = src;
- + int8 pr0, pr1, pr2, pr3;
- +
- + // Upper half of pred[]
- + pr0 = (int8)src[-1]; src += src_stride;
- + pr1 = (int8)src[-1]; src += src_stride;
- + pr2 = (int8)src[-1]; src += src_stride;
- + pr3 = (int8)src[-1]; src += src_stride;
- + int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
- +
- + //Lower half of pred[]
- + pr0 = (int8)src[-1]; src += src_stride;
- + pr1 = (int8)src[-1]; src += src_stride;
- + pr2 = (int8)src[-1]; src += src_stride;
- + pr3 = (int8)src[-1];
- + return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
- +}
- +
- +int x264_predict_8x8c_v( const local pixel *src, int src_stride )
- +{
- + int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
- + return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
- + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
- +}
- +
- +int x264_predict_8x8c_p( const local pixel *src, int src_stride )
- +{
- + int H = 0, V = 0;
- + for( int i = 0; i < 4; i++ )
- + {
- + H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
- + V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
- + }
- +
- + int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
- + int b = (17 * H + 16) >> 5;
- + int c = (17 * V + 16) >> 5;
- + int i00 = a - 3 * b - 3 * c + 16;
- +
- + // Upper half of pred[]
- + int pix = i00;
- + int8 pr0, pr1, pr2, pr3;
- + pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- +
- + pix = i00;
- + pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- +
- + pix = i00;
- + pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- +
- + pix = i00;
- + pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
- +
- + //Lower half of pred[]
- + pix = i00;
- + pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- +
- + pix = i00;
- + pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- +
- + pix = i00;
- + pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- +
- + pix = i00;
- + pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
- + pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
- + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
- +}
- +
- +int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
- +{
- + int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
- + for( int i = 0; i < 4; i++ )
- + {
- + s0 += src[i - src_stride];
- + s1 += src[i + 4 - src_stride];
- + s2 += src[-1 + i * src_stride];
- + s3 += src[-1 + (i+4)*src_stride];
- + }
- +
- + // Upper half of pred[]
- + int8 dc0;
- + dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
- + dc0.hi = (int4)( (s1 + 2) >> 2 );
- + int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
- +
- + // Lower half of pred[]
- + dc0.lo = (int4)( (s3 + 2) >> 2 );
- + dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
- + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
- +}
- +
- +#else /* not vectorized: private is cheap registers are scarce */
- +
- +int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
- +{
- + private pixel pred[32];
- + int x_plus_y;
- +
- + // Upper half of pred[]
- + for( int y = 0; y < 4; y++ )
- + {
- + for( int x = 0; x < 8; x++ )
- + {
- + x_plus_y = clamp_int( x + y, 0, 13 );
- + pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
- + }
- + }
- + int satd = satd_8x4_lp( src, src_stride, pred, 8 );
- + //Lower half of pred[]
- + for( int y = 4; y < 8; y++ )
- + {
- + for( int x = 0; x < 8; x++ )
- + {
- + x_plus_y = clamp_int( x + y, 0, 13 );
- + pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
- + }
- + }
- + pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
- + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- + return satd;
- +}
- +
- +int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
- +{
- + private pixel pred[32];
- +#define PRED( x, y ) pred[(x) + (y)*8]
- + // Upper half of pred[]
- + PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
- + PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
- + PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
- + PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
- + PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
- + PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
- + PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
- + PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
- + PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
- + PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
- + PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
- + int satd = satd_8x4_lp( src, src_stride, pred, 8 );
- +
- + // Lower half of pred[]
- + PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
- + PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
- + PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
- + PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
- + PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
- + PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
- + PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
- + PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
- + PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
- + PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
- + PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
- + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- + return satd;
- +#undef PRED
- +}
- +
- +int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
- +{
- + private pixel pred[32];
- +#define PRED( x, y ) pred[(x) + (y)*8]
- + // Upper half of pred[]
- + PRED( 0, 2 ) = F2( left[1], left[0], left_top );
- + PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
- + PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
- + PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
- + PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
- + PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
- + PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
- + PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
- + PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
- + PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
- + PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
- + PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
- + PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
- + PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
- + PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
- + PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
- + PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
- + PRED( 7, 0 ) = F1( top[6], top[7] );
- + int satd = satd_8x4_lp( src, src_stride, pred, 8 );
- +
- + //Lower half of pred[]
- + PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
- + PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
- + PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
- + PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
- + PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
- + PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
- + PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
- + PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
- + PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
- + PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
- + PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
- + PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
- + PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
- + PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
- + PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
- + PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
- + PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
- + PRED( 7, 0 ) = F1( top[4], top[5] );
- + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- + return satd;
- +#undef PRED
- +}
- +
- +int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
- +{
- + private pixel pred[32];
- + int satd;
- + int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
- + int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
- + int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
- + int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
- + int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
- + int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
- + int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
- + int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
- + int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
- + int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
- + int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
- + // Upper half of pred[]
- + vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
- + vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
- + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
- + vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
- + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
- + vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
- + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
- + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
- + satd = satd_8x4_lp( src, src_stride, pred, 8 );
- + // Lower half of pred[]
- + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
- + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
- + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
- + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
- + vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
- + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
- + vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
- + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
- + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- + return satd;
- +}
- +
- +int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
- +{
- + private pixel pred[32];
- + int satd;
- +#define PRED( x, y ) pred[(x) + (y)*8]
- + // Upper half of pred[]
- + PRED( 0, 0 ) = F1( top[0], top[1] );
- + PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
- + PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
- + PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
- + PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
- + PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
- + PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
- + PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
- + PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
- + PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
- + PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
- + PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
- + PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
- + PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
- + PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
- + PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
- + PRED( 7, 2 ) = F1( top[8], top[9] );
- + PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
- + satd = satd_8x4_lp( src, src_stride, pred, 8 );
- + // Lower half of pred[]
- + PRED( 0, 0 ) = F1( top[2], top[3] );
- + PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
- + PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
- + PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
- + PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
- + PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
- + PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
- + PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
- + PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
- + PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
- + PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
- + PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
- + PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
- + PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
- + PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
- + PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
- + PRED( 7, 2 ) = F1( top[10], top[11] );
- + PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
- + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- + return satd;
- +#undef PRED
- +}
- +
- +int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
- +{
- + private pixel pred[32];
- + int satd;
- + int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
- + int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
- + int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
- + int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
- + int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
- + int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
- + int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
- + int p8 = pack8to16( left[7], left[7] );
- + // Upper half of pred[]
- + vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
- + satd = satd_8x4_lp( src, src_stride, pred, 8 );
- + // Lower half of pred[]
- + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
- + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
- + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- + return satd;
- +}
- +
- +int x264_predict_8x8c_h( const local pixel *src, int src_stride )
- +{
- + private pixel pred[32];
- + const local pixel *src_l = src;
- +
- + // Upper half of pred[]
- + vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
- + vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
- + vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
- + vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
- + int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
- +
- + // Lower half of pred[]
- + vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
- + vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
- + vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
- + vstore8( (uchar8)(src[-1]), 3, pred );
- + return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
- +}
- +
- +int x264_predict_8x8c_v( const local pixel *src, int src_stride )
- +{
- + private pixel pred[32];
- + uchar16 v16;
- + v16.lo = vload8( 0, &src[-src_stride] );
- + v16.hi = vload8( 0, &src[-src_stride] );
- +
- + vstore16( v16, 0, pred );
- + vstore16( v16, 1, pred );
- +
- + return satd_8x4_lp( src, src_stride, pred, 8 ) +
- + satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
- +}
- +
- +int x264_predict_8x8c_p( const local pixel *src, int src_stride )
- +{
- + int H = 0, V = 0;
- + private pixel pred[32];
- + int satd;
- +
- + for( int i = 0; i < 4; i++ )
- + {
- + H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
- + V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
- + }
- +
- + int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
- + int b = (17 * H + 16) >> 5;
- + int c = (17 * V + 16) >> 5;
- + int i00 = a - 3 * b - 3 * c + 16;
- +
- + // Upper half of pred[]
- + for( int y = 0; y < 4; y++ )
- + {
- + int pix = i00;
- + for( int x = 0; x < 8; x++ )
- + {
- + pred[x + y*8] = x264_clip_pixel( pix >> 5 );
- + pix += b;
- + }
- + i00 += c;
- + }
- + satd = satd_8x4_lp( src, src_stride, pred, 8 );
- + // Lower half of pred[]
- + for( int y = 0; y < 4; y++ )
- + {
- + int pix = i00;
- + for( int x = 0; x < 8; x++ )
- + {
- + pred[x + y*8] = x264_clip_pixel( pix >> 5 );
- + pix += b;
- + }
- + i00 += c;
- + }
- + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- + return satd;
- +}
- +
- +int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
- +{
- + private pixel pred[32];
- + int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
- + for( int i = 0; i < 4; i++ )
- + {
- + s0 += src[i - src_stride];
- + s1 += src[i + 4 - src_stride];
- + s2 += src[-1 + i * src_stride];
- + s3 += src[-1 + (i+4)*src_stride];
- + }
- +
- + // Upper half of pred[]
- + uchar8 dc0;
- + dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
- + dc0.hi = (uchar4)( (s1 + 2) >> 2 );
- + vstore8( dc0, 0, pred );
- + vstore8( dc0, 1, pred );
- + vstore8( dc0, 2, pred );
- + vstore8( dc0, 3, pred );
- + int satd = satd_8x4_lp( src, src_stride, pred, 8 );
- +
- + // Lower half of pred[]
- + dc0.lo = (uchar4)( (s3 + 2) >> 2 );
- + dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
- + vstore8( dc0, 0, pred );
- + vstore8( dc0, 1, pred );
- + vstore8( dc0, 2, pred );
- + vstore8( dc0, 3, pred );
- + return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
- +}
- +#endif
- +
- +kernel void mb_intra_cost_satd_8x8(
- + read_only image2d_t fenc,
- + global uint16_t *fenc_intra_cost,
- + global int *frame_stats,
- + int lambda,
- + int mb_width,
- + int slow )
- +{
- +#define CACHE_STRIDE (265)
- +#define BLOCK_OFFSET (266)
- + local pixel cache[2385];
- + local int cost_buf[32];
- + local pixel top[32 * 16];
- + local pixel left[32 * 8];
- + local pixel left_top[32];
- +
- + int lx = (int)get_local_id( 0 );
- + int ly = (int)get_local_id( 1 );
- + int gx = (int)get_global_id( 0 );
- + int gy = (int)get_global_id( 1 );
- + int gidx = (int)get_group_id( 0 );
- + int gidy = (int)get_group_id( 1 );
- + int linear_id = (int)(ly * get_local_size( 0 ) + lx);
- + int satd = COST_MAX;
- + int basex = (gidx << 8);
- + int basey = (gidy << 3) - 1;
- +
- + for( int y = 0; y < 9; y++ ) //Maybe unrolled later?
- + {
- + for( int x = 4*linear_id; x < (33 * 8); x += 1024 )
- + {
- + uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
- + cache[y * CACHE_STRIDE + 1 + x] = data.s0;
- + cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
- + cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
- + cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
- + }
- + }
- + if( linear_id < 9 )
- + cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
- +
- + barrier( CLK_LOCAL_MEM_FENCE );
- +
- + // Cooperatively build the top edge for the macroblock using smoothing function
- + int j = ly;
- + top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
- + 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
- + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
- + j += 8;
- + top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
- + 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
- + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
- + // Cooperatively build the left edge for the macroblock using smoothing function
- + left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
- + 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
- + cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp_int((ly + 1), 0, 7 )] + 2 ) >> 2;
- + // One left_top per macroblock
- + if( 0 == ly )
- + {
- + left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
- + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
- + cost_buf[lx] = COST_MAX;
- + }
- + barrier( CLK_LOCAL_MEM_FENCE );
- +
- + // each warp/wavefront generates a different prediction type; no divergence
- + switch( ly )
- + {
- + case 0:
- + satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
- + break;
- + case 1:
- + satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
- + break;
- + case 2:
- + satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
- + break;
- + case 3:
- + satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
- + break;
- + case 4:
- + satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
- + break;
- + case 5:
- + satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
- + break;
- + case 6:
- + satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
- + break;
- + case 7:
- + satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
- + break;
- + default:
- + break;
- + }
- + atom_min( &cost_buf[lx], satd );
- + if( slow )
- + {
- + // Do the remaining two (least likely) prediction modes
- + switch( ly )
- + {
- + case 0: // DDL
- + satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
- + atom_min( &cost_buf[lx], satd );
- + break;
- + case 1: // VL
- + satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
- + atom_min( &cost_buf[lx], satd );
- + break;
- + default:
- + break;
- + }
- + }
- + barrier( CLK_LOCAL_MEM_FENCE );
- +
- + if( (0 == ly) && (gx < mb_width) )
- + {
- + fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
- + }
- +
- + //Auxiliary work for one of the following kernels
- + if( gx < 2 && gy == 0 )
- + frame_stats[gx] = 0;
- +#undef CACHE_STRIDE
- +#undef BLOCK_OFFSET
- +}
- +
- +
- +
- +/*
- + * parallel sum intra costs
- + *
- + * global launch dimensions: [256, mb_height]
- + */
- +kernel void sum_intra_cost(
- + const global uint16_t *fenc_intra_cost,
- + const global uint16_t *inv_qscale_factor,
- + global int *fenc_row_satds,
- + global int *frame_stats,
- + int mb_width )
- +{
- + int y = get_global_id( 1 );
- + int mb_height = get_global_size( 1 );
- +
- + int row_satds = 0;
- + int cost_est = 0;
- + int cost_est_aq = 0;
- +
- + for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
- + {
- + int mb_xy = x + y * mb_width;
- + int cost = fenc_intra_cost[mb_xy];
- + int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
- + int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
- +
- + row_satds += cost_aq;
- + if( b_frame_score_mb )
- + {
- + cost_est += cost;
- + cost_est_aq += cost_aq;
- + }
- + }
- +
- + local int buffer[256];
- + int x = get_global_id( 0 );
- +
- + row_satds = parallel_sum( row_satds, x, buffer );
- + cost_est = parallel_sum( cost_est, x, buffer );
- + cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
- +
- + if( get_global_id( 0 ) == 0 )
- + {
- + fenc_row_satds[y] = row_satds;
- + atomic_add( frame_stats + 0, cost_est );
- + atomic_add( frame_stats + 1, cost_est_aq );
- + }
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/motionsearch.cl
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/common/opencl/motionsearch.cl Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,276 @@
- +
- +
- +
- +int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
- +{
- + /* edge macroblocks might not have a direct descendant, use nearest */
- + x = (x == mb_width-1) ? (x - (mb_width&1)) >> 1 : x >> 1;
- + y = (y == mb_height-1) ? (y - (mb_height&1)) >> 1 : y >> 1;
- + return (mb_width>>1) * y + x;
- +}
- +
- +/* Four threads calculate an 8x8 SAD. Each does two rows */
- +int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
- +{
- + frefpos.y += idx << 1;
- + fencpos.y += idx << 1;
- + int cost = 0;
- + if( frefpos.x < 0 )
- + {
- + /* slow path when MV goes past right edge */
- + for( int y = 0; y < 2; y++ )
- + {
- + for( int x = 0; x < 8; x++ )
- + {
- + pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
- + pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
- + cost += abs_diff( enc, ref );
- + }
- + }
- + }
- + else
- + {
- + uint4 enc, ref, costs = 0;
- + enc = read_imageui( fenc, sampler, fencpos );
- + ref = read_imageui( fref, sampler, frefpos );
- + costs += abs_diff( enc, ref );
- + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
- + ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
- + costs += abs_diff( enc, ref );
- + enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
- + ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
- + costs += abs_diff( enc, ref );
- + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
- + ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
- + costs += abs_diff( enc, ref );
- + cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
- + }
- + costs[idx] = cost;
- + return costs[0] + costs[1] + costs[2] + costs[3];
- +}
- +
- +/* One thread performs 8x8 SAD */
- +int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
- +{
- + if( frefpos.x < 0 )
- + {
- + /* slow path when MV goes past right edge */
- + int cost = 0;
- + for( int y = 0; y < 8; y++ )
- + {
- + for( int x = 0; x < 8; x++ )
- + {
- + uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
- + uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
- + cost += abs_diff( enc, ref );
- + }
- + }
- + return cost;
- + }
- + else
- + {
- + uint4 enc, ref, cost = 0;
- + for( int y = 0; y < 8; y++ )
- + {
- + for( int x = 0; x < 8; x += 4 )
- + {
- + enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
- + ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
- + cost += abs_diff( enc, ref );
- + }
- + }
- + return cost.s0 + cost.s1 + cost.s2 + cost.s3;
- + }
- +}
- +
- +
- +/*
- + * hierarchical motion estimation
- + *
- + * Each kernel launch is a single iteration
- + *
- + * MB per work group is determined by lclx / 4 * lcly
- + *
- + * global launch dimensions: [mb_width * 4, mb_height]
- + */
- +kernel void hierarchical_motion(
- + read_only image2d_t fenc,
- + read_only image2d_t fref,
- + const global short2 *in_mvs,
- + global short2 *out_mvs,
- + global int16_t *out_mv_costs,
- + global short2 *mvp_buffer,
- + local int16_t *cost_local,
- + local short2 *mvc_local,
- + int mb_width,
- + int lambda,
- + int me_range,
- + int scale,
- + int b_shift_index,
- + int b_first_iteration,
- + int b_reverse_references )
- +{
- + int mb_x = get_global_id( 0 ) >> 2;
- + if( mb_x >= mb_width )
- + return;
- + int mb_height = get_global_size( 1 );
- + int mb_i = get_global_id( 0 ) & 3;
- + int mb_y = get_global_id( 1 );
- + int mb_xy = mb_y * mb_width + mb_x;
- + const int mb_size = 8;
- + int2 coord;
- + coord.x = mb_x * mb_size;
- + coord.y = mb_y * mb_size;
- +
- + const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
- + cost_local += 4 * mb_in_group;
- +
- + int i_mvc = 0;
- + mvc_local += 4 * mb_in_group;
- + mvc_local[mb_i] = 0;
- + short2 mvp;
- +
- + if( b_first_iteration )
- + {
- + mvp.x = 0;
- + mvp.y = 0;
- + }
- + else
- + {
- +#define MVC( DX, DY )\
- + {\
- + int px = mb_x + DX;\
- + int py = mb_y + DY;\
- + if( b_shift_index )\
- + mvc_local[i_mvc] = in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )];\
- + else\
- + mvc_local[i_mvc] = in_mvs[mb_width * py + px];\
- + mvc_local[i_mvc].x >>= scale;\
- + mvc_local[i_mvc].y >>= scale;\
- + i_mvc++;\
- + }
- + /* Find MVP from median of MVCs */
- + if( b_reverse_references )
- + {
- + /* odd iterations: derive MVP from down and right */
- + if( mb_x < mb_width - 1 )
- + MVC( 1, 0 );
- + if( mb_y < mb_height - 1 )
- + {
- + MVC( 0, 1 );
- + if( mb_x > b_shift_index )
- + MVC( -1, 1 );
- + if( mb_x < mb_width - 1 )
- + MVC( 1, 1 );
- + }
- + }
- + else
- + {
- + /* even iterations: derive MVP from up and left */
- + if( mb_x > 0 )
- + MVC( -1, 0 );
- + if( mb_y > 0 )
- + {
- + MVC( 0, -1 );
- + if( mb_x < mb_width - 1 )
- + MVC( 1, -1 );
- + if( mb_x > b_shift_index )
- + MVC( -1, -1 );
- + }
- + }
- + if( i_mvc <= 1 )
- + {
- + mvp = mvc_local[0];
- + }
- + else
- + mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
- +#undef MVC
- + }
- + //new mvp == old mvp, copy the input mv to the output mv and exit.
- + if( (!b_shift_index) && (mvp.x == mvp_buffer[mb_xy].x) && (mvp.y == mvp_buffer[mb_xy].y) )
- + {
- + out_mvs[mb_xy] = in_mvs[mb_xy];
- + return;
- + }
- + mvp_buffer[mb_xy] = mvp;
- + short2 mv_min;
- + short2 mv_max;
- + mv_min.x = -mb_size * mb_x - 4;
- + mv_max.x = mb_size * (mb_width - mb_x - 1) + 4;
- + mv_min.y = -mb_size * mb_y - 4;
- + mv_max.y = mb_size * (mb_height - mb_y - 1) + 4;
- +
- + short2 bestmv;
- + bestmv.x = x264_clip3( mvp.x, mv_min.x, mv_max.x );
- + bestmv.y = x264_clip3( mvp.y, mv_min.y, mv_max.y );
- +
- + int2 refcrd;
- + refcrd.x = coord.x + bestmv.x;
- + refcrd.y = coord.y + bestmv.y;
- + /* measure cost at bestmv */
- + int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
- + lambda * calc_mv_cost( abs_diff( bestmv.x, mvp.x ) << (2 + scale), abs_diff( bestmv.y, mvp.y ) << (2 + scale) );
- +
- + do
- + {
- + /* measure costs at offsets from bestmv */
- + refcrd.x = coord.x + bestmv.x + dia_offs[mb_i].x;
- + refcrd.y = coord.y + bestmv.y + dia_offs[mb_i].y;
- + short2 trymv = bestmv + dia_offs[mb_i];
- + int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
- + lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );
- +
- + cost_local[mb_i] = (cost<<2) | mb_i;
- + cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
- +
- + if( (cost >> 2) >= bcost )
- + break;
- +
- + bestmv += dia_offs[cost&3];
- + bcost = cost>>2;
- +
- + if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
- + break;
- + }
- + while( --me_range > 0 );
- +
- + short2 trymv;
- +
- +#define COST_MV_NO_PAD( X, Y, L )\
- + trymv.x = x264_clip3( X, mv_min.x, mv_max.x );\
- + trymv.y = x264_clip3( Y, mv_min.y, mv_max.y );\
- + if( abs_diff( mvp.x, trymv.x ) > 1 || abs_diff( mvp.y, trymv.y ) > 1 ) {\
- + int2 refcrd = coord; refcrd.x += trymv.x; refcrd.y += trymv.y;\
- + int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
- + L * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );\
- + if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
- +
- + COST_MV_NO_PAD( 0, 0, 0 );
- +
- + if( !b_first_iteration )
- + {
- + /* try cost at previous iteration's MV, if MVP was too far away */
- + short2 prevmv;
- + if( b_shift_index )
- + prevmv = in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )];
- + else
- + prevmv = in_mvs[mb_xy];
- + prevmv.x >>= scale;
- + prevmv.y >>= scale;
- + COST_MV_NO_PAD( prevmv.x, prevmv.y, lambda );
- + }
- +
- + for( int i = 0; i < i_mvc; i++ )
- + {
- + /* try cost at each candidate MV, if MVP was too far away */
- + COST_MV_NO_PAD( mvc_local[i].x, mvc_local[i].y, lambda );
- + }
- +
- + if( mb_i == 0 )
- + {
- + bestmv.x <<= scale;
- + bestmv.y <<= scale;
- + out_mvs[mb_xy] = bestmv;
- + out_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
- + }
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/subpel.cl
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/common/opencl/subpel.cl Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,283 @@
- +
- +/* One thread performs 8x8 SAD */
- +int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
- +{
- + int2 frefpos = (int2)(qpos.x >> 2, qpos.y >> 2);
- + int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
- +
- + uint mask_shift = 8 * (3 - hpel_idx);
- + uint mask = 0x000000ff << mask_shift;
- +
- + uint4 cost4 = 0;
- +
- + for( int y = 0; y < 8; y++ )
- + {
- + uint4 enc, val4;
- + enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
- + val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 & mask) >> mask_shift;
- + val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 & mask) >> mask_shift;
- + val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 & mask) >> mask_shift;
- + val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 & mask) >> mask_shift;
- + cost4 += abs_diff( enc, val4 );
- +
- + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
- + val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 & mask) >> mask_shift;
- + val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 & mask) >> mask_shift;
- + val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 & mask) >> mask_shift;
- + val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 & mask) >> mask_shift;
- + cost4 += abs_diff( enc, val4 );
- + }
- +
- + return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
- +}
- +
- +/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
- +int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
- +{
- + int2 frefApos = qpos >> 2;
- + int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
- +
- + int2 qposB = qpos + ((qpos & 1) << 1);
- + int2 frefBpos = qposB >> 2;
- + int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
- +
- + uint mask_shift0 = 8 * (3 - hpelA), mask_shift1 = 8 * (3 - hpelB);
- + uint mask0 = 0x000000ff << mask_shift0, mask1 = 0x000000ff << mask_shift1;
- +
- + int cost = 0;
- +
- + for( int y = 0; y < 8; y++ )
- + {
- + for( int x = 0; x < 8; x++ )
- + {
- + uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
- + uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 & mask0) >> mask_shift0;
- + uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 & mask1) >> mask_shift1;
- + uint ref = rhadd( vA, vB );
- + cost += abs_diff( enc, ref );
- + }
- + }
- +
- + return cost;
- +}
- +
- +/* Four thread measures 8x8 SATD cost at a QPEL offset into an HPEL plane
- + *
- + * Each thread collects 1/4 of the rows of diffs and processes one quarter of
- + * the transforms
- + */
- +int satd_8x8_ii_qpel_coop4(
- + read_only image2d_t fenc, int2 fencpos,
- + read_only image2d_t fref_planes, int2 qpos,
- + local sum2_t *tmpp,
- + int idx )
- +{
- + volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
- + sum2_t b0, b1, b2, b3;
- +
- + // fencpos is full-pel position of original MB
- + // qpos is qpel position within reference frame
- + int2 frefApos = (int2)(qpos.x>>2, qpos.y>>2);
- + int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
- +
- + int2 qposB = (int2)qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
- + int2 frefBpos = (int2)(qposB.x>>2, qposB.y>>2);
- + int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
- +
- + uint mask_shift0 = 8 * (3 - hpelA), mask_shift1 = 8 * (3 - hpelB);
- + uint mask0 = 0x000000ff << mask_shift0, mask1 = 0x000000ff << mask_shift1;
- +
- + uint vA, vB;
- + uint a0, a1;
- + uint enc;
- + sum2_t sum = 0;
- +
- +
- +#define READ_DIFF( OUT, X )\
- + enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
- + vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 & mask0) >> mask_shift0;\
- + vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 & mask1) >> mask_shift1;\
- + OUT = enc - rhadd( vA, vB );
- +
- +#define READ_DIFF_EX( OUT, a, b )\
- + {\
- + READ_DIFF( a0, a );\
- + READ_DIFF( a1, b );\
- + OUT = a0 + (a1<<BITS_PER_SUM);\
- + }
- +#define ROW_8x4_SATD( a, b )\
- + {\
- + fencpos.y += a;\
- + frefApos.y += b;\
- + frefBpos.y += b;\
- + READ_DIFF_EX( b0, 0, 4 );\
- + READ_DIFF_EX( b1, 1, 5 );\
- + READ_DIFF_EX( b2, 2, 6 );\
- + READ_DIFF_EX( b3, 3, 7 );\
- + HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
- + HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
- + sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
- + }
- + ROW_8x4_SATD( 0, 0 );
- + ROW_8x4_SATD( 4, 4 );
- +
- +#undef READ_DIFF
- +#undef READ_DIFF_EX
- +#undef ROW_8x4_SATD
- + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
- +}
- +
- +constant short2 hpoffs[4] =
- +{
- + {0, -2}, {-2, 0}, {2, 0}, {0, 2}
- +};
- +
- +/*
- + * sub pixel refinement of motion vectors, output MVs and costs are moved from
- + * temporary buffers into final per-frame buffer
- + *
- + * global launch dimensions: [mb_width * 4, mb_height]
- + *
- + * With X being the source 16x16 pixels, F is the lowres pixel used by the
- + * motion search. We will now utilize the H V and C pixels (stored in separate
- + * planes) to search at half-pel increments.
- + *
- + * X X X X X X
- + * F H F H F
- + * X X X X X X
- + * V C V C V
- + * X X X X X X
- + * F H F H F
- + * X X X X X X
- + *
- + * The YX HPEL bits of the motion vector selects the plane we search in. The
- + * four planes are packed in the fref_planes 2D image buffer. Each sample
- + * returns: s0 = F, s1 = H, s2 = V, s3 = C
- + */
- +kernel void subpel_refine(
- + read_only image2d_t fenc,
- + read_only image2d_t fref_planes,
- + const global short2 *in_mvs,
- + const global int16_t *in_sad_mv_costs,
- + local int16_t *cost_local,
- + local sum2_t *satd_local,
- + local short2 *mvc_local,
- + global short2 *fenc_lowres_mv,
- + global int16_t *fenc_lowres_mv_costs,
- + int mb_width,
- + int lambda,
- + int b,
- + int ref,
- + int b_islist1 )
- +{
- + int mb_x = get_global_id( 0 ) >> 2;
- + if( mb_x >= mb_width )
- + return;
- + int mb_height = get_global_size( 1 );
- +
- + int mb_i = get_global_id( 0 ) & 3;
- + int mb_y = get_global_id( 1 );
- + int mb_xy = mb_y * mb_width + mb_x;
- +
- + fenc_lowres_mv += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
- + fenc_lowres_mv_costs += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
- +
- + int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
- + cost_local += mb_in_group * 4;
- + satd_local += mb_in_group * 16;
- + mvc_local += mb_in_group * 4;
- +
- + int i_mvc = 0;
- + mvc_local[0] = 0;
- + mvc_local[1] = 0;
- + mvc_local[2] = 0;
- + mvc_local[3] = 0;
- + short2 mvp;
- +#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
- + if( mb_x > 0 )
- + MVC( -1, 0 );
- + if( mb_y > 0 )
- + {
- + MVC( 0, -1 );
- + if( mb_x < mb_width - 1 )
- + MVC( 1, -1 );
- + if( mb_x > 0 )
- + MVC( -1, -1 );
- + }
- + if( i_mvc <= 1 )
- + mvp = mvc_local[0];
- + else
- + mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
- +#undef MVC
- +
- + int bcost = in_sad_mv_costs[mb_xy];
- + short2 bmv = in_mvs[mb_xy];
- +
- + /* Make mvp a QPEL MV */
- + mvp.x <<= 2;
- + mvp.y <<= 2;
- +
- + /* Make bmv a QPEL MV */
- + bmv.x <<= 2;
- + bmv.y <<= 2;
- +
- + int2 coord;
- + coord.x = mb_x << 3;
- + coord.y = mb_y << 3;
- + short2 trymv;
- + int2 qpos;
- + int cost = 0;
- +
- +#define HPEL_QPEL( ARR, FUNC )\
- + {\
- + trymv = bmv + ARR[mb_i];\
- + qpos.x = (coord.x << 2) + trymv.x;\
- + qpos.y = (coord.y << 2) + trymv.y;\
- + cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
- + cost_local[mb_i] = (cost<<2) + mb_i;\
- + cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
- + if( (cost>>2) < bcost )\
- + {\
- + bmv += ARR[cost&3];\
- + bcost = cost>>2;\
- + }\
- + }
- +
- + HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
- +
- +#define CHEAP_QPEL 1
- +
- +#if CHEAP_QPEL
- + HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
- +#endif
- +
- + /* remeasure with SATD */
- + qpos.x = (coord.x << 2) + bmv.x;
- + qpos.y = (coord.y << 2) + bmv.y;
- + cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
- + bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
- + bcost += lambda * calc_mv_cost( abs_diff( bmv.x, mvp.x ), abs_diff( bmv.y, mvp.y ) );
- + short2 bestmv = 0;
- +
- +#if !CHEAP_QPEL
- +#define QPEL_SATD( ARR, FUNC )\
- + {\
- + trymv = bmv + ARR;\
- + qpos.x = (coord.x << 2) + trymv.x;\
- + qpos.y = (coord.y << 2) + trymv.y;\
- + cost_local[mb_i] = FUNC( fenc, coord, fref_planes, qpos, satd_local, mb_i );\
- + cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
- + cost += lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
- + if( cost < bcost )\
- + {\
- + bestmv = ARR;\
- + bcost = cost;\
- + }\
- + }
- + for( int i = 0; i<4; i++ )
- + QPEL_SATD( dia_offs[i], satd_8x8_ii_qpel_coop4 );
- +#endif
- +
- + fenc_lowres_mv[mb_xy] = bmv+bestmv;
- + fenc_lowres_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/weightp.cl
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/common/opencl/weightp.cl Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,45 @@
- +/*
- +Launch dimensions: width x height (in pixels)
- +*/
- +kernel void weightp_scaled_images( read_only image2d_t in_plane,
- + write_only image2d_t out_plane,
- + uint offset,
- + uint scale,
- + uint denom )
- +{
- + int gx = get_global_id( 0 );
- + int gy = get_global_id( 1 );
- + uint4 input_val;
- + uint4 output_val;
- +
- + input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
- + output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
- + write_imageui( out_plane, (int2)(gx, gy), output_val );
- +}
- +
- +/*
- +Launch dimensions: width x height (in pixels)
- +*/
- +kernel void weightp_hpel( read_only image2d_t in_plane,
- + write_only image2d_t out_plane,
- + uint offset,
- + uint scale,
- + uint denom )
- +{
- + int gx = get_global_id( 0 );
- + int gy = get_global_id( 1 );
- + uint input_val;
- + uint output_val;
- +
- + input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
- + //Unpack
- + uint4 temp;
- + temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
- + temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
- +
- + temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
- +
- + //Pack
- + output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
- + write_imageui( out_plane, (int2)(gx, gy), output_val );
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/x264-cl.h
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/common/opencl/x264-cl.h Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,156 @@
- +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
- +
- +constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
- +
- +/* 7.18.1.1 Exact-width integer types */
- +typedef signed char int8_t;
- +typedef unsigned char uint8_t;
- +typedef short int16_t;
- +typedef unsigned short uint16_t;
- +typedef int int32_t;
- +typedef unsigned uint32_t;
- +typedef long int64_t;
- +typedef unsigned long uint64_t;
- +
- +typedef union
- +{
- + uint32_t i;
- + uint16_t b[2];
- + uint8_t c[4];
- +} x264_union32_t;
- +
- +typedef uint8_t pixel;
- +typedef uint16_t sum_t;
- +typedef uint32_t sum2_t;
- +
- +#define LOWRES_COST_MASK ((1<<14)-1)
- +#define LOWRES_COST_SHIFT 14
- +#define COST_MAX (1<<28)
- +
- +#define X264_MIN( a, b ) ((a)<(b) ? (a) : (b))
- +
- +#define PIXEL_MAX 255
- +#define BITS_PER_SUM (8 * sizeof(sum_t))
- +
- +#define COPY2_IF_LT( x, y, a, b )\
- + if((y)<(x))\
- + {\
- + (x) = (y);\
- + (a) = (b);\
- + }
- +constant short2 dia_offs[4] =
- +{
- + {0, -1}, {-1, 0}, {1, 0}, {0, 1},
- +};
- +
- +inline pixel x264_clip_pixel( int x )
- +{
- + return ((x & ~PIXEL_MAX) ? (-x) >> 31 & PIXEL_MAX : x);
- +}
- +
- +inline int x264_clip3( int v, int i_min, int i_max )
- +{
- + return ((v < i_min) ? i_min : (v > i_max) ? i_max : v);
- +}
- +
- +inline int x264_median( int a, int b, int c )
- +{
- + int t = (a - b) & ((a - b) >> 31);
- + a -= t;
- + b += t;
- + b -= (b - c) & ((b - c) >> 31);
- + b += (a - b) & ((a - b) >> 31);
- + return b;
- +}
- +
- +inline short2 x264_median_mv( short2 a, short2 b, short2 c )
- +{
- + short2 dst;
- + dst.x = x264_median( a.x, b.x, c.x );
- + dst.y = x264_median( a.y, b.y, c.y );
- + return dst;
- +}
- +
- +inline sum2_t abs2( sum2_t a )
- +{
- + sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
- + return (a + s) ^ s;
- +}
- +
- +#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
- + sum2_t t0 = s0 + s1;\
- + sum2_t t1 = s0 - s1;\
- + sum2_t t2 = s2 + s3;\
- + sum2_t t3 = s2 - s3;\
- + d0 = t0 + t2;\
- + d2 = t0 - t2;\
- + d1 = t1 + t3;\
- + d3 = t1 - t3;\
- +}
- +
- +#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
- + int2 t0 = s0 + s1;\
- + int2 t1 = s0 - s1;\
- + int2 t2 = s2 + s3;\
- + int2 t3 = s2 - s3;\
- + d0 = t0 + t2;\
- + d2 = t0 - t2;\
- + d1 = t1 + t3;\
- + d3 = t1 - t3;\
- +}
- +
- +#define SATD_C_8x4_Q( name, q1, q2 )\
- + int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
- + {\
- + sum2_t tmp[4][4];\
- + sum2_t a0, a1, a2, a3;\
- + sum2_t sum = 0;\
- + for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
- + {\
- + a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
- + a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
- + a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
- + a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
- + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
- + }\
- + for( int i = 0; i < 4; i++ )\
- + {\
- + HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
- + sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
- + }\
- + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
- + }
- +
- +/*
- + * Utility function to perform a parallel sum reduction of an array of integers
- + */
- +int parallel_sum( int value, int x, volatile local int *array )
- +{
- + array[x] = value;
- + barrier( CLK_LOCAL_MEM_FENCE );
- +
- + int dim = get_local_size( 0 );
- +
- + while( dim > 1 )
- + {
- + dim >>= 1;
- +
- + if( x < dim )
- + {
- + array[x] += array[x + dim];
- + }
- +
- + if( dim > 32 )
- + {
- + barrier( CLK_LOCAL_MEM_FENCE );
- + }
- + }
- +
- + return array[0];
- +}
- +
- +int calc_mv_cost( int dx, int dy )
- +{
- + return round( (log2( (float)(dx + 1) ) * 2.0f + 0.718f + !!dx) ) +
- + round( (log2( (float)(dy + 1) ) * 2.0f + 0.718f + !!dy) );
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f configure
- --- a/configure Thu Mar 29 14:14:07 2012 -0700
- +++ b/configure Tue May 15 18:05:00 2012 -0500
- @@ -25,6 +25,7 @@
- --system-libx264 use system libx264 instead of internal
- --enable-shared build shared library
- --enable-static build static library
- + --disable-opencl disable OpenCL features
- --disable-gpl disable GPL-only features
- --disable-thread disable multithreaded encoding
- --enable-win32thread use win32threads (windows only)
- @@ -273,6 +274,7 @@
- bit_depth="8"
- chroma_format="all"
- compiler="GNU"
- +opencl="yes"
- CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
- LDFLAGS="$LDFLAGS"
- @@ -381,6 +383,9 @@
- --host=*)
- host="$optarg"
- ;;
- + --disable-opencl)
- + opencl="no"
- + ;;
- --cross-prefix=*)
- cross_prefix="$optarg"
- ;;
- @@ -1082,6 +1087,37 @@
- PROF_USE_LD=$PROF_USE_LD
- EOF
- +if [ "$opencl" = "yes" ]; then
- + log_check "looking for xxd"
- + if ! $(xxd -v 2>/dev/null); then
- + echo 'OpenCL support requires xxd to compile.'
- + echo 'use --disable-opencl to compile without OpenCL'
- + exit 1
- + elif [ "$CUDA_PATH" != "" ]; then
- + echo 'HAVE_OPENCL=yes' >> config.mak
- + echo 'OPENCL_LIB=OpenCL' >> config.mak
- + echo 'OPENCL_INC_DIR=$(CUDA_PATH)include' >> config.mak
- + if [ "$ARCH" = "X86" ]; then
- + echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/Win32' >> config.mak
- + else
- + echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/x64' >> config.mak
- + fi
- + define HAVE_OPENCL
- + elif [ "$AMDAPPSDKROOT" != "" ]; then
- + echo 'HAVE_OPENCL=yes' >> config.mak
- + echo 'OPENCL_LIB=OpenCL' >> config.mak
- + echo 'OPENCL_INC_DIR=$(AMDAPPSDKROOT)/include' >> config.mak
- + if [ "$ARCH" = "X86" ]; then
- + echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86' >> config.mak
- + else
- + echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86_64' >> config.mak
- + fi
- + define HAVE_OPENCL
- + else
- + opencl="no"
- + fi
- +fi
- +
- if [ $compiler = ICL ]; then
- echo '%.o: %.c' >> config.mak
- echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
- @@ -1192,6 +1228,7 @@
- visualize: $vis
- bit depth: $bit_depth
- chroma format: $chroma_format
- +opencl: $opencl
- EOF
- echo >> config.log
- diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/encoder.c
- --- a/encoder/encoder.c Thu Mar 29 14:14:07 2012 -0700
- +++ b/encoder/encoder.c Tue May 15 18:05:00 2012 -0500
- @@ -37,6 +37,10 @@
- #include "common/visualize.h"
- #endif
- +#if HAVE_OPENCL
- +#include "opencl.c"
- +#endif
- +
- //#define DEBUG_MB_TYPE
- #define bs_write_ue bs_write_ue_big
- @@ -515,6 +519,21 @@
- if( h->i_thread_frames > 1 )
- h->param.nalu_process = NULL;
- +#ifndef HAVE_OPENCL
- + if( h->param.b_opencl )
- + {
- + x264_log( h, X264_LOG_WARNING, "not compiled with OpenCL support!\n" );
- + h->param.b_opencl = 0;
- + }
- +#endif
- +#if BIT_DEPTH > 8
- + if( h->param.b_opencl )
- + {
- + x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth!\n" );
- + h->param.b_opencl = 0;
- + }
- +#endif
- +
- h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
- if( h->param.i_keyint_max == 1 )
- {
- @@ -1306,6 +1325,14 @@
- goto fail;
- }
- +#if HAVE_OPENCL
- + if( h->param.b_opencl )
- + {
- + if( x264_opencl_init( h ) )
- + h->param.b_opencl = 0;
- + }
- +#endif
- +
- if( x264_lookahead_init( h, i_slicetype_length ) )
- goto fail;
- @@ -3451,6 +3478,10 @@
- || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
- || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
- +#if HAVE_OPENCL
- + x264_opencl_free( h );
- +#endif
- +
- x264_lookahead_delete( h );
- if( h->param.b_sliced_threads )
- diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/opencl.c
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/encoder/opencl.c Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,424 @@
- +/*****************************************************************************
- + * opencl.c: OpenCL initialization and kernel compilation
- + *****************************************************************************
- + * Copyright (C) 2010-2012 x264 project
- + *
- + * Authors: Steve Borho <sborho@multicorewareinc.com>
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License
- + * along with this program; if not, write to the Free Software
- + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- + *
- + * This program is also available under a commercial proprietary license.
- + * For more information, contact us at licensing@x264.com.
- + *****************************************************************************/
- +
- +#if HAVE_OPENCL
- +#include "common/common.h"
- +#include "encoder/oclobj.h"
- +
- +/* Try to load the cached compiled program binary, verify the device context is
- + * still valid before reuse */
- +static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
- +{
- + cl_program program = NULL;
- + cl_int status;
- +
- + /* try to load cached program binary */
- + FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
- + if( !fp )
- + return NULL;
- +
- + fseek( fp, 0L, SEEK_END );
- + size_t size = ftell( fp );
- + rewind( fp );
- + uint8_t *binary = x264_malloc( size );
- + if( !binary )
- + goto fail;
- +
- + fread( binary, 1, size, fp );
- + const uint8_t *ptr = (const uint8_t*)binary;
- +
- +#define CHECK_STRING( STR )\
- + do {\
- + size_t len = strlen( STR );\
- + if( size <= len || strncmp( (char*)ptr, STR, len ) )\
- + goto fail;\
- + else {\
- + size -= (len+1); ptr += (len+1);\
- + }\
- + } while( 0 )
- +
- + CHECK_STRING( devname );
- + CHECK_STRING( devvendor );
- + CHECK_STRING( driverversion );
- +#undef CHECK_STRING
- +
- + program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
- + if( status != CL_SUCCESS )
- + program = NULL;
- +
- +fail:
- + fclose( fp );
- + x264_free( binary );
- + return program;
- +}
- +
- +/* Save the compiled program binary to a file for later reuse. Device context
- + * is also saved in the cache file so we do not reuse stale binaries */
- +static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
- +{
- + FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
- + if( !fp )
- + return;
- +
- + size_t size;
- + cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
- + if( status == CL_SUCCESS )
- + {
- + unsigned char *binary = x264_malloc( size );
- + status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binary, NULL );
- + if( status == CL_SUCCESS )
- + {
- + fwrite( devname, 1, strlen( devname ), fp );
- + fwrite( "\n", 1, 1, fp );
- + fwrite( devvendor, 1, strlen( devvendor ), fp );
- + fwrite( "\n", 1, 1, fp );
- + fwrite( driverversion, 1, strlen( driverversion ), fp );
- + fwrite( "\n", 1, 1, fp );
- + fwrite( binary, 1, size, fp );
- + }
- + x264_free( binary );
- + }
- + fclose( fp );
- +}
- +
- +static int x264_detect_AMD_GPU( cl_device_id device, int *b_is_SI )
- +{
- + char extensions[512];
- + char boardname[64];
- + char devname[64];
- +
- + *b_is_SI = 0;
- +
- + cl_int status = clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL );
- + if( status != CL_SUCCESS || !strstr( extensions, "cl_amd_media_ops" ) )
- + return 0;
- +
- + /* Detect whether GPU is a SouthernIsland based device */
- +#define CL_DEVICE_BOARD_NAME_AMD 0x4038
- + status = clGetDeviceInfo( device, CL_DEVICE_BOARD_NAME_AMD, sizeof(boardname), boardname, NULL );
- + if( status == CL_SUCCESS )
- + {
- + boardname[15] = boardname[16] = boardname[17] = 'X';
- + *b_is_SI = !strcmp( boardname, "AMD Radeon HD 7XXX Series" );
- + return 1;
- + }
- +
- + /* Fall back to checking the device name */
- + status = clGetDeviceInfo( device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
- + if( status != CL_SUCCESS )
- + return 1;
- +
- + const char *tahiti_names[] = { "Tahiti", "Pitcairn", "Capeverde", "Bali", NULL };
- + for( int i = 0; tahiti_names[i]; i++ )
- + if( !strcmp( devname, tahiti_names[i] ) )
- + {
- + *b_is_SI = 1;
- + return 1;
- + }
- + return 1;
- +}
- +
- +/* The OpenCL source under common/opencl will be merged into encoder/oclobj.h by
- + * the Makefile. It defines a x264_opencl_source byte array which we will pass
- + * to clCreateProgramWithSource(). We also attempt to use a cache file for the
- + * compiled binary, stored in the current working folder. */
- +static cl_program x264_opencl_compile( x264_t *h )
- +{
- + cl_program program;
- + cl_int status;
- +
- + char devname[64];
- + char devvendor[64];
- + char driverversion[64];
- + status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
- + status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(devvendor), devvendor, NULL );
- + status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
- + if( status != CL_SUCCESS )
- + return NULL;
- +
- + int b_isamd = x264_detect_AMD_GPU( h->opencl.device, &h->opencl.b_device_AMD_SI );
- +
- + x264_log( h, X264_LOG_INFO, "OpenCL: %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
- +
- + program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
- + if( !program )
- + {
- + /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
- + x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
- + const char *strptr = (const char*)x264_opencl_source;
- + size_t size = sizeof(x264_opencl_source);
- + program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
- + if( status != CL_SUCCESS || !program )
- + {
- + x264_log( h, X264_LOG_ERROR, "OpenCL: unable to create program\n" );
- + return NULL;
- + }
- + }
- +
- + /* Build the program binary for the OpenCL device */
- + const char *buildopts = "";
- + if( b_isamd && !h->opencl.b_device_AMD_SI )
- + buildopts = "-DVECTORIZE=1";
- + status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
- + if( status == CL_SUCCESS )
- + {
- + x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
- + return program;
- + }
- +
- + /* Compile failure, should not happen with production code. */
- +
- + size_t build_log_len = 0;
- +
- + status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
- + if( status != CL_SUCCESS )
- + {
- + x264_log( h, X264_LOG_ERROR, "OpenCL: Compilation failed, unable to get build log\n" );
- + return NULL;
- + }
- +
- + char *build_log = x264_malloc( build_log_len );
- + if( !build_log )
- + return NULL;
- +
- + status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
- + if( status != CL_SUCCESS )
- + {
- + x264_log( h, X264_LOG_ERROR, "OpenCL: Compilation failed, unable to get build log\n" );
- + x264_free( build_log );
- + return NULL;
- + }
- +
- + FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
- + if( lg )
- + {
- + fwrite( build_log, 1, build_log_len, lg );
- + fclose( lg );
- + x264_log( h, X264_LOG_ERROR, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
- + }
- +
- + x264_free( build_log );
- + return NULL;
- +}
- +
- +static void x264_opencl_free_lookahead( x264_t *h )
- +{
- +#define RELEASE( a, f ) if( a ) f( a );
- + RELEASE( h->opencl.intra_kernel, clReleaseKernel )
- + RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
- + RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
- + RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
- + RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
- + RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
- + RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
- + RELEASE( h->opencl.memset_kernel, clReleaseKernel )
- + RELEASE( h->opencl.hme_kernel, clReleaseKernel )
- + RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
- + RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
- + RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
- + RELEASE( h->opencl.lookahead_program, clReleaseProgram )
- + RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
- + RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
- + RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
- + RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
- + RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
- + RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
- + RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
- + RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
- + RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
- + RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
- + RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
- + RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
- + RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
- + RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
- + for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
- + {
- + RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
- + }
- +#undef RELEASE
- +}
- +
- +static int x264_opencl_init_lookahead( x264_t *h )
- +{
- + if( h->param.rc.i_lookahead == 0 )
- + return -1;
- +
- + char *kernelnames[] = {
- + "mb_intra_cost_satd_8x8",
- + "sum_intra_cost",
- + "downscale_hpel",
- + "downscale1",
- + "downscale2",
- + "memset_int16",
- + "weightp_scaled_images",
- + "weightp_hpel",
- + "hierarchical_motion",
- + "subpel_refine",
- + "mode_selection",
- + "sum_inter_cost"
- + };
- + cl_kernel *kernels[] = {
- + &h->opencl.intra_kernel,
- + &h->opencl.rowsum_intra_kernel,
- + &h->opencl.downscale_hpel_kernel,
- + &h->opencl.downscale_kernel1,
- + &h->opencl.downscale_kernel2,
- + &h->opencl.memset_kernel,
- + &h->opencl.weightp_scaled_images_kernel,
- + &h->opencl.weightp_hpel_kernel,
- + &h->opencl.hme_kernel,
- + &h->opencl.subpel_refine_kernel,
- + &h->opencl.mode_select_kernel,
- + &h->opencl.rowsum_inter_kernel
- + };
- + cl_int status;
- +
- + h->opencl.lookahead_program = x264_opencl_compile( h );
- + if( !h->opencl.lookahead_program )
- + {
- + x264_opencl_free_lookahead( h );
- + return -1;
- + }
- +
- + for( int i = 0; i < sizeof(kernelnames)/sizeof(char*); i++ )
- + {
- + *kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
- + if( status != CL_SUCCESS )
- + {
- + x264_log( h, X264_LOG_ERROR, "Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
- + x264_opencl_free_lookahead( h );
- + return -1;
- + }
- + }
- +
- + h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
- + if( status != CL_SUCCESS )
- + {
- + x264_log( h, X264_LOG_ERROR, "Unable to allocate page-locked buffer, error '%d'\n", status );
- + return -1;
- + }
- + h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
- + 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
- + if( status != CL_SUCCESS )
- + {
- + x264_log( h, X264_LOG_ERROR, "Unable to map page-locked buffer, error '%d'\n", status );
- + return -1;
- + }
- +
- + return 0;
- +}
- +
- +static void x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
- +{
- + x264_t *h = (x264_t*)user_data;
- + x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
- +}
- +
- +static int x264_opencl_init( x264_t *h )
- +{
- + cl_int status;
- + cl_uint numPlatforms;
- + int ret = -1;
- +
- + status = clGetPlatformIDs( 0, NULL, &numPlatforms );
- + if( status != CL_SUCCESS || numPlatforms == 0 )
- + return -1;
- +
- + cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
- + status = clGetPlatformIDs( numPlatforms, platforms, NULL );
- + if( status != CL_SUCCESS )
- + {
- + x264_free( platforms );
- + return -1;
- + }
- +
- + /* Select first OpenCL platform that supports a GPU device */
- + for( cl_uint i = 0; i < numPlatforms; ++i )
- + {
- + cl_uint gpu_count;
- + status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
- + if( status == CL_SUCCESS && gpu_count > 0 )
- + {
- + /* take GPU 0 */
- + status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 1, &h->opencl.device, NULL );
- + if( status != CL_SUCCESS )
- + continue;
- +
- + h->opencl.context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
- + if( status != CL_SUCCESS )
- + continue;
- +
- + cl_bool image_support;
- + clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
- + if( !image_support )
- + continue;
- +
- + cl_uint count = 0;
- + cl_image_format imageType[100];
- + clGetSupportedImageFormats( h->opencl.context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 100, imageType, &count );
- + count = X264_MIN( count, 100 );
- +
- + int b_has_r = 0;
- + int b_has_rgba = 0;
- + for( cl_uint j = 0; j<count; j++ )
- + {
- + if( imageType[j].image_channel_order == CL_R )
- + b_has_r = 1;
- + else if( imageType[j].image_channel_order == CL_RGBA )
- + b_has_rgba = 1;
- + }
- + if( !b_has_r || !b_has_rgba )
- + continue;
- +
- + h->opencl.queue = clCreateCommandQueue( h->opencl.context, h->opencl.device, 0, &status );
- + if( status != CL_SUCCESS )
- + continue;
- +
- + ret = 0;
- + break;
- + }
- + }
- +
- + x264_free( platforms );
- +
- + if( !h->param.psz_clbin_file )
- + h->param.psz_clbin_file = "x264_lookahead.clbin";
- +
- + if( !ret )
- + ret = x264_opencl_init_lookahead( h );
- +
- + return ret;
- +}
- +
- +void x264_opencl_free( x264_t *h )
- +{
- + x264_opencl_free_lookahead( h );
- +
- + if( h->opencl.queue )
- + clReleaseCommandQueue( h->opencl.queue );
- + if( h->opencl.context )
- + clReleaseContext( h->opencl.context );
- +}
- +#endif /* HAVE_OPENCL */
- diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/slicetype-cl.c
- --- /dev/null Thu Jan 01 00:00:00 1970 +0000
- +++ b/encoder/slicetype-cl.c Tue May 15 18:05:00 2012 -0500
- @@ -0,0 +1,655 @@
- +/*****************************************************************************
- + * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
- + *****************************************************************************
- + * Copyright (C) 2012 x264 project
- + *
- + * Authors: Steve Borho <sborho@multicorewareinc.com>
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License
- + * along with this program; if not, write to the Free Software
- + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- + *
- + * This program is also available under a commercial proprietary license.
- + * For more information, contact us at licensing@x264.com.
- + *****************************************************************************/
- +
- +#define OCLCHECK( method, ... )\
- + status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) {\
- + x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status ); return status; }
- +
- +static void x264_opencl_flush( x264_t *h )
- +{
- + clFinish( h->opencl.queue );
- +
- + /* Finish copies from the GPU by copying from the page-locked buffer to
- + * their final destination
- + */
- + for( int i = 0; i < h->opencl.num_copies; i++ )
- + memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
- + h->opencl.num_copies = 0;
- + h->opencl.pl_occupancy = 0;
- +}
- +
- +static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
- +{
- + if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
- + x264_opencl_flush( h );
- + assert( bytes < PAGE_LOCKED_BUF_SIZE );
- + char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
- + h->opencl.pl_occupancy += bytes;
- + return ptr;
- +}
- +
- +static int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
- +{
- + if( fenc->b_intra_calculated )
- + return 0;
- + fenc->b_intra_calculated = 1;
- +
- + int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
- +
- +#define CREATEBUF( out, flags, size )\
- + out = clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
- + if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
- +#define CREATEIMAGE( out, flags, pf, width, height )\
- + out = clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
- + if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
- +
- + int mb_count = h->mb.i_mb_count;
- + cl_int status;
- +
- + if( !h->opencl.lowres_mv_costs )
- + {
- + /* Allocate shared memory buffers */
- + int width = h->mb.i_mb_width * 8 * sizeof(pixel);
- + int height = h->mb.i_mb_height * 8 * sizeof(pixel);
- +
- + cl_image_format pixel_format;
- + pixel_format.image_channel_order = CL_R;
- + pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
- + CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
- +
- + for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
- + {
- + pixel_format.image_channel_order = CL_RGBA;
- + pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
- + CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
- + width >>= 1;
- + height >>= 1;
- + }
- +
- + CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
- + CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
- + CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
- + CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
- + CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
- + CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
- + CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
- + CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
- + CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
- + CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
- + CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
- + CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
- + }
- +
- + if( !fenc->opencl.intra_cost )
- + {
- + /* Allocate per-frame buffers */
- + int width = h->mb.i_mb_width * 8 * sizeof(pixel);
- + int height = h->mb.i_mb_height * 8 * sizeof(pixel);
- +
- + cl_image_format pixel_format;
- + pixel_format.image_channel_order = CL_R;
- + pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
- + CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
- +
- + for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
- + {
- + pixel_format.image_channel_order = CL_RGBA;
- + pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
- + CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
- + width >>= 1;
- + height >>= 1;
- + }
- + CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
- + CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
- + CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
- + CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
- + CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
- + CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
- + }
- +#undef CREATEBUF
- +
- + /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
- +
- + char *locked = x264_opencl_alloc_locked( h, luma_length );
- + memcpy( locked, fenc->plane[0], luma_length );
- + OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
- +
- + size_t gdim[2];
- + if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
- + {
- + int size = h->mb.i_mb_count * sizeof(int16_t);
- + locked = x264_opencl_alloc_locked( h, size );
- + memcpy( locked, fenc->i_inv_qscale_factor, size );
- + OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
- + }
- + else
- + {
- + /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
- + cl_uint arg = 0;
- + int16_t value = 256;
- + OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
- + OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
- + gdim[0] = h->mb.i_mb_count;
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
- + }
- +
- + int stride = fenc->i_stride[0];
- + cl_uint arg = 0;
- + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
- + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
- + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
- + gdim[0] = 8 * h->mb.i_mb_width;
- + gdim[1] = 8 * h->mb.i_mb_height;
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
- +
- + for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
- + {
- + /* Workaround for AMD Southern Island, alternate kernel instances. No
- + * perf impact to this, so we do it for all GPUs
- + */
- + cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
- +
- + arg = 0;
- + OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
- + OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
- + gdim[0] >>= 1;
- + gdim[1] >>= 1;
- + if( gdim[0] < 16 || gdim[1] < 16 )
- + break;
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
- + }
- +
- + size_t ldim[2];
- + gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
- + gdim[1] = 8*h->mb.i_mb_height;
- + ldim[0] = 32;
- + ldim[1] = 8;
- + arg = 0;
- + int slow = h->param.analyse.i_subpel_refine > 7;
- + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
- + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
- + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
- + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
- + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
- +
- + gdim[0] = 256;
- + gdim[1] = h->mb.i_mb_height;
- + ldim[0] = 256;
- + ldim[1] = 1;
- + arg = 0;
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
- +
- + if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
- + x264_opencl_flush( h );
- +
- + int size = h->mb.i_mb_count * sizeof(int16_t);
- + locked = x264_opencl_alloc_locked( h, size );
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
- + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
- + h->opencl.copies[h->opencl.num_copies].src = locked;
- + h->opencl.copies[h->opencl.num_copies].bytes = size;
- + h->opencl.num_copies++;
- +
- + size = h->mb.i_mb_height * sizeof(int);
- + locked = x264_opencl_alloc_locked( h, size );
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
- + h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
- + h->opencl.copies[h->opencl.num_copies].src = locked;
- + h->opencl.copies[h->opencl.num_copies].bytes = size;
- + h->opencl.num_copies++;
- +
- + size = sizeof(int) * 4;
- + locked = x264_opencl_alloc_locked( h, size );
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
- + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
- + h->opencl.copies[h->opencl.num_copies].src = locked;
- + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
- + h->opencl.num_copies++;
- + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
- + h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
- + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
- + h->opencl.num_copies++;
- +
- + h->opencl.last_buf = !h->opencl.last_buf;
- + return 0;
- +}
- +
- +static void x264_optimal_launch_dims( size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
- +{
- + size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
- + size_t preferred_multiple = 64;
- + cl_uint num_cus = 6;
- +
- + clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
- + clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
- + clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
- +
- + ldims[0] = preferred_multiple;
- + ldims[1] = 8;
- +
- + /* make ldims[1] an even divisor of gdims[1] */
- + while( gdims[1] & (ldims[1] - 1) )
- + {
- + ldims[0] <<= 1;
- + ldims[1] >>= 1;
- + }
- + /* make total ldims fit under the max work-group dimensions for the device */
- + while( ldims[0] * ldims[1] > max_work_group )
- + {
- + if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
- + ldims[1] >>= 1;
- + else
- + ldims[0] >>= 1;
- + }
- +
- + if( ldims[0] > gdims[0] )
- + {
- + /* remove preferred multiples until we're close to gdims[0] */
- + while( gdims[0] + preferred_multiple < ldims[0] )
- + ldims[0] -= preferred_multiple;
- + gdims[0] = ldims[0];
- + }
- + else
- + {
- + /* make gdims an even multiple of ldims */
- + gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
- + gdims[0] *= ldims[0];
- + }
- +
- + /* make ldims smaller to spread work across compute units */
- + while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
- + {
- + if( ldims[0] > preferred_multiple )
- + ldims[0] >>= 1;
- + else if( ldims[1] > 1 )
- + ldims[1] >>= 1;
- + else
- + break;
- + }
- + /* for smaller GPUs, try not to abuse their texture cache */
- + if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
- + ldims[0] = 32;
- +}
- +
- +static int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
- +{
- + x264_frame_t *fenc = frames[b];
- + x264_frame_t *fref = frames[ref];
- +
- + cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
- + cl_mem ref_luma_hpel;
- + cl_int status;
- +
- + if( w && (w->i_denom) != 0 )
- + {
- + size_t gdims[2];
- +
- + gdims[0] = 8 * h->mb.i_mb_width;
- + gdims[1] = 8 * h->mb.i_mb_height;
- +
- + /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
- + for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
- + {
- + cl_uint arg = 0;
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
- +
- + gdims[0] >>= 1;
- + gdims[1] >>= 1;
- + if( gdims[0] < 16 || gdims[1] < 16 )
- + break;
- + }
- +
- + cl_uint arg = 0;
- + gdims[0] = 8 * h->mb.i_mb_width;
- + gdims[1] = 8 * h->mb.i_mb_height;
- +
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
- + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
- +
- + /* Use weighted reference planes for motion search */
- + for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
- + ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
- + ref_luma_hpel = h->opencl.weighted_luma_hpel;
- + }
- + else
- + {
- + /* Use unweighted reference planes for motion search */
- + for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
- + ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
- + ref_luma_hpel = fref->opencl.luma_hpel;
- + }
- +
- + const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
- + int b_first_iteration = 1;
- + int b_reverse_references = 1;
- + int A = 1;
- +
- +
- + int mb_per_group = 0;
- + int cost_local_size = 0;
- + int mvc_local_size = 0;
- + int fenc_local_size = 0;
- + int mb_width;
- +
- + size_t gdims[2];
- + size_t ldims[2];
- +
- + /* scale 0 is 8x8 */
- + for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
- + {
- + mb_width = h->mb.i_mb_width >> scale;
- + gdims[0] = mb_width;
- + gdims[1] = h->mb.i_mb_height >> scale;
- + if( gdims[0] < 2 || gdims[1] < 2 )
- + continue;
- + gdims[0] <<= 2;
- + x264_optimal_launch_dims( gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
- +
- + mb_per_group = (ldims[0] >> 2) * ldims[1];
- + cost_local_size = 4 * mb_per_group * sizeof(int16_t);
- + fenc_local_size = 16 * sizeof(cl_uint4) * mb_per_group;
- + mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
- + int scaled_me_range = h->param.analyse.i_me_range >> scale;
- + int b_shift_index = 1;
- +
- + cl_uint arg = 0;
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
- +
- + for( int iter = 0; iter < num_iterations[scale]; iter++ )
- + {
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
- +
- + b_shift_index = 0;
- + b_first_iteration = 0;
- + if( scale > 2 )
- + b_reverse_references = !b_reverse_references;
- + else
- + b_reverse_references = 0;
- + A = !A;
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
- + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
- + }
- + }
- +
- + int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
- + cl_uint arg = 0;
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
- +
- + if( b_islist1 )
- + {
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
- + }
- + else
- + {
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
- + }
- +
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
- + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
- +
- + if( h->opencl.b_device_AMD_SI )
- + {
- + /* workaround for AMD Southern Island, perform meaningless small copy */
- + OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
- + }
- +
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
- +
- + int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
- +
- + if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
- + x264_opencl_flush( h );
- +
- + char *locked = x264_opencl_alloc_locked( h, mvlen );
- + h->opencl.copies[h->opencl.num_copies].src = locked;
- + h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
- +
- + if( b_islist1 )
- + {
- + int mvs_offset = mvlen * (ref - b - 1);
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
- + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
- + }
- + else
- + {
- + int mvs_offset = mvlen * (b - ref - 1);
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
- + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
- + }
- +
- + h->opencl.num_copies++;
- +
- + return 0;
- +}
- +
- +static int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
- +{
- + cl_int status;
- + x264_frame_t *fenc = frames[b];
- + x264_frame_t *fref0 = frames[p0];
- + x264_frame_t *fref1 = frames[p1];
- +
- + int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
- +
- + /* Tasks for this kernel:
- + * 1. Select least cost mode (intra, ref0, ref1)
- + * list_used 0, 1, 2, or 3. if B frame, do not allow intra
- + * 2. if B frame, try bidir predictions.
- + * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
- + */
- + size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
- + size_t ldim_bidir[2];
- + size_t *ldims = NULL;
- + int cost_local_size = 4;
- + int satd_local_size = 4;
- + if( b < p1 )
- + {
- + /* For B frames, use 4 threads per MB for BIDIR checks */
- + ldims = ldim_bidir;
- + gdims[0] <<= 2;
- + x264_optimal_launch_dims( gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
- + int mb_per_group = (ldims[0] >> 2) * ldims[1];
- + cost_local_size = 4 * mb_per_group * sizeof(int16_t);
- + satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
- + }
- +
- + cl_uint arg = 0;
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
- + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
- +
- + /* Sum costs across rows, atomicAdd down frame */
- + size_t gdim[2] = { 256, h->mb.i_mb_height };
- + size_t ldim[2] = { 256, 1 };
- +
- + arg = 0;
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
- + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
- + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
- +
- + if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
- + x264_opencl_flush( h );
- +
- + int size = h->mb.i_mb_count * sizeof(int16_t);
- + char *locked = x264_opencl_alloc_locked( h, size );
- + h->opencl.copies[h->opencl.num_copies].src = locked;
- + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
- + h->opencl.copies[h->opencl.num_copies].bytes = size;
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
- + h->opencl.num_copies++;
- +
- + size = h->mb.i_mb_height * sizeof(int);
- + locked = x264_opencl_alloc_locked( h, size );
- + h->opencl.copies[h->opencl.num_copies].src = locked;
- + h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
- + h->opencl.copies[h->opencl.num_copies].bytes = size;
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
- + h->opencl.num_copies++;
- +
- + size = 4 * sizeof(int);
- + locked = x264_opencl_alloc_locked( h, size );
- + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
- + h->opencl.last_buf = !h->opencl.last_buf;
- +
- + h->opencl.copies[h->opencl.num_copies].src = locked;
- + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
- + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
- + h->opencl.num_copies++;
- + h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
- + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
- + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
- + h->opencl.num_copies++;
- +
- + if( b == p1 ) // P frames only
- + {
- + h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
- + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
- + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
- + h->opencl.num_copies++;
- + }
- + return 0;
- +}
- +
- +static int x264_opencl_precalculate_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
- +{
- + if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
- + return 0;
- + else
- + {
- + int do_search[2];
- + int dist_scale_factor = 128;
- + const x264_weight_t *w = x264_weight_none;
- +
- + // avoid duplicating work
- + frames[b]->i_cost_est[b-p0][p1-b] = 0;
- +
- + do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
- + do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
- + if( do_search[0] )
- + {
- + if( h->param.analyse.i_weighted_pred && b == p1 )
- + {
- + x264_emms();
- + x264_weights_analyse( h, frames[b], frames[p0], 1 );
- + w = frames[b]->weight[0];
- + }
- + frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
- + }
- + if( do_search[1] )
- + frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
- + if( b == p1 )
- + frames[b]->i_intra_mbs[b-p0] = 0;
- + if( p1 != p0 )
- + dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
- +
- + frames[b]->i_cost_est[b-p0][p1-b] = 0;
- + frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
- +
- + x264_opencl_lowres_init( h, frames[b], a->i_lambda );
- +
- + if( do_search[0] )
- + {
- + x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
- + x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
- + }
- + if( do_search[1] )
- + {
- + x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
- + x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
- + }
- + x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
- + return 1;
- + }
- +}
- diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/slicetype.c
- --- a/encoder/slicetype.c Thu Mar 29 14:14:07 2012 -0700
- +++ b/encoder/slicetype.c Tue May 15 18:05:00 2012 -0500
- @@ -36,6 +36,15 @@
- x264_frame_t **frames, int p0, int p1, int b,
- int b_intra_penalty );
- +static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
- +
- +#if HAVE_OPENCL
- +#ifdef _WIN32
- +#include "windows.h"
- +#endif
- +#include "slicetype-cl.c"
- +#endif
- +
- static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
- {
- a->i_qp = X264_LOOKAHEAD_QP;
- @@ -748,38 +757,62 @@
- frames[b]->i_cost_est[b-p0][p1-b] = 0;
- frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
- - /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
- - * This considerably improves MV prediction overall. */
- +#if HAVE_OPENCL
- + if( h->param.b_opencl )
- + {
- + x264_opencl_lowres_init(h, frames[b], a->i_lambda );
- + if( do_search[0] )
- + {
- + x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
- + x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
- + }
- + if( do_search[1] )
- + {
- + x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
- + x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
- + }
- + if( b != p0 )
- + x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
- + x264_opencl_flush( h );
- - /* The edge mbs seem to reduce the predictive quality of the
- - * whole frame's score, but are needed for a spatial distribution. */
- - if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
- - h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
- - {
- - for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
- - {
- - row_satd[h->mb.i_mb_y] = 0;
- - if( !frames[b]->b_intra_calculated )
- - row_satd_intra[h->mb.i_mb_y] = 0;
- - for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
- - x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
- - }
- + i_score = frames[b]->i_cost_est[b-p0][p1-b];
- }
- else
- +#endif
- {
- - for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
- - for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
- - x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
- + /* Lowres lookahead goes backwards because the MVs are used as predictors in
- + * the main encode. This considerably improves MV prediction overall. */
- +
- + /* The edge mbs seem to reduce the predictive quality of the
- + * whole frame's score, but are needed for a spatial distribution. */
- + if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
- + h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
- + {
- + for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
- + {
- + row_satd[h->mb.i_mb_y] = 0;
- + if( !frames[b]->b_intra_calculated )
- + row_satd_intra[h->mb.i_mb_y] = 0;
- + for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
- + x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
- + }
- + }
- + else
- + {
- + for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
- + for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
- + x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
- + }
- +
- + i_score = frames[b]->i_cost_est[b-p0][p1-b];
- + if( b != p1 )
- + i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
- + else
- + frames[b]->b_intra_calculated = 1;
- +
- + frames[b]->i_cost_est[b-p0][p1-b] = i_score;
- + x264_emms();
- }
- -
- - i_score = frames[b]->i_cost_est[b-p0][p1-b];
- - if( b != p1 )
- - i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
- - else
- - frames[b]->b_intra_calculated = 1;
- -
- - frames[b]->i_cost_est[b-p0][p1-b] = i_score;
- - x264_emms();
- }
- if( b_intra_penalty )
- @@ -1344,6 +1377,61 @@
- return;
- }
- +#if HAVE_OPENCL
- +#ifdef _WIN32
- + int my_pri = THREAD_PRIORITY_NORMAL, ocl_pri = THREAD_PRIORITY_NORMAL;
- +#endif
- +
- + if( h->param.b_opencl )
- + {
- +#ifdef _WIN32
- +#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
- + HANDLE id = GetCurrentThread();
- + my_pri = GetThreadPriority( id );
- + SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
- + cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
- + if( status == CL_SUCCESS )
- + {
- + ocl_pri = GetThreadPriority( id );
- + SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
- + }
- +#endif
- +
- + int b_work_done = 0;
- +
- + /* precalculate intra and I frames */
- + for( int i = 0; i <= num_frames; i++ )
- + {
- + b_work_done |= x264_opencl_lowres_init( h, frames[i], a.i_lambda );
- + }
- + if( b_work_done && h->param.analyse.i_weighted_pred )
- + {
- + /* weightp estimation requires intra costs to be on the CPU */
- + x264_opencl_flush( h );
- + }
- +
- + if( h->param.i_bframe && h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
- + {
- + /* Precalculate motion searches in both directions */
- + for( int i = 0; i < num_frames; i++ )
- + {
- + for (int b = 0; b < h->param.i_bframe; b++)
- + {
- + if (i <= b)
- + {
- + break;
- + }
- +
- + b_work_done |= x264_opencl_precalculate_frame_cost( h, &a, frames, i, i-b-1, i-b-1 );
- + b_work_done |= x264_opencl_precalculate_frame_cost( h, &a, frames, i-b-1, i, i );
- + }
- + }
- + }
- + if( b_work_done )
- + x264_opencl_flush( h );
- + }
- +#endif
- +
- if( h->param.i_bframe )
- {
- if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
- @@ -1377,6 +1465,18 @@
- continue;
- }
- +#if HAVE_OPENCL
- + if( h->param.b_opencl )
- + {
- + int b_work_done = 0;
- + b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+0, i+2, i+1 );
- + b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+0, i+1, i+1 );
- + b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+1, i+2, i+2 );
- + if( b_work_done )
- + x264_opencl_flush( h );
- + }
- +#endif
- +
- cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
- cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
- cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
- @@ -1459,6 +1559,19 @@
- /* Restore frametypes for all frames that haven't actually been decided yet. */
- for( int j = reset_start; j <= num_frames; j++ )
- frames[j]->i_type = X264_TYPE_AUTO;
- +
- +#ifdef _WIN32
- +#if HAVE_OPENCL
- + if( h->param.b_opencl )
- + {
- + HANDLE id = GetCurrentThread();
- + SetThreadPriority( id, my_pri );
- + cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
- + if( status == CL_SUCCESS )
- + SetThreadPriority( id, ocl_pri );
- + }
- +#endif
- +#endif
- }
- void x264_slicetype_decide( x264_t *h )
- diff -r 087c10a1fec7 -r 9a682cb0c74f x264.c
- --- a/x264.c Thu Mar 29 14:14:07 2012 -0700
- +++ b/x264.c Tue May 15 18:05:00 2012 -0500
- @@ -805,6 +805,8 @@
- " as opposed to letting them select different algorithms\n" );
- H2( " --asm <integer> Override CPU detection\n" );
- H2( " --no-asm Disable all CPU optimizations\n" );
- + H2( " --no-opencl Disable use of OpenCL\n" );
- + H2( " --clbin-file <string> Specify path of compiled OpenCL kernel cache\n" );
- H2( " --visualize Show MB types overlayed on the encoded video\n" );
- H2( " --dump-yuv <string> Save reconstructed frames\n" );
- H2( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
- @@ -909,6 +911,8 @@
- { "ref", required_argument, NULL, 'r' },
- { "asm", required_argument, NULL, 0 },
- { "no-asm", no_argument, NULL, 0 },
- + { "no-opencl", no_argument, NULL, 0 },
- + { "clbin-file", required_argument, NULL, 0 },
- { "sar", required_argument, NULL, 0 },
- { "fps", required_argument, NULL, OPT_FPS },
- { "frames", required_argument, NULL, OPT_FRAMES },
- diff -r 087c10a1fec7 -r 9a682cb0c74f x264.h
- --- a/x264.h Thu Mar 29 14:14:07 2012 -0700
- +++ b/x264.h Tue May 15 18:05:00 2012 -0500
- @@ -462,6 +462,9 @@
- int b_fake_interlaced;
- + int b_opencl; /* use OpenCL when available */
- + char *psz_clbin_file; /* compiled OpenCL kernel cache file */
- +
- /* Slicing parameters */
- int i_slice_max_size; /* Max size per slice in bytes; includes estimated NAL overhead. */
- int i_slice_max_mbs; /* Max number of MBs per slice; overrides i_slice_count. */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement