SHARE
TWEET

First public x264's OpenCL lookahead test patch

a guest May 18th, 2012 578 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # HG changeset patch
  2. # User Steve Borho <steve@borho.org>
  3. # Date 1337123100 18000
  4. # Node ID 9a682cb0c74fbf0ef19ef7eaa04b20fc0e27d360
  5. # Parent  087c10a1fec707bc7c53e91f5efeb3567f0bf77f
  6. OpenCL lookahead
  7.  
  8. diff -r 087c10a1fec7 -r 9a682cb0c74f Makefile
  9. --- a/Makefile  Thu Mar 29 14:14:07 2012 -0700
  10. +++ b/Makefile  Tue May 15 18:05:00 2012 -0500
  11. @@ -8,6 +8,8 @@
  12.  vpath %.asm $(SRCPATH)
  13.  vpath %.rc $(SRCPATH)
  14.  
  15. +GENERATED =
  16. +
  17.  all: default
  18.  default:
  19.  
  20. @@ -145,6 +147,36 @@
  21.  endif
  22.  endif
  23.  
  24. +QUOTED_CFLAGS := $(CFLAGS)
  25. +
  26. +ifeq ($(HAVE_OPENCL),yes)
  27. +empty:=
  28. +space:=$(empty) $(empty)
  29. +escaped:=\ $(empty)
  30. +open:=(
  31. +escopen:=\(
  32. +close:=)
  33. +escclose:=\)
  34. +SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
  35. +SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
  36. +SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
  37. +SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
  38. +SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
  39. +SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
  40. +# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
  41. +# make gcc happy
  42. +CFLAGS += -I$(SAFE_INC_DIR)
  43. +LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
  44. +# For the CFLAGS used by the .depend rule, we must add quotes because
  45. +# the rule does an extra level of shell expansions
  46. +QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)"
  47. +encoder/oclobj.h: common/opencl/x264-cl.h $(wildcard common/opencl/*.cl)
  48. +       echo "static const char x264_opencl_source [] = {" > $@
  49. +       cat $^ | xxd -i >> $@
  50. +       echo ",0x00 };" >> $@
  51. +GENERATED += encoder/oclobj.h
  52. +endif
  53. +
  54.  OBJS   += $(SRCS:%.c=%.o)
  55.  OBJCLI += $(SRCCLI:%.c=%.o)
  56.  OBJSO  += $(SRCSO:%.c=%.o)
  57. @@ -164,17 +196,22 @@
  58.         $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
  59.  
  60.  ifneq ($(EXE),)
  61. -.PHONY: x264 checkasm
  62. +.PHONY: x264 checkasm test-opencl
  63.  x264: x264$(EXE)
  64.  checkasm: checkasm$(EXE)
  65. +test-opencl: test-opencl$(EXE)
  66.  endif
  67.  
  68. -x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
  69. +x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
  70.         $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
  71.  
  72. -checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
  73. +checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
  74.         $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
  75.  
  76. +OBJOCL = tools/test-opencl.o
  77. +test-opencl$(EXE): .depend ${OBJOCL} $(LIBX264)
  78. +       $(LD)$@ $(OBJOCL) $(LIBX264) $(LDFLAGS)
  79. +
  80.  $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
  81.  
  82.  %.o: %.asm
  83. @@ -193,7 +230,7 @@
  84.  
  85.  .depend: config.mak
  86.         @rm -f .depend
  87. -       @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
  88. +       @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
  89.  
  90.  config.mak:
  91.         ./configure
  92. @@ -231,7 +268,7 @@
  93.  
  94.  clean:
  95.         rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
  96. -       rm -f checkasm checkasm.exe $(OBJCHK)
  97. +       rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
  98.         rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
  99.  
  100.  distclean: clean
  101. diff -r 087c10a1fec7 -r 9a682cb0c74f common/common.c
  102. --- a/common/common.c   Thu Mar 29 14:14:07 2012 -0700
  103. +++ b/common/common.c   Tue May 15 18:05:00 2012 -0500
  104. @@ -170,6 +170,10 @@
  105.      param->b_pic_struct = 0;
  106.      param->b_fake_interlaced = 0;
  107.      param->i_frame_packing = -1;
  108. +#if HAVE_OPENCL
  109. +    param->b_opencl = 1;
  110. +    param->psz_clbin_file = NULL;
  111. +#endif
  112.  }
  113.  
  114.  static int x264_param_apply_preset( x264_param_t *param, const char *preset )
  115. @@ -1013,6 +1017,10 @@
  116.          p->b_fake_interlaced = atobool(value);
  117.      OPT("frame-packing")
  118.          p->i_frame_packing = atoi(value);
  119. +    OPT("opencl")
  120. +        p->b_opencl = atobool( value );
  121. +    OPT("clbin-file")
  122. +        p->psz_clbin_file = strdup( value );
  123.      else
  124.          return X264_PARAM_BAD_NAME;
  125.  #undef OPT
  126. diff -r 087c10a1fec7 -r 9a682cb0c74f common/common.h
  127. --- a/common/common.h   Thu Mar 29 14:14:07 2012 -0700
  128. +++ b/common/common.h   Tue May 15 18:05:00 2012 -0500
  129. @@ -92,6 +92,11 @@
  130.  #include <assert.h>
  131.  #include <limits.h>
  132.  
  133. +#if HAVE_OPENCL
  134. +#    define NUM_IMAGE_SCALES 4
  135. +#    include "CL/cl.h"
  136. +#endif
  137. +
  138.  #if HAVE_INTERLACED
  139.  #   define MB_INTERLACED h->mb.b_interlaced
  140.  #   define SLICE_MBAFF h->sh.b_mbaff
  141. @@ -943,6 +948,65 @@
  142.      struct visualize_t *visualize;
  143.  #endif
  144.      x264_lookahead_t *lookahead;
  145. +
  146. +#if HAVE_OPENCL
  147. +#define MAX_FINISH_COPIES 1024
  148. +#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
  149. +    struct
  150. +    {
  151. +        cl_context       context;
  152. +        cl_device_id     device;
  153. +        cl_command_queue queue;
  154. +
  155. +        cl_program       lookahead_program;
  156. +        cl_int           last_buf;
  157. +
  158. +        cl_mem           page_locked_buffer;
  159. +        char            *page_locked_ptr;
  160. +        int              pl_occupancy;
  161. +
  162. +        struct
  163. +        {
  164. +            void *src;
  165. +            void *dest;
  166. +            int   bytes;
  167. +        } copies[MAX_FINISH_COPIES];
  168. +        int         num_copies;
  169. +
  170. +        int         b_device_AMD_SI;
  171. +
  172. +        /* downscale lowres luma */
  173. +        cl_kernel   downscale_hpel_kernel;
  174. +        cl_kernel   downscale_kernel1;
  175. +        cl_kernel   downscale_kernel2;
  176. +        cl_mem      luma_16x16_image[2];
  177. +
  178. +        /* weightp filtering */
  179. +        cl_kernel   weightp_hpel_kernel;
  180. +        cl_kernel   weightp_scaled_images_kernel;
  181. +        cl_mem      weighted_scaled_images[NUM_IMAGE_SCALES];
  182. +        cl_mem      weighted_luma_hpel;
  183. +
  184. +        /* intra */
  185. +        cl_kernel   memset_kernel;
  186. +        cl_kernel   intra_kernel;
  187. +        cl_kernel   rowsum_intra_kernel;
  188. +        cl_mem      row_satds[2];
  189. +
  190. +        /* hierarchical motion estimation */
  191. +        cl_kernel   hme_kernel;
  192. +        cl_kernel   subpel_refine_kernel;
  193. +        cl_mem      mv_buffers[2];
  194. +        cl_mem      lowres_mv_costs;
  195. +        cl_mem      mvp_buffer;
  196. +
  197. +        /* bidir */
  198. +        cl_kernel   mode_select_kernel;
  199. +        cl_kernel   rowsum_inter_kernel;
  200. +        cl_mem      lowres_costs[2];
  201. +        cl_mem      frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
  202. +    } opencl;
  203. +#endif
  204.  };
  205.  
  206.  // included at the end because it needs x264_t
  207. diff -r 087c10a1fec7 -r 9a682cb0c74f common/frame.c
  208. --- a/common/frame.c    Thu Mar 29 14:14:07 2012 -0700
  209. +++ b/common/frame.c    Tue May 15 18:05:00 2012 -0500
  210. @@ -258,6 +258,23 @@
  211.      return NULL;
  212.  }
  213.  
  214. +#if HAVE_OPENCL
  215. +static void x264_opencl_frame_delete( x264_frame_t *frame )
  216. +{
  217. +#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
  218. +    for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
  219. +        RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
  220. +    RELEASEBUF( frame->opencl.luma_hpel );
  221. +    RELEASEBUF( frame->opencl.inv_qscale_factor );
  222. +    RELEASEBUF( frame->opencl.intra_cost );
  223. +    RELEASEBUF( frame->opencl.lowres_mvs0 );
  224. +    RELEASEBUF( frame->opencl.lowres_mvs1 );
  225. +    RELEASEBUF( frame->opencl.lowres_mv_costs0 );
  226. +    RELEASEBUF( frame->opencl.lowres_mv_costs1 );
  227. +#undef RELEASEBUF
  228. +}
  229. +#endif
  230. +
  231.  void x264_frame_delete( x264_frame_t *frame )
  232.  {
  233.      /* Duplicate frames are blank copies of real frames (including pointers),
  234. @@ -302,6 +319,9 @@
  235.          x264_free( frame->ref[1] );
  236.          x264_pthread_mutex_destroy( &frame->mutex );
  237.          x264_pthread_cond_destroy( &frame->cv );
  238. +#if HAVE_OPENCL
  239. +        x264_opencl_frame_delete( frame );
  240. +#endif
  241.      }
  242.      x264_free( frame );
  243.  }
  244. diff -r 087c10a1fec7 -r 9a682cb0c74f common/frame.h
  245. --- a/common/frame.h    Thu Mar 29 14:14:07 2012 -0700
  246. +++ b/common/frame.h    Tue May 15 18:05:00 2012 -0500
  247. @@ -170,6 +170,20 @@
  248.      /* user frame properties */
  249.      uint8_t *mb_info;
  250.      void (*mb_info_free)( void* );
  251. +
  252. +#if HAVE_OPENCL
  253. +    struct
  254. +    {
  255. +        cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
  256. +        cl_mem luma_hpel;
  257. +        cl_mem inv_qscale_factor;
  258. +        cl_mem intra_cost;
  259. +        cl_mem lowres_mvs0;
  260. +        cl_mem lowres_mvs1;
  261. +        cl_mem lowres_mv_costs0;
  262. +        cl_mem lowres_mv_costs1;
  263. +    } opencl;
  264. +#endif
  265.  } x264_frame_t;
  266.  
  267.  /* synchronized frame list */
  268. diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/bidir.cl
  269. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  270. +++ b/common/opencl/bidir.cl    Tue May 15 18:05:00 2012 -0500
  271. @@ -0,0 +1,254 @@
  272. +
  273. +/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
  274. +int bidir_satd_8x8_ii_coop4(
  275. +    read_only image2d_t fenc_lowres,  int2 fencpos,
  276. +    read_only image2d_t fref0_planes, int2 qpos0,
  277. +    read_only image2d_t fref1_planes, int2 qpos1,
  278. +    int weight,
  279. +    local sum2_t *tmpp,
  280. +    int idx )
  281. +{
  282. +    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
  283. +    sum2_t b0, b1, b2, b3;
  284. +    sum2_t sum = 0;
  285. +
  286. +    // fencpos is full-pel position of original MB
  287. +    // qpos0 is qpel position within reference frame 0
  288. +    // qpos1 is qpel position within reference frame 1
  289. +
  290. +    int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
  291. +    int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
  292. +
  293. +    int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
  294. +    int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
  295. +    int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
  296. +
  297. +    int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
  298. +    int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
  299. +
  300. +    int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
  301. +    int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
  302. +    int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
  303. +
  304. +    uint mask_shift0A = 8 * (3 - hpel0A), mask_shift0B = 8 * (3 - hpel0B);
  305. +    uint mask0A = 0x000000ff << mask_shift0A, mask0B = 0x000000ff << mask_shift0B;
  306. +    uint mask_shift1A = 8 * (3 - hpel1A), mask_shift1B = 8 * (3 - hpel1B);
  307. +    uint mask1A = 0x000000ff << mask_shift1A, mask1B = 0x000000ff << mask_shift1B;
  308. +
  309. +    uint vA, vB;
  310. +    uint enc, ref0, ref1;
  311. +    uint a0, a1;
  312. +    const int weight2 = 64 - weight;
  313. +
  314. +#define READ_BIDIR_DIFF( OUT, X )\
  315. +    enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
  316. +    vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 & mask0A) >> mask_shift0A;\
  317. +    vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 & mask0B) >> mask_shift0B;\
  318. +    ref0 = rhadd( vA, vB );\
  319. +    vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 & mask1A) >> mask_shift1A;\
  320. +    vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 & mask1B) >> mask_shift1B;\
  321. +    ref1 = rhadd( vA, vB );\
  322. +    OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
  323. +
  324. +#define READ_DIFF_EX( OUT, a, b )\
  325. +    READ_BIDIR_DIFF( a0, a );\
  326. +    READ_BIDIR_DIFF( a1, b );\
  327. +    OUT = a0 + (a1<<BITS_PER_SUM);
  328. +
  329. +#define ROW_8x4_SATD( a, b, c )\
  330. +    fencpos.y += a;\
  331. +    fref0Apos.y += b;\
  332. +    fref0Bpos.y += b;\
  333. +    fref1Apos.y += c;\
  334. +    fref1Bpos.y += c;\
  335. +    READ_DIFF_EX( b0, 0, 4 );\
  336. +    READ_DIFF_EX( b1, 1, 5 );\
  337. +    READ_DIFF_EX( b2, 2, 6 );\
  338. +    READ_DIFF_EX( b3, 3, 7 );\
  339. +    HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
  340. +    HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
  341. +    sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
  342. +
  343. +    ROW_8x4_SATD( 0, 0, 0 );
  344. +    ROW_8x4_SATD( 4, 4, 4 );
  345. +
  346. +#undef READ_BIDIR_DIFF
  347. +#undef READ_DIFF_EX
  348. +#undef ROW_8x4_SATD
  349. +
  350. +    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
  351. +}
  352. +
  353. +/*
  354. + * mode selection
  355. + *
  356. + * global launch dimensions:  [mb_width, mb_height]
  357. + *
  358. + * If this is a B frame, launch dims are [mb_width * 4, mb_height]
  359. + */
  360. +kernel void mode_selection(
  361. +    read_only image2d_t   fenc_lowres,
  362. +    read_only image2d_t   fref0_planes,
  363. +    read_only image2d_t   fref1_planes,
  364. +    const global short2  *fenc_lowres_mvs0,
  365. +    const global short2  *fenc_lowres_mvs1,
  366. +    const global short2  *fref1_lowres_mvs0,
  367. +    const global int16_t *fenc_lowres_mv_costs0,
  368. +    const global int16_t *fenc_lowres_mv_costs1,
  369. +    const global uint16_t *fenc_intra_cost,
  370. +    global uint16_t      *lowres_costs,
  371. +    global int           *frame_stats,
  372. +    local int16_t        *cost_local,
  373. +    local sum2_t         *satd_local,
  374. +    int                   mb_width,
  375. +    int                   bipred_weight,
  376. +    int                   dist_scale_factor,
  377. +    int                   b,
  378. +    int                   p0,
  379. +    int                   p1,
  380. +    int                   lambda )
  381. +{
  382. +    int mb_x = get_global_id( 0 );
  383. +    int b_bidir = (b < p1);
  384. +    if( b_bidir )
  385. +    {
  386. +        mb_x >>= 2;
  387. +        if( mb_x >= mb_width )
  388. +            return;
  389. +    }
  390. +    int mb_y = get_global_id( 1 );
  391. +    int mb_height = get_global_size( 1 );
  392. +    int mb_count = mb_width * mb_height;
  393. +    int mb_xy = mb_x + mb_y * mb_width;
  394. +
  395. +    /* Initialize frame stats for next kernel (sum_inter_cost) */
  396. +    if( mb_x < 4 && mb_y == 0 )
  397. +        frame_stats[mb_x] = 0;
  398. +
  399. +    int bcost = COST_MAX;
  400. +    int list_used = 0;
  401. +
  402. +    if( !b_bidir )
  403. +    {
  404. +        int icost = fenc_intra_cost[mb_xy];
  405. +        COPY2_IF_LT( bcost, icost, list_used, 0 );
  406. +    }
  407. +    if( b != p0 )
  408. +    {
  409. +        int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
  410. +        COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
  411. +    }
  412. +    if( b != p1 )
  413. +    {
  414. +        int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
  415. +        COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
  416. +    }
  417. +
  418. +    if( b_bidir )
  419. +    {
  420. +        int2 coord = (int2)(mb_x << 3, mb_y << 3);
  421. +        int mb_i = get_global_id( 0 ) & 3;
  422. +        int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
  423. +        cost_local += mb_in_group * 4;
  424. +        satd_local += mb_in_group * 16;
  425. +
  426. +#define TRY_BIDIR( mv0, mv1, penalty )\
  427. +    {\
  428. +        int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
  429. +        int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
  430. +        cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
  431. +        int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
  432. +        COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
  433. +    }
  434. +        /* temporal prediction */
  435. +        short2 dmv0, dmv1;
  436. +        short2 mvr = fref1_lowres_mvs0[mb_xy];
  437. +        dmv0.x = (mvr.x * dist_scale_factor + 128) >> 8;
  438. +        dmv0.y = (mvr.y * dist_scale_factor + 128) >> 8;
  439. +        dmv1.x = dmv0.x - mvr.x;
  440. +        dmv1.y = dmv0.y - mvr.y;
  441. +        TRY_BIDIR( dmv0, dmv1, 0 )
  442. +
  443. +        if( as_uint( dmv0 ) || as_uint( dmv1 ) )
  444. +        {
  445. +            /* B-direct prediction */
  446. +            dmv0 = 0; dmv1 = 0;
  447. +            TRY_BIDIR( dmv0, dmv1, 0 );
  448. +        }
  449. +
  450. +        /* L0+L1 prediction */
  451. +        dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
  452. +        dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
  453. +        TRY_BIDIR( dmv0, dmv1, 5 );
  454. +#undef TRY_BIDIR
  455. +    }
  456. +
  457. +    lowres_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
  458. +}
  459. +
  460. +/*
  461. + * parallel sum inter costs
  462. + *
  463. + * global launch dimensions: [256, mb_height]
  464. + */
  465. +kernel void sum_inter_cost(
  466. +    const global uint16_t *fenc_lowres_costs,
  467. +    const global uint16_t *inv_qscale_factor,
  468. +    global int           *fenc_row_satds,
  469. +    global int           *frame_stats,
  470. +    int                   mb_width,
  471. +    int                   bframe_bias,
  472. +    int                   b,
  473. +    int                   p0,
  474. +    int                   p1 )
  475. +{
  476. +    int y = get_global_id( 1 );
  477. +    int mb_height = get_global_size( 1 );
  478. +
  479. +    int row_satds = 0;
  480. +    int cost_est = 0;
  481. +    int cost_est_aq = 0;
  482. +    int intra_mbs = 0;
  483. +
  484. +    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
  485. +    {
  486. +        int mb_xy = x + y * mb_width;
  487. +        int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
  488. +        int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
  489. +        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
  490. +
  491. +        if( list == 0 && b_frame_score_mb )
  492. +        {
  493. +            intra_mbs++;
  494. +        }
  495. +
  496. +        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
  497. +
  498. +        row_satds += cost_aq;
  499. +
  500. +        if( b_frame_score_mb )
  501. +        {
  502. +            cost_est += cost;
  503. +            cost_est_aq += cost_aq;
  504. +        }
  505. +    }
  506. +
  507. +    local int buffer[256];
  508. +    int x = get_global_id( 0 );
  509. +
  510. +    row_satds   = parallel_sum( row_satds, x, buffer );
  511. +    cost_est    = parallel_sum( cost_est, x, buffer );
  512. +    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
  513. +    intra_mbs   = parallel_sum( intra_mbs, x, buffer );
  514. +
  515. +    if( b != p1 )
  516. +        cost_est = (int)((float)cost_est * 100 / (120 + bframe_bias));
  517. +
  518. +    if( get_global_id( 0 ) == 0 )
  519. +    {
  520. +        fenc_row_satds[y] = row_satds;
  521. +        atomic_add( frame_stats + 0, cost_est );
  522. +        atomic_add( frame_stats + 1, cost_est_aq );
  523. +        atomic_add( frame_stats + 2, intra_mbs );
  524. +    }
  525. +}
  526. diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/downscale.cl
  527. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  528. +++ b/common/opencl/downscale.cl        Tue May 15 18:05:00 2012 -0500
  529. @@ -0,0 +1,138 @@
  530. +/*
  531. + * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
  532. + *
  533. + * --
  534. + *
  535. + * fenc_img is an output image (area of memory referenced through a texture
  536. + * cache). A read of any pixel location (x,y) returns four pixel values:
  537. + *
  538. + * val.s0 = P(x,y)
  539. + * val.s1 = P(x+1,y)
  540. + * val.s2 = P(x+2,y)
  541. + * val.s3 = P(x+3,y)
  542. + *
  543. + * This is a 4x replication of the lowres pixels, a trade-off between memory
  544. + * size and read latency.
  545. + *
  546. + * --
  547. + *
  548. + * hpel_planes is an output image that contains the four HPEL planes used for
  549. + * subpel refinement. A read of any pixel location (x,y) returns four HPEL pixel
  550. + * values:
  551. + *
  552. + * val.s0 = F(x,y) - aka the full pel pixel
  553. + * val.s1 = H(x,y) - horizontally interpolated pixel
  554. + * val.s2 = V(x,y) - vertically interpolated pixel
  555. + * val.s3 = C(x,y) - center (or corner) pixel
  556. + *
  557. + * launch dimensions:  [lowres-width, lowres-height]
  558. + */
  559. +kernel void downscale_hpel(
  560. +    const global pixel *fenc,
  561. +    write_only image2d_t fenc_img,
  562. +    write_only image2d_t hpel_planes,
  563. +    int stride )
  564. +{
  565. +    int x = get_global_id( 0 );
  566. +    int y = get_global_id( 1 );
  567. +    uint4 values;
  568. +
  569. +    fenc += y * stride * 2;
  570. +    const global pixel *src1 = fenc + stride;
  571. +    const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
  572. +    int2 pos = (int2)(x, y);
  573. +    pixel right, left;
  574. +
  575. +    right = rhadd( fenc[x*2], src1[x*2] );
  576. +    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  577. +    values.s0 = rhadd( right, left );           // F
  578. +
  579. +    right = rhadd( fenc[2*x+1], src1[2*x+1] );
  580. +    left  = rhadd( fenc[2*x+2], src1[2*x+2] );
  581. +    values.s1 = rhadd( right, left );           // H
  582. +
  583. +    right = rhadd( src1[2*x], src2[2*x] );
  584. +    left  = rhadd( src1[2*x+1], src2[2*x+1] );
  585. +    values.s2 = rhadd( right, left );           // V
  586. +
  587. +    right = rhadd( src1[2*x+1], src2[2*x+1] );
  588. +    left  = rhadd( src1[2*x+2], src2[2*x+2] );
  589. +    values.s3 = rhadd( right, left );           // C
  590. +
  591. +    uint4 val = (uint4)(((values.s0 & 0xff) << 24) + ((values.s1 & 0xff) << 16) + ((values.s2 & 0xff) << 8) + (values.s3 & 0xff), 0, 0, 0);
  592. +    write_imageui( hpel_planes, pos, val );
  593. +
  594. +    x = x+1 < get_global_size( 0 ) ? x+1 : x;
  595. +    right = rhadd( fenc[x*2], src1[x*2] );
  596. +    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  597. +    values.s1 = rhadd( right, left );
  598. +
  599. +    x = x+1 < get_global_size( 0 ) ? x+1 : x;
  600. +    right = rhadd( fenc[x*2], src1[x*2] );
  601. +    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  602. +    values.s2 = rhadd( right, left );
  603. +
  604. +    x = x+1 < get_global_size( 0 ) ? x+1 : x;
  605. +    right = rhadd( fenc[x*2], src1[x*2] );
  606. +    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  607. +    values.s3 = rhadd( right, left );
  608. +
  609. +    write_imageui( fenc_img, pos, values );
  610. +}
  611. +
  612. +/*
  613. + * downscale lowres hierarchical motion search image
  614. + *
  615. + * launch dimensions:  [lowres-width, lowres-height]
  616. + */
  617. +kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
  618. +{
  619. +    int x = get_global_id( 0 );
  620. +    int y = get_global_id( 1 );
  621. +    int2 pos = (int2)(x, y);
  622. +    int gs = get_global_size( 0 );
  623. +    uint4 top, bot, values;
  624. +    top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
  625. +    bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
  626. +    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
  627. +
  628. +    // these select statements appear redundant, and they should be, but tests break when
  629. +    // they are not here.
  630. +    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
  631. +    top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
  632. +    bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
  633. +    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
  634. +    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
  635. +    write_imageui( lower_res, pos, (uint4)(values) );
  636. +}
  637. +
  638. +/*
  639. + * Second copy of downscale kernel, no differences. This is a (no perf loss) workaround for a
  640. + * scheduling bug in current Tahiti drivers.
  641. + */
  642. +kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
  643. +{
  644. +    int x = get_global_id( 0 );
  645. +    int y = get_global_id( 1 );
  646. +    int2 pos = (int2)(x, y);
  647. +    int gs = get_global_size( 0 );
  648. +    uint4 top, bot, values;
  649. +    top = read_imageui( higher_res, sampler, (int2)( x*2, 2*y ) );
  650. +    bot = read_imageui( higher_res, sampler, (int2)( x*2, 2*y+1 ) );
  651. +    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
  652. +
  653. +    // these select statements appear redundant, and they should be, but tests break when
  654. +    // they are not here.
  655. +    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
  656. +    top = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y ) );
  657. +    bot = read_imageui( higher_res, sampler, (int2)( x*2+4, 2*y+1 ) );
  658. +    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
  659. +    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
  660. +    write_imageui( lower_res, pos, (uint4)(values) );
  661. +}
  662. +
  663. +/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
  664. +kernel void memset_int16( global int16_t *buf, int16_t value )
  665. +{
  666. +    buf[get_global_id( 0 )] = value;
  667. +}
  668. diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/intra.cl
  669. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  670. +++ b/common/opencl/intra.cl    Tue May 15 18:05:00 2012 -0500
  671. @@ -0,0 +1,1053 @@
  672. +
  673. +inline uint32_t pack16to32( uint32_t a, uint32_t b )
  674. +{
  675. +    return a + (b << 16);
  676. +}
  677. +
  678. +inline uint32_t pack8to16( uint32_t a, uint32_t b )
  679. +{
  680. +    return a + (b << 8);
  681. +}
  682. +
  683. +inline int clamp_int( int a, int b, int c )
  684. +{
  685. +    return clamp( a, b, c );
  686. +}
  687. +
  688. +#if VECTORIZE
  689. +int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
  690. +{
  691. +    int8 a_v, d_v;
  692. +    int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
  693. +    int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
  694. +
  695. +    d_v = convert_int8( vload8( 0, data ) );
  696. +    a_v.s01234567 = (d_v - pr0).s04152637;
  697. +    HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  698. +
  699. +    data += data_stride;
  700. +    d_v = convert_int8( vload8( 0, data ) );
  701. +    a_v.s01234567 = (d_v - pr1).s04152637;
  702. +    HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  703. +
  704. +    data += data_stride;
  705. +    d_v = convert_int8( vload8( 0, data ) );
  706. +    a_v.s01234567 = (d_v - pr2).s04152637;
  707. +    HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  708. +
  709. +    data += data_stride;
  710. +    d_v = convert_int8( vload8( 0, data ) );
  711. +    a_v.s01234567 = (d_v - pr3).s04152637;
  712. +    HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  713. +
  714. +    uint8 sum_v;
  715. +
  716. +    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
  717. +    sum_v = abs( a_v );
  718. +
  719. +    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
  720. +    sum_v += abs( a_v );
  721. +
  722. +    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
  723. +    sum_v += abs( a_v );
  724. +
  725. +    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
  726. +    sum_v += abs( a_v );
  727. +
  728. +    uint4 sum2 = sum_v.hi + sum_v.lo;
  729. +    uint2 sum3 = sum2.hi + sum2.lo;
  730. +    return ( sum3.hi + sum3.lo ) >> 1;
  731. +}
  732. +#else
  733. +SATD_C_8x4_Q( satd_8x4_lp, const local, private )
  734. +#endif
  735. +
  736. +/****************************************************************************
  737. + * 8x8 prediction for intra luma block
  738. + ****************************************************************************/
  739. +
  740. +#define F1            rhadd
  741. +#define F2( a, b, c ) ( a+2*b+c+2 )>>2
  742. +
  743. +#if VECTORIZE
  744. +int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
  745. +{
  746. +    int8 pr0, pr1, pr2, pr3;
  747. +
  748. +    // Upper half of pred[]
  749. +    pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
  750. +    pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
  751. +    pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  752. +    pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  753. +    pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  754. +    pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  755. +    pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  756. +    pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  757. +
  758. +    pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
  759. +    pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  760. +    pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  761. +    pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  762. +    pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  763. +    pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  764. +    pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  765. +    pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  766. +
  767. +    pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  768. +    pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  769. +    pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  770. +    pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  771. +    pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  772. +    pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  773. +    pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  774. +    pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  775. +
  776. +    pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  777. +    pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  778. +    pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  779. +    pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  780. +    pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  781. +    pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  782. +    pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  783. +    pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  784. +    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  785. +
  786. +    // Lower half of pred[]
  787. +    pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  788. +    pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  789. +    pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  790. +    pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  791. +    pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  792. +    pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  793. +    pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  794. +    pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  795. +
  796. +    pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  797. +    pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  798. +    pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  799. +    pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  800. +    pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  801. +    pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  802. +    pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  803. +    pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
  804. +
  805. +    pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  806. +    pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  807. +    pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  808. +    pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  809. +    pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  810. +    pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  811. +    pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
  812. +    pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
  813. +
  814. +    pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  815. +    pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  816. +    pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  817. +    pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  818. +    pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  819. +    pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
  820. +    pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
  821. +    pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
  822. +
  823. +    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  824. +}
  825. +
  826. +int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  827. +{
  828. +    int8 pr0, pr1, pr2, pr3;
  829. +
  830. +    // Upper half of pred[]
  831. +    pr3.s0 = F2( left[1], left[2], left[3] );
  832. +    pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
  833. +    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
  834. +    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
  835. +    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
  836. +    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
  837. +    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
  838. +    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
  839. +    pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
  840. +    pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
  841. +    pr0.s7 = F2( top[5], top[6], top[7] );
  842. +    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  843. +
  844. +    // Lower half of pred[]
  845. +    pr3.s0 = F2( left[5], left[6], left[7] );
  846. +    pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
  847. +    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
  848. +    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
  849. +    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
  850. +    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
  851. +    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
  852. +    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
  853. +    pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
  854. +    pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
  855. +    pr0.s7 = F2( top[1], top[2], top[3] );
  856. +    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  857. +}
  858. +
  859. +int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  860. +{
  861. +    int8 pr0, pr1, pr2, pr3;
  862. +
  863. +    // Upper half of pred[]
  864. +    pr2.s0 = F2( left[1], left[0], left_top );
  865. +    pr3.s0 = F2( left[2], left[1], left[0] );
  866. +    pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
  867. +    pr0.s0 = pr2.s1 = F1( left_top, top[0] );
  868. +    pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
  869. +    pr0.s1 = pr2.s2 = F1( top[0], top[1] );
  870. +    pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
  871. +    pr0.s2 = pr2.s3 = F1( top[1], top[2] );
  872. +    pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
  873. +    pr0.s3 = pr2.s4 = F1( top[2], top[3] );
  874. +    pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
  875. +    pr0.s4 = pr2.s5 = F1( top[3], top[4] );
  876. +    pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
  877. +    pr0.s5 = pr2.s6 = F1( top[4], top[5] );
  878. +    pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
  879. +    pr0.s6 = pr2.s7 = F1( top[5], top[6] );
  880. +    pr1.s7 = F2( top[5], top[6], top[7] );
  881. +    pr0.s7 = F1( top[6], top[7] );
  882. +    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  883. +
  884. +    // Lower half of pred[]
  885. +    pr2.s0 = F2( left[5], left[4], left[3] );
  886. +    pr3.s0 = F2( left[6], left[5], left[4] );
  887. +    pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
  888. +    pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
  889. +    pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
  890. +    pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
  891. +    pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
  892. +    pr0.s2 = pr2.s3 = F1( left_top, top[0] );
  893. +    pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
  894. +    pr0.s3 = pr2.s4 = F1( top[0], top[1] );
  895. +    pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
  896. +    pr0.s4 = pr2.s5 = F1( top[1], top[2] );
  897. +    pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
  898. +    pr0.s5 = pr2.s6 = F1( top[2], top[3] );
  899. +    pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
  900. +    pr0.s6 = pr2.s7 = F1( top[3], top[4] );
  901. +    pr1.s7 = F2( top[3], top[4], top[5] );
  902. +    pr0.s7 = F1( top[4], top[5] );
  903. +    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  904. +#undef PRED
  905. +}
  906. +
  907. +int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  908. +{
  909. +    int8 pr0, pr1, pr2, pr3;
  910. +
  911. +    // Upper half of pred[]
  912. +    pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  913. +    pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
  914. +    pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
  915. +    pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
  916. +
  917. +    pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  918. +    pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  919. +    pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
  920. +    pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
  921. +
  922. +    pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  923. +    pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  924. +    pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  925. +    pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
  926. +
  927. +    pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  928. +    pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  929. +    pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  930. +    pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  931. +    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  932. +
  933. +    // Lower half of pred[]
  934. +    pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  935. +    pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  936. +    pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  937. +    pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  938. +
  939. +    pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  940. +    pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  941. +    pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  942. +    pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  943. +
  944. +    pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  945. +    pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  946. +    pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  947. +    pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  948. +
  949. +    pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  950. +    pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  951. +    pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  952. +    pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  953. +    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  954. +}
  955. +
  956. +int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
  957. +{
  958. +    int8 pr0, pr1, pr2, pr3;
  959. +
  960. +    // Upper half of pred[]
  961. +    pr0.s0 = F1( top[0], top[1] );
  962. +    pr1.s0 = F2( top[0], top[1], top[2] );
  963. +    pr2.s0 = pr0.s1 = F1( top[1], top[2] );
  964. +    pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
  965. +    pr2.s1 = pr0.s2 = F1( top[2], top[3] );
  966. +    pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
  967. +    pr2.s2 = pr0.s3 = F1( top[3], top[4] );
  968. +    pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
  969. +    pr2.s3 = pr0.s4 = F1( top[4], top[5] );
  970. +    pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
  971. +    pr2.s4 = pr0.s5 = F1( top[5], top[6] );
  972. +    pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
  973. +    pr2.s5 = pr0.s6 = F1( top[6], top[7] );
  974. +    pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
  975. +    pr2.s6 = pr0.s7 = F1( top[7], top[8] );
  976. +    pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
  977. +    pr2.s7 = F1( top[8], top[9] );
  978. +    pr3.s7 = F2( top[8], top[9], top[10] );
  979. +    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  980. +
  981. +    // Lower half of pred[]
  982. +    pr0.s0 = F1( top[2], top[3] );
  983. +    pr1.s0 = F2( top[2], top[3], top[4] );
  984. +    pr2.s0 = pr0.s1 = F1( top[3], top[4] );
  985. +    pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
  986. +    pr2.s1 = pr0.s2 = F1( top[4], top[5] );
  987. +    pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
  988. +    pr2.s2 = pr0.s3 = F1( top[5], top[6] );
  989. +    pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
  990. +    pr2.s3 = pr0.s4 = F1( top[6], top[7] );
  991. +    pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
  992. +    pr2.s4 = pr0.s5 = F1( top[7], top[8] );
  993. +    pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
  994. +    pr2.s5 = pr0.s6 = F1( top[8], top[9] );
  995. +    pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
  996. +    pr2.s6 = pr0.s7 = F1( top[9], top[10] );
  997. +    pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
  998. +    pr2.s7 = F1( top[10], top[11] );
  999. +    pr3.s7 = F2( top[10], top[11], top[12] );
  1000. +    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  1001. +}
  1002. +
  1003. +int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
  1004. +{
  1005. +    int8 pr0, pr1, pr2, pr3;
  1006. +
  1007. +    // Upper half of pred[]
  1008. +    pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  1009. +    pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  1010. +    pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  1011. +    pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  1012. +
  1013. +    pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  1014. +    pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  1015. +    pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  1016. +    pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  1017. +
  1018. +    pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  1019. +    pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  1020. +    pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  1021. +    pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  1022. +
  1023. +    pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  1024. +    pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  1025. +    pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  1026. +    pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  1027. +    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  1028. +
  1029. +    // Lower half of pred[]
  1030. +    pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  1031. +    pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  1032. +    pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  1033. +    pr0.s6 = left[7]; pr0.s7 = left[7];
  1034. +
  1035. +    pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  1036. +    pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  1037. +    pr1.s4 = left[7]; pr1.s5 = left[7];
  1038. +    pr1.s6 = left[7]; pr1.s7 = left[7];
  1039. +
  1040. +    pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  1041. +    pr2.s2 = left[7]; pr2.s3 = left[7];
  1042. +    pr2.s4 = left[7]; pr2.s5 = left[7];
  1043. +    pr2.s6 = left[7]; pr2.s7 = left[7];
  1044. +
  1045. +    pr3 = (int8)left[7];
  1046. +
  1047. +    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  1048. +}
  1049. +
  1050. +int x264_predict_8x8c_h( const local pixel *src, int src_stride )
  1051. +{
  1052. +    const local pixel *src_l = src;
  1053. +    int8 pr0, pr1, pr2, pr3;
  1054. +
  1055. +    // Upper half of pred[]
  1056. +    pr0 = (int8)src[-1]; src += src_stride;
  1057. +    pr1 = (int8)src[-1]; src += src_stride;
  1058. +    pr2 = (int8)src[-1]; src += src_stride;
  1059. +    pr3 = (int8)src[-1]; src += src_stride;
  1060. +    int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
  1061. +
  1062. +    //Lower half of pred[]
  1063. +    pr0 = (int8)src[-1]; src += src_stride;
  1064. +    pr1 = (int8)src[-1]; src += src_stride;
  1065. +    pr2 = (int8)src[-1]; src += src_stride;
  1066. +    pr3 = (int8)src[-1];
  1067. +    return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  1068. +}
  1069. +
  1070. +int x264_predict_8x8c_v( const local pixel *src, int src_stride )
  1071. +{
  1072. +    int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
  1073. +    return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
  1074. +           satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
  1075. +}
  1076. +
  1077. +int x264_predict_8x8c_p( const local pixel *src, int src_stride )
  1078. +{
  1079. +    int H = 0, V = 0;
  1080. +    for( int i = 0; i < 4; i++ )
  1081. +    {
  1082. +        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
  1083. +        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
  1084. +    }
  1085. +
  1086. +    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
  1087. +    int b = (17 * H + 16) >> 5;
  1088. +    int c = (17 * V + 16) >> 5;
  1089. +    int i00 = a - 3 * b - 3 * c + 16;
  1090. +
  1091. +    // Upper half of pred[]
  1092. +    int pix = i00;
  1093. +    int8 pr0, pr1, pr2, pr3;
  1094. +    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1095. +    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1096. +    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1097. +    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1098. +    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1099. +    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1100. +    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1101. +    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1102. +
  1103. +    pix = i00;
  1104. +    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1105. +    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1106. +    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1107. +    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1108. +    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1109. +    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1110. +    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1111. +    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1112. +
  1113. +    pix = i00;
  1114. +    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1115. +    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1116. +    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1117. +    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1118. +    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1119. +    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1120. +    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1121. +    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1122. +
  1123. +    pix = i00;
  1124. +    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1125. +    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1126. +    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1127. +    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1128. +    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1129. +    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1130. +    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1131. +    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1132. +    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  1133. +
  1134. +    //Lower half of pred[]
  1135. +    pix = i00;
  1136. +    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1137. +    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1138. +    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1139. +    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1140. +    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1141. +    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1142. +    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1143. +    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1144. +
  1145. +    pix = i00;
  1146. +    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1147. +    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1148. +    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1149. +    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1150. +    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1151. +    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1152. +    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1153. +    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1154. +
  1155. +    pix = i00;
  1156. +    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1157. +    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1158. +    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1159. +    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1160. +    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1161. +    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1162. +    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1163. +    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1164. +
  1165. +    pix = i00;
  1166. +    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  1167. +    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  1168. +    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  1169. +    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  1170. +    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  1171. +    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  1172. +    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  1173. +    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  1174. +    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  1175. +}
  1176. +
  1177. +int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
  1178. +{
  1179. +    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
  1180. +    for( int i = 0; i < 4; i++ )
  1181. +    {
  1182. +        s0 += src[i - src_stride];
  1183. +        s1 += src[i + 4 - src_stride];
  1184. +        s2 += src[-1 + i * src_stride];
  1185. +        s3 += src[-1 + (i+4)*src_stride];
  1186. +    }
  1187. +
  1188. +    // Upper half of pred[]
  1189. +    int8 dc0;
  1190. +    dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
  1191. +    dc0.hi = (int4)( (s1 + 2) >> 2 );
  1192. +    int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
  1193. +
  1194. +    // Lower half of pred[]
  1195. +    dc0.lo = (int4)( (s3 + 2) >> 2 );
  1196. +    dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
  1197. +    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
  1198. +}
  1199. +
  1200. +#else  /* not vectorized: private is cheap registers are scarce */
  1201. +
  1202. +int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
  1203. +{
  1204. +    private pixel pred[32];
  1205. +    int x_plus_y;
  1206. +
  1207. +    // Upper half of pred[]
  1208. +    for( int y = 0; y < 4; y++ )
  1209. +    {
  1210. +        for( int x = 0; x < 8; x++ )
  1211. +        {
  1212. +            x_plus_y = clamp_int( x + y, 0, 13 );
  1213. +            pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
  1214. +        }
  1215. +    }
  1216. +    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1217. +    //Lower half of pred[]
  1218. +    for( int y = 4; y < 8; y++ )
  1219. +    {
  1220. +        for( int x = 0; x < 8; x++ )
  1221. +        {
  1222. +            x_plus_y = clamp_int( x + y, 0, 13 );
  1223. +            pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
  1224. +        }
  1225. +    }
  1226. +    pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
  1227. +    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1228. +    return satd;
  1229. +}
  1230. +
  1231. +int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  1232. +{
  1233. +    private pixel pred[32];
  1234. +#define PRED( x, y ) pred[(x) + (y)*8]
  1235. +    // Upper half of pred[]
  1236. +    PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
  1237. +    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
  1238. +    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
  1239. +    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
  1240. +    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
  1241. +    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
  1242. +    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
  1243. +    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
  1244. +    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
  1245. +    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
  1246. +    PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
  1247. +    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1248. +
  1249. +    // Lower half of pred[]
  1250. +    PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
  1251. +    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
  1252. +    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
  1253. +    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
  1254. +    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
  1255. +    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
  1256. +    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
  1257. +    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
  1258. +    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
  1259. +    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
  1260. +    PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
  1261. +    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1262. +    return satd;
  1263. +#undef PRED
  1264. +}
  1265. +
  1266. +int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  1267. +{
  1268. +    private pixel pred[32];
  1269. +#define PRED( x, y ) pred[(x) + (y)*8]
  1270. +    // Upper half of pred[]
  1271. +    PRED( 0, 2 ) = F2( left[1], left[0], left_top );
  1272. +    PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
  1273. +    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
  1274. +    PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
  1275. +    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
  1276. +    PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
  1277. +    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
  1278. +    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
  1279. +    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
  1280. +    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
  1281. +    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
  1282. +    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
  1283. +    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
  1284. +    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
  1285. +    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
  1286. +    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
  1287. +    PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
  1288. +    PRED( 7, 0 ) = F1( top[6], top[7] );
  1289. +    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1290. +
  1291. +    //Lower half of pred[]
  1292. +    PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
  1293. +    PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
  1294. +    PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
  1295. +    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
  1296. +    PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
  1297. +    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
  1298. +    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
  1299. +    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
  1300. +    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
  1301. +    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
  1302. +    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
  1303. +    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
  1304. +    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
  1305. +    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
  1306. +    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
  1307. +    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
  1308. +    PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
  1309. +    PRED( 7, 0 ) = F1( top[4], top[5] );
  1310. +    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1311. +    return satd;
  1312. +#undef PRED
  1313. +}
  1314. +
  1315. +int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  1316. +{
  1317. +    private pixel pred[32];
  1318. +    int satd;
  1319. +    int p1 =  pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
  1320. +    int p2 =  pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
  1321. +    int p3 =  pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
  1322. +    int p4 =  pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
  1323. +    int p5 =  pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
  1324. +    int p6 =  pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
  1325. +    int p7 =  pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
  1326. +    int p8 =  pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
  1327. +    int p9 =  pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
  1328. +    int p10 =  pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
  1329. +    int p11 =  pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
  1330. +    // Upper half of pred[]
  1331. +    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
  1332. +    vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
  1333. +    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
  1334. +    vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
  1335. +    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
  1336. +    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
  1337. +    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
  1338. +    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
  1339. +    satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1340. +    // Lower half of pred[]
  1341. +    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
  1342. +    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
  1343. +    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
  1344. +    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
  1345. +    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
  1346. +    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
  1347. +    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
  1348. +    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
  1349. +    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1350. +    return satd;
  1351. +}
  1352. +
  1353. +int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
  1354. +{
  1355. +    private pixel pred[32];
  1356. +    int satd;
  1357. +#define PRED( x, y ) pred[(x) + (y)*8]
  1358. +    // Upper half of pred[]
  1359. +    PRED( 0, 0 ) = F1( top[0], top[1] );
  1360. +    PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
  1361. +    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
  1362. +    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
  1363. +    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
  1364. +    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
  1365. +    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
  1366. +    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
  1367. +    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
  1368. +    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
  1369. +    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
  1370. +    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
  1371. +    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
  1372. +    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
  1373. +    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
  1374. +    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
  1375. +    PRED( 7, 2 ) = F1( top[8], top[9] );
  1376. +    PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
  1377. +    satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1378. +    // Lower half of pred[]
  1379. +    PRED( 0, 0 ) = F1( top[2], top[3] );
  1380. +    PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
  1381. +    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
  1382. +    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
  1383. +    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
  1384. +    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
  1385. +    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
  1386. +    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
  1387. +    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
  1388. +    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
  1389. +    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
  1390. +    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
  1391. +    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
  1392. +    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
  1393. +    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
  1394. +    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
  1395. +    PRED( 7, 2 ) = F1( top[10], top[11] );
  1396. +    PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
  1397. +    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1398. +    return satd;
  1399. +#undef PRED
  1400. +}
  1401. +
  1402. +int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
  1403. +{
  1404. +    private pixel pred[32];
  1405. +    int satd;
  1406. +    int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
  1407. +    int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
  1408. +    int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
  1409. +    int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
  1410. +    int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
  1411. +    int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
  1412. +    int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
  1413. +    int p8 = pack8to16( left[7], left[7] );
  1414. +    // Upper half of pred[]
  1415. +    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
  1416. +    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
  1417. +    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
  1418. +    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
  1419. +    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
  1420. +    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
  1421. +    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
  1422. +    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
  1423. +    satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1424. +    // Lower half of pred[]
  1425. +    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
  1426. +    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
  1427. +    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
  1428. +    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
  1429. +    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
  1430. +    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
  1431. +    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
  1432. +    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
  1433. +    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1434. +    return satd;
  1435. +}
  1436. +
  1437. +int x264_predict_8x8c_h( const local pixel *src, int src_stride )
  1438. +{
  1439. +    private pixel pred[32];
  1440. +    const local pixel *src_l = src;
  1441. +
  1442. +    // Upper half of pred[]
  1443. +    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
  1444. +    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
  1445. +    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
  1446. +    vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
  1447. +    int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
  1448. +
  1449. +    // Lower half of pred[]
  1450. +    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
  1451. +    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
  1452. +    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
  1453. +    vstore8( (uchar8)(src[-1]), 3, pred );
  1454. +    return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
  1455. +}
  1456. +
  1457. +int x264_predict_8x8c_v( const local pixel *src, int src_stride )
  1458. +{
  1459. +    private pixel pred[32];
  1460. +    uchar16 v16;
  1461. +    v16.lo = vload8( 0, &src[-src_stride] );
  1462. +    v16.hi = vload8( 0, &src[-src_stride] );
  1463. +
  1464. +    vstore16( v16, 0, pred );
  1465. +    vstore16( v16, 1, pred );
  1466. +
  1467. +    return satd_8x4_lp( src, src_stride, pred, 8 ) +
  1468. +           satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
  1469. +}
  1470. +
  1471. +int x264_predict_8x8c_p( const local pixel *src, int src_stride )
  1472. +{
  1473. +    int H = 0, V = 0;
  1474. +    private pixel pred[32];
  1475. +    int satd;
  1476. +
  1477. +    for( int i = 0; i < 4; i++ )
  1478. +    {
  1479. +        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
  1480. +        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
  1481. +    }
  1482. +
  1483. +    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
  1484. +    int b = (17 * H + 16) >> 5;
  1485. +    int c = (17 * V + 16) >> 5;
  1486. +    int i00 = a - 3 * b - 3 * c + 16;
  1487. +
  1488. +    // Upper half of pred[]
  1489. +    for( int y = 0; y < 4; y++ )
  1490. +    {
  1491. +        int pix = i00;
  1492. +        for( int x = 0; x < 8; x++ )
  1493. +        {
  1494. +            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
  1495. +            pix += b;
  1496. +        }
  1497. +        i00 += c;
  1498. +    }
  1499. +    satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1500. +    // Lower half of pred[]
  1501. +    for( int y = 0; y < 4; y++ )
  1502. +    {
  1503. +        int pix = i00;
  1504. +        for( int x = 0; x < 8; x++ )
  1505. +        {
  1506. +            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
  1507. +            pix += b;
  1508. +        }
  1509. +        i00 += c;
  1510. +    }
  1511. +    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1512. +    return satd;
  1513. +}
  1514. +
  1515. +int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
  1516. +{
  1517. +    private pixel pred[32];
  1518. +    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
  1519. +    for( int i = 0; i < 4; i++ )
  1520. +    {
  1521. +        s0 += src[i - src_stride];
  1522. +        s1 += src[i + 4 - src_stride];
  1523. +        s2 += src[-1 + i * src_stride];
  1524. +        s3 += src[-1 + (i+4)*src_stride];
  1525. +    }
  1526. +
  1527. +    // Upper half of pred[]
  1528. +    uchar8 dc0;
  1529. +    dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
  1530. +    dc0.hi = (uchar4)( (s1 + 2) >> 2 );
  1531. +    vstore8( dc0, 0, pred );
  1532. +    vstore8( dc0, 1, pred );
  1533. +    vstore8( dc0, 2, pred );
  1534. +    vstore8( dc0, 3, pred );
  1535. +    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  1536. +
  1537. +    // Lower half of pred[]
  1538. +    dc0.lo = (uchar4)( (s3 + 2) >> 2 );
  1539. +    dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
  1540. +    vstore8( dc0, 0, pred );
  1541. +    vstore8( dc0, 1, pred );
  1542. +    vstore8( dc0, 2, pred );
  1543. +    vstore8( dc0, 3, pred );
  1544. +    return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  1545. +}
  1546. +#endif
  1547. +
  1548. +kernel void mb_intra_cost_satd_8x8(
  1549. +    read_only image2d_t  fenc,
  1550. +    global uint16_t     *fenc_intra_cost,
  1551. +    global int          *frame_stats,
  1552. +    int                  lambda,
  1553. +    int                  mb_width,
  1554. +    int                  slow )
  1555. +{
  1556. +#define CACHE_STRIDE (265)
  1557. +#define BLOCK_OFFSET (266)
  1558. +    local pixel cache[2385];
  1559. +    local int cost_buf[32];
  1560. +    local pixel top[32 * 16];
  1561. +    local pixel left[32 * 8];
  1562. +    local pixel left_top[32];
  1563. +
  1564. +    int lx = (int)get_local_id( 0 );
  1565. +    int ly = (int)get_local_id( 1 );
  1566. +    int gx = (int)get_global_id( 0 );
  1567. +    int gy = (int)get_global_id( 1 );
  1568. +    int gidx = (int)get_group_id( 0 );
  1569. +    int gidy = (int)get_group_id( 1 );
  1570. +    int linear_id = (int)(ly * get_local_size( 0 ) + lx);
  1571. +    int satd = COST_MAX;
  1572. +    int basex = (gidx << 8);
  1573. +    int basey = (gidy << 3) - 1;
  1574. +
  1575. +    for( int y = 0; y < 9; y++ ) //Maybe unrolled later?
  1576. +    {
  1577. +        for( int x = 4*linear_id; x < (33 * 8); x += 1024 )
  1578. +        {
  1579. +            uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
  1580. +            cache[y * CACHE_STRIDE + 1 + x] = data.s0;
  1581. +            cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
  1582. +            cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
  1583. +            cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
  1584. +        }
  1585. +    }
  1586. +    if( linear_id < 9 )
  1587. +        cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
  1588. +
  1589. +    barrier( CLK_LOCAL_MEM_FENCE );
  1590. +
  1591. +    // Cooperatively build the top edge for the macroblock using smoothing function
  1592. +    int j = ly;
  1593. +    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
  1594. +                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
  1595. +                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
  1596. +    j += 8;
  1597. +    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
  1598. +                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
  1599. +                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
  1600. +    // Cooperatively build the left edge for the macroblock using smoothing function
  1601. +    left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
  1602. +                        2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
  1603. +                        cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp_int((ly + 1), 0, 7 )] + 2 ) >> 2;
  1604. +    // One left_top per macroblock
  1605. +    if( 0 == ly )
  1606. +    {
  1607. +        left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
  1608. +                         cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
  1609. +        cost_buf[lx] = COST_MAX;
  1610. +    }
  1611. +    barrier( CLK_LOCAL_MEM_FENCE );
  1612. +
  1613. +    // each warp/wavefront generates a different prediction type; no divergence
  1614. +    switch( ly )
  1615. +    {
  1616. +        case 0:
  1617. +            satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  1618. +            break;
  1619. +        case 1:
  1620. +            satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  1621. +            break;
  1622. +        case 2:
  1623. +            satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  1624. +            break;
  1625. +        case 3:
  1626. +            satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  1627. +            break;
  1628. +        case 4:
  1629. +            satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
  1630. +            break;
  1631. +        case 5:
  1632. +            satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
  1633. +            break;
  1634. +        case 6:
  1635. +            satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
  1636. +            break;
  1637. +        case 7:
  1638. +            satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
  1639. +            break;
  1640. +        default:
  1641. +            break;
  1642. +    }
  1643. +    atom_min( &cost_buf[lx], satd );
  1644. +    if( slow )
  1645. +    {
  1646. +        // Do the remaining two (least likely) prediction modes
  1647. +        switch( ly )
  1648. +        {
  1649. +            case 0: // DDL
  1650. +                satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
  1651. +                atom_min( &cost_buf[lx], satd );
  1652. +                break;
  1653. +            case 1: // VL
  1654. +                satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
  1655. +                atom_min( &cost_buf[lx], satd );
  1656. +                break;
  1657. +            default:
  1658. +                break;
  1659. +        }
  1660. +    }
  1661. +    barrier( CLK_LOCAL_MEM_FENCE );
  1662. +
  1663. +    if( (0 == ly) && (gx < mb_width) )
  1664. +    {
  1665. +        fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
  1666. +    }
  1667. +
  1668. +    //Auxiliary work for one of the following kernels
  1669. +    if( gx < 2 && gy == 0 )
  1670. +        frame_stats[gx] = 0;
  1671. +#undef CACHE_STRIDE
  1672. +#undef BLOCK_OFFSET
  1673. +}
  1674. +
  1675. +
  1676. +
  1677. +/*
  1678. + * parallel sum intra costs
  1679. + *
  1680. + * global launch dimensions: [256, mb_height]
  1681. + */
  1682. +kernel void sum_intra_cost(
  1683. +    const global uint16_t *fenc_intra_cost,
  1684. +    const global uint16_t *inv_qscale_factor,
  1685. +    global int           *fenc_row_satds,
  1686. +    global int           *frame_stats,
  1687. +    int                   mb_width )
  1688. +{
  1689. +    int y = get_global_id( 1 );
  1690. +    int mb_height = get_global_size( 1 );
  1691. +
  1692. +    int row_satds = 0;
  1693. +    int cost_est = 0;
  1694. +    int cost_est_aq = 0;
  1695. +
  1696. +    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
  1697. +    {
  1698. +        int mb_xy = x + y * mb_width;
  1699. +        int cost = fenc_intra_cost[mb_xy];
  1700. +        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
  1701. +        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
  1702. +
  1703. +        row_satds += cost_aq;
  1704. +        if( b_frame_score_mb )
  1705. +        {
  1706. +            cost_est += cost;
  1707. +            cost_est_aq += cost_aq;
  1708. +        }
  1709. +    }
  1710. +
  1711. +    local int buffer[256];
  1712. +    int x = get_global_id( 0 );
  1713. +
  1714. +    row_satds   = parallel_sum( row_satds, x, buffer );
  1715. +    cost_est    = parallel_sum( cost_est, x, buffer );
  1716. +    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
  1717. +
  1718. +    if( get_global_id( 0 ) == 0 )
  1719. +    {
  1720. +        fenc_row_satds[y] = row_satds;
  1721. +        atomic_add( frame_stats + 0, cost_est );
  1722. +        atomic_add( frame_stats + 1, cost_est_aq );
  1723. +    }
  1724. +}
  1725. diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/motionsearch.cl
  1726. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  1727. +++ b/common/opencl/motionsearch.cl     Tue May 15 18:05:00 2012 -0500
  1728. @@ -0,0 +1,276 @@
  1729. +
  1730. +
  1731. +
  1732. +int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
  1733. +{
  1734. +    /* edge macroblocks might not have a direct descendant, use nearest */
  1735. +    x = (x == mb_width-1)  ? (x - (mb_width&1)) >> 1 : x >> 1;
  1736. +    y = (y == mb_height-1) ? (y - (mb_height&1)) >> 1 : y >> 1;
  1737. +    return (mb_width>>1) * y + x;
  1738. +}
  1739. +
  1740. +/* Four threads calculate an 8x8 SAD.  Each does two rows */
  1741. +int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
  1742. +{
  1743. +    frefpos.y += idx << 1;
  1744. +    fencpos.y += idx << 1;
  1745. +    int cost = 0;
  1746. +    if( frefpos.x < 0 )
  1747. +    {
  1748. +        /* slow path when MV goes past right edge */
  1749. +        for( int y = 0; y < 2; y++ )
  1750. +        {
  1751. +            for( int x = 0; x < 8; x++ )
  1752. +            {
  1753. +                pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
  1754. +                pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
  1755. +                cost += abs_diff( enc, ref );
  1756. +            }
  1757. +        }
  1758. +    }
  1759. +    else
  1760. +    {
  1761. +        uint4 enc, ref, costs = 0;
  1762. +        enc = read_imageui( fenc, sampler, fencpos );
  1763. +        ref = read_imageui( fref, sampler, frefpos );
  1764. +        costs += abs_diff( enc, ref );
  1765. +        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
  1766. +        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
  1767. +        costs += abs_diff( enc, ref );
  1768. +        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
  1769. +        ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
  1770. +        costs += abs_diff( enc, ref );
  1771. +        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
  1772. +        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
  1773. +        costs += abs_diff( enc, ref );
  1774. +        cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
  1775. +    }
  1776. +    costs[idx] = cost;
  1777. +    return costs[0] + costs[1] + costs[2] + costs[3];
  1778. +}
  1779. +
  1780. +/* One thread performs 8x8 SAD */
  1781. +int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
  1782. +{
  1783. +    if( frefpos.x < 0 )
  1784. +    {
  1785. +        /* slow path when MV goes past right edge */
  1786. +        int cost = 0;
  1787. +        for( int y = 0; y < 8; y++ )
  1788. +        {
  1789. +            for( int x = 0; x < 8; x++ )
  1790. +            {
  1791. +                uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
  1792. +                uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
  1793. +                cost += abs_diff( enc, ref );
  1794. +            }
  1795. +        }
  1796. +        return cost;
  1797. +    }
  1798. +    else
  1799. +    {
  1800. +        uint4 enc, ref, cost = 0;
  1801. +        for( int y = 0; y < 8; y++ )
  1802. +        {
  1803. +            for( int x = 0; x < 8; x += 4 )
  1804. +            {
  1805. +                enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
  1806. +                ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
  1807. +                cost += abs_diff( enc, ref );
  1808. +            }
  1809. +        }
  1810. +        return cost.s0 + cost.s1 + cost.s2 + cost.s3;
  1811. +    }
  1812. +}
  1813. +
  1814. +
  1815. +/*
  1816. + * hierarchical motion estimation
  1817. + *
  1818. + * Each kernel launch is a single iteration
  1819. + *
  1820. + * MB per work group is determined by lclx / 4 * lcly
  1821. + *
  1822. + * global launch dimensions:  [mb_width * 4, mb_height]
  1823. + */
  1824. +kernel void hierarchical_motion(
  1825. +    read_only image2d_t  fenc,
  1826. +    read_only image2d_t  fref,
  1827. +    const global short2 *in_mvs,
  1828. +    global short2       *out_mvs,
  1829. +    global int16_t      *out_mv_costs,
  1830. +    global short2       *mvp_buffer,
  1831. +    local int16_t       *cost_local,
  1832. +    local short2        *mvc_local,
  1833. +    int                  mb_width,
  1834. +    int                  lambda,
  1835. +    int                  me_range,
  1836. +    int                  scale,
  1837. +    int                  b_shift_index,
  1838. +    int                  b_first_iteration,
  1839. +    int                  b_reverse_references )
  1840. +{
  1841. +    int mb_x = get_global_id( 0 ) >> 2;
  1842. +    if( mb_x >= mb_width )
  1843. +        return;
  1844. +    int mb_height = get_global_size( 1 );
  1845. +    int mb_i = get_global_id( 0 ) & 3;
  1846. +    int mb_y = get_global_id( 1 );
  1847. +    int mb_xy = mb_y * mb_width + mb_x;
  1848. +    const int mb_size = 8;
  1849. +    int2 coord;
  1850. +    coord.x = mb_x * mb_size;
  1851. +    coord.y = mb_y * mb_size;
  1852. +
  1853. +    const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
  1854. +    cost_local += 4 * mb_in_group;
  1855. +
  1856. +    int i_mvc = 0;
  1857. +    mvc_local += 4 * mb_in_group;
  1858. +    mvc_local[mb_i] = 0;
  1859. +    short2 mvp;
  1860. +
  1861. +    if( b_first_iteration )
  1862. +    {
  1863. +        mvp.x = 0;
  1864. +        mvp.y = 0;
  1865. +    }
  1866. +    else
  1867. +    {
  1868. +#define MVC( DX, DY )\
  1869. +    {\
  1870. +        int px = mb_x + DX;\
  1871. +        int py = mb_y + DY;\
  1872. +        if( b_shift_index )\
  1873. +            mvc_local[i_mvc] = in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )];\
  1874. +        else\
  1875. +            mvc_local[i_mvc] = in_mvs[mb_width * py + px];\
  1876. +        mvc_local[i_mvc].x >>= scale;\
  1877. +        mvc_local[i_mvc].y >>= scale;\
  1878. +        i_mvc++;\
  1879. +    }
  1880. +        /* Find MVP from median of MVCs */
  1881. +        if( b_reverse_references )
  1882. +        {
  1883. +            /* odd iterations: derive MVP from down and right */
  1884. +            if( mb_x < mb_width - 1 )
  1885. +                MVC( 1, 0 );
  1886. +            if( mb_y < mb_height - 1 )
  1887. +            {
  1888. +                MVC( 0, 1 );
  1889. +                if( mb_x > b_shift_index )
  1890. +                    MVC( -1, 1 );
  1891. +                if( mb_x < mb_width - 1 )
  1892. +                    MVC( 1, 1 );
  1893. +            }
  1894. +        }
  1895. +        else
  1896. +        {
  1897. +            /* even iterations: derive MVP from up and left */
  1898. +            if( mb_x > 0 )
  1899. +                MVC( -1, 0 );
  1900. +            if( mb_y > 0 )
  1901. +            {
  1902. +                MVC( 0, -1 );
  1903. +                if( mb_x < mb_width - 1 )
  1904. +                    MVC( 1, -1 );
  1905. +                if( mb_x > b_shift_index )
  1906. +                    MVC( -1, -1 );
  1907. +            }
  1908. +        }
  1909. +        if( i_mvc <= 1 )
  1910. +        {
  1911. +            mvp = mvc_local[0];
  1912. +        }
  1913. +        else
  1914. +            mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
  1915. +#undef MVC
  1916. +    }
  1917. +    //new mvp == old mvp, copy the input mv to the output mv and exit.
  1918. +    if( (!b_shift_index) && (mvp.x == mvp_buffer[mb_xy].x) && (mvp.y == mvp_buffer[mb_xy].y) )
  1919. +    {
  1920. +        out_mvs[mb_xy] = in_mvs[mb_xy];
  1921. +        return;
  1922. +    }
  1923. +    mvp_buffer[mb_xy] = mvp;
  1924. +    short2 mv_min;
  1925. +    short2 mv_max;
  1926. +    mv_min.x = -mb_size * mb_x - 4;
  1927. +    mv_max.x = mb_size * (mb_width - mb_x - 1) + 4;
  1928. +    mv_min.y = -mb_size * mb_y - 4;
  1929. +    mv_max.y = mb_size * (mb_height - mb_y - 1) + 4;
  1930. +
  1931. +    short2 bestmv;
  1932. +    bestmv.x = x264_clip3( mvp.x, mv_min.x, mv_max.x );
  1933. +    bestmv.y = x264_clip3( mvp.y, mv_min.y, mv_max.y );
  1934. +
  1935. +    int2 refcrd;
  1936. +    refcrd.x = coord.x + bestmv.x;
  1937. +    refcrd.y = coord.y + bestmv.y;
  1938. +    /* measure cost at bestmv */
  1939. +    int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
  1940. +                lambda * calc_mv_cost( abs_diff( bestmv.x, mvp.x ) << (2 + scale), abs_diff( bestmv.y, mvp.y ) << (2 + scale) );
  1941. +
  1942. +    do
  1943. +    {
  1944. +        /* measure costs at offsets from bestmv */
  1945. +        refcrd.x = coord.x + bestmv.x + dia_offs[mb_i].x;
  1946. +        refcrd.y = coord.y + bestmv.y + dia_offs[mb_i].y;
  1947. +        short2 trymv = bestmv + dia_offs[mb_i];
  1948. +        int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
  1949. +                   lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );
  1950. +
  1951. +        cost_local[mb_i] = (cost<<2) | mb_i;
  1952. +        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
  1953. +
  1954. +        if( (cost >> 2) >= bcost )
  1955. +            break;
  1956. +
  1957. +        bestmv += dia_offs[cost&3];
  1958. +        bcost = cost>>2;
  1959. +
  1960. +        if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
  1961. +            break;
  1962. +    }
  1963. +    while( --me_range > 0 );
  1964. +
  1965. +    short2 trymv;
  1966. +
  1967. +#define COST_MV_NO_PAD( X, Y, L )\
  1968. +    trymv.x = x264_clip3( X, mv_min.x, mv_max.x );\
  1969. +    trymv.y = x264_clip3( Y, mv_min.y, mv_max.y );\
  1970. +    if( abs_diff( mvp.x, trymv.x ) > 1 || abs_diff( mvp.y, trymv.y ) > 1 ) {\
  1971. +        int2 refcrd = coord; refcrd.x += trymv.x; refcrd.y += trymv.y;\
  1972. +        int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
  1973. +                   L * calc_mv_cost( abs_diff( trymv.x, mvp.x ) << (2 + scale), abs_diff( trymv.y, mvp.y ) << (2 + scale) );\
  1974. +        if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
  1975. +
  1976. +    COST_MV_NO_PAD( 0, 0, 0 );
  1977. +
  1978. +    if( !b_first_iteration )
  1979. +    {
  1980. +        /* try cost at previous iteration's MV, if MVP was too far away */
  1981. +        short2 prevmv;
  1982. +        if( b_shift_index )
  1983. +            prevmv = in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )];
  1984. +        else
  1985. +            prevmv = in_mvs[mb_xy];
  1986. +        prevmv.x >>= scale;
  1987. +        prevmv.y >>= scale;
  1988. +        COST_MV_NO_PAD( prevmv.x, prevmv.y, lambda );
  1989. +    }
  1990. +
  1991. +    for( int i = 0; i < i_mvc; i++ )
  1992. +    {
  1993. +        /* try cost at each candidate MV, if MVP was too far away */
  1994. +        COST_MV_NO_PAD( mvc_local[i].x, mvc_local[i].y, lambda );
  1995. +    }
  1996. +
  1997. +    if( mb_i == 0 )
  1998. +    {
  1999. +        bestmv.x <<= scale;
  2000. +        bestmv.y <<= scale;
  2001. +        out_mvs[mb_xy] = bestmv;
  2002. +        out_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
  2003. +    }
  2004. +}
  2005. diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/subpel.cl
  2006. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  2007. +++ b/common/opencl/subpel.cl   Tue May 15 18:05:00 2012 -0500
  2008. @@ -0,0 +1,283 @@
  2009. +
  2010. +/* One thread performs 8x8 SAD */
  2011. +int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
  2012. +{
  2013. +    int2 frefpos = (int2)(qpos.x >> 2, qpos.y >> 2);
  2014. +    int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
  2015. +
  2016. +    uint mask_shift = 8 * (3 - hpel_idx);
  2017. +    uint mask = 0x000000ff << mask_shift;
  2018. +
  2019. +    uint4 cost4 = 0;
  2020. +
  2021. +    for( int y = 0; y < 8; y++ )
  2022. +    {
  2023. +        uint4 enc, val4;
  2024. +        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
  2025. +        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 & mask) >> mask_shift;
  2026. +        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 & mask) >> mask_shift;
  2027. +        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 & mask) >> mask_shift;
  2028. +        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 & mask) >> mask_shift;
  2029. +        cost4 += abs_diff( enc, val4 );
  2030. +
  2031. +        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
  2032. +        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 & mask) >> mask_shift;
  2033. +        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 & mask) >> mask_shift;
  2034. +        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 & mask) >> mask_shift;
  2035. +        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 & mask) >> mask_shift;
  2036. +        cost4 += abs_diff( enc, val4 );
  2037. +    }
  2038. +
  2039. +    return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
  2040. +}
  2041. +
  2042. +/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
  2043. +int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
  2044. +{
  2045. +    int2 frefApos = qpos >> 2;
  2046. +    int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
  2047. +
  2048. +    int2 qposB = qpos + ((qpos & 1) << 1);
  2049. +    int2 frefBpos = qposB >> 2;
  2050. +    int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
  2051. +
  2052. +    uint mask_shift0 = 8 * (3 - hpelA), mask_shift1 = 8 * (3 - hpelB);
  2053. +    uint mask0 = 0x000000ff << mask_shift0, mask1 = 0x000000ff << mask_shift1;
  2054. +
  2055. +    int cost = 0;
  2056. +
  2057. +    for( int y = 0; y < 8; y++ )
  2058. +    {
  2059. +        for( int x = 0; x < 8; x++ )
  2060. +        {
  2061. +            uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
  2062. +            uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 & mask0) >> mask_shift0;
  2063. +            uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 & mask1) >> mask_shift1;
  2064. +            uint ref = rhadd( vA, vB );
  2065. +            cost += abs_diff( enc, ref );
  2066. +        }
  2067. +    }
  2068. +
  2069. +    return cost;
  2070. +}
  2071. +
  2072. +/* Four thread measures 8x8 SATD cost at a QPEL offset into an HPEL plane
  2073. + *
  2074. + * Each thread collects 1/4 of the rows of diffs and processes one quarter of
  2075. + * the transforms
  2076. + */
  2077. +int satd_8x8_ii_qpel_coop4(
  2078. +    read_only image2d_t fenc, int2 fencpos,
  2079. +    read_only image2d_t fref_planes, int2 qpos,
  2080. +    local sum2_t *tmpp,
  2081. +    int idx )
  2082. +{
  2083. +    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
  2084. +    sum2_t b0, b1, b2, b3;
  2085. +
  2086. +    // fencpos is full-pel position of original MB
  2087. +    // qpos is qpel position within reference frame
  2088. +    int2 frefApos = (int2)(qpos.x>>2, qpos.y>>2);
  2089. +    int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
  2090. +
  2091. +    int2 qposB = (int2)qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
  2092. +    int2 frefBpos = (int2)(qposB.x>>2, qposB.y>>2);
  2093. +    int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
  2094. +
  2095. +    uint mask_shift0 = 8 * (3 - hpelA), mask_shift1 = 8 * (3 - hpelB);
  2096. +    uint mask0 = 0x000000ff << mask_shift0, mask1 = 0x000000ff << mask_shift1;
  2097. +
  2098. +    uint vA, vB;
  2099. +    uint a0, a1;
  2100. +    uint enc;
  2101. +    sum2_t sum = 0;
  2102. +
  2103. +
  2104. +#define READ_DIFF( OUT, X )\
  2105. +    enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
  2106. +    vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 & mask0) >> mask_shift0;\
  2107. +    vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 & mask1) >> mask_shift1;\
  2108. +    OUT = enc - rhadd( vA, vB );
  2109. +
  2110. +#define READ_DIFF_EX( OUT, a, b )\
  2111. +    {\
  2112. +        READ_DIFF( a0, a );\
  2113. +        READ_DIFF( a1, b );\
  2114. +        OUT = a0 + (a1<<BITS_PER_SUM);\
  2115. +    }
  2116. +#define ROW_8x4_SATD( a, b )\
  2117. +    {\
  2118. +        fencpos.y += a;\
  2119. +        frefApos.y += b;\
  2120. +        frefBpos.y += b;\
  2121. +        READ_DIFF_EX( b0, 0, 4 );\
  2122. +        READ_DIFF_EX( b1, 1, 5 );\
  2123. +        READ_DIFF_EX( b2, 2, 6 );\
  2124. +        READ_DIFF_EX( b3, 3, 7 );\
  2125. +        HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
  2126. +        HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
  2127. +        sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
  2128. +    }
  2129. +    ROW_8x4_SATD( 0, 0 );
  2130. +    ROW_8x4_SATD( 4, 4 );
  2131. +
  2132. +#undef READ_DIFF
  2133. +#undef READ_DIFF_EX
  2134. +#undef ROW_8x4_SATD
  2135. +    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
  2136. +}
  2137. +
  2138. +constant short2 hpoffs[4] =
  2139. +{
  2140. +    {0, -2}, {-2, 0}, {2, 0}, {0, 2}
  2141. +};
  2142. +
  2143. +/*
  2144. + * sub pixel refinement of motion vectors, output MVs and costs are moved from
  2145. + * temporary buffers into final per-frame buffer
  2146. + *
  2147. + * global launch dimensions:  [mb_width * 4, mb_height]
  2148. + *
  2149. + * With X being the source 16x16 pixels, F is the lowres pixel used by the
  2150. + * motion search.  We will now utilize the H V and C pixels (stored in separate
  2151. + * planes) to search at half-pel increments.
  2152. + *
  2153. + * X X X X X X
  2154. + *  F H F H F
  2155. + * X X X X X X
  2156. + *  V C V C V
  2157. + * X X X X X X
  2158. + *  F H F H F
  2159. + * X X X X X X
  2160. + *
  2161. + * The YX HPEL bits of the motion vector selects the plane we search in.  The
  2162. + * four planes are packed in the fref_planes 2D image buffer.  Each sample
  2163. + * returns:  s0 = F, s1 = H, s2 = V, s3 = C
  2164. + */
  2165. +kernel void subpel_refine(
  2166. +    read_only image2d_t   fenc,
  2167. +    read_only image2d_t   fref_planes,
  2168. +    const global short2  *in_mvs,
  2169. +    const global int16_t *in_sad_mv_costs,
  2170. +    local int16_t        *cost_local,
  2171. +    local sum2_t         *satd_local,
  2172. +    local short2         *mvc_local,
  2173. +    global short2        *fenc_lowres_mv,
  2174. +    global int16_t       *fenc_lowres_mv_costs,
  2175. +    int                   mb_width,
  2176. +    int                   lambda,
  2177. +    int                   b,
  2178. +    int                   ref,
  2179. +    int                   b_islist1 )
  2180. +{
  2181. +    int mb_x = get_global_id( 0 ) >> 2;
  2182. +    if( mb_x >= mb_width )
  2183. +        return;
  2184. +    int mb_height = get_global_size( 1 );
  2185. +
  2186. +    int mb_i = get_global_id( 0 ) & 3;
  2187. +    int mb_y = get_global_id( 1 );
  2188. +    int mb_xy = mb_y * mb_width + mb_x;
  2189. +
  2190. +    fenc_lowres_mv += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
  2191. +    fenc_lowres_mv_costs += ( b_islist1 ? ( ref - b - 1 ) : ( b - ref - 1 ) ) * mb_width * mb_height;
  2192. +
  2193. +    int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
  2194. +    cost_local += mb_in_group * 4;
  2195. +    satd_local += mb_in_group * 16;
  2196. +    mvc_local += mb_in_group * 4;
  2197. +
  2198. +    int i_mvc = 0;
  2199. +    mvc_local[0] = 0;
  2200. +    mvc_local[1] = 0;
  2201. +    mvc_local[2] = 0;
  2202. +    mvc_local[3] = 0;
  2203. +    short2 mvp;
  2204. +#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
  2205. +    if( mb_x > 0 )
  2206. +        MVC( -1, 0 );
  2207. +    if( mb_y > 0 )
  2208. +    {
  2209. +        MVC( 0, -1 );
  2210. +        if( mb_x < mb_width - 1 )
  2211. +            MVC( 1, -1 );
  2212. +        if( mb_x > 0 )
  2213. +            MVC( -1, -1 );
  2214. +    }
  2215. +    if( i_mvc <= 1 )
  2216. +        mvp = mvc_local[0];
  2217. +    else
  2218. +        mvp = x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
  2219. +#undef MVC
  2220. +
  2221. +    int bcost =  in_sad_mv_costs[mb_xy];
  2222. +    short2 bmv = in_mvs[mb_xy];
  2223. +
  2224. +    /* Make mvp a QPEL MV */
  2225. +    mvp.x <<= 2;
  2226. +    mvp.y <<= 2;
  2227. +
  2228. +    /* Make bmv a QPEL MV */
  2229. +    bmv.x <<= 2;
  2230. +    bmv.y <<= 2;
  2231. +
  2232. +    int2 coord;
  2233. +    coord.x = mb_x << 3;
  2234. +    coord.y = mb_y << 3;
  2235. +    short2 trymv;
  2236. +    int2 qpos;
  2237. +    int cost = 0;
  2238. +
  2239. +#define HPEL_QPEL( ARR, FUNC )\
  2240. +    {\
  2241. +        trymv = bmv + ARR[mb_i];\
  2242. +        qpos.x = (coord.x << 2) + trymv.x;\
  2243. +        qpos.y = (coord.y << 2) + trymv.y;\
  2244. +        cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
  2245. +        cost_local[mb_i] = (cost<<2) + mb_i;\
  2246. +        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
  2247. +        if( (cost>>2) < bcost )\
  2248. +        {\
  2249. +            bmv += ARR[cost&3];\
  2250. +            bcost = cost>>2;\
  2251. +        }\
  2252. +    }
  2253. +
  2254. +    HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
  2255. +
  2256. +#define CHEAP_QPEL 1
  2257. +
  2258. +#if CHEAP_QPEL
  2259. +    HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
  2260. +#endif
  2261. +
  2262. +    /* remeasure with SATD */
  2263. +    qpos.x = (coord.x << 2) + bmv.x;
  2264. +    qpos.y = (coord.y << 2) + bmv.y;
  2265. +    cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
  2266. +    bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
  2267. +    bcost += lambda * calc_mv_cost( abs_diff( bmv.x, mvp.x ), abs_diff( bmv.y, mvp.y ) );
  2268. +    short2 bestmv = 0;
  2269. +
  2270. +#if !CHEAP_QPEL
  2271. +#define QPEL_SATD( ARR, FUNC )\
  2272. +    {\
  2273. +        trymv = bmv + ARR;\
  2274. +        qpos.x = (coord.x << 2) + trymv.x;\
  2275. +        qpos.y = (coord.y << 2) + trymv.y;\
  2276. +        cost_local[mb_i] = FUNC( fenc, coord, fref_planes, qpos, satd_local, mb_i );\
  2277. +        cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
  2278. +        cost += lambda * calc_mv_cost( abs_diff( trymv.x, mvp.x ), abs_diff( trymv.y, mvp.y ) );\
  2279. +        if( cost < bcost )\
  2280. +        {\
  2281. +            bestmv = ARR;\
  2282. +            bcost = cost;\
  2283. +        }\
  2284. +    }
  2285. +    for( int i = 0; i<4; i++ )
  2286. +        QPEL_SATD( dia_offs[i], satd_8x8_ii_qpel_coop4 );
  2287. +#endif
  2288. +
  2289. +    fenc_lowres_mv[mb_xy] = bmv+bestmv;
  2290. +    fenc_lowres_mv_costs[mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK );
  2291. +}
  2292. diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/weightp.cl
  2293. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  2294. +++ b/common/opencl/weightp.cl  Tue May 15 18:05:00 2012 -0500
  2295. @@ -0,0 +1,45 @@
  2296. +/*
  2297. +Launch dimensions: width x height (in pixels)
  2298. +*/
  2299. +kernel void weightp_scaled_images( read_only image2d_t in_plane,
  2300. +                                   write_only image2d_t out_plane,
  2301. +                                   uint offset,
  2302. +                                   uint scale,
  2303. +                                   uint denom )
  2304. +{
  2305. +    int gx = get_global_id( 0 );
  2306. +    int gy = get_global_id( 1 );
  2307. +    uint4 input_val;
  2308. +    uint4 output_val;
  2309. +
  2310. +    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
  2311. +    output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
  2312. +    write_imageui( out_plane, (int2)(gx, gy), output_val );
  2313. +}
  2314. +
  2315. +/*
  2316. +Launch dimensions: width x height (in pixels)
  2317. +*/
  2318. +kernel void weightp_hpel( read_only image2d_t in_plane,
  2319. +                          write_only image2d_t out_plane,
  2320. +                          uint offset,
  2321. +                          uint scale,
  2322. +                          uint denom )
  2323. +{
  2324. +    int gx = get_global_id( 0 );
  2325. +    int gy = get_global_id( 1 );
  2326. +    uint input_val;
  2327. +    uint output_val;
  2328. +
  2329. +    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
  2330. +    //Unpack
  2331. +    uint4 temp;
  2332. +    temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
  2333. +    temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
  2334. +
  2335. +    temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
  2336. +
  2337. +    //Pack
  2338. +    output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
  2339. +    write_imageui( out_plane, (int2)(gx, gy), output_val );
  2340. +}
  2341. diff -r 087c10a1fec7 -r 9a682cb0c74f common/opencl/x264-cl.h
  2342. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  2343. +++ b/common/opencl/x264-cl.h   Tue May 15 18:05:00 2012 -0500
  2344. @@ -0,0 +1,156 @@
  2345. +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
  2346. +
  2347. +constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
  2348. +
  2349. +/* 7.18.1.1  Exact-width integer types */
  2350. +typedef signed char int8_t;
  2351. +typedef unsigned char   uint8_t;
  2352. +typedef short  int16_t;
  2353. +typedef unsigned short  uint16_t;
  2354. +typedef int  int32_t;
  2355. +typedef unsigned   uint32_t;
  2356. +typedef long  int64_t;
  2357. +typedef unsigned long uint64_t;
  2358. +
  2359. +typedef union
  2360. +{
  2361. +    uint32_t i;
  2362. +    uint16_t b[2];
  2363. +    uint8_t  c[4];
  2364. +}  x264_union32_t;
  2365. +
  2366. +typedef uint8_t  pixel;
  2367. +typedef uint16_t sum_t;
  2368. +typedef uint32_t sum2_t;
  2369. +
  2370. +#define LOWRES_COST_MASK ((1<<14)-1)
  2371. +#define LOWRES_COST_SHIFT 14
  2372. +#define COST_MAX (1<<28)
  2373. +
  2374. +#define X264_MIN( a, b ) ((a)<(b) ? (a) : (b))
  2375. +
  2376. +#define PIXEL_MAX 255
  2377. +#define BITS_PER_SUM (8 * sizeof(sum_t))
  2378. +
  2379. +#define COPY2_IF_LT( x, y, a, b )\
  2380. +    if((y)<(x))\
  2381. +    {\
  2382. +        (x) = (y);\
  2383. +        (a) = (b);\
  2384. +    }
  2385. +constant short2 dia_offs[4] =
  2386. +{
  2387. +    {0, -1}, {-1, 0}, {1, 0}, {0, 1},
  2388. +};
  2389. +
  2390. +inline pixel x264_clip_pixel( int x )
  2391. +{
  2392. +    return ((x & ~PIXEL_MAX) ? (-x) >> 31 & PIXEL_MAX : x);
  2393. +}
  2394. +
  2395. +inline int x264_clip3( int v, int i_min, int i_max )
  2396. +{
  2397. +    return ((v < i_min) ? i_min : (v > i_max) ? i_max : v);
  2398. +}
  2399. +
  2400. +inline int x264_median( int a, int b, int c )
  2401. +{
  2402. +    int t = (a - b) & ((a - b) >> 31);
  2403. +    a -= t;
  2404. +    b += t;
  2405. +    b -= (b - c) & ((b - c) >> 31);
  2406. +    b += (a - b) & ((a - b) >> 31);
  2407. +    return b;
  2408. +}
  2409. +
  2410. +inline short2 x264_median_mv( short2 a, short2 b, short2 c )
  2411. +{
  2412. +    short2 dst;
  2413. +    dst.x = x264_median( a.x, b.x, c.x );
  2414. +    dst.y = x264_median( a.y, b.y, c.y );
  2415. +    return dst;
  2416. +}
  2417. +
  2418. +inline sum2_t abs2( sum2_t a )
  2419. +{
  2420. +    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
  2421. +    return (a + s) ^ s;
  2422. +}
  2423. +
  2424. +#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
  2425. +        sum2_t t0 = s0 + s1;\
  2426. +        sum2_t t1 = s0 - s1;\
  2427. +        sum2_t t2 = s2 + s3;\
  2428. +        sum2_t t3 = s2 - s3;\
  2429. +        d0 = t0 + t2;\
  2430. +        d2 = t0 - t2;\
  2431. +        d1 = t1 + t3;\
  2432. +        d3 = t1 - t3;\
  2433. +}
  2434. +
  2435. +#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
  2436. +        int2 t0 = s0 + s1;\
  2437. +        int2 t1 = s0 - s1;\
  2438. +        int2 t2 = s2 + s3;\
  2439. +        int2 t3 = s2 - s3;\
  2440. +        d0 = t0 + t2;\
  2441. +        d2 = t0 - t2;\
  2442. +        d1 = t1 + t3;\
  2443. +        d3 = t1 - t3;\
  2444. +}
  2445. +
  2446. +#define SATD_C_8x4_Q( name, q1, q2 )\
  2447. +    int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
  2448. +    {\
  2449. +        sum2_t tmp[4][4];\
  2450. +        sum2_t a0, a1, a2, a3;\
  2451. +        sum2_t sum = 0;\
  2452. +        for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
  2453. +        {\
  2454. +            a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
  2455. +            a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
  2456. +            a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
  2457. +            a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
  2458. +            HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
  2459. +        }\
  2460. +        for( int i = 0; i < 4; i++ )\
  2461. +        {\
  2462. +            HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
  2463. +            sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
  2464. +        }\
  2465. +        return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
  2466. +    }
  2467. +
  2468. +/*
  2469. + * Utility function to perform a parallel sum reduction of an array of integers
  2470. + */
  2471. +int parallel_sum( int value, int x, volatile local int *array )
  2472. +{
  2473. +    array[x] = value;
  2474. +    barrier( CLK_LOCAL_MEM_FENCE );
  2475. +
  2476. +    int dim = get_local_size( 0 );
  2477. +
  2478. +    while( dim > 1 )
  2479. +    {
  2480. +        dim >>= 1;
  2481. +
  2482. +        if( x < dim )
  2483. +        {
  2484. +            array[x] += array[x + dim];
  2485. +        }
  2486. +
  2487. +        if( dim > 32 )
  2488. +        {
  2489. +            barrier( CLK_LOCAL_MEM_FENCE );
  2490. +        }
  2491. +    }
  2492. +
  2493. +    return array[0];
  2494. +}
  2495. +
  2496. +int calc_mv_cost( int dx, int dy )
  2497. +{
  2498. +    return round( (log2( (float)(dx + 1) ) * 2.0f + 0.718f + !!dx) ) +
  2499. +           round( (log2( (float)(dy + 1) ) * 2.0f + 0.718f + !!dy) );
  2500. +}
  2501. diff -r 087c10a1fec7 -r 9a682cb0c74f configure
  2502. --- a/configure Thu Mar 29 14:14:07 2012 -0700
  2503. +++ b/configure Tue May 15 18:05:00 2012 -0500
  2504. @@ -25,6 +25,7 @@
  2505.    --system-libx264         use system libx264 instead of internal
  2506.    --enable-shared          build shared library
  2507.    --enable-static          build static library
  2508. +  --disable-opencl         disable OpenCL features
  2509.    --disable-gpl            disable GPL-only features
  2510.    --disable-thread         disable multithreaded encoding
  2511.    --enable-win32thread     use win32threads (windows only)
  2512. @@ -273,6 +274,7 @@
  2513.  bit_depth="8"
  2514.  chroma_format="all"
  2515.  compiler="GNU"
  2516. +opencl="yes"
  2517.  
  2518.  CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
  2519.  LDFLAGS="$LDFLAGS"
  2520. @@ -381,6 +383,9 @@
  2521.          --host=*)
  2522.              host="$optarg"
  2523.              ;;
  2524. +        --disable-opencl)
  2525. +            opencl="no"
  2526. +            ;;
  2527.          --cross-prefix=*)
  2528.              cross_prefix="$optarg"
  2529.              ;;
  2530. @@ -1082,6 +1087,37 @@
  2531.  PROF_USE_LD=$PROF_USE_LD
  2532.  EOF
  2533.  
  2534. +if [ "$opencl" = "yes" ]; then
  2535. +    log_check "looking for xxd"
  2536. +    if ! $(xxd -v 2>/dev/null); then
  2537. +        echo 'OpenCL support requires xxd to compile.'
  2538. +        echo 'use --disable-opencl to compile without OpenCL'
  2539. +        exit 1
  2540. +    elif [ "$CUDA_PATH" != "" ]; then
  2541. +        echo 'HAVE_OPENCL=yes' >> config.mak
  2542. +        echo 'OPENCL_LIB=OpenCL' >> config.mak
  2543. +        echo 'OPENCL_INC_DIR=$(CUDA_PATH)include' >> config.mak
  2544. +        if [ "$ARCH" = "X86" ]; then
  2545. +            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/Win32' >> config.mak
  2546. +        else
  2547. +            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/x64' >> config.mak
  2548. +        fi
  2549. +        define HAVE_OPENCL
  2550. +    elif [ "$AMDAPPSDKROOT" != "" ]; then
  2551. +        echo 'HAVE_OPENCL=yes' >> config.mak
  2552. +        echo 'OPENCL_LIB=OpenCL' >> config.mak
  2553. +        echo 'OPENCL_INC_DIR=$(AMDAPPSDKROOT)/include' >> config.mak
  2554. +        if [ "$ARCH" = "X86" ]; then
  2555. +            echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86' >> config.mak
  2556. +        else
  2557. +            echo 'OPENCL_LIB_DIR=$(AMDAPPSDKROOT)/lib/x86_64' >> config.mak
  2558. +        fi
  2559. +        define HAVE_OPENCL
  2560. +    else
  2561. +        opencl="no"
  2562. +    fi
  2563. +fi
  2564. +
  2565.  if [ $compiler = ICL ]; then
  2566.      echo '%.o: %.c' >> config.mak
  2567.      echo '     $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
  2568. @@ -1192,6 +1228,7 @@
  2569.  visualize:     $vis
  2570.  bit depth:     $bit_depth
  2571.  chroma format: $chroma_format
  2572. +opencl:        $opencl
  2573.  EOF
  2574.  
  2575.  echo >> config.log
  2576. diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/encoder.c
  2577. --- a/encoder/encoder.c Thu Mar 29 14:14:07 2012 -0700
  2578. +++ b/encoder/encoder.c Tue May 15 18:05:00 2012 -0500
  2579. @@ -37,6 +37,10 @@
  2580.  #include "common/visualize.h"
  2581.  #endif
  2582.  
  2583. +#if HAVE_OPENCL
  2584. +#include "opencl.c"
  2585. +#endif
  2586. +
  2587.  //#define DEBUG_MB_TYPE
  2588.  
  2589.  #define bs_write_ue bs_write_ue_big
  2590. @@ -515,6 +519,21 @@
  2591.      if( h->i_thread_frames > 1 )
  2592.          h->param.nalu_process = NULL;
  2593.  
  2594. +#ifndef HAVE_OPENCL
  2595. +    if( h->param.b_opencl )
  2596. +    {
  2597. +        x264_log( h, X264_LOG_WARNING, "not compiled with OpenCL support!\n" );
  2598. +        h->param.b_opencl = 0;
  2599. +    }
  2600. +#endif
  2601. +#if BIT_DEPTH > 8
  2602. +    if( h->param.b_opencl )
  2603. +    {
  2604. +        x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth!\n" );
  2605. +        h->param.b_opencl = 0;
  2606. +    }
  2607. +#endif
  2608. +
  2609.      h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
  2610.      if( h->param.i_keyint_max == 1 )
  2611.      {
  2612. @@ -1306,6 +1325,14 @@
  2613.              goto fail;
  2614.      }
  2615.  
  2616. +#if HAVE_OPENCL
  2617. +    if( h->param.b_opencl )
  2618. +    {
  2619. +        if( x264_opencl_init( h ) )
  2620. +            h->param.b_opencl = 0;
  2621. +    }
  2622. +#endif
  2623. +
  2624.      if( x264_lookahead_init( h, i_slicetype_length ) )
  2625.          goto fail;
  2626.  
  2627. @@ -3451,6 +3478,10 @@
  2628.                     || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
  2629.                     || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
  2630.  
  2631. +#if HAVE_OPENCL
  2632. +    x264_opencl_free( h );
  2633. +#endif
  2634. +
  2635.      x264_lookahead_delete( h );
  2636.  
  2637.      if( h->param.b_sliced_threads )
  2638. diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/opencl.c
  2639. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  2640. +++ b/encoder/opencl.c  Tue May 15 18:05:00 2012 -0500
  2641. @@ -0,0 +1,424 @@
  2642. +/*****************************************************************************
  2643. + * opencl.c: OpenCL initialization and kernel compilation
  2644. + *****************************************************************************
  2645. + * Copyright (C) 2010-2012 x264 project
  2646. + *
  2647. + * Authors: Steve Borho <sborho@multicorewareinc.com>
  2648. + *
  2649. + * This program is free software; you can redistribute it and/or modify
  2650. + * it under the terms of the GNU General Public License as published by
  2651. + * the Free Software Foundation; either version 2 of the License, or
  2652. + * (at your option) any later version.
  2653. + *
  2654. + * This program is distributed in the hope that it will be useful,
  2655. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2656. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  2657. + * GNU General Public License for more details.
  2658. + *
  2659. + * You should have received a copy of the GNU General Public License
  2660. + * along with this program; if not, write to the Free Software
  2661. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  2662. + *
  2663. + * This program is also available under a commercial proprietary license.
  2664. + * For more information, contact us at licensing@x264.com.
  2665. + *****************************************************************************/
  2666. +
  2667. +#if HAVE_OPENCL
  2668. +#include "common/common.h"
  2669. +#include "encoder/oclobj.h"
  2670. +
  2671. +/* Try to load the cached compiled program binary, verify the device context is
  2672. + * still valid before reuse */
  2673. +static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
  2674. +{
  2675. +    cl_program program = NULL;
  2676. +    cl_int status;
  2677. +
  2678. +    /* try to load cached program binary */
  2679. +    FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
  2680. +    if( !fp )
  2681. +        return NULL;
  2682. +
  2683. +    fseek( fp, 0L, SEEK_END );
  2684. +    size_t size = ftell( fp );
  2685. +    rewind( fp );
  2686. +    uint8_t *binary = x264_malloc( size );
  2687. +    if( !binary )
  2688. +        goto fail;
  2689. +
  2690. +    fread( binary, 1, size, fp );
  2691. +    const uint8_t *ptr = (const uint8_t*)binary;
  2692. +
  2693. +#define CHECK_STRING( STR )\
  2694. +    do {\
  2695. +        size_t len = strlen( STR );\
  2696. +        if( size <= len || strncmp( (char*)ptr, STR, len ) )\
  2697. +            goto fail;\
  2698. +        else {\
  2699. +            size -= (len+1); ptr += (len+1);\
  2700. +        }\
  2701. +    } while( 0 )
  2702. +
  2703. +    CHECK_STRING( devname );
  2704. +    CHECK_STRING( devvendor );
  2705. +    CHECK_STRING( driverversion );
  2706. +#undef CHECK_STRING
  2707. +
  2708. +    program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
  2709. +    if( status != CL_SUCCESS )
  2710. +        program = NULL;
  2711. +
  2712. +fail:
  2713. +    fclose( fp );
  2714. +    x264_free( binary );
  2715. +    return program;
  2716. +}
  2717. +
  2718. +/* Save the compiled program binary to a file for later reuse.  Device context
  2719. + * is also saved in the cache file so we do not reuse stale binaries */
  2720. +static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
  2721. +{
  2722. +    FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
  2723. +    if( !fp )
  2724. +        return;
  2725. +
  2726. +    size_t size;
  2727. +    cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
  2728. +    if( status == CL_SUCCESS )
  2729. +    {
  2730. +        unsigned char *binary = x264_malloc( size );
  2731. +        status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binary, NULL );
  2732. +        if( status == CL_SUCCESS )
  2733. +        {
  2734. +            fwrite( devname, 1, strlen( devname ), fp );
  2735. +            fwrite( "\n", 1, 1, fp );
  2736. +            fwrite( devvendor, 1, strlen( devvendor ), fp );
  2737. +            fwrite( "\n", 1, 1, fp );
  2738. +            fwrite( driverversion, 1, strlen( driverversion ), fp );
  2739. +            fwrite( "\n", 1, 1, fp );
  2740. +            fwrite( binary, 1, size, fp );
  2741. +        }
  2742. +        x264_free( binary );
  2743. +    }
  2744. +    fclose( fp );
  2745. +}
  2746. +
  2747. +static int x264_detect_AMD_GPU( cl_device_id device, int *b_is_SI )
  2748. +{
  2749. +    char extensions[512];
  2750. +    char boardname[64];
  2751. +    char devname[64];
  2752. +
  2753. +    *b_is_SI = 0;
  2754. +
  2755. +    cl_int status = clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL );
  2756. +    if( status != CL_SUCCESS || !strstr( extensions, "cl_amd_media_ops" ) )
  2757. +        return 0;
  2758. +
  2759. +    /* Detect whether GPU is a SouthernIsland based device */
  2760. +#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
  2761. +    status = clGetDeviceInfo( device, CL_DEVICE_BOARD_NAME_AMD, sizeof(boardname), boardname, NULL );
  2762. +    if( status == CL_SUCCESS )
  2763. +    {
  2764. +        boardname[15] = boardname[16] = boardname[17] = 'X';
  2765. +        *b_is_SI = !strcmp( boardname, "AMD Radeon HD 7XXX Series" );
  2766. +        return 1;
  2767. +    }
  2768. +
  2769. +    /* Fall back to checking the device name */
  2770. +    status = clGetDeviceInfo( device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
  2771. +    if( status != CL_SUCCESS )
  2772. +        return 1;
  2773. +
  2774. +    const char *tahiti_names[] = { "Tahiti", "Pitcairn", "Capeverde", "Bali", NULL };
  2775. +    for( int i = 0; tahiti_names[i]; i++ )
  2776. +        if( !strcmp( devname, tahiti_names[i] ) )
  2777. +        {
  2778. +            *b_is_SI = 1;
  2779. +            return 1;
  2780. +        }
  2781. +    return 1;
  2782. +}
  2783. +
  2784. +/* The OpenCL source under common/opencl will be merged into encoder/oclobj.h by
  2785. + * the Makefile. It defines a x264_opencl_source byte array which we will pass
  2786. + * to clCreateProgramWithSource().  We also attempt to use a cache file for the
  2787. + * compiled binary, stored in the current working folder.  */
  2788. +static cl_program x264_opencl_compile( x264_t *h )
  2789. +{
  2790. +    cl_program program;
  2791. +    cl_int status;
  2792. +
  2793. +    char devname[64];
  2794. +    char devvendor[64];
  2795. +    char driverversion[64];
  2796. +    status  = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME,    sizeof(devname), devname, NULL );
  2797. +    status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR,  sizeof(devvendor), devvendor, NULL );
  2798. +    status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
  2799. +    if( status != CL_SUCCESS )
  2800. +        return NULL;
  2801. +
  2802. +    int b_isamd = x264_detect_AMD_GPU( h->opencl.device, &h->opencl.b_device_AMD_SI );
  2803. +
  2804. +    x264_log( h, X264_LOG_INFO, "OpenCL: %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
  2805. +
  2806. +    program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
  2807. +    if( !program )
  2808. +    {
  2809. +        /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
  2810. +        x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
  2811. +        const char *strptr = (const char*)x264_opencl_source;
  2812. +        size_t size = sizeof(x264_opencl_source);
  2813. +        program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
  2814. +        if( status != CL_SUCCESS || !program )
  2815. +        {
  2816. +            x264_log( h, X264_LOG_ERROR, "OpenCL: unable to create program\n" );
  2817. +            return NULL;
  2818. +        }
  2819. +    }
  2820. +
  2821. +    /* Build the program binary for the OpenCL device */
  2822. +    const char *buildopts = "";
  2823. +    if( b_isamd && !h->opencl.b_device_AMD_SI )
  2824. +        buildopts = "-DVECTORIZE=1";
  2825. +    status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
  2826. +    if( status == CL_SUCCESS )
  2827. +    {
  2828. +        x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
  2829. +        return program;
  2830. +    }
  2831. +
  2832. +    /* Compile failure, should not happen with production code. */
  2833. +
  2834. +    size_t build_log_len = 0;
  2835. +
  2836. +    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
  2837. +    if( status != CL_SUCCESS )
  2838. +    {
  2839. +        x264_log( h, X264_LOG_ERROR, "OpenCL: Compilation failed, unable to get build log\n" );
  2840. +        return NULL;
  2841. +    }
  2842. +
  2843. +    char *build_log = x264_malloc( build_log_len );
  2844. +    if( !build_log )
  2845. +        return NULL;
  2846. +
  2847. +    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
  2848. +    if( status != CL_SUCCESS )
  2849. +    {
  2850. +        x264_log( h, X264_LOG_ERROR, "OpenCL: Compilation failed, unable to get build log\n" );
  2851. +        x264_free( build_log );
  2852. +        return NULL;
  2853. +    }
  2854. +
  2855. +    FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
  2856. +    if( lg )
  2857. +    {
  2858. +        fwrite( build_log, 1, build_log_len, lg );
  2859. +        fclose( lg );
  2860. +        x264_log( h, X264_LOG_ERROR, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
  2861. +    }
  2862. +
  2863. +    x264_free( build_log );
  2864. +    return NULL;
  2865. +}
  2866. +
  2867. +static void x264_opencl_free_lookahead( x264_t *h )
  2868. +{
  2869. +#define RELEASE( a, f ) if( a ) f( a );
  2870. +    RELEASE( h->opencl.intra_kernel, clReleaseKernel )
  2871. +    RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
  2872. +    RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
  2873. +    RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
  2874. +    RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
  2875. +    RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
  2876. +    RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
  2877. +    RELEASE( h->opencl.memset_kernel, clReleaseKernel )
  2878. +    RELEASE( h->opencl.hme_kernel, clReleaseKernel )
  2879. +    RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
  2880. +    RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
  2881. +    RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
  2882. +    RELEASE( h->opencl.lookahead_program, clReleaseProgram )
  2883. +    RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
  2884. +    RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
  2885. +    RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
  2886. +    RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
  2887. +    RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
  2888. +    RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
  2889. +    RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
  2890. +    RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
  2891. +    RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
  2892. +    RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
  2893. +    RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
  2894. +    RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
  2895. +    RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
  2896. +    RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
  2897. +    for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
  2898. +    {
  2899. +        RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
  2900. +    }
  2901. +#undef RELEASE
  2902. +}
  2903. +
  2904. +static int x264_opencl_init_lookahead( x264_t *h )
  2905. +{
  2906. +    if( h->param.rc.i_lookahead == 0 )
  2907. +        return -1;
  2908. +
  2909. +    char *kernelnames[] = {
  2910. +        "mb_intra_cost_satd_8x8",
  2911. +        "sum_intra_cost",
  2912. +        "downscale_hpel",
  2913. +        "downscale1",
  2914. +        "downscale2",
  2915. +        "memset_int16",
  2916. +        "weightp_scaled_images",
  2917. +        "weightp_hpel",
  2918. +        "hierarchical_motion",
  2919. +        "subpel_refine",
  2920. +        "mode_selection",
  2921. +        "sum_inter_cost"
  2922. +    };
  2923. +    cl_kernel *kernels[] = {
  2924. +        &h->opencl.intra_kernel,
  2925. +        &h->opencl.rowsum_intra_kernel,
  2926. +        &h->opencl.downscale_hpel_kernel,
  2927. +        &h->opencl.downscale_kernel1,
  2928. +        &h->opencl.downscale_kernel2,
  2929. +        &h->opencl.memset_kernel,
  2930. +        &h->opencl.weightp_scaled_images_kernel,
  2931. +        &h->opencl.weightp_hpel_kernel,
  2932. +        &h->opencl.hme_kernel,
  2933. +        &h->opencl.subpel_refine_kernel,
  2934. +        &h->opencl.mode_select_kernel,
  2935. +        &h->opencl.rowsum_inter_kernel
  2936. +    };
  2937. +    cl_int status;
  2938. +
  2939. +    h->opencl.lookahead_program = x264_opencl_compile( h );
  2940. +    if( !h->opencl.lookahead_program )
  2941. +    {
  2942. +        x264_opencl_free_lookahead( h );
  2943. +        return -1;
  2944. +    }
  2945. +
  2946. +    for( int i = 0; i < sizeof(kernelnames)/sizeof(char*); i++ )
  2947. +    {
  2948. +        *kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
  2949. +        if( status != CL_SUCCESS )
  2950. +        {
  2951. +            x264_log( h, X264_LOG_ERROR, "Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
  2952. +            x264_opencl_free_lookahead( h );
  2953. +            return -1;
  2954. +        }
  2955. +    }
  2956. +
  2957. +    h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
  2958. +    if( status != CL_SUCCESS )
  2959. +    {
  2960. +        x264_log( h, X264_LOG_ERROR, "Unable to allocate page-locked buffer, error '%d'\n", status );
  2961. +        return -1;
  2962. +    }
  2963. +    h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
  2964. +                                                    0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
  2965. +    if( status != CL_SUCCESS )
  2966. +    {
  2967. +        x264_log( h, X264_LOG_ERROR, "Unable to map page-locked buffer, error '%d'\n", status );
  2968. +        return -1;
  2969. +    }
  2970. +
  2971. +    return 0;
  2972. +}
  2973. +
  2974. +static void  x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
  2975. +{
  2976. +    x264_t *h = (x264_t*)user_data;
  2977. +    x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
  2978. +}
  2979. +
  2980. +static int x264_opencl_init( x264_t *h )
  2981. +{
  2982. +    cl_int status;
  2983. +    cl_uint numPlatforms;
  2984. +    int ret = -1;
  2985. +
  2986. +    status = clGetPlatformIDs( 0, NULL, &numPlatforms );
  2987. +    if( status != CL_SUCCESS || numPlatforms == 0 )
  2988. +        return -1;
  2989. +
  2990. +    cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
  2991. +    status = clGetPlatformIDs( numPlatforms, platforms, NULL );
  2992. +    if( status != CL_SUCCESS )
  2993. +    {
  2994. +        x264_free( platforms );
  2995. +        return -1;
  2996. +    }
  2997. +
  2998. +    /* Select first OpenCL platform that supports a GPU device */
  2999. +    for( cl_uint i = 0; i < numPlatforms; ++i )
  3000. +    {
  3001. +        cl_uint gpu_count;
  3002. +        status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
  3003. +        if( status == CL_SUCCESS && gpu_count > 0 )
  3004. +        {
  3005. +            /* take GPU 0 */
  3006. +            status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 1, &h->opencl.device, NULL );
  3007. +            if( status != CL_SUCCESS )
  3008. +                continue;
  3009. +
  3010. +            h->opencl.context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
  3011. +            if( status != CL_SUCCESS )
  3012. +                continue;
  3013. +
  3014. +            cl_bool image_support;
  3015. +            clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
  3016. +            if( !image_support )
  3017. +                continue;
  3018. +
  3019. +            cl_uint count = 0;
  3020. +            cl_image_format imageType[100];
  3021. +            clGetSupportedImageFormats( h->opencl.context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 100, imageType, &count );
  3022. +            count = X264_MIN( count, 100 );
  3023. +
  3024. +            int b_has_r = 0;
  3025. +            int b_has_rgba = 0;
  3026. +            for( cl_uint j = 0; j<count; j++ )
  3027. +            {
  3028. +                if( imageType[j].image_channel_order == CL_R )
  3029. +                    b_has_r = 1;
  3030. +                else if( imageType[j].image_channel_order == CL_RGBA )
  3031. +                    b_has_rgba = 1;
  3032. +            }
  3033. +            if( !b_has_r || !b_has_rgba )
  3034. +                continue;
  3035. +
  3036. +            h->opencl.queue = clCreateCommandQueue( h->opencl.context, h->opencl.device, 0, &status );
  3037. +            if( status != CL_SUCCESS )
  3038. +                continue;
  3039. +
  3040. +            ret = 0;
  3041. +            break;
  3042. +        }
  3043. +    }
  3044. +
  3045. +    x264_free( platforms );
  3046. +
  3047. +    if( !h->param.psz_clbin_file )
  3048. +        h->param.psz_clbin_file = "x264_lookahead.clbin";
  3049. +
  3050. +    if( !ret )
  3051. +        ret = x264_opencl_init_lookahead( h );
  3052. +
  3053. +    return ret;
  3054. +}
  3055. +
  3056. +void x264_opencl_free( x264_t *h )
  3057. +{
  3058. +    x264_opencl_free_lookahead( h );
  3059. +
  3060. +    if( h->opencl.queue )
  3061. +        clReleaseCommandQueue( h->opencl.queue );
  3062. +    if( h->opencl.context )
  3063. +        clReleaseContext( h->opencl.context );
  3064. +}
  3065. +#endif /* HAVE_OPENCL */
  3066. diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/slicetype-cl.c
  3067. --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
  3068. +++ b/encoder/slicetype-cl.c    Tue May 15 18:05:00 2012 -0500
  3069. @@ -0,0 +1,655 @@
  3070. +/*****************************************************************************
  3071. + * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
  3072. + *****************************************************************************
  3073. + * Copyright (C) 2012 x264 project
  3074. + *
  3075. + * Authors: Steve Borho <sborho@multicorewareinc.com>
  3076. + *
  3077. + * This program is free software; you can redistribute it and/or modify
  3078. + * it under the terms of the GNU General Public License as published by
  3079. + * the Free Software Foundation; either version 2 of the License, or
  3080. + * (at your option) any later version.
  3081. + *
  3082. + * This program is distributed in the hope that it will be useful,
  3083. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  3084. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  3085. + * GNU General Public License for more details.
  3086. + *
  3087. + * You should have received a copy of the GNU General Public License
  3088. + * along with this program; if not, write to the Free Software
  3089. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  3090. + *
  3091. + * This program is also available under a commercial proprietary license.
  3092. + * For more information, contact us at licensing@x264.com.
  3093. + *****************************************************************************/
  3094. +
  3095. +#define OCLCHECK( method, ... )\
  3096. +    status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) {\
  3097. +        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status ); return status; }
  3098. +
  3099. +static void x264_opencl_flush( x264_t *h )
  3100. +{
  3101. +    clFinish( h->opencl.queue );
  3102. +
  3103. +    /* Finish copies from the GPU by copying from the page-locked buffer to
  3104. +     * their final destination
  3105. +     */
  3106. +    for( int i = 0; i < h->opencl.num_copies; i++ )
  3107. +        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
  3108. +    h->opencl.num_copies = 0;
  3109. +    h->opencl.pl_occupancy = 0;
  3110. +}
  3111. +
  3112. +static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
  3113. +{
  3114. +    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
  3115. +        x264_opencl_flush( h );
  3116. +    assert( bytes < PAGE_LOCKED_BUF_SIZE );
  3117. +    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
  3118. +    h->opencl.pl_occupancy += bytes;
  3119. +    return ptr;
  3120. +}
  3121. +
  3122. +static int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
  3123. +{
  3124. +    if( fenc->b_intra_calculated )
  3125. +        return 0;
  3126. +    fenc->b_intra_calculated = 1;
  3127. +
  3128. +    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
  3129. +
  3130. +#define CREATEBUF( out, flags, size )\
  3131. +    out = clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
  3132. +    if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
  3133. +#define CREATEIMAGE( out, flags, pf, width, height )\
  3134. +    out = clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
  3135. +    if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
  3136. +
  3137. +    int mb_count = h->mb.i_mb_count;
  3138. +    cl_int status;
  3139. +
  3140. +    if( !h->opencl.lowres_mv_costs )
  3141. +    {
  3142. +        /* Allocate shared memory buffers */
  3143. +        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
  3144. +        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
  3145. +
  3146. +        cl_image_format pixel_format;
  3147. +        pixel_format.image_channel_order = CL_R;
  3148. +        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
  3149. +        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
  3150. +
  3151. +        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
  3152. +        {
  3153. +            pixel_format.image_channel_order = CL_RGBA;
  3154. +            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
  3155. +            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
  3156. +            width >>= 1;
  3157. +            height >>= 1;
  3158. +        }
  3159. +
  3160. +        CREATEBUF( h->opencl.lowres_mv_costs,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
  3161. +        CREATEBUF( h->opencl.lowres_costs[0],  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
  3162. +        CREATEBUF( h->opencl.lowres_costs[1],  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
  3163. +        CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
  3164. +        CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
  3165. +        CREATEBUF( h->opencl.mvp_buffer,    CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
  3166. +        CREATEBUF( h->opencl.frame_stats[0],   CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
  3167. +        CREATEBUF( h->opencl.frame_stats[1],   CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
  3168. +        CREATEBUF( h->opencl.row_satds[0],     CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
  3169. +        CREATEBUF( h->opencl.row_satds[1],     CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
  3170. +        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
  3171. +        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
  3172. +    }
  3173. +
  3174. +    if( !fenc->opencl.intra_cost )
  3175. +    {
  3176. +        /* Allocate per-frame buffers */
  3177. +        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
  3178. +        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
  3179. +
  3180. +        cl_image_format pixel_format;
  3181. +        pixel_format.image_channel_order = CL_R;
  3182. +        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
  3183. +        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
  3184. +
  3185. +        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
  3186. +        {
  3187. +            pixel_format.image_channel_order = CL_RGBA;
  3188. +            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
  3189. +            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
  3190. +            width >>= 1;
  3191. +            height >>= 1;
  3192. +        }
  3193. +        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
  3194. +        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
  3195. +        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
  3196. +        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
  3197. +        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
  3198. +        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
  3199. +    }
  3200. +#undef CREATEBUF
  3201. +
  3202. +    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
  3203. +
  3204. +    char *locked = x264_opencl_alloc_locked( h, luma_length );
  3205. +    memcpy( locked, fenc->plane[0], luma_length );
  3206. +    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
  3207. +
  3208. +    size_t gdim[2];
  3209. +    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
  3210. +    {
  3211. +        int size = h->mb.i_mb_count * sizeof(int16_t);
  3212. +        locked = x264_opencl_alloc_locked( h, size );
  3213. +        memcpy( locked, fenc->i_inv_qscale_factor, size );
  3214. +        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
  3215. +    }
  3216. +    else
  3217. +    {
  3218. +        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
  3219. +        cl_uint arg = 0;
  3220. +        int16_t value = 256;
  3221. +        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
  3222. +        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
  3223. +        gdim[0] = h->mb.i_mb_count;
  3224. +        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
  3225. +    }
  3226. +
  3227. +    int stride = fenc->i_stride[0];
  3228. +    cl_uint arg = 0;
  3229. +    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
  3230. +    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
  3231. +    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
  3232. +    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
  3233. +    gdim[0] = 8 * h->mb.i_mb_width;
  3234. +    gdim[1] = 8 * h->mb.i_mb_height;
  3235. +    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
  3236. +
  3237. +    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
  3238. +    {
  3239. +        /* Workaround for AMD Southern Island, alternate kernel instances.  No
  3240. +         * perf impact to this, so we do it for all GPUs
  3241. +         */
  3242. +        cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
  3243. +
  3244. +        arg = 0;
  3245. +        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
  3246. +        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
  3247. +        gdim[0] >>= 1;
  3248. +        gdim[1] >>= 1;
  3249. +        if( gdim[0] < 16 || gdim[1] < 16 )
  3250. +            break;
  3251. +        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
  3252. +    }
  3253. +
  3254. +    size_t ldim[2];
  3255. +    gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
  3256. +    gdim[1] = 8*h->mb.i_mb_height;
  3257. +    ldim[0] = 32;
  3258. +    ldim[1] = 8;
  3259. +    arg = 0;
  3260. +    int slow = h->param.analyse.i_subpel_refine > 7;
  3261. +    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
  3262. +    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
  3263. +    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
  3264. +    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
  3265. +    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
  3266. +    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
  3267. +    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
  3268. +
  3269. +    gdim[0] = 256;
  3270. +    gdim[1] = h->mb.i_mb_height;
  3271. +    ldim[0] = 256;
  3272. +    ldim[1] = 1;
  3273. +    arg = 0;
  3274. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
  3275. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
  3276. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
  3277. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
  3278. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
  3279. +    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
  3280. +
  3281. +    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
  3282. +        x264_opencl_flush( h );
  3283. +
  3284. +    int size = h->mb.i_mb_count * sizeof(int16_t);
  3285. +    locked = x264_opencl_alloc_locked( h, size );
  3286. +    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
  3287. +    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
  3288. +    h->opencl.copies[h->opencl.num_copies].src = locked;
  3289. +    h->opencl.copies[h->opencl.num_copies].bytes = size;
  3290. +    h->opencl.num_copies++;
  3291. +
  3292. +    size = h->mb.i_mb_height * sizeof(int);
  3293. +    locked = x264_opencl_alloc_locked( h, size );
  3294. +    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
  3295. +    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
  3296. +    h->opencl.copies[h->opencl.num_copies].src = locked;
  3297. +    h->opencl.copies[h->opencl.num_copies].bytes = size;
  3298. +    h->opencl.num_copies++;
  3299. +
  3300. +    size = sizeof(int) * 4;
  3301. +    locked = x264_opencl_alloc_locked( h, size );
  3302. +    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
  3303. +    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
  3304. +    h->opencl.copies[h->opencl.num_copies].src = locked;
  3305. +    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
  3306. +    h->opencl.num_copies++;
  3307. +    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
  3308. +    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
  3309. +    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
  3310. +    h->opencl.num_copies++;
  3311. +
  3312. +    h->opencl.last_buf = !h->opencl.last_buf;
  3313. +    return 0;
  3314. +}
  3315. +
  3316. +static void x264_optimal_launch_dims( size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
  3317. +{
  3318. +    size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
  3319. +    size_t preferred_multiple = 64;
  3320. +    cl_uint num_cus = 6;
  3321. +
  3322. +    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
  3323. +    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
  3324. +    clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
  3325. +
  3326. +    ldims[0] = preferred_multiple;
  3327. +    ldims[1] = 8;
  3328. +
  3329. +    /* make ldims[1] an even divisor of gdims[1] */
  3330. +    while( gdims[1] & (ldims[1] - 1) )
  3331. +    {
  3332. +        ldims[0] <<= 1;
  3333. +        ldims[1] >>= 1;
  3334. +    }
  3335. +    /* make total ldims fit under the max work-group dimensions for the device */
  3336. +    while( ldims[0] * ldims[1] > max_work_group )
  3337. +    {
  3338. +        if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
  3339. +            ldims[1] >>= 1;
  3340. +        else
  3341. +            ldims[0] >>= 1;
  3342. +    }
  3343. +
  3344. +    if( ldims[0] > gdims[0] )
  3345. +    {
  3346. +        /* remove preferred multiples until we're close to gdims[0] */
  3347. +        while( gdims[0] + preferred_multiple < ldims[0] )
  3348. +            ldims[0] -= preferred_multiple;
  3349. +        gdims[0] = ldims[0];
  3350. +    }
  3351. +    else
  3352. +    {
  3353. +        /* make gdims an even multiple of ldims */
  3354. +        gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
  3355. +        gdims[0] *= ldims[0];
  3356. +    }
  3357. +
  3358. +    /* make ldims smaller to spread work across compute units */
  3359. +    while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
  3360. +    {
  3361. +        if( ldims[0] > preferred_multiple )
  3362. +            ldims[0] >>= 1;
  3363. +        else if( ldims[1] > 1 )
  3364. +            ldims[1] >>= 1;
  3365. +        else
  3366. +            break;
  3367. +    }
  3368. +    /* for smaller GPUs, try not to abuse their texture cache */
  3369. +    if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
  3370. +        ldims[0] = 32;
  3371. +}
  3372. +
  3373. +static int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
  3374. +{
  3375. +    x264_frame_t *fenc = frames[b];
  3376. +    x264_frame_t *fref = frames[ref];
  3377. +
  3378. +    cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
  3379. +    cl_mem ref_luma_hpel;
  3380. +    cl_int status;
  3381. +
  3382. +    if( w && (w->i_denom) != 0 )
  3383. +    {
  3384. +        size_t gdims[2];
  3385. +
  3386. +        gdims[0] = 8 * h->mb.i_mb_width;
  3387. +        gdims[1] = 8 * h->mb.i_mb_height;
  3388. +
  3389. +        /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
  3390. +        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
  3391. +        {
  3392. +            cl_uint arg = 0;
  3393. +            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
  3394. +            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
  3395. +            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
  3396. +            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
  3397. +            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
  3398. +            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
  3399. +
  3400. +            gdims[0] >>= 1;
  3401. +            gdims[1] >>= 1;
  3402. +            if( gdims[0] < 16 || gdims[1] < 16 )
  3403. +                break;
  3404. +        }
  3405. +
  3406. +        cl_uint arg = 0;
  3407. +        gdims[0] = 8 * h->mb.i_mb_width;
  3408. +        gdims[1] = 8 * h->mb.i_mb_height;
  3409. +
  3410. +        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
  3411. +        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
  3412. +        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
  3413. +        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
  3414. +        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
  3415. +        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
  3416. +
  3417. +        /* Use weighted reference planes for motion search */
  3418. +        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
  3419. +            ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
  3420. +        ref_luma_hpel = h->opencl.weighted_luma_hpel;
  3421. +    }
  3422. +    else
  3423. +    {
  3424. +        /* Use unweighted reference planes for motion search */
  3425. +        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
  3426. +            ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
  3427. +        ref_luma_hpel = fref->opencl.luma_hpel;
  3428. +    }
  3429. +
  3430. +    const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
  3431. +    int b_first_iteration = 1;
  3432. +    int b_reverse_references = 1;
  3433. +    int A = 1;
  3434. +
  3435. +
  3436. +    int mb_per_group = 0;
  3437. +    int cost_local_size = 0;
  3438. +    int mvc_local_size = 0;
  3439. +    int fenc_local_size = 0;
  3440. +    int mb_width;
  3441. +
  3442. +    size_t gdims[2];
  3443. +    size_t ldims[2];
  3444. +
  3445. +    /* scale 0 is 8x8 */
  3446. +    for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
  3447. +    {
  3448. +        mb_width = h->mb.i_mb_width >> scale;
  3449. +        gdims[0] = mb_width;
  3450. +        gdims[1] = h->mb.i_mb_height >> scale;
  3451. +        if( gdims[0] < 2 || gdims[1] < 2 )
  3452. +            continue;
  3453. +        gdims[0] <<= 2;
  3454. +        x264_optimal_launch_dims( gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
  3455. +
  3456. +        mb_per_group = (ldims[0] >> 2) * ldims[1];
  3457. +        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
  3458. +        fenc_local_size = 16 * sizeof(cl_uint4) * mb_per_group;
  3459. +        mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
  3460. +        int scaled_me_range = h->param.analyse.i_me_range >> scale;
  3461. +        int b_shift_index = 1;
  3462. +
  3463. +        cl_uint arg = 0;
  3464. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
  3465. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
  3466. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
  3467. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
  3468. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
  3469. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
  3470. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
  3471. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
  3472. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
  3473. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
  3474. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
  3475. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
  3476. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
  3477. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
  3478. +        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
  3479. +
  3480. +        for( int iter = 0; iter < num_iterations[scale]; iter++ )
  3481. +        {
  3482. +            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
  3483. +
  3484. +            b_shift_index = 0;
  3485. +            b_first_iteration = 0;
  3486. +            if( scale > 2 )
  3487. +                b_reverse_references = !b_reverse_references;
  3488. +            else
  3489. +                b_reverse_references = 0;
  3490. +            A = !A;
  3491. +            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
  3492. +            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
  3493. +            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
  3494. +            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
  3495. +            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
  3496. +        }
  3497. +    }
  3498. +
  3499. +    int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
  3500. +    cl_uint arg = 0;
  3501. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
  3502. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
  3503. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
  3504. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
  3505. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
  3506. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
  3507. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
  3508. +
  3509. +    if( b_islist1 )
  3510. +    {
  3511. +        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
  3512. +        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
  3513. +    }
  3514. +    else
  3515. +    {
  3516. +        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
  3517. +        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
  3518. +    }
  3519. +
  3520. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
  3521. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
  3522. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
  3523. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
  3524. +    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
  3525. +
  3526. +    if( h->opencl.b_device_AMD_SI )
  3527. +    {
  3528. +        /* workaround for AMD Southern Island, perform meaningless small copy */
  3529. +        OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
  3530. +    }
  3531. +
  3532. +    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
  3533. +
  3534. +    int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
  3535. +
  3536. +    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
  3537. +        x264_opencl_flush( h );
  3538. +
  3539. +    char *locked = x264_opencl_alloc_locked( h, mvlen );
  3540. +    h->opencl.copies[h->opencl.num_copies].src = locked;
  3541. +    h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
  3542. +
  3543. +    if( b_islist1 )
  3544. +    {
  3545. +        int mvs_offset = mvlen * (ref - b - 1);
  3546. +        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
  3547. +        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
  3548. +    }
  3549. +    else
  3550. +    {
  3551. +        int mvs_offset = mvlen * (b - ref - 1);
  3552. +        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
  3553. +        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
  3554. +    }
  3555. +
  3556. +    h->opencl.num_copies++;
  3557. +
  3558. +    return 0;
  3559. +}
  3560. +
  3561. +static int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
  3562. +{
  3563. +    cl_int status;
  3564. +    x264_frame_t *fenc = frames[b];
  3565. +    x264_frame_t *fref0 = frames[p0];
  3566. +    x264_frame_t *fref1 = frames[p1];
  3567. +
  3568. +    int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
  3569. +
  3570. +    /* Tasks for this kernel:
  3571. +     * 1. Select least cost mode (intra, ref0, ref1)
  3572. +     *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
  3573. +     * 2. if B frame, try bidir predictions.
  3574. +     * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
  3575. +     */
  3576. +    size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
  3577. +    size_t ldim_bidir[2];
  3578. +    size_t *ldims = NULL;
  3579. +    int cost_local_size = 4;
  3580. +    int satd_local_size = 4;
  3581. +    if( b < p1 )
  3582. +    {
  3583. +        /* For B frames, use 4 threads per MB for BIDIR checks */
  3584. +        ldims = ldim_bidir;
  3585. +        gdims[0] <<= 2;
  3586. +        x264_optimal_launch_dims( gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
  3587. +        int mb_per_group = (ldims[0] >> 2) * ldims[1];
  3588. +        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
  3589. +        satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
  3590. +    }
  3591. +
  3592. +    cl_uint arg = 0;
  3593. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
  3594. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
  3595. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
  3596. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
  3597. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
  3598. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
  3599. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
  3600. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
  3601. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
  3602. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
  3603. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
  3604. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
  3605. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
  3606. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
  3607. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
  3608. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
  3609. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
  3610. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
  3611. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
  3612. +    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
  3613. +    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
  3614. +
  3615. +    /* Sum costs across rows, atomicAdd down frame */
  3616. +    size_t gdim[2] = { 256, h->mb.i_mb_height };
  3617. +    size_t ldim[2] = { 256, 1 };
  3618. +
  3619. +    arg = 0;
  3620. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
  3621. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
  3622. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
  3623. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
  3624. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
  3625. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
  3626. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
  3627. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
  3628. +    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
  3629. +    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
  3630. +
  3631. +    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
  3632. +        x264_opencl_flush( h );
  3633. +
  3634. +    int size =  h->mb.i_mb_count * sizeof(int16_t);
  3635. +    char *locked = x264_opencl_alloc_locked( h, size );
  3636. +    h->opencl.copies[h->opencl.num_copies].src = locked;
  3637. +    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
  3638. +    h->opencl.copies[h->opencl.num_copies].bytes = size;
  3639. +    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
  3640. +    h->opencl.num_copies++;
  3641. +
  3642. +    size =  h->mb.i_mb_height * sizeof(int);
  3643. +    locked = x264_opencl_alloc_locked( h, size );
  3644. +    h->opencl.copies[h->opencl.num_copies].src = locked;
  3645. +    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
  3646. +    h->opencl.copies[h->opencl.num_copies].bytes = size;
  3647. +    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
  3648. +    h->opencl.num_copies++;
  3649. +
  3650. +    size =  4 * sizeof(int);
  3651. +    locked = x264_opencl_alloc_locked( h, size );
  3652. +    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
  3653. +    h->opencl.last_buf = !h->opencl.last_buf;
  3654. +
  3655. +    h->opencl.copies[h->opencl.num_copies].src = locked;
  3656. +    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
  3657. +    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
  3658. +    h->opencl.num_copies++;
  3659. +    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
  3660. +    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
  3661. +    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
  3662. +    h->opencl.num_copies++;
  3663. +
  3664. +    if( b == p1 ) // P frames only
  3665. +    {
  3666. +        h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
  3667. +        h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
  3668. +        h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
  3669. +        h->opencl.num_copies++;
  3670. +    }
  3671. +    return 0;
  3672. +}
  3673. +
  3674. +static int x264_opencl_precalculate_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
  3675. +{
  3676. +    if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
  3677. +        return 0;
  3678. +    else
  3679. +    {
  3680. +        int do_search[2];
  3681. +        int dist_scale_factor = 128;
  3682. +        const x264_weight_t *w = x264_weight_none;
  3683. +
  3684. +        // avoid duplicating work
  3685. +        frames[b]->i_cost_est[b-p0][p1-b] = 0;
  3686. +
  3687. +        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
  3688. +        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
  3689. +        if( do_search[0] )
  3690. +        {
  3691. +            if( h->param.analyse.i_weighted_pred && b == p1 )
  3692. +            {
  3693. +                x264_emms();
  3694. +                x264_weights_analyse( h, frames[b], frames[p0], 1 );
  3695. +                w = frames[b]->weight[0];
  3696. +            }
  3697. +            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
  3698. +        }
  3699. +        if( do_search[1] )
  3700. +            frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
  3701. +        if( b == p1 )
  3702. +            frames[b]->i_intra_mbs[b-p0] = 0;
  3703. +        if( p1 != p0 )
  3704. +            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
  3705. +
  3706. +        frames[b]->i_cost_est[b-p0][p1-b] = 0;
  3707. +        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
  3708. +
  3709. +        x264_opencl_lowres_init( h, frames[b], a->i_lambda );
  3710. +
  3711. +        if( do_search[0] )
  3712. +        {
  3713. +            x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
  3714. +            x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
  3715. +        }
  3716. +        if( do_search[1] )
  3717. +        {
  3718. +            x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
  3719. +            x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
  3720. +        }
  3721. +        x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
  3722. +        return 1;
  3723. +    }
  3724. +}
  3725. diff -r 087c10a1fec7 -r 9a682cb0c74f encoder/slicetype.c
  3726. --- a/encoder/slicetype.c       Thu Mar 29 14:14:07 2012 -0700
  3727. +++ b/encoder/slicetype.c       Tue May 15 18:05:00 2012 -0500
  3728. @@ -36,6 +36,15 @@
  3729.                                        x264_frame_t **frames, int p0, int p1, int b,
  3730.                                        int b_intra_penalty );
  3731.  
  3732. +static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
  3733. +
  3734. +#if HAVE_OPENCL
  3735. +#ifdef _WIN32
  3736. +#include "windows.h"
  3737. +#endif
  3738. +#include "slicetype-cl.c"
  3739. +#endif
  3740. +
  3741.  static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
  3742.  {
  3743.      a->i_qp = X264_LOOKAHEAD_QP;
  3744. @@ -748,38 +757,62 @@
  3745.          frames[b]->i_cost_est[b-p0][p1-b] = 0;
  3746.          frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
  3747.  
  3748. -        /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
  3749. -         * This considerably improves MV prediction overall. */
  3750. +#if HAVE_OPENCL
  3751. +        if( h->param.b_opencl )
  3752. +        {
  3753. +            x264_opencl_lowres_init(h, frames[b], a->i_lambda );
  3754. +            if( do_search[0] )
  3755. +            {
  3756. +                x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
  3757. +                x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
  3758. +            }
  3759. +            if( do_search[1] )
  3760. +            {
  3761. +                x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
  3762. +                x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, w );
  3763. +            }
  3764. +            if( b != p0 )
  3765. +                x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
  3766. +            x264_opencl_flush( h );
  3767.  
  3768. -        /* The edge mbs seem to reduce the predictive quality of the
  3769. -         * whole frame's score, but are needed for a spatial distribution. */
  3770. -        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
  3771. -            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
  3772. -        {
  3773. -            for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
  3774. -            {
  3775. -                row_satd[h->mb.i_mb_y] = 0;
  3776. -                if( !frames[b]->b_intra_calculated )
  3777. -                    row_satd_intra[h->mb.i_mb_y] = 0;
  3778. -                for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
  3779. -                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
  3780. -            }
  3781. +            i_score = frames[b]->i_cost_est[b-p0][p1-b];
  3782.          }
  3783.          else
  3784. +#endif
  3785.          {
  3786. -            for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
  3787. -                for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
  3788. -                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
  3789. +            /* Lowres lookahead goes backwards because the MVs are used as predictors in
  3790. +             * the main encode. This considerably improves MV prediction overall. */
  3791. +
  3792. +            /* The edge mbs seem to reduce the predictive quality of the
  3793. +             * whole frame's score, but are needed for a spatial distribution. */
  3794. +            if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
  3795. +                    h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
  3796. +            {
  3797. +                for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
  3798. +                {
  3799. +                    row_satd[h->mb.i_mb_y] = 0;
  3800. +                    if( !frames[b]->b_intra_calculated )
  3801. +                        row_satd_intra[h->mb.i_mb_y] = 0;
  3802. +                    for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
  3803. +                        x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
  3804. +                }
  3805. +            }
  3806. +            else
  3807. +            {
  3808. +                for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
  3809. +                    for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
  3810. +                        x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
  3811. +            }
  3812. +
  3813. +            i_score = frames[b]->i_cost_est[b-p0][p1-b];
  3814. +            if( b != p1 )
  3815. +                i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
  3816. +            else
  3817. +                frames[b]->b_intra_calculated = 1;
  3818. +
  3819. +            frames[b]->i_cost_est[b-p0][p1-b] = i_score;
  3820. +            x264_emms();
  3821.          }
  3822. -
  3823. -        i_score = frames[b]->i_cost_est[b-p0][p1-b];
  3824. -        if( b != p1 )
  3825. -            i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
  3826. -        else
  3827. -            frames[b]->b_intra_calculated = 1;
  3828. -
  3829. -        frames[b]->i_cost_est[b-p0][p1-b] = i_score;
  3830. -        x264_emms();
  3831.      }
  3832.  
  3833.      if( b_intra_penalty )
  3834. @@ -1344,6 +1377,61 @@
  3835.          return;
  3836.      }
  3837.  
  3838. +#if HAVE_OPENCL
  3839. +#ifdef _WIN32
  3840. +    int my_pri = THREAD_PRIORITY_NORMAL, ocl_pri = THREAD_PRIORITY_NORMAL;
  3841. +#endif
  3842. +
  3843. +    if( h->param.b_opencl )
  3844. +    {
  3845. +#ifdef _WIN32
  3846. +#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
  3847. +        HANDLE id = GetCurrentThread();
  3848. +        my_pri = GetThreadPriority( id );
  3849. +        SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
  3850. +        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
  3851. +        if( status == CL_SUCCESS )
  3852. +        {
  3853. +            ocl_pri = GetThreadPriority( id );
  3854. +            SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
  3855. +        }
  3856. +#endif
  3857. +
  3858. +        int b_work_done = 0;
  3859. +
  3860. +        /* precalculate intra and I frames */
  3861. +        for( int i = 0; i <= num_frames; i++ )
  3862. +        {
  3863. +            b_work_done |= x264_opencl_lowres_init( h, frames[i], a.i_lambda );
  3864. +        }
  3865. +        if( b_work_done && h->param.analyse.i_weighted_pred )
  3866. +        {
  3867. +            /* weightp estimation requires intra costs to be on the CPU */
  3868. +            x264_opencl_flush( h );
  3869. +        }
  3870. +
  3871. +        if( h->param.i_bframe && h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
  3872. +        {
  3873. +            /* Precalculate motion searches in both directions */
  3874. +            for( int i = 0; i < num_frames; i++ )
  3875. +            {
  3876. +                for (int b = 0; b < h->param.i_bframe; b++)
  3877. +                {
  3878. +                    if (i <= b)
  3879. +                    {
  3880. +                        break;
  3881. +                    }
  3882. +
  3883. +                    b_work_done |= x264_opencl_precalculate_frame_cost( h, &a, frames, i, i-b-1, i-b-1 );
  3884. +                    b_work_done |= x264_opencl_precalculate_frame_cost( h, &a, frames, i-b-1, i, i );
  3885. +                }
  3886. +            }
  3887. +        }
  3888. +        if( b_work_done )
  3889. +            x264_opencl_flush( h );
  3890. +    }
  3891. +#endif
  3892. +
  3893.      if( h->param.i_bframe )
  3894.      {
  3895.          if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
  3896. @@ -1377,6 +1465,18 @@
  3897.                      continue;
  3898.                  }
  3899.  
  3900. +#if HAVE_OPENCL
  3901. +                if( h->param.b_opencl )
  3902. +                {
  3903. +                    int b_work_done = 0;
  3904. +                    b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+0, i+2, i+1 );
  3905. +                    b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+0, i+1, i+1 );
  3906. +                    b_work_done |= x264_opencl_precalculate_frame_cost(h, &a, frames, i+1, i+2, i+2 );
  3907. +                    if( b_work_done )
  3908. +                        x264_opencl_flush( h );
  3909. +                }
  3910. +#endif
  3911. +
  3912.                  cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
  3913.                  cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
  3914.                  cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
  3915. @@ -1459,6 +1559,19 @@
  3916.      /* Restore frametypes for all frames that haven't actually been decided yet. */
  3917.      for( int j = reset_start; j <= num_frames; j++ )
  3918.          frames[j]->i_type = X264_TYPE_AUTO;
  3919. +
  3920. +#ifdef _WIN32
  3921. +#if HAVE_OPENCL
  3922. +    if( h->param.b_opencl )
  3923. +    {
  3924. +        HANDLE id = GetCurrentThread();
  3925. +        SetThreadPriority( id, my_pri );
  3926. +        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
  3927. +        if( status == CL_SUCCESS )
  3928. +            SetThreadPriority( id, ocl_pri );
  3929. +    }
  3930. +#endif
  3931. +#endif
  3932.  }
  3933.  
  3934.  void x264_slicetype_decide( x264_t *h )
  3935. diff -r 087c10a1fec7 -r 9a682cb0c74f x264.c
  3936. --- a/x264.c    Thu Mar 29 14:14:07 2012 -0700
  3937. +++ b/x264.c    Tue May 15 18:05:00 2012 -0500
  3938. @@ -805,6 +805,8 @@
  3939.          "                                  as opposed to letting them select different algorithms\n" );
  3940.      H2( "      --asm <integer>         Override CPU detection\n" );
  3941.      H2( "      --no-asm                Disable all CPU optimizations\n" );
  3942. +    H2( "      --no-opencl             Disable use of OpenCL\n" );
  3943. +    H2( "      --clbin-file <string>   Specify path of compiled OpenCL kernel cache\n" );
  3944.      H2( "      --visualize             Show MB types overlayed on the encoded video\n" );
  3945.      H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
  3946.      H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
  3947. @@ -909,6 +911,8 @@
  3948.      { "ref",         required_argument, NULL, 'r' },
  3949.      { "asm",         required_argument, NULL, 0 },
  3950.      { "no-asm",            no_argument, NULL, 0 },
  3951. +    { "no-opencl",         no_argument, NULL, 0 },
  3952. +    { "clbin-file",  required_argument, NULL, 0 },
  3953.      { "sar",         required_argument, NULL, 0 },
  3954.      { "fps",         required_argument, NULL, OPT_FPS },
  3955.      { "frames",      required_argument, NULL, OPT_FRAMES },
  3956. diff -r 087c10a1fec7 -r 9a682cb0c74f x264.h
  3957. --- a/x264.h    Thu Mar 29 14:14:07 2012 -0700
  3958. +++ b/x264.h    Tue May 15 18:05:00 2012 -0500
  3959. @@ -462,6 +462,9 @@
  3960.  
  3961.      int b_fake_interlaced;
  3962.  
  3963. +    int b_opencl;            /* use OpenCL when available */
  3964. +    char *psz_clbin_file;    /* compiled OpenCL kernel cache file */
  3965. +
  3966.      /* Slicing parameters */
  3967.      int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
  3968.      int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */
RAW Paste Data
Top