Advertisement
Guest User

Untitled

a guest
Aug 12th, 2017
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.35 KB | None | 0 0
  1. #include <xtl.h>
  2.  
  3. #include "libavcodec/dsputil.h"
  4. #include "libavcodec/h264data.h"
  5. #include "libavcodec/h264dsp.h"
  6.  
  7.  
  8. /**
  9. * Inline
  10. **/
  11. static inline void write16x4(uint8_t *dst, int dst_stride,
  12. register __vector4 r0, register __vector4 r1,
  13. register __vector4 r2, register __vector4 r3) {
  14. DECLARE_ALIGNED(16, unsigned char, result)[64];
  15. uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
  16. int int_dst_stride = dst_stride/4;
  17.  
  18. __stvx(r0, result, 0 );
  19. __stvx(r1, result, 16);
  20. __stvx(r2, result, 32);
  21. __stvx(r3, result, 48);
  22. /* FIXME: there has to be a better way!!!! */
  23. *dst_int = *src_int;
  24. *(dst_int+ int_dst_stride) = *(src_int + 1);
  25. *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
  26. *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
  27. *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
  28. *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
  29. *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
  30. *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
  31. *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
  32. *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
  33. *(dst_int+10*int_dst_stride) = *(src_int + 10);
  34. *(dst_int+11*int_dst_stride) = *(src_int + 11);
  35. *(dst_int+12*int_dst_stride) = *(src_int + 12);
  36. *(dst_int+13*int_dst_stride) = *(src_int + 13);
  37. *(dst_int+14*int_dst_stride) = *(src_int + 14);
  38. *(dst_int+15*int_dst_stride) = *(src_int + 15);
  39. }
  40.  
  41. #if 1
  42.  
  43. /** \brief loads unaligned vector \a *src with offset \a offset
  44. and returns it */
  45. static inline __vector4 unaligned_load(int offset, uint8_t *src)
  46. {
  47. register __vector4 first = __lvx(src,offset);
  48. register __vector4 second = __lvx(src, offset+15);
  49. register __vector4 mask = __lvsl(src, offset);
  50. return __vperm(first, second, mask);
  51. }
  52.  
  53. #endif
  54.  
  55. /** \brief performs a 6x16 transpose of data in src, and stores it to dst
  56. \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
  57. out of unaligned_load() */
  58. static inline void readAndTranspose16x6(
  59. uint8_t * src,
  60. int src_stride,
  61. register __vector4 r8,
  62. register __vector4 r9,
  63. register __vector4 r10,
  64. register __vector4 r11,
  65. register __vector4 r12,
  66. register __vector4 r13
  67. )
  68. {
  69. register __vector4 r0 = unaligned_load(0, src);
  70. register __vector4 r1 = unaligned_load( src_stride, src);
  71. register __vector4 r2 = unaligned_load(2* src_stride, src);
  72. register __vector4 r3 = unaligned_load(3* src_stride, src);
  73. register __vector4 r4 = unaligned_load(4* src_stride, src);
  74. register __vector4 r5 = unaligned_load(5* src_stride, src);
  75. register __vector4 r6 = unaligned_load(6* src_stride, src);
  76. register __vector4 r7 = unaligned_load(7* src_stride, src);
  77. register __vector4 r14 = unaligned_load(14*src_stride, src);
  78. register __vector4 r15 = unaligned_load(15*src_stride, src);
  79.  
  80. r8 = unaligned_load( 8*src_stride, src);
  81. r9 = unaligned_load( 9*src_stride, src);
  82. r10 = unaligned_load(10*src_stride, src);
  83. r11 = unaligned_load(11*src_stride, src);
  84. r12 = unaligned_load(12*src_stride, src);
  85. r13 = unaligned_load(13*src_stride, src);
  86.  
  87. /*Merge first pairs*/
  88. r0 = __vmrghb(r0, r8); /*0, 8*/
  89. r1 = __vmrghb(r1, r9); /*1, 9*/
  90. r2 = __vmrghb(r2, r10); /*2,10*/
  91. r3 = __vmrghb(r3, r11); /*3,11*/
  92. r4 = __vmrghb(r4, r12); /*4,12*/
  93. r5 = __vmrghb(r5, r13); /*5,13*/
  94. r6 = __vmrghb(r6, r14); /*6,14*/
  95. r7 = __vmrghb(r7, r15); /*7,15*/
  96.  
  97. /*Merge second pairs*/
  98. r8 = __vmrghb(r0, r4); /*0,4, 8,12 set 0*/
  99. r9 = __vmrglb(r0, r4); /*0,4, 8,12 set 1*/
  100. r10 = __vmrghb(r1, r5); /*1,5, 9,13 set 0*/
  101. r11 = __vmrglb(r1, r5); /*1,5, 9,13 set 1*/
  102. r12 = __vmrghb(r2, r6); /*2,6,10,14 set 0*/
  103. r13 = __vmrglb(r2, r6); /*2,6,10,14 set 1*/
  104. r14 = __vmrghb(r3, r7); /*3,7,11,15 set 0*/
  105. r15 = __vmrglb(r3, r7); /*3,7,11,15 set 1*/
  106.  
  107. /*Third merge*/
  108. r0 = __vmrghb(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/
  109. r1 = __vmrglb(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/
  110. r2 = __vmrghb(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/
  111. r4 = __vmrghb(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/
  112. r5 = __vmrglb(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/
  113. r6 = __vmrghb(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/
  114. /* Don't need to compute 3 and 7*/
  115.  
  116. /*Final merge*/
  117. r8 = __vmrghb(r0, r4); /*all set 0*/
  118. r9 = __vmrglb(r0, r4); /*all set 1*/
  119. r10 = __vmrghb(r1, r5); /*all set 2*/
  120. r11 = __vmrglb(r1, r5); /*all set 3*/
  121. r12 = __vmrghb(r2, r6); /*all set 4*/
  122. r13 = __vmrglb(r2, r6); /*all set 5*/
  123. /* Don't need to compute 14 and 15*/
  124. }
  125.  
  126.  
  127.  
  128. static void inline transpose4x16(
  129. register __vector4 r0,
  130. register __vector4 r1,
  131. register __vector4 r2,
  132. register __vector4 r3
  133. ){
  134. register __vector4 r4;
  135. register __vector4 r5;
  136. register __vector4 r6;
  137. register __vector4 r7;
  138.  
  139. r4 = __vmrghb(r0, r2); /*0, 2 set 0*/ \
  140. r5 = __vmrglb(r0, r2); /*0, 2 set 1*/ \
  141. r6 = __vmrghb(r1, r3); /*1, 3 set 0*/ \
  142. r7 = __vmrglb(r1, r3); /*1, 3 set 1*/ \
  143. \
  144. r0 = __vmrghb(r4, r6); /*all set 0*/ \
  145. r1 = __vmrglb(r4, r6); /*all set 1*/ \
  146. r2 = __vmrghb(r5, r7); /*all set 2*/ \
  147. r3 = __vmrglb(r5, r7); /*all set 3*/ \
  148. }
  149.  
  150. static void inline h264_deblock_p0_q0(
  151. register __vector4 p0,
  152. register __vector4 p1,
  153. register __vector4 q0,
  154. register __vector4 q1,
  155. register __vector4 tc0masked
  156. )
  157. {
  158.  
  159. const __vector4 A0v = __vslb(__vspltisb(10), __vspltisb(4));
  160.  
  161. register __vector4 pq0bit = __vxor(p0,q0);
  162. register __vector4 q1minus;
  163. register __vector4 p0minus;
  164. register __vector4 stage1;
  165. register __vector4 stage2;
  166. register __vector4 vec160;
  167. register __vector4 delta;
  168. register __vector4 deltaneg;
  169.  
  170. q1minus = __vnor(q1, q1); /* 255 - q1 */
  171. stage1 = __vavgub(p1, q1minus); /* (p1 - q1 + 256)>>1 */
  172. stage2 = __vsrb(stage1, __vspltisb(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */
  173. p0minus = __vnor(p0, p0); /* 255 - p0 */
  174. stage1 = __vavgub(q0, p0minus); /* (q0 - p0 + 256)>>1 */
  175. pq0bit = __vand(pq0bit, __vspltisb(1));
  176. stage2 = __vavgub(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */
  177. stage2 = __vaddubs(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */
  178. vec160 = __lvx(&A0v, 0);
  179. deltaneg = __vsububs(vec160, stage2); /* -d */
  180. delta = __vsububs(stage2, vec160); /* d */
  181. deltaneg = __vminub(tc0masked, deltaneg);
  182. delta = __vminub(tc0masked, delta);
  183. p0 = __vsububs(p0, deltaneg);
  184. q0 = __vsububs(q0, delta);
  185. p0 = __vaddubs(p0, delta);
  186. q0 = __vaddubs(q0, deltaneg);
  187. }
  188.  
  189. // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
  190. static inline __vector4 h264_deblock_q1(
  191. register __vector4 p0,
  192. register __vector4 p1,
  193. register __vector4 p2,
  194. register __vector4 q0,
  195. register __vector4 tc0
  196. )
  197. {
  198.  
  199. register __vector4 average = __vavgub(p0, q0);
  200. register __vector4 temp;
  201. register __vector4 uncliped;
  202. register __vector4 ones;
  203. register __vector4 max;
  204. register __vector4 min;
  205. register __vector4 newp1;
  206.  
  207. temp = __vxor(average, p2); //__vnor ?
  208. average = __vavgub (average, p2); /*avg(p2, avg(p0, q0)) */
  209. ones = __vspltisb(1); //vspltisb 16,-1
  210. temp = __vand (temp, ones); /*(p2^avg(p0, q0)) & 1 */
  211. uncliped = __vsububs (average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
  212. max = __vaddubs (p1, tc0);
  213. min = __vsububs(p1, tc0);
  214. newp1 = __vmaxub(min, uncliped);
  215. newp1 = __vminub(max, newp1);
  216. return newp1;
  217. }
  218.  
  219. // out: o = |x-y| < a
  220. static inline __vector4 diff_lt_altivec (
  221. __vector4 x,
  222. __vector4 y,
  223. __vector4 a
  224. )
  225. {
  226. __vector4 diff = __vsububs(x, y);
  227. __vector4 diffneg = __vsububs(y, x);
  228. __vector4 o = __vor(diff, diffneg); /* |x-y| */
  229. o = __vcmpgtub(o, a);
  230. return o;
  231. }
  232.  
  233. static inline __vector4 h264_deblock_mask (
  234. __vector4 p0,
  235. __vector4 p1,
  236. __vector4 q0,
  237. __vector4 q1,
  238. __vector4 alpha,
  239. __vector4 beta
  240. ) {
  241.  
  242. __vector4 mask;
  243. __vector4 tempmask;
  244.  
  245. mask = diff_lt_altivec(p0, q0, alpha);
  246. tempmask = diff_lt_altivec(p1, p0, beta);
  247. mask = __vand(mask, tempmask);
  248. tempmask = diff_lt_altivec(q1, q0, beta);
  249. mask = __vand(mask, tempmask);
  250.  
  251. return mask;
  252. }
  253.  
  254. static inline void h264_loop_filter_luma_altivec(
  255. register __vector4 p2,
  256. register __vector4 p1,
  257. register __vector4 p0,
  258. register __vector4 q0,
  259. register __vector4 q1,
  260. register __vector4 q2,
  261. int alpha,
  262. int beta,
  263. int8_t * tc0)
  264. {
  265. DECLARE_ALIGNED(16, unsigned char, temp)[16];
  266. __vector4 alphavec,betavec,mask,p1mask,q1mask,tc0vec,finaltc0,tc0masked,newp1,newq1;
  267.  
  268. temp[0] = alpha;
  269. temp[1] = beta;
  270.  
  271. alphavec = __lvx(temp,0);
  272. betavec = __vspltb(alphavec, 0x1);
  273. alphavec = __vspltb(alphavec, 0x0);
  274. mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */
  275.  
  276. *((int *)temp) = *((int *)tc0);
  277. tc0vec = __lvx((signed char*)temp, 0);
  278. tc0vec = __vmrghb(tc0vec, tc0vec);
  279. tc0vec = __vmrghb(tc0vec, tc0vec);
  280. mask = __vand(mask, __vcmpgtub(tc0vec, __vspltisb(-1)));//vspltisb 7,1 /* if tc0[i] >= 0 */
  281. finaltc0 = __vand(tc0vec, mask);
  282.  
  283. p1mask = diff_lt_altivec(p2, p0, betavec);
  284. p1mask = __vand(p1mask, mask);
  285. tc0masked = __vand(p1mask, tc0vec);
  286. finaltc0 = __vsububm(finaltc0, p1mask);
  287.  
  288. p1mask = diff_lt_altivec(p2, p0, betavec);
  289. p1mask = __vand(p1mask, mask); /* if ( |p2 - p0| < beta) */
  290. tc0masked = __vand(p1mask, tc0vec);
  291. finaltc0 = __vsububm(finaltc0, p1mask); /* tc++ */
  292. newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);
  293.  
  294. q1mask = diff_lt_altivec(q2, q0, betavec);
  295. q1mask = __vand(q1mask, mask); /* if ( |q2 - q0| < beta ) */
  296. tc0masked = __vand(q1mask, tc0vec);
  297. finaltc0 = __vsububm(finaltc0, q1mask); /* tc++ */
  298. newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);
  299.  
  300. h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);
  301. p1 = newp1;
  302. q1 = newq1;
  303. }
  304.  
  305. static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
  306. if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0)
  307. {
  308. __vector4 p2 = __lvx(pix,-3*stride);
  309. __vector4 p1 = __lvx(pix,-2*stride);
  310. __vector4 p0 = __lvx(pix,-1*stride);
  311. __vector4 q0 = __lvx(pix,0);
  312. __vector4 q1 = __lvx(pix,stride);
  313. __vector4 q2 = __lvx(pix,2*stride);
  314. h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
  315. __stvx (p1, pix, -2*stride);
  316. __stvx (p0, pix, -1*stride);
  317. __stvx (q0, pix, 0);
  318. __stvx (q1, pix, stride);
  319. }
  320. }
  321. /** No finished **/
  322. #if 0
  323. static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
  324.  
  325. __vector4 line0, line1, line2, line3, line4, line5;
  326. if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
  327. return;
  328. readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
  329. h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
  330. transpose4x16(line1, line2, line3, line4);
  331. write16x4(pix-2, stride, line1, line2, line3, line4);
  332. }
  333. #endif
  334.  
  335. void ff_h264dsp_init_luma_vmx(H264DSPContext *c)
  336. {
  337. c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
  338. //c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
  339. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement