Guest User

Untitled

a guest
May 12th, 2014
245
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 26.41 KB | None | 0 0
  1. ;*****************************************************************************
  2. ;* SSE2-optimized HEVC deblocking code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2013 VTT
  5. ;*
  6. ;* Authors: Seppo Tomperi <[email protected]>
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24.  
  25. %include "libavutil/x86/x86util.asm"
  26.  
  27. SECTION_RODATA
  28.  
  29. pw_pixel_max: times 8 dw ((1 << 10)-1)
  30.  
  31. SECTION .text
  32. INIT_XMM sse2
  33.  
  34. ; expands to [base],...,[base+7*stride]
  35. %define PASS8ROWS(base, base3, stride, stride3) \
  36. [base], [base+stride], [base+stride*2], [base3], \
  37. [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
  38.  
  39. ; in: 8 rows of 4 bytes in %4..%11
  40. ; out: 4 rows of 8 words in m0..m3
  41. %macro TRANSPOSE4x8B_LOAD 8
  42. movd m0, %1
  43. movd m2, %2
  44. movd m1, %3
  45. movd m3, %4
  46.  
  47. punpcklbw m0, m2
  48. punpcklbw m1, m3
  49. punpcklwd m0, m1
  50.  
  51. movd m4, %5
  52. movd m6, %6
  53. movd m5, %7
  54. movd m7, %8
  55.  
  56. punpcklbw m4, m6
  57. punpcklbw m5, m7
  58. punpcklwd m4, m5
  59.  
  60. movdqa m2, m0
  61. punpckldq m0, m4
  62. punpckhdq m2, m4
  63. movdqa m1, m0
  64. movdqa m3, m2
  65.  
  66. pxor m5, m5
  67. punpcklbw m0, m5
  68. punpckhbw m1, m5
  69. punpcklbw m2, m5
  70. punpckhbw m3, m5
  71. %endmacro
  72.  
  73. ; in: 4 rows of 8 words in m0..m3
  74. ; out: 8 rows of 4 bytes in %1..%8
  75. %macro TRANSPOSE8x4B_STORE 8
  76. packuswb m0, m0
  77. packuswb m1, m1
  78. packuswb m2, m2
  79. packuswb m3, m3
  80.  
  81. punpcklbw m0, m1
  82. punpcklbw m2, m3
  83.  
  84. movdqa m6, m0
  85.  
  86. punpcklwd m0, m2
  87. punpckhwd m6, m2
  88.  
  89. movd %1, m0
  90. pshufd m0, m0, 0x39
  91. movd %2, m0
  92. pshufd m0, m0, 0x39
  93. movd %3, m0
  94. pshufd m0, m0, 0x39
  95. movd %4, m0
  96.  
  97. movd %5, m6
  98. pshufd m6, m6, 0x39
  99. movd %6, m6
  100. pshufd m6, m6, 0x39
  101. movd %7, m6
  102. pshufd m6, m6, 0x39
  103. movd %8, m6
  104. %endmacro
  105.  
  106. ; in: 8 rows of 4 words in %4..%11
  107. ; out: 4 rows of 8 words in m0..m3
  108. %macro TRANSPOSE4x8W_LOAD 8
  109. movq m0, %1
  110. movq m2, %2
  111. movq m1, %3
  112. movq m3, %4
  113.  
  114. punpcklwd m0, m2
  115. punpcklwd m1, m3
  116. movdqa m2, m0
  117. punpckldq m0, m1
  118. punpckhdq m2, m1
  119.  
  120. movq m4, %5
  121. movq m6, %6
  122. movq m5, %7
  123. movq m7, %8
  124.  
  125. punpcklwd m4, m6
  126. punpcklwd m5, m7
  127. movdqa m6, m4
  128. punpckldq m4, m5
  129. punpckhdq m6, m5
  130.  
  131. movdqa m1, m0
  132. punpcklqdq m0, m4
  133. punpckhqdq m1, m4
  134. movdqa m3, m2
  135. punpcklqdq m2, m6
  136. punpckhqdq m3, m6
  137.  
  138. %endmacro
  139.  
  140. ; in: 4 rows of 8 words in m0..m3
  141. ; out: 8 rows of 4 words in %1..%8
  142. %macro TRANSPOSE8x4W_STORE 8
  143. pxor m5, m5; zeros reg
  144. CLIPW m0, m5, [pw_pixel_max]
  145. CLIPW m1, m5, [pw_pixel_max]
  146. CLIPW m2, m5, [pw_pixel_max]
  147. CLIPW m3, m5, [pw_pixel_max]
  148.  
  149. movdqa m4, m0
  150. movdqa m5, m2
  151.  
  152. punpcklwd m0, m1
  153. punpcklwd m2, m3
  154. movdqa m6, m0
  155. punpckldq m0, m2
  156. punpckhdq m6, m2
  157.  
  158. movq %1, m0
  159. punpckhqdq m0, m0
  160. movq %2, m0
  161. movq %3, m6
  162. punpckhqdq m6, m6
  163. movq %4, m6
  164.  
  165. punpckhwd m4, m1
  166. punpckhwd m5, m3
  167. movdqa m6, m4
  168. punpckldq m4, m5
  169. punpckhdq m6, m5
  170.  
  171. movq %5, m4
  172. punpckhqdq m4, m4
  173. movq %6, m4
  174. movq %7, m6
  175. punpckhqdq m6, m6
  176. movq %8, m6
  177. %endmacro
  178.  
  179. ; in: 8 rows of 8 bytes in %1..%8
  180. ; out: 8 rows of 8 words in m0..m7
  181. %macro TRANSPOSE8x8B_LOAD 8
  182. movq m7, %1
  183. movq m2, %2
  184. movq m1, %3
  185. movq m3, %4
  186.  
  187. punpcklbw m7, m2
  188. punpcklbw m1, m3
  189. movdqa m3, m7
  190. punpcklwd m3, m1
  191. punpckhwd m7, m1
  192.  
  193. movq m4, %5
  194. movq m6, %6
  195. movq m5, %7
  196. movq m15, %8
  197.  
  198. punpcklbw m4, m6
  199. punpcklbw m5, m15
  200. movdqa m9, m4
  201. punpcklwd m9, m5
  202. punpckhwd m4, m5
  203.  
  204. movdqa m1, m3
  205. punpckldq m1, m9; 0, 1
  206. punpckhdq m3, m9; 2, 3
  207.  
  208. movdqa m5, m7
  209. punpckldq m5, m4; 4, 5
  210. punpckhdq m7, m4; 6, 7
  211.  
  212. pxor m13, m13
  213.  
  214. movdqa m0, m1
  215. punpcklbw m0, m13; 0 in 16 bit
  216. punpckhbw m1, m13; 1 in 16 bit
  217.  
  218. movdqa m2, m3;
  219. punpcklbw m2, m13; 2
  220. punpckhbw m3, m13; 3
  221.  
  222. movdqa m4, m5;
  223. punpcklbw m4, m13; 4
  224. punpckhbw m5, m13; 5
  225.  
  226. movdqa m6, m7
  227. punpcklbw m6, m13; 6
  228. punpckhbw m7, m13; 7
  229. %endmacro
  230.  
  231.  
  232. ; in: 8 rows of 8 words in m0..m8
  233. ; out: 8 rows of 8 bytes in %1..%8
  234. %macro TRANSPOSE8x8B_STORE 8
  235. packuswb m0, m0
  236. packuswb m1, m1
  237. packuswb m2, m2
  238. packuswb m3, m3
  239. packuswb m4, m4
  240. packuswb m5, m5
  241. packuswb m6, m6
  242. packuswb m7, m7
  243.  
  244. punpcklbw m0, m1
  245. punpcklbw m2, m3
  246.  
  247. movdqa m8, m0
  248. punpcklwd m0, m2
  249. punpckhwd m8, m2
  250.  
  251. punpcklbw m4, m5
  252. punpcklbw m6, m7
  253.  
  254. movdqa m9, m4
  255. punpcklwd m4, m6
  256. punpckhwd m9, m6
  257.  
  258. movdqa m10, m0
  259. punpckldq m0, m4; 0, 1
  260. punpckhdq m10, m4; 2, 3
  261.  
  262. movdqa m11, m8
  263. punpckldq m11, m9; 4, 5
  264. punpckhdq m8, m9; 6, 7
  265. movq %1, m0
  266. pshufd m0, m0, 0x4E
  267. movq %2, m0
  268. movq %3, m10
  269. pshufd m10, m10, 0x4E
  270. movq %4, m10
  271. movq %5, m11
  272. pshufd m11, m11, 0x4E
  273. movq %6, m11
  274. movq %7, m8
  275. pshufd m8, m8, 0x4E
  276. movq %8, m8
  277. %endmacro
  278.  
  279. ; in: 8 rows of 8 words in m0..m7
  280. ; out: 8 rows of 8 words in m0..m7
  281. %macro TRANSPOSE8x8W 0
  282. movdqa m8, m0
  283. movdqa m9, m2
  284. movdqa m10, m4
  285. movdqa m11, m6
  286.  
  287. punpcklwd m0, m1
  288. punpcklwd m2, m3
  289. punpcklwd m4, m5
  290. punpcklwd m6, m7
  291. punpckhwd m8, m1
  292. punpckhwd m9, m3
  293. punpckhwd m10, m5
  294. punpckhwd m11, m7
  295.  
  296. movdqa m3, m0
  297. movdqa m1, m4
  298. punpckldq m0, m2
  299. punpckldq m4, m6
  300. punpckhdq m3, m2
  301. punpckhdq m1, m6
  302.  
  303. movdqa m5, m8
  304. movdqa m7, m10
  305. punpckldq m5, m9
  306. punpckldq m10, m11
  307. punpckhdq m8, m9
  308. punpckhdq m7, m11
  309.  
  310. movdqa m6, m0
  311. movdqa m2, m3
  312. punpcklqdq m0, m4
  313. punpckhqdq m6, m4
  314. punpcklqdq m2, m1
  315. punpckhqdq m3, m1
  316. movdqa m1, m6
  317.  
  318. movdqa m4, m5
  319. movdqa m6, m8
  320. punpcklqdq m4, m10
  321. punpckhqdq m5, m10
  322. punpcklqdq m6, m7
  323. punpckhqdq m8, m7
  324. movdqa m7, m8
  325. %endmacro
  326.  
  327. ; in: 8 rows of 8 words in %1..%8
  328. ; out: 8 rows of 8 words in m0..m7
  329. %macro TRANSPOSE8x8W_LOAD 8
  330. movdqu m0, %1
  331. movdqu m1, %2
  332. movdqu m2, %3
  333. movdqu m3, %4
  334. movdqu m4, %5
  335. movdqu m5, %6
  336. movdqu m6, %7
  337. movdqu m7, %8
  338. TRANSPOSE8x8W
  339. %endmacro
  340.  
  341. ; in: 8 rows of 8 words in m0..m8
  342. ; out: 8 rows of 8 words in %1..%8
  343. %macro TRANSPOSE8x8W_STORE 8
  344. TRANSPOSE8x8W
  345.  
  346. pxor m8, m8
  347. CLIPW m0, m8, [pw_pixel_max]
  348. CLIPW m1, m8, [pw_pixel_max]
  349. CLIPW m2, m8, [pw_pixel_max]
  350. CLIPW m3, m8, [pw_pixel_max]
  351. CLIPW m4, m8, [pw_pixel_max]
  352. CLIPW m5, m8, [pw_pixel_max]
  353. CLIPW m6, m8, [pw_pixel_max]
  354. CLIPW m7, m8, [pw_pixel_max]
  355.  
  356. movdqu %1, m0
  357. movdqu %2, m1
  358. movdqu %3, m2
  359. movdqu %4, m3
  360. movdqu %5, m4
  361. movdqu %6, m5
  362. movdqu %7, m6
  363. movdqu %8, m7
  364. %endmacro
  365.  
  366. %macro SHIFT_LEFT_PARAM 1
  367. mov r7,[%1]
  368. mov r8,[%1+4]
  369. shl r7, 2
  370. shl r8, 2
  371. mov [%1], r7
  372. mov [%1+4], r8
  373. %endmacro
  374.  
  375. ; in: %2 clobbered
  376. ; out: %1
  377. ; mask in m11
  378. ; clobbers m10
  379. %macro MASKED_COPY 2
  380. pand %2, m11 ; and mask
  381. movdqa m10, m11
  382. pandn m10, %1; and -mask
  383. por %2, m10
  384. movdqa %1, %2
  385. %endmacro
  386.  
  387. ; in: %2 clobbered
  388. ; out: %1
  389. ; mask in %3, will be clobbered
  390. %macro MASKED_COPY2 3
  391. pand %2, %3 ; and mask
  392. pandn %3, %1; and -mask
  393. por %2, %3
  394. movdqa %1, %2
  395. %endmacro
  396.  
  397. ALIGN 16
  398. ; input in m0 ... m3 and tcs in r2. Output in m1 and m2
  399. %macro CHROMA_DEBLOCK_BODY 1
  400. movdqa m4, m2; temp copy of q0
  401. movdqa m5, m0; temp copy of p1
  402. psubw m4, m1; q0 - p0
  403. psubw m5, m3; p1 - q1
  404. psllw m4, 2; << 2
  405. paddw m5, m4;
  406.  
  407. ;tc calculations
  408. movd m6, [r2]; tc0
  409. add r2, 4;
  410. punpcklwd m6, m6
  411. movd m7, [r2]; tc1
  412. punpcklwd m7, m7
  413. shufps m6, m7, 0; tc0, tc1
  414. movdqa m4, m6
  415. pcmpeqw m7, m7; set all bits to 1
  416. pxor m4, m7; flip all bits of first reg
  417. psrlw m7, 15; 1 in every cell
  418. paddw m4, m7; -tc0, -tc1
  419. ;end tc calculations
  420.  
  421. psllw m7, 2; 4 in every cell
  422. paddw m5, m7; +4
  423. psraw m5, 3; >> 3
  424.  
  425. psllw m4, %1-8; << (BIT_DEPTH - 8)
  426. psllw m6, %1-8; << (BIT_DEPTH - 8)
  427. pmaxsw m5, m4
  428. pminsw m5, m6
  429. paddw m1, m5; p0 + delta0
  430. psubw m2, m5; q0 - delta0
  431. %endmacro
  432.  
  433. %if ARCH_X86_64
  434. INIT_XMM ssse3
  435. ALIGN 16
  436. ; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6
  437. ff_hevc_luma_deblock_body:
  438. movdqa m9, m2
  439. psllw m9, 1; *2
  440. movdqa m10, m1
  441. psubw m10, m9
  442. paddw m10, m3
  443. pabsw m10, m10 ; 0dp0, 0dp3 , 1dp0, 1dp3
  444.  
  445. movdqa m9, m5
  446. psllw m9, 1; *2
  447. movdqa m11, m6
  448. psubw m11, m9
  449. paddw m11, m4
  450. pabsw m11, m11 ; 0dq0, 0dq3 , 1dq0, 1dq3
  451.  
  452. ;beta calculations
  453. movd m13, [r2]; beta0
  454. mov r11, [r2];
  455. add r2, 4;
  456. punpcklwd m13, m13
  457. movd m14, [r2]; beta1
  458. mov r12, [r2];
  459. punpcklwd m14, m14
  460. pshufd m13, m14, 0; beta0, beta1
  461. ;end beta calculations
  462.  
  463. movdqa m9, m10
  464. paddw m9, m11; 0d0, 0d3 , 1d0, 1d3
  465.  
  466. pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high
  467. pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low
  468.  
  469. pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
  470. pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
  471.  
  472. paddw m14, m9; 0d0+0d3, 1d0+1d3
  473. movdqa m15, m13; beta0, beta1
  474.  
  475. ;compare
  476. pcmpgtw m15, m14
  477. movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
  478. cmp r13, 0
  479. je bypasswrite
  480.  
  481. ;weak / strong decision compare to beta_2
  482. movdqa m15, m13; beta0, beta1
  483. psraw m15, 2; beta >> 2
  484. movdqa m8, m9;
  485. psllw m8, 1;
  486. pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
  487. movmskps r14, m15;
  488. ;end weak / strong decision
  489.  
  490. ; weak filter nd_p/q calculation
  491. pshufd m8, m10, 0x31
  492. psrld m8, 16
  493. paddw m8, m10
  494. movd r7, m8
  495. and r7, 0xffff; 1dp0 + 1dp3
  496. pshufd m8, m8, 0x4E
  497. movd r8, m8
  498. and r8, 0xffff; 0dp0 + 0dp3
  499.  
  500. pshufd m8, m11, 0x31
  501. psrld m8, 16
  502. paddw m8, m11
  503. movd r9, m8
  504. and r9, 0xffff; 1dq0 + 1dq3
  505. pshufd m8, m8, 0x4E
  506. movd r10, m8
  507. and r10, 0xffff; 0dq0 + 0dq3
  508. ; end calc for weak filter
  509.  
  510. ; filtering mask
  511. mov r2, r13
  512. shr r2, 3
  513. movd m15, r2
  514. and r13, 1
  515. movd m11, r13
  516. shufps m11, m15, 0
  517. shl r2, 1
  518. or r13, r2
  519.  
  520. pcmpeqd m15, m15; set all bits to 1
  521. psrld m15, 31; set to 32bit 1
  522. pcmpeqd m11, m15; filtering mask
  523.  
  524. ;decide between strong and weak filtering
  525. ;tc25 calculations
  526. movd m8, [r3]; tc0
  527. mov r2d, [r3];
  528. add r3, 4;
  529. punpcklwd m8, m8
  530. movd m9, [r3]; tc1
  531. add r2d, [r3]; tc0 + tc1
  532. cmp r2d, 0;
  533. je bypasswrite
  534. punpcklwd m9, m9
  535. shufps m8, m9, 0; tc0, tc1
  536. movdqa m9, m8
  537. psllw m8, 2; tc << 2
  538. pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1)
  539. ;end tc25 calculations
  540.  
  541. ;----beta_3 comparison-----
  542. movdqa m12, m0; p3
  543. psubw m12, m3; p3 - p0
  544. pabsw m12, m12; abs(p3 - p0)
  545.  
  546. movdqa m15, m7; q3
  547. psubw m15, m4; q3 - q0
  548. pabsw m15, m15; abs(q3 - q0)
  549.  
  550. paddw m12, m15; abs(p3 - p0) + abs(q3 - q0)
  551.  
  552. pshufhw m12, m12, 0xf0 ;0b11110000;
  553. pshuflw m12, m12, 0xf0 ;0b11110000;
  554.  
  555. psraw m13, 3; beta >> 3
  556. pcmpgtw m13, m12;
  557. movmskps r2, m13;
  558. and r14, r2; strong mask , beta_2 and beta_3 comparisons
  559. ;----beta_3 comparison end-----
  560. ;----tc25 comparison---
  561. movdqa m12, m3; p0
  562. psubw m12, m4; p0 - q0
  563. pabsw m12, m12; abs(p0 - q0)
  564.  
  565. pshufhw m12, m12, 0xf0 ;0b11110000;
  566. pshuflw m12, m12, 0xf0 ;0b11110000;
  567.  
  568. pcmpgtw m8, m12; tc25 comparisons
  569. movmskps r2, m8;
  570. and r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons
  571. ;----tc25 comparison end---
  572. mov r2, r14;
  573. shr r2, 1;
  574. and r14, r2; strong mask, bits 2 and 0
  575.  
  576. pcmpeqw m13, m13; set all bits to 1
  577. movdqa m14, m9; tc
  578. pxor m14, m13; invert bits
  579. psrlw m13, 15; 1 in every cell
  580. paddw m14, m13; -tc
  581.  
  582. psllw m9, 1; tc * 2
  583. psllw m14, 1; -tc * 2
  584.  
  585. and r14, 5; 0b101
  586. mov r2, r14; strong mask
  587. shr r14, 2;
  588. movd m12, r14; store to xmm for mask generation
  589. shl r14, 1
  590. and r2, 1
  591. movd m10, r2; store to xmm for mask generation
  592. or r14, r2; final strong mask, bits 1 and 0
  593. cmp r14, 0;
  594. je weakfilter
  595.  
  596. shufps m10, m12, 0
  597.  
  598. pcmpeqd m12, m12; set all bits to 1
  599. psrld m12, 31; set to 32bit 1
  600. pcmpeqd m10, m12; strong mask
  601.  
  602. psllw m13, 2; 4 in every cell
  603. pand m11, m10; combine filtering mask and strong mask
  604. movdqa m12, m2; p1
  605. paddw m12, m3; p1 + p0
  606. paddw m12, m4; p1 + p0 + q0
  607. movdqa m10, m12; copy
  608. psllw m12, 1; 2*p1 + 2*p0 + 2*q0
  609. paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
  610. paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
  611. paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
  612. psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
  613. psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
  614. pmaxsw m12, m14
  615. pminsw m12, m9; av_clip( , -2 * tc, 2 * tc)
  616. paddw m12, m3; p0'
  617.  
  618. movdqa m15, m1; p2
  619. paddw m15, m10; p2 + p1 + p0 + q0
  620. psrlw m13, 1; 2 in every cell
  621. paddw m15, m13; p2 + p1 + p0 + q0 + 2
  622. psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2
  623. psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
  624. pmaxsw m15, m14
  625. pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
  626. paddw m15, m2; p1'
  627.  
  628. movdqa m8, m1; p2
  629. paddw m8, m0; p3 + p2
  630. psllw m8, 1; 2*p3 + 2*p2
  631. paddw m8, m1; 2*p3 + 3*p2
  632. paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0
  633. psllw m13, 1; 4 in every cell
  634. paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4
  635. psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
  636. psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
  637. pmaxsw m8, m14
  638. pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
  639. paddw m8, m1; p2'
  640. MASKED_COPY m1, m8
  641.  
  642. movdqa m8, m3; p0
  643. paddw m8, m4; p0 + q0
  644. paddw m8, m5; p0 + q0 + q1
  645. psllw m8, 1; 2*p0 + 2*q0 + 2*q1
  646. paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1
  647. paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2
  648. paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
  649. psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
  650. psubw m8, m4;
  651. pmaxsw m8, m14
  652. pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
  653. paddw m8, m4; q0'
  654. MASKED_COPY m2, m15
  655.  
  656. movdqa m15, m3; p0
  657. paddw m15, m4; p0 + q0
  658. paddw m15, m5; p0 + q0 + q1
  659. movdqa m10, m15;
  660. paddw m15, m6; p0 + q0 + q1 + q2
  661. psrlw m13, 1; 2 in every cell
  662. paddw m15, m13; p0 + q0 + q1 + q2 + 2
  663. psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2
  664. psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
  665. pmaxsw m15, m14
  666. pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
  667. paddw m15, m5; q1'
  668.  
  669. paddw m13, m7; q3 + 2
  670. paddw m13, m6; q3 + q2 + 2
  671. psllw m13, 1; 2*q3 + 2*q2 + 4
  672. paddw m13, m6; 2*q3 + 3*q2 + 4
  673. paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4
  674. psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
  675. psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
  676. pmaxsw m13, m14
  677. pminsw m13, m9; av_clip( , -2 * tc, 2 * tc)
  678. paddw m13, m6; q2'
  679.  
  680. MASKED_COPY m6, m13
  681. MASKED_COPY m5, m15
  682. MASKED_COPY m4, m8
  683. MASKED_COPY m3, m12
  684.  
  685. weakfilter:
  686. not r14; strong mask -> weak mask
  687. and r14, r13; final weak filtering mask, bits 0 and 1
  688. cmp r14, 0;
  689. je ready
  690.  
  691. ; weak filtering mask
  692. mov r2, r14
  693. shr r2, 1
  694. movd m12, r2
  695. and r14, 1
  696. movd m11, r14
  697. shufps m11, m12, 0
  698.  
  699. pcmpeqd m12, m12; set all bits to 1
  700. psrld m12, 31; set to 32bit 1
  701. pcmpeqd m11, m12; filtering mask
  702.  
  703. mov r13, r11; beta0
  704. shr r13, 1;
  705. add r11, r13
  706. shr r11, 3; ((beta0+(beta0>>1))>>3))
  707.  
  708. mov r13, r12; beta1
  709. shr r13, 1;
  710. add r12, r13
  711. shr r12, 3; ((beta1+(beta1>>1))>>3))
  712.  
  713. pcmpeqw m13, m13; set all bits to 1
  714. psrlw m13, 15; 1 in every cell
  715. psllw m13, 3; 8 in every cell
  716.  
  717. movdqa m12, m4 ; q0
  718. psubw m12, m3 ; q0 - p0
  719. movdqa m10, m12
  720. psllw m10, 3; 8 * (q0 - p0)
  721. paddw m12, m10 ; 9 * (q0 - p0)
  722.  
  723. movdqa m10, m5 ; q1
  724. psubw m10, m2 ; q1 - p1
  725. movdqa m8, m10
  726. psllw m8, 1; 2 * ( q1 - p1 )
  727. paddw m10, m8; 3 * ( q1 - p1 )
  728. psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
  729. paddw m12, m13; + 8
  730. psraw m12, 4; >> 4 , delta0
  731. pabsw m13, m12; abs(delta0)
  732.  
  733.  
  734. movdqa m10, m9; 2*tc
  735. psllw m10, 2; 8 * tc
  736. paddw m10, m9; 10 * tc
  737. pcmpgtw m10, m13
  738. pand m11, m10
  739.  
  740. psraw m9, 1; tc * 2 -> tc
  741. psraw m14, 1; -tc * 2 -> -tc
  742.  
  743. pmaxsw m12, m14
  744. pminsw m12, m9; av_clip(delta0, -tc, tc)
  745.  
  746. pcmpeqw m13, m13; set all bits to 1
  747. psraw m9, 1; tc -> tc / 2
  748. movdqa m14, m9;
  749. pxor m14, m13; complement -tc
  750. psrlw m13, 15; set all cells to 1
  751. paddw m14, m13; add 1, -tc / 2
  752.  
  753. movdqa m15, m1; p2
  754. pavgw m15, m3; (p2 + p0 + 1) >> 1
  755. psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
  756. paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
  757. psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
  758. pmaxsw m15, m14
  759. pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2)
  760. paddw m15, m2; p1'
  761.  
  762. ;beta calculations
  763. movd m10, r11; beta0
  764. punpcklwd m10, m10
  765. movd m13, r12; beta1
  766. punpcklwd m13, m13
  767. shufps m10, m13, 0; betax0, betax1
  768.  
  769. movd m13, r7; 1dp0 + 1dp3
  770. movd m8, r8; 0dp0 + 0dp3
  771. punpcklwd m8, m8
  772. punpcklwd m13, m13
  773. shufps m13, m8, 0;
  774. movdqa m8, m10; copy of beta
  775. pcmpgtw m8, m13
  776. pand m8, m11
  777. ;end beta calculations
  778. MASKED_COPY2 m2, m15, m8; write p1'
  779.  
  780. movdqa m8, m6; q2
  781. pavgw m8, m4; (q2 + q0 + 1) >> 1
  782. psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1
  783. psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
  784. psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
  785. pmaxsw m8, m14
  786. pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2)
  787. paddw m8, m5; q1'
  788.  
  789. movd m13, r9;
  790. movd m15, r10;
  791. punpcklwd m15, m15
  792. punpcklwd m13, m13
  793. shufps m13, m15, 0; dq0 + dq3
  794.  
  795. pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3)
  796. pand m10, m11
  797. MASKED_COPY2 m5, m8, m10; write q1'
  798.  
  799. movdqa m15, m3 ; p0
  800. paddw m15, m12 ; p0 + delta0
  801. MASKED_COPY m3, m15
  802.  
  803. movdqa m8, m4 ; q0
  804. psubw m8, m12 ; q0 - delta0
  805. MASKED_COPY m4, m8
  806. ready:
  807. mov r4, 0
  808. ret
  809. bypasswrite:
  810. mov r4, 1
  811. ret
  812. %endif
  813. INIT_XMM sse2
  814. ;-----------------------------------------------------------------------------
  815. ; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q)
  816. ;-----------------------------------------------------------------------------
  817. cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
  818. sub r0, 2
  819. lea r5, [3*r1]
  820. mov r4, r0
  821. add r0, r5
  822. TRANSPOSE4x8B_LOAD PASS8ROWS(r4, r0, r1, r5)
  823. CHROMA_DEBLOCK_BODY 8
  824. TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
  825. RET
  826.  
  827. cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
  828. sub r0, 4
  829. lea r5, [3*r1]
  830. mov r4, r0
  831. add r0, r5
  832. TRANSPOSE4x8W_LOAD PASS8ROWS(r4, r0, r1, r5)
  833. CHROMA_DEBLOCK_BODY 10
  834. TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
  835. RET
  836.  
  837. ;-----------------------------------------------------------------------------
  838. ; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q
  839. ;-----------------------------------------------------------------------------
  840. cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
  841. mov r5, r0; pix
  842. sub r5, r1
  843. sub r5, r1
  844. movq m0, [r5]; p1
  845. movq m1, [r5+r1]; p0
  846. movq m2, [r0]; q0
  847. movq m3, [r0+r1]; q1
  848. pxor m5, m5; zeros reg
  849. punpcklbw m0, m5
  850. punpcklbw m1, m5
  851. punpcklbw m2, m5
  852. punpcklbw m3, m5
  853. CHROMA_DEBLOCK_BODY 8
  854. packuswb m1, m1 ; p0' packed in bytes on low quadword
  855. packuswb m2, m2 ; q0' packed in bytes on low quadword
  856. movq [r5+r1], m1
  857. movq [r0], m2
  858. RET
  859.  
  860. cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
  861. mov r5, r0; pix
  862. sub r5, r1
  863. sub r5, r1
  864. movdqu m0, [r5]; p1
  865. movdqu m1, [r5+r1]; p0
  866. movdqu m2, [r0]; q0
  867. movdqu m3, [r0+r1]; q1
  868. CHROMA_DEBLOCK_BODY 10
  869. pxor m5, m5; zeros reg
  870. CLIPW m1, m5, [pw_pixel_max]
  871. CLIPW m2, m5, [pw_pixel_max]
  872. movdqu [r5+r1], m1
  873. movdqu [r0], m2
  874. RET
  875.  
  876. %if ARCH_X86_64
  877. INIT_XMM ssse3
  878. ;-----------------------------------------------------------------------------
  879. ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  880. ;-----------------------------------------------------------------------------
  881. cglobal hevc_v_loop_filter_luma_8, 4, 15, 16
  882. sub r0, 4
  883. lea r5, [3*r1]
  884. mov r6, r0
  885. add r0, r5
  886. TRANSPOSE8x8B_LOAD PASS8ROWS(r6, r0, r1, r5)
  887. call ff_hevc_luma_deblock_body
  888. cmp r4, 1
  889. je bypassvluma_8
  890. TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
  891. bypassvluma_8:
  892. RET
  893.  
  894. cglobal hevc_v_loop_filter_luma_10, 4, 15, 16
  895. sub r0, 4
  896. lea r5, [3*r1]
  897. mov r6, r0
  898. add r0, r5
  899. TRANSPOSE8x8W_LOAD PASS8ROWS(r6, r0, r1, r5)
  900. SHIFT_LEFT_PARAM r2
  901. SHIFT_LEFT_PARAM r3
  902. call ff_hevc_luma_deblock_body
  903. cmp r4, 1
  904. je bypassvluma_10
  905. TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
  906. bypassvluma_10:
  907. RET
  908.  
  909. ;-----------------------------------------------------------------------------
  910. ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  911. ;-----------------------------------------------------------------------------
  912. cglobal hevc_h_loop_filter_luma_8, 4, 15, 16
  913. lea r6, [3*r1]
  914. mov r5, r0
  915. sub r5, r6
  916. sub r5, r1
  917. movq m0, [r5]; p3
  918. movq m1, [r5+r1]; p2
  919. movq m2, [r5+2*r1]; p1
  920. movq m3, [r5+r6]; p0
  921. movq m4, [r0]; q0
  922. movq m5, [r0+r1]; q1
  923. movq m6, [r0+2*r1]; q2
  924. movq m7, [r0+r6]; q3
  925. pxor m8, m8
  926. punpcklbw m0, m8
  927. punpcklbw m1, m8
  928. punpcklbw m2, m8
  929. punpcklbw m3, m8
  930. punpcklbw m4, m8
  931. punpcklbw m5, m8
  932. punpcklbw m6, m8
  933. punpcklbw m7, m8
  934. call ff_hevc_luma_deblock_body
  935. cmp r4, 1
  936. je bypasshluma_8
  937. packuswb m1, m1; p2
  938. packuswb m2, m2; p1
  939. packuswb m3, m3; p0
  940. packuswb m4, m4; q0
  941. packuswb m5, m5; q1
  942. packuswb m6, m6; q2
  943. movq [r5+r1], m1; p2
  944. movq [r5+2*r1], m2; p1
  945. movq [r5+r6], m3; p0
  946. movq [r0], m4; q0
  947. movq [r0+r1], m5; q1
  948. movq [r0+2*r1], m6; q2
  949. bypasshluma_8:
  950. RET
  951.  
  952. cglobal hevc_h_loop_filter_luma_10, 4, 15, 16
  953. lea r6, [3*r1]
  954. mov r5, r0
  955. sub r5, r6
  956. sub r5, r1
  957. movdqu m0, [r5]; p3
  958. movdqu m1, [r5+r1]; p2
  959. movdqu m2, [r5+2*r1]; p1
  960. movdqu m3, [r5+r6]; p0
  961. movdqu m4, [r0]; q0
  962. movdqu m5, [r0+r1]; q1
  963. movdqu m6, [r0+2*r1]; q2
  964. movdqu m7, [r0+r6]; q3
  965. SHIFT_LEFT_PARAM r2
  966. SHIFT_LEFT_PARAM r3
  967. call ff_hevc_luma_deblock_body
  968. cmp r4, 1
  969. je bypasshluma_10
  970. pxor m8, m8; zeros reg
  971. CLIPW m1, m8, [pw_pixel_max]
  972. CLIPW m2, m8, [pw_pixel_max]
  973. CLIPW m3, m8, [pw_pixel_max]
  974. CLIPW m4, m8, [pw_pixel_max]
  975. CLIPW m5, m8, [pw_pixel_max]
  976. CLIPW m6, m8, [pw_pixel_max]
  977. movdqu [r5+r1], m1; p2
  978. movdqu [r5+2*r1], m2; p1
  979. movdqu [r5+r6], m3; p0
  980. movdqu [r0], m4; q0
  981. movdqu [r0+r1], m5; q1
  982. movdqu [r0+2*r1], m6; q2
  983. bypasshluma_10:
  984. RET
  985. %endif
Advertisement
Add Comment
Please, Sign In to add comment