Advertisement
Guest User

Untitled

a guest
Sep 19th, 2017
442
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 33.62 KB | None | 0 0
  1. From 489134cf2343edbb65bb17d2ed99dc2c114657bf Mon Sep 17 00:00:00 2001
  2. From: Daniel Kang <daniel.d.kang@gmail.com>
  3. Date: Wed, 22 Jun 2011 17:40:50 -0400
  4. Subject: [PATCH 1/2] luma mc first pass done \o/
  5.  
  6. ---
  7. libavcodec/x86/Makefile              |    1 +
  8.  libavcodec/x86/dsputil_mmx.c         |   47 ++
  9.  libavcodec/x86/h264_qpel_10bit.asm   |  813 ++++++++++++++++++++++++++++++++++
  10.  libavcodec/x86/h264_qpel_mmx_10bit.c |  141 ++++++
  11.  4 files changed, 1002 insertions(+), 0 deletions(-)
  12.  create mode 100644 libavcodec/x86/h264_qpel_10bit.asm
  13.  create mode 100755 libavcodec/x86/h264_qpel_mmx_10bit.c
  14.  
  15. diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
  16. index 022ab27..d3cf0da 100644
  17. --- a/libavcodec/x86/Makefile
  18. +++ b/libavcodec/x86/Makefile
  19. @@ -46,6 +46,7 @@ MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
  20.                                            x86/fmtconvert.o              \
  21.                                            x86/h264_chromamc.o           \
  22.                                            x86/h264_chromamc_10bit.o     \
  23. +                                          x86/h264_qpel_10bit.o         \
  24.                                            $(YASM-OBJS-yes)
  25.  
  26.  MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
  27. diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
  28. index 5c5ecb2..43ff26a 100644
  29. --- a/libavcodec/x86/dsputil_mmx.c
  30. +++ b/libavcodec/x86/dsputil_mmx.c
  31. @@ -1896,6 +1896,7 @@ PREFETCH(prefetch_3dnow, prefetch)
  32.  #undef PREFETCH
  33.  
  34.  #include "h264_qpel_mmx.c"
  35. +#include "h264_qpel_mmx_10bit.c"
  36.  
  37.  void ff_put_h264_chroma_mc8_mmx_rnd   (uint8_t *dst, uint8_t *src,
  38.                                         int stride, int h, int x, int y);
  39. @@ -2649,6 +2650,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  40.              SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
  41.              SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
  42.              }
  43. +#if HAVE_YASM
  44. +#define SET_QPEL_FUNCS_10(PFX, IDX, SIZE, CPU) \
  45. +            c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## SIZE ## _mc00_10_ ## CPU; \
  46. +            c->PFX ## _pixels_tab[IDX][ 1] = ff_ ## PFX ## SIZE ## _mc10_10_ ## CPU; \
  47. +            c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## SIZE ## _mc20_10_ ## CPU; \
  48. +            c->PFX ## _pixels_tab[IDX][ 3] = ff_ ## PFX ## SIZE ## _mc30_10_ ## CPU; \
  49. +            c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## SIZE ## _mc01_10_ ## CPU; \
  50. +            c->PFX ## _pixels_tab[IDX][ 5] = ff_ ## PFX ## SIZE ## _mc11_10_ ## CPU; \
  51. +            c->PFX ## _pixels_tab[IDX][ 6] = ff_ ## PFX ## SIZE ## _mc21_10_ ## CPU; \
  52. +            c->PFX ## _pixels_tab[IDX][ 7] = ff_ ## PFX ## SIZE ## _mc31_10_ ## CPU; \
  53. +            c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## SIZE ## _mc02_10_ ## CPU; \
  54. +            c->PFX ## _pixels_tab[IDX][ 9] = ff_ ## PFX ## SIZE ## _mc12_10_ ## CPU; \
  55. +            c->PFX ## _pixels_tab[IDX][10] = ff_ ## PFX ## SIZE ## _mc22_10_ ## CPU; \
  56. +            c->PFX ## _pixels_tab[IDX][11] = ff_ ## PFX ## SIZE ## _mc32_10_ ## CPU; \
  57. +            c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## SIZE ## _mc03_10_ ## CPU; \
  58. +            c->PFX ## _pixels_tab[IDX][13] = ff_ ## PFX ## SIZE ## _mc13_10_ ## CPU; \
  59. +            c->PFX ## _pixels_tab[IDX][14] = ff_ ## PFX ## SIZE ## _mc23_10_ ## CPU; \
  60. +            c->PFX ## _pixels_tab[IDX][15] = ff_ ## PFX ## SIZE ## _mc33_10_ ## CPU
  61. +            else if (bit_depth == 10) {
  62. +                SET_QPEL_FUNCS_10(put_h264_qpel, 0, 16, mmxext);
  63. +                SET_QPEL_FUNCS_10(put_h264_qpel, 1, 8,  mmxext);
  64. +                SET_QPEL_FUNCS_10(put_h264_qpel, 2, 4,  mmxext);
  65. +                SET_QPEL_FUNCS_10(avg_h264_qpel, 0, 16, mmxext);
  66. +                SET_QPEL_FUNCS_10(avg_h264_qpel, 1, 8,  mmxext);
  67. +                SET_QPEL_FUNCS_10(avg_h264_qpel, 2, 4,  mmxext);
  68. +            }
  69. +#endif
  70.  
  71.              SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
  72.              SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
  73. @@ -2777,7 +2805,26 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  74.              H264_QPEL_FUNCS(3, 3, sse2);
  75.              }
  76.  #if HAVE_YASM
  77. +#define H264_QPEL_FUNCS_10(x, y, CPU)\
  78. +            c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
  79. +            c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
  80. +            c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
  81. +            c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
  82.              if (bit_depth == 10) {
  83. +                H264_QPEL_FUNCS_10(0, 0, sse2);
  84. +                H264_QPEL_FUNCS_10(0, 1, sse2);
  85. +                H264_QPEL_FUNCS_10(0, 2, sse2);
  86. +                H264_QPEL_FUNCS_10(0, 3, sse2);
  87. +                H264_QPEL_FUNCS_10(1, 1, sse2);
  88. +                H264_QPEL_FUNCS_10(1, 2, sse2);
  89. +                H264_QPEL_FUNCS_10(1, 3, sse2);
  90. +                H264_QPEL_FUNCS_10(2, 1, sse2);
  91. +                H264_QPEL_FUNCS_10(2, 2, sse2);
  92. +                H264_QPEL_FUNCS_10(2, 3, sse2);
  93. +                H264_QPEL_FUNCS_10(3, 1, sse2);
  94. +                H264_QPEL_FUNCS_10(3, 2, sse2);
  95. +                H264_QPEL_FUNCS_10(3, 3, sse2);
  96. +
  97.                  c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
  98.                  c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
  99.              }
  100. diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
  101. new file mode 100644
  102. index 0000000..fc44e85
  103. --- /dev/null
  104. +++ b/libavcodec/x86/h264_qpel_10bit.asm
  105. @@ -0,0 +1,813 @@
  106. +;*****************************************************************************
  107. +;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
  108. +;*****************************************************************************
  109. +;* Copyright (C) 2005-2011 x264 project
  110. +;*
  111. +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  112. +;*
  113. +;* This file is part of Libav.
  114. +;*
  115. +;* Libav is free software; you can redistribute it and/or
  116. +;* modify it under the terms of the GNU Lesser General Public
  117. +;* License as published by the Free Software Foundation; either
  118. +;* version 2.1 of the License, or (at your option) any later version.
  119. +;*
  120. +;* Libav is distributed in the hope that it will be useful,
  121. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  122. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  123. +;* Lesser General Public License for more details.
  124. +;*
  125. +;* You should have received a copy of the GNU Lesser General Public
  126. +;* License along with Libav; if not, write to the Free Software
  127. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  128. +;******************************************************************************
  129. +
  130. +%include "x86inc.asm"
  131. +%include "x86util.asm"
  132. +
  133. +SECTION_RODATA 32
  134. +
  135. +cextern pw_16
  136. +cextern pw_1
  137. +cextern pb_0
  138. +
  139. +pw_pixel_max: times 8 dw ((1 << 10)-1)
  140. +
  141. +pad10: times 8 dw 10*1023
  142. +pad20: times 8 dw 20*1023
  143. +pad30: times 8 dw 30*1023
  144. +depad: times 4 dd 32*20*1023 + 512
  145. +depad2: times 8 dw 20*1023 + 16*1022 + 16
  146. +unpad: times 8 dw 16*1022/32 ; needs to be mod 16
  147. +
  148. +tap1: times 4 dw  1, -5
  149. +tap2: times 4 dw 20, 20
  150. +tap3: times 4 dw -5,  1
  151. +pd_0f: times 4 dd 0xffff
  152. +
  153. +SECTION .text
  154. +
  155. +; All of  the 2x2 functions are probably no faster than the C version.
  156. +
  157. +;-----------------------------------------------------------------------------
  158. +; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  159. +;-----------------------------------------------------------------------------
  160. +%macro MC00 1
  161. +INIT_MMX
  162. +cglobal %1_h264_qpel4_mc00_10_mmxext, 3,4
  163. +    lea           r3, [r2*3]
  164. +    movq          m0, [r1     ]
  165. +    OP_MOV [r0     ], m0
  166. +    movq          m0, [r1+r2  ]
  167. +    OP_MOV [r0+r2  ], m0
  168. +    movq          m0, [r1+r2*2]
  169. +    OP_MOV [r0+r2*2], m0
  170. +    movq          m0, [r1+r3  ]
  171. +    OP_MOV [r0+r3  ], m0
  172. +    RET
  173. +
  174. +INIT_XMM
  175. +cglobal %1_h264_qpel8_mc00_10_sse2, 3,3
  176. +%rep 4
  177. +    movu        m0, [r1   ]
  178. +    OP_MOV [r0   ], m0
  179. +    movu        m0, [r1+r2]
  180. +    OP_MOV [r0+r2], m0
  181. +    lea         r0, [r0+r2*2]
  182. +    lea         r1, [r1+r2*2]
  183. +%endrep
  184. +    RET
  185. +
  186. +cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
  187. +%rep 8
  188. +    movu           m0, [r1      ]
  189. +    movu           m1, [r1   +16]
  190. +    OP_MOV [r0      ], m0
  191. +    OP_MOV [r0   +16], m1
  192. +    movu           m0, [r1+r2   ]
  193. +    movu           m1, [r1+r2+16]
  194. +    OP_MOV [r0+r2   ], m0
  195. +    OP_MOV [r0+r2+16], m1
  196. +    lea            r0, [r0+r2*2]
  197. +    lea            r1, [r1+r2*2]
  198. +%endrep
  199. +    RET
  200. +%endmacro
  201. +
  202. +%macro AVG_MOV 2
  203. +    pavgw %2, %1
  204. +    mova  %1, %2
  205. +%endmacro
  206. +
  207. +%define OP_MOV mova
  208. +MC00 put
  209. +
  210. +%define OP_MOV AVG_MOV
  211. +MC00 avg
  212. +
  213. +;-----------------------------------------------------------------------------
  214. +; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  215. +;-----------------------------------------------------------------------------
  216. +%macro FILT_H 4
  217. +    paddw  %1, %4
  218. +    psubw  %1, %2  ; a-b
  219. +    psraw  %1, 2   ; (a-b)/4
  220. +    psubw  %1, %2  ; (a-b)/4-b
  221. +    paddw  %1, %3  ; (a-b)/4-b+c
  222. +    psraw  %1, 2   ; ((a-b)/4-b+c)/4
  223. +    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  224. +%endmacro
  225. +
  226. +%macro ADDW 3
  227. +%if mmsize == 8
  228. +    paddw %1, %2
  229. +%else
  230. +    movu  %3, %2
  231. +    paddw %1, %3
  232. +%endif
  233. +%endmacro
  234. +
  235. +%macro MC20 3
  236. +cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
  237. +    mov     r3d, %3
  238. +    pxor     m0, m0
  239. +    mova     m1, [pw_pixel_max]
  240. +    mova     m6, [pw_16]
  241. +.nextrow
  242. +    movu     m2, [r1-4]
  243. +    movu     m3, [r1-2]
  244. +    movu     m4, [r1+0]
  245. +    ADDW     m4, [r1+2], m5
  246. +    ADDW     m3, [r1+4], m5
  247. +    ADDW     m2, [r1+6], m5
  248. +
  249. +    FILT_H   m2, m3, m4, m6
  250. +    psraw    m2, 1
  251. +    CLIPW    m2, m0, m1
  252. +    OP_MOV [r0], m2
  253. +    add      r0, r2
  254. +    add      r1, r2
  255. +    dec     r3d
  256. +    jg .nextrow
  257. +    REP_RET
  258. +%endmacro
  259. +
  260. +%define OP_MOV mova
  261. +INIT_MMX
  262. +MC20 mmxext, put, 4
  263. +INIT_XMM
  264. +MC20 sse2  , put, 8
  265. +
  266. +%define OP_MOV AVG_MOV
  267. +INIT_MMX
  268. +MC20 mmxext, avg, 4
  269. +INIT_XMM
  270. +MC20 sse2  , avg, 8
  271. +
  272. +;-----------------------------------------------------------------------------
  273. +; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
  274. +;-----------------------------------------------------------------------------
  275. +%macro MC30 3
  276. +cglobal %2_h264_qpel%3_mc30_10_%1, 3,5,6
  277. +    lea r4, [r1+2]
  278. +    jmp mangle(ff_%2_h264_qpel%3_mc10_10_%1.body)
  279. +%endmacro
  280. +
  281. +%define OP_MOV mova
  282. +INIT_MMX
  283. +MC30 mmxext, put, 4
  284. +INIT_XMM
  285. +MC30 sse2  , put, 8
  286. +
  287. +%define OP_MOV AVG_MOV
  288. +INIT_MMX
  289. +MC30 mmxext, avg, 4
  290. +INIT_XMM
  291. +MC30 sse2  , avg, 8
  292. +
  293. +;-----------------------------------------------------------------------------
  294. +; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
  295. +;-----------------------------------------------------------------------------
  296. +%macro MC10 3
  297. +cglobal %2_h264_qpel%3_mc10_10_%1, 3,5,7
  298. +    mov      r4, r1
  299. +.body
  300. +    mov     r3d, %3
  301. +    pxor     m0, m0
  302. +    mova     m1, [pw_pixel_max]
  303. +    mova     m6, [pw_16]
  304. +.nextrow
  305. +    movu     m2, [r1-4]
  306. +    movu     m3, [r1-2]
  307. +    movu     m4, [r1+0]
  308. +    ADDW     m4, [r1+2], m5
  309. +    ADDW     m3, [r1+4], m5
  310. +    ADDW     m2, [r1+6], m5
  311. +
  312. +    FILT_H   m2, m3, m4, m6
  313. +    psraw    m2, 1
  314. +    CLIPW    m2, m0, m1
  315. +    movu     m3, [r4]
  316. +    pavgw    m2, m3
  317. +    OP_MOV [r0], m2
  318. +    add      r0, r2
  319. +    add      r1, r2
  320. +    add      r4, r2
  321. +    dec     r3d
  322. +    jg .nextrow
  323. +    REP_RET
  324. +%endmacro
  325. +
  326. +%define OP_MOV mova
  327. +INIT_MMX
  328. +MC10 mmxext, put, 4
  329. +INIT_XMM
  330. +MC10 sse2  , put, 8
  331. +
  332. +%define OP_MOV AVG_MOV
  333. +INIT_MMX
  334. +MC10 mmxext, avg, 4
  335. +INIT_XMM
  336. +MC10 sse2  , avg, 8
  337. +
  338. +;-----------------------------------------------------------------------------
  339. +; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  340. +;-----------------------------------------------------------------------------
  341. +%macro FILT_V 8
  342. +    movu     %6, [r1]
  343. +    paddw    %1, %6
  344. +    mova     %7, %2
  345. +    paddw    %7, %5
  346. +    mova     %8, %3
  347. +    paddw    %8, %4
  348. +    FILT_H   %1, %7, %8, [pw_16]
  349. +    psraw    %1, 1
  350. +    CLIPW    %1, [pb_0], [pw_pixel_max]
  351. +%endmacro
  352. +
  353. +%macro MC02 3
  354. +cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
  355. +    lea      r3, [r2*2]
  356. +    sub      r1, r3
  357. +    movu     m0, [r1]
  358. +    movu     m1, [r1+r2]
  359. +    add      r1, r3
  360. +    movu     m2, [r1]
  361. +    movu     m3, [r1+r2]
  362. +    add      r1, r3
  363. +    movu     m4, [r1]
  364. +    add      r1, r2
  365. +
  366. +%rep %3-1
  367. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  368. +    OP_MOV [r0], m0
  369. +    add      r1, r2
  370. +    add      r0, r2
  371. +    SWAP 0,1,2,3,4,5
  372. +%endrep
  373. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  374. +    OP_MOV [r0], m0
  375. +    RET
  376. +%endmacro
  377. +
  378. +%define OP_MOV mova
  379. +INIT_MMX
  380. +MC02 mmxext, put, 4
  381. +INIT_XMM
  382. +MC02 sse2  , put, 8
  383. +
  384. +%define OP_MOV AVG_MOV
  385. +INIT_MMX
  386. +MC02 mmxext, avg, 4
  387. +INIT_XMM
  388. +MC02 sse2  , avg, 8
  389. +
  390. +;-----------------------------------------------------------------------------
  391. +; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
  392. +;-----------------------------------------------------------------------------
  393. +%macro MC01 3
  394. +cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
  395. +    mov      r4, r1
  396. +.body
  397. +    lea      r3, [r2*2]
  398. +    sub      r1, r3
  399. +    movu     m0, [r1]
  400. +    movu     m1, [r1+r2]
  401. +    add      r1, r3
  402. +    movu     m2, [r1]
  403. +    movu     m3, [r1+r2]
  404. +    add      r1, r3
  405. +    movu     m4, [r1]
  406. +    add      r1, r2
  407. +
  408. +%rep %3-1
  409. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  410. +    movu     m7, [r4]
  411. +    pavgw    m0, m7
  412. +    OP_MOV [r0], m0
  413. +    add      r4, r2
  414. +    add      r1, r2
  415. +    add      r0, r2
  416. +    SWAP 0,1,2,3,4,5
  417. +%endrep
  418. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  419. +    movu     m7, [r4]
  420. +    pavgw    m0, m7
  421. +    OP_MOV [r0], m0
  422. +    RET
  423. +%endmacro
  424. +
  425. +%define OP_MOV mova
  426. +INIT_MMX
  427. +MC01 mmxext, put, 4
  428. +INIT_XMM
  429. +MC01 sse2  , put, 8
  430. +
  431. +%define OP_MOV AVG_MOV
  432. +INIT_MMX
  433. +MC01 mmxext, avg, 4
  434. +INIT_XMM
  435. +MC01 sse2  , avg, 8
  436. +
  437. +;-----------------------------------------------------------------------------
  438. +; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
  439. +;-----------------------------------------------------------------------------
  440. +%macro MC03 3
  441. +cglobal %2_h264_qpel%3_mc03_10_%1, 3,5,8
  442. +    lea r4, [r1+r2]
  443. +    jmp mangle(ff_%2_h264_qpel%3_mc01_10_%1.body)
  444. +%endmacro
  445. +
  446. +%define OP_MOV mova
  447. +INIT_MMX
  448. +MC03 mmxext, put, 4
  449. +INIT_XMM
  450. +MC03 sse2  , put, 8
  451. +
  452. +%define OP_MOV AVG_MOV
  453. +INIT_MMX
  454. +MC03 mmxext, avg, 4
  455. +INIT_XMM
  456. +MC03 sse2  , avg, 8
  457. +
  458. +;-----------------------------------------------------------------------------
  459. +; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
  460. +;-----------------------------------------------------------------------------
  461. +%macro MC11 3
  462. +; this REALLY needs x86_64
  463. +cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
  464. +    mov      r4, r1
  465. +.body
  466. +    lea      r3, [r2*2]
  467. +    sub      r1, r3
  468. +    movu     m0, [r1]
  469. +    movu     m1, [r1+r2]
  470. +    add      r1, r3
  471. +    movu     m2, [r1]
  472. +    movu     m3, [r1+r2]
  473. +    add      r1, r3
  474. +    movu     m4, [r1]
  475. +    add      r1, r2
  476. +
  477. +%assign i 0
  478. +%rep %3
  479. +%assign i i+1
  480. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  481. +;now do FILT_H with fewer registers. probably faster than doing FILT_V then FILT_H
  482. +;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  483. +;unfortunately I need three registers, so m5 will have to be re-read from memory
  484. +    movu     m5, [r4-4]
  485. +    ADDW     m5, [r4+6], m7
  486. +    movu     m6, [r4-2]
  487. +    ADDW     m6, [r4+4], m7
  488. +    paddw    m5, [pw_16]
  489. +    psubw    m5, m6  ; a-b
  490. +    psraw    m5, 2   ; (a-b)/4
  491. +    psubw    m5, m6  ; (a-b)/4-b
  492. +;now I need to load c...
  493. +    movu     m6, [r4+0]
  494. +    ADDW     m6, [r4+2], m7
  495. +    paddw    m5, m6  ; (a-b)/4-b+c
  496. +    psraw    m5, 2   ; ((a-b)/4-b+c)/4
  497. +    paddw    m5, m6  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  498. +    psraw    m5, 1
  499. +    CLIPW    m5, [pb_0], [pw_pixel_max]
  500. +;avg FILT_V, FILT_H and reload m5
  501. +    pavgw    m0, m5
  502. +    OP_MOV [r0], m0
  503. +%if i<%3
  504. +    movu     m5, [r1]
  505. +    add      r4, r2
  506. +    add      r1, r2
  507. +    add      r0, r2
  508. +    SWAP 0,1,2,3,4,5
  509. +%endif
  510. +%endrep
  511. +    RET
  512. +%endmacro
  513. +
  514. +%define OP_MOV mova
  515. +INIT_MMX
  516. +MC11 mmxext, put, 4
  517. +INIT_XMM
  518. +MC11 sse2  , put, 8
  519. +
  520. +%define OP_MOV AVG_MOV
  521. +INIT_MMX
  522. +MC11 mmxext, avg, 4
  523. +INIT_XMM
  524. +MC11 sse2  , avg, 8
  525. +
  526. +;-----------------------------------------------------------------------------
  527. +; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  528. +;-----------------------------------------------------------------------------
  529. +%macro MC31 3
  530. +cglobal %2_h264_qpel%3_mc31_10_%1, 3,5,8
  531. +    mov r4, r1
  532. +    add r1, 2
  533. +    jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  534. +%endmacro
  535. +
  536. +%define OP_MOV mova
  537. +INIT_MMX
  538. +MC31 mmxext, put, 4
  539. +INIT_XMM
  540. +MC31 sse2  , put, 8
  541. +
  542. +%define OP_MOV AVG_MOV
  543. +INIT_MMX
  544. +MC31 mmxext, avg, 4
  545. +INIT_XMM
  546. +MC31 sse2  , avg, 8
  547. +
  548. +;-----------------------------------------------------------------------------
  549. +; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  550. +;-----------------------------------------------------------------------------
  551. +%macro MC13 3
  552. +cglobal %2_h264_qpel%3_mc13_10_%1, 3,5,8
  553. +    lea r4, [r1+r2]
  554. +    jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  555. +%endmacro
  556. +
  557. +%define OP_MOV mova
  558. +INIT_MMX
  559. +MC13 mmxext, put, 4
  560. +INIT_XMM
  561. +MC13 sse2  , put, 8
  562. +
  563. +%define OP_MOV AVG_MOV
  564. +INIT_MMX
  565. +MC13 mmxext, avg, 4
  566. +INIT_XMM
  567. +MC13 sse2  , avg, 8
  568. +
  569. +;-----------------------------------------------------------------------------
  570. +; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  571. +;-----------------------------------------------------------------------------
  572. +%macro MC33 3
  573. +cglobal %2_h264_qpel%3_mc33_10_%1, 3,5,8
  574. +    lea r4, [r1+r2]
  575. +    add r1, 2
  576. +    jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  577. +%endmacro
  578. +
  579. +%define OP_MOV mova
  580. +INIT_MMX
  581. +MC33 mmxext, put, 4
  582. +INIT_XMM
  583. +MC33 sse2  , put, 8
  584. +
  585. +%define OP_MOV AVG_MOV
  586. +INIT_MMX
  587. +MC33 mmxext, avg, 4
  588. +INIT_XMM
  589. +MC33 sse2  , avg, 8
  590. +
  591. +
  592. +
  593. +;-----------------------------------------------------------------------------
  594. +; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
  595. +;-----------------------------------------------------------------------------
  596. +%macro FILT_H2 3
  597. +    psubw  %1, %2  ; a-b
  598. +    psubw  %2, %3  ; b-c
  599. +    psllw  %2, 2
  600. +    psubw  %1, %2  ; a-5*b+4*c
  601. +    psllw  %3, 4
  602. +    paddw  %1, %3  ; a-5*b+20*c
  603. +%endmacro
  604. +
  605. +%macro FILT_VNRD 8
  606. +    movu     %6, [r1]
  607. +    paddw    %1, %6
  608. +    mova     %7, %2
  609. +    paddw    %7, %5
  610. +    mova     %8, %3
  611. +    paddw    %8, %4
  612. +    FILT_H2  %1, %7, %8
  613. +%endmacro
  614. +
  615. +%macro MC22 3
  616. +%2_hv%3_10_%1:
  617. +    add     rsp, gprsize
  618. +    neg      r2           ; This actually saves instructions
  619. +    lea      r1, [r1+r2*2]
  620. +    sub      r1, mmsize
  621. +    xor      r4, r4
  622. +    mov      r3, 3
  623. +.v_loop:
  624. +    movu     m0, [r1]
  625. +    sub      r1, r2
  626. +    movu     m1, [r1]
  627. +    sub      r1, r2
  628. +    movu     m2, [r1]
  629. +    sub      r1, r2
  630. +    movu     m3, [r1]
  631. +    sub      r1, r2
  632. +    movu     m4, [r1]
  633. +    sub      r1, r2
  634. +%assign i 0
  635. +%rep %3-1
  636. +    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  637. +    psubw    m0, [pad20]
  638. +    mova     [rsp+r4+i*mmsize*3], m0
  639. +    sub      r1, r2
  640. +    SWAP 0,1,2,3,4,5
  641. +%assign i i+1
  642. +%endrep
  643. +    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  644. +    psubw    m0, [pad20]
  645. +    mova     [rsp+r4+i*mmsize*3], m0
  646. +    add      r4, mmsize
  647. +    lea      r1, [r1+r2*8+mmsize]
  648. +%if %3==8
  649. +    lea      r1, [r1+r2*4]
  650. +%endif
  651. +    dec      r3
  652. +    jg .v_loop
  653. +    sub     rsp, gprsize
  654. +    neg      r2
  655. +    ret
  656. +
  657. +cglobal %2_h264_qpel%3_mc22_10_%1, 3,7,10
  658. +    mov      r6, rsp          ; backup stack pointer
  659. +    and     rsp, ~(mmsize-1)  ; align stack
  660. +    sub     rsp, 4096         ; TODO: calculate this correctly
  661. +
  662. +    call %2_hv%3_10_%1
  663. +
  664. +    mov        r4, mmsize
  665. +    mov       r3d, %3
  666. +    mova       m0, [tap1]
  667. +    mova       m7, [tap3]
  668. +%if num_mmregs > 8
  669. +    mova       m8, [tap2]
  670. +    mova       m9, [depad]
  671. +    %define s1 m8
  672. +    %define s2 m9
  673. +%else
  674. +    %define s1 [tap2]
  675. +    %define s2 [depad]
  676. +%endif
  677. +.h_loop:
  678. +    movu       m1, [rsp+r4-4]
  679. +    movu       m2, [rsp+r4-2]
  680. +    mova       m3, [rsp+r4+0]
  681. +    movu       m4, [rsp+r4+2]
  682. +    movu       m5, [rsp+r4+4]
  683. +    movu       m6, [rsp+r4+6]
  684. +    pmaddwd    m1, m0
  685. +    pmaddwd    m2, m0
  686. +    pmaddwd    m3, s1
  687. +    pmaddwd    m4, s1
  688. +    pmaddwd    m5, m7
  689. +    pmaddwd    m6, m7
  690. +    paddd      m1, s2
  691. +    paddd      m2, s2
  692. +    paddd      m3, m5
  693. +    paddd      m4, m6
  694. +    paddd      m1, m3
  695. +    paddd      m2, m4
  696. +    psrad      m1, 10
  697. +    psrad      m2, 10
  698. +    pslld      m2, 16
  699. +    pand       m1, [pd_0f]
  700. +    por        m1, m2
  701. +    CLIPW      m1, [pb_0], [pw_pixel_max]
  702. +    OP_MOV   [r0], m1
  703. +    add        r4, mmsize*3
  704. +    add        r0, r2
  705. +    dec       r3d
  706. +    jg .h_loop
  707. +
  708. +    mov     rsp, r6          ; restore stack pointer
  709. +    RET
  710. +%endmacro
  711. +
  712. +%define OP_MOV mova
  713. +INIT_MMX
  714. +MC22 mmxext, put, 4
  715. +INIT_XMM
  716. +MC22 sse2  , put, 8
  717. +
  718. +%define OP_MOV AVG_MOV
  719. +INIT_MMX
  720. +MC22 mmxext, avg, 4
  721. +INIT_XMM
  722. +MC22 sse2  , avg, 8
  723. +
  724. +;-----------------------------------------------------------------------------
  725. +; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  726. +;-----------------------------------------------------------------------------
  727. +%macro MC12 3
  728. +cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  729. +    mov      r6, rsp          ; backup stack pointer
  730. +    and     rsp, ~(mmsize-1)  ; align stack
  731. +    sub     rsp, 4096         ; TODO: calculate this correctly
  732. +
  733. +    call %2_hv%3_10_%1
  734. +
  735. +    xor        r4, r4
  736. +.body
  737. +    mov       r3d, %3
  738. +    mova       m0, [tap1]
  739. +    mova       m7, [tap3]
  740. +%if num_mmregs > 8
  741. +    mova       m8, [tap2]
  742. +    mova       m9, [depad]
  743. +    %define s1 m8
  744. +    %define s2 m9
  745. +%else
  746. +    %define s1 [tap2]
  747. +    %define s2 [depad]
  748. +%endif
  749. +.h_loop:
  750. +    movu       m1, [rsp+mmsize-4]
  751. +    movu       m2, [rsp+mmsize-2]
  752. +    mova       m3, [rsp+mmsize+0]
  753. +    movu       m4, [rsp+mmsize+2]
  754. +    movu       m5, [rsp+mmsize+4]
  755. +    movu       m6, [rsp+mmsize+6]
  756. +    pmaddwd    m1, m0
  757. +    pmaddwd    m2, m0
  758. +    pmaddwd    m3, s1
  759. +    pmaddwd    m4, s1
  760. +    pmaddwd    m5, m7
  761. +    pmaddwd    m6, m7
  762. +    paddd      m1, s2
  763. +    paddd      m2, s2
  764. +    paddd      m3, m5
  765. +    paddd      m4, m6
  766. +    paddd      m1, m3
  767. +    paddd      m2, m4
  768. +    psrad      m1, 10
  769. +    psrad      m2, 10
  770. +    pslld      m2, 16
  771. +    pand       m1, [pd_0f]
  772. +    por        m1, m2
  773. +    CLIPW      m1, [pw_0], [pw_pixel_max]
  774. +
  775. +    movu       m3, [rsp+r4+mmsize] ; movu needed for mc32
  776. +    paddw      m3, [depad2]
  777. +    psrlw      m3, 5
  778. +    psubw      m3, [unpad]
  779. +    CLIPW      m3, [pw_0], [pw_pixel_max]
  780. +    pavgw      m1, m3
  781. +
  782. +    OP_MOV   [r0], m1
  783. +    add       rsp, mmsize*3
  784. +    add        r0, r2
  785. +    dec       r3d
  786. +    jg .h_loop
  787. +
  788. +    mov     rsp, r6          ; restore stack pointer
  789. +    RET
  790. +%endmacro
  791. +
  792. +%define OP_MOV mova
  793. +INIT_MMX
  794. +MC12 mmxext, put, 4
  795. +INIT_XMM
  796. +MC12 sse2  , put, 8
  797. +
  798. +%define OP_MOV AVG_MOV
  799. +INIT_MMX
  800. +MC12 mmxext, avg, 4
  801. +INIT_XMM
  802. +MC12 sse2  , avg, 8
  803. +
  804. +;-----------------------------------------------------------------------------
  805. +; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
  806. +;-----------------------------------------------------------------------------
  807. +%macro MC32 3
  808. +cglobal %2_h264_qpel%3_mc32_10_%1, 3,7,10
  809. +    mov  r6, rsp          ; backup stack pointer
  810. +    and rsp, ~(mmsize-1)  ; align stack
  811. +    sub rsp, 4096         ; TODO: calculate this correctly
  812. +
  813. +    call %2_hv%3_10_%1
  814. +
  815. +    mov  r4, 2            ; sizeof(pixel)
  816. +    jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  817. +%endmacro
  818. +
  819. +%define OP_MOV mova
  820. +INIT_MMX
  821. +MC32 mmxext, put, 4
  822. +INIT_XMM
  823. +MC32 sse2  , put, 8
  824. +
  825. +%define OP_MOV AVG_MOV
  826. +INIT_MMX
  827. +MC32 mmxext, avg, 4
  828. +INIT_XMM
  829. +MC32 sse2  , avg, 8
  830. +
  831. +;-----------------------------------------------------------------------------
  832. +; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
  833. +;-----------------------------------------------------------------------------
  834. +%macro MC21 3
  835. +%2_h%3_10_%1:
  836. +    add       rsp, gprsize
  837. +    mov       r3d, %3
  838. +    xor        r4, r4
  839. +    mova       m6, [pad20]
  840. +.nextrow
  841. +    movu       m2, [r5-4]
  842. +    movu       m3, [r5-2]
  843. +    movu       m4, [r5+0]
  844. +    ADDW       m4, [r5+2], m5
  845. +    ADDW       m3, [r5+4], m5
  846. +    ADDW       m2, [r5+6], m5
  847. +
  848. +    FILT_H2    m2, m3, m4
  849. +    psubw      m2, m6
  850. +    mova [rsp+r4], m2
  851. +    add        r4, mmsize*3
  852. +    add        r5, r2
  853. +    dec       r3d
  854. +    jg .nextrow
  855. +    sub       rsp, gprsize
  856. +    ret
  857. +
  858. +cglobal %2_h264_qpel%3_mc21_10_%1, 3,7,10
  859. +    mov   r6, rsp          ; backup stack pointer
  860. +    and  rsp, ~(mmsize-1)  ; align stack
  861. +    sub  rsp, 4096         ; TODO: calculate this correctly
  862. +
  863. +    mov   r5, r1
  864. +    call  %2_hv%3_10_%1
  865. +
  866. +%define PAD mmsize*16*3*2     ; SIZE*16*3*sizeof(pixel)
  867. +    add  rsp, PAD
  868. +    call %2_h%3_10_%1
  869. +    sub  rsp, PAD
  870. +
  871. +    mov  r4, PAD-mmsize            ; H buffer
  872. +    jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  873. +%endmacro
  874. +
  875. +%define OP_MOV mova
  876. +INIT_MMX
  877. +MC21 mmxext, put, 4
  878. +INIT_XMM
  879. +MC21 sse2  , put, 8
  880. +
  881. +%define OP_MOV AVG_MOV
  882. +INIT_MMX
  883. +MC21 mmxext, avg, 4
  884. +INIT_XMM
  885. +MC21 sse2  , avg, 8
  886. +
  887. +;-----------------------------------------------------------------------------
  888. +; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
  889. +;-----------------------------------------------------------------------------
  890. +%macro MC23 3
  891. +cglobal %2_h264_qpel%3_mc23_10_%1, 3,7,10
  892. +    mov   r6, rsp          ; backup stack pointer
  893. +    and  rsp, ~(mmsize-1)  ; align stack
  894. +    sub  rsp, 4096         ; TODO: calculate this correctly
  895. +
  896. +    lea   r5, [r1+r2]
  897. +    call  %2_hv%3_10_%1
  898. +
  899. +%define PAD mmsize*16*3*2     ; SIZE*16*3*sizeof(pixel)
  900. +    add  rsp, PAD
  901. +    call %2_h%3_10_%1
  902. +    sub  rsp, PAD
  903. +
  904. +    mov  r4, PAD-mmsize            ; H buffer
  905. +    jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  906. +%endmacro
  907. +
  908. +%define OP_MOV mova
  909. +INIT_MMX
  910. +MC23 mmxext, put, 4
  911. +INIT_XMM
  912. +MC23 sse2  , put, 8
  913. +
  914. +%define OP_MOV AVG_MOV
  915. +INIT_MMX
  916. +MC23 mmxext, avg, 4
  917. +INIT_XMM
  918. +MC23 sse2  , avg, 8
  919. diff --git a/libavcodec/x86/h264_qpel_mmx_10bit.c b/libavcodec/x86/h264_qpel_mmx_10bit.c
  920. new file mode 100755
  921. index 0000000..98cf6da
  922. --- /dev/null
  923. +++ b/libavcodec/x86/h264_qpel_mmx_10bit.c
  924. @@ -0,0 +1,141 @@
  925. +/*
  926. + * Copyright (c) 2011 Daniel Kang
  927. + *
  928. + * This file is part of Libav.
  929. + *
  930. + * Libav is free software; you can redistribute it and/or
  931. + * modify it under the terms of the GNU Lesser General Public
  932. + * License as published by the Free Software Foundation; either
  933. + * version 2.1 of the License, or (at your option) any later version.
  934. + *
  935. + * Libav is distributed in the hope that it will be useful,
  936. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  937. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  938. + * Lesser General Public License for more details.
  939. + *
  940. + * You should have received a copy of the GNU Lesser General Public
  941. + * License along with Libav; if not, write to the Free Software
  942. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  943. + */
  944. +
  945. +#include "dsputil_mmx.h"
  946. +
  947. +#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
  948. +void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
  949. +    (uint8_t *dst, uint8_t *src, int stride);
  950. +
  951. +#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
  952. +    LUMA_MC_OP(put,  4, DEPTH, TYPE, OPT) \
  953. +    LUMA_MC_OP(avg,  4, DEPTH, TYPE, OPT) \
  954. +    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
  955. +    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
  956. +    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  957. +    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  958. +
  959. +#define LUMA_MC_816(DEPTH, TYPE, OPT) \
  960. +    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
  961. +    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
  962. +    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  963. +    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  964. +
  965. +LUMA_MC_ALL(10, mc00, mmxext)
  966. +LUMA_MC_ALL(10, mc10, mmxext)
  967. +LUMA_MC_ALL(10, mc20, mmxext)
  968. +LUMA_MC_ALL(10, mc30, mmxext)
  969. +LUMA_MC_ALL(10, mc01, mmxext)
  970. +LUMA_MC_ALL(10, mc11, mmxext)
  971. +LUMA_MC_ALL(10, mc21, mmxext)
  972. +LUMA_MC_ALL(10, mc31, mmxext)
  973. +LUMA_MC_ALL(10, mc02, mmxext)
  974. +LUMA_MC_ALL(10, mc12, mmxext)
  975. +LUMA_MC_ALL(10, mc22, mmxext)
  976. +LUMA_MC_ALL(10, mc32, mmxext)
  977. +LUMA_MC_ALL(10, mc03, mmxext)
  978. +LUMA_MC_ALL(10, mc13, mmxext)
  979. +LUMA_MC_ALL(10, mc23, mmxext)
  980. +LUMA_MC_ALL(10, mc33, mmxext)
  981. +
  982. +LUMA_MC_816(10, mc00, sse2)
  983. +LUMA_MC_816(10, mc10, sse2)
  984. +LUMA_MC_816(10, mc20, sse2)
  985. +LUMA_MC_816(10, mc30, sse2)
  986. +LUMA_MC_816(10, mc01, sse2)
  987. +LUMA_MC_816(10, mc11, sse2)
  988. +LUMA_MC_816(10, mc21, sse2)
  989. +LUMA_MC_816(10, mc31, sse2)
  990. +LUMA_MC_816(10, mc02, sse2)
  991. +LUMA_MC_816(10, mc12, sse2)
  992. +LUMA_MC_816(10, mc22, sse2)
  993. +LUMA_MC_816(10, mc32, sse2)
  994. +LUMA_MC_816(10, mc03, sse2)
  995. +LUMA_MC_816(10, mc13, sse2)
  996. +LUMA_MC_816(10, mc23, sse2)
  997. +LUMA_MC_816(10, mc33, sse2)
  998. +
  999. +#define QPEL8_OPMC(OP, MC, MMX)\
  1000. +void ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1001. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst  , src  , stride);\
  1002. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst+8, src+8, stride);\
  1003. +    src += 4*stride;\
  1004. +    dst += 4*stride;\
  1005. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst  , src  , stride);\
  1006. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst+8, src+8, stride);\
  1007. +}
  1008. +
  1009. +#define QPEL8_OP(MC, MMX)\
  1010. +QPEL8_OPMC(put, MC, MMX)\
  1011. +QPEL8_OPMC(avg, MC, MMX)
  1012. +
  1013. +#define QPEL8(MMX)\
  1014. +QPEL8_OP(mc00, MMX)\
  1015. +QPEL8_OP(mc01, MMX)\
  1016. +QPEL8_OP(mc02, MMX)\
  1017. +QPEL8_OP(mc03, MMX)\
  1018. +QPEL8_OP(mc10, MMX)\
  1019. +QPEL8_OP(mc11, MMX)\
  1020. +QPEL8_OP(mc12, MMX)\
  1021. +QPEL8_OP(mc13, MMX)\
  1022. +QPEL8_OP(mc20, MMX)\
  1023. +QPEL8_OP(mc21, MMX)\
  1024. +QPEL8_OP(mc22, MMX)\
  1025. +QPEL8_OP(mc23, MMX)\
  1026. +QPEL8_OP(mc30, MMX)\
  1027. +QPEL8_OP(mc31, MMX)\
  1028. +QPEL8_OP(mc32, MMX)\
  1029. +QPEL8_OP(mc33, MMX)
  1030. +
  1031. +#define QPEL16_OPMC(OP, MC, MMX)\
  1032. +void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1033. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
  1034. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1035. +    src += 8*stride;\
  1036. +    dst += 8*stride;\
  1037. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
  1038. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1039. +}
  1040. +
  1041. +#define QPEL16_OP(MC, MMX)\
  1042. +QPEL16_OPMC(put, MC, MMX)\
  1043. +QPEL16_OPMC(avg, MC, MMX)
  1044. +
  1045. +#define QPEL16(MMX)\
  1046. +QPEL16_OP(mc01, MMX)\
  1047. +QPEL16_OP(mc02, MMX)\
  1048. +QPEL16_OP(mc03, MMX)\
  1049. +QPEL16_OP(mc10, MMX)\
  1050. +QPEL16_OP(mc11, MMX)\
  1051. +QPEL16_OP(mc12, MMX)\
  1052. +QPEL16_OP(mc13, MMX)\
  1053. +QPEL16_OP(mc20, MMX)\
  1054. +QPEL16_OP(mc21, MMX)\
  1055. +QPEL16_OP(mc22, MMX)\
  1056. +QPEL16_OP(mc23, MMX)\
  1057. +QPEL16_OP(mc30, MMX)\
  1058. +QPEL16_OP(mc31, MMX)\
  1059. +QPEL16_OP(mc32, MMX)\
  1060. +QPEL16_OP(mc33, MMX)
  1061. +
  1062. +QPEL8(mmxext)
  1063. +QPEL16_OP(mc00, mmxext)
  1064. +QPEL16(mmxext)
  1065. +QPEL16(sse2)
  1066. --
  1067. 1.7.5.1
  1068.  
  1069.  
  1070. From 970a7a164d0ae110821c174a7b9081f93172c095 Mon Sep 17 00:00:00 2001
  1071. From: Daniel Kang <daniel.d.kang@gmail.com>
  1072. Date: Sat, 25 Jun 2011 13:28:49 -0400
  1073. Subject: [PATCH 2/2] improvement?
  1074.  
  1075. ---
  1076. libavcodec/x86/h264_qpel_10bit.asm |   40 ++++++++++++++++++++---------------
  1077.  1 files changed, 23 insertions(+), 17 deletions(-)
  1078.  
  1079. diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
  1080. index fc44e85..bc0e78d 100644
  1081. --- a/libavcodec/x86/h264_qpel_10bit.asm
  1082. +++ b/libavcodec/x86/h264_qpel_10bit.asm
  1083. @@ -620,7 +620,7 @@ MC22 sse2  , avg, 8
  1084.  ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  1085.  ;-----------------------------------------------------------------------------
  1086.  %macro MC12 3
  1087. -cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1088. +cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
  1089.      mov      r6, rsp          ; backup stack pointer
  1090.      and     rsp, ~(mmsize-1)  ; align stack
  1091.      sub     rsp, 4096         ; TODO: calculate this correctly
  1092. @@ -630,16 +630,22 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1093.      xor        r4, r4
  1094.  .body
  1095.      mov       r3d, %3
  1096. -    mova       m0, [tap1]
  1097. -    mova       m7, [tap3]
  1098. +    pxor       m0, m0
  1099. +    mova       m7, [pw_pixel_max]
  1100.  %if num_mmregs > 8
  1101. -    mova       m8, [tap2]
  1102. -    mova       m9, [depad]
  1103. +    mova       m8, [tap1]
  1104. +    mova       m9, [tap2]
  1105. +    mova      m10, [tap3]
  1106. +    mova      m11, [depad]
  1107.      %define s1 m8
  1108.      %define s2 m9
  1109. +    %define s3 m10
  1110. +    %define s1 m11
  1111.  %else
  1112. -    %define s1 [tap2]
  1113. -    %define s2 [depad]
  1114. +    %define s1 [tap1]
  1115. +    %define s2 [tap2]
  1116. +    %define s3 [tap3]
  1117. +    %define d1 [depad]
  1118.  %endif
  1119.  .h_loop:
  1120.      movu       m1, [rsp+mmsize-4]
  1121. @@ -648,14 +654,14 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1122.      movu       m4, [rsp+mmsize+2]
  1123.      movu       m5, [rsp+mmsize+4]
  1124.      movu       m6, [rsp+mmsize+6]
  1125. -    pmaddwd    m1, m0
  1126. -    pmaddwd    m2, m0
  1127. -    pmaddwd    m3, s1
  1128. -    pmaddwd    m4, s1
  1129. -    pmaddwd    m5, m7
  1130. -    pmaddwd    m6, m7
  1131. -    paddd      m1, s2
  1132. -    paddd      m2, s2
  1133. +    pmaddwd    m1, s1
  1134. +    pmaddwd    m2, s1
  1135. +    pmaddwd    m3, s2
  1136. +    pmaddwd    m4, s2
  1137. +    pmaddwd    m5, s3
  1138. +    pmaddwd    m6, s3
  1139. +    paddd      m1, d1
  1140. +    paddd      m2, d1
  1141.      paddd      m3, m5
  1142.      paddd      m4, m6
  1143.      paddd      m1, m3
  1144. @@ -665,13 +671,13 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1145.      pslld      m2, 16
  1146.      pand       m1, [pd_0f]
  1147.      por        m1, m2
  1148. -    CLIPW      m1, [pw_0], [pw_pixel_max]
  1149. +    CLIPW      m1, m0, m7
  1150.  
  1151.      movu       m3, [rsp+r4+mmsize] ; movu needed for mc32
  1152.      paddw      m3, [depad2]
  1153.      psrlw      m3, 5
  1154.      psubw      m3, [unpad]
  1155. -    CLIPW      m3, [pw_0], [pw_pixel_max]
  1156. +    CLIPW      m3, m0, m7
  1157.      pavgw      m1, m3
  1158.  
  1159.      OP_MOV   [r0], m1
  1160. --
  1161. 1.7.5.1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement