Advertisement
Guest User

Untitled

a guest
Sep 20th, 2017
412
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 58.20 KB | None | 0 0
  1. From 489134cf2343edbb65bb17d2ed99dc2c114657bf Mon Sep 17 00:00:00 2001
  2. From: Daniel Kang <daniel.d.kang@gmail.com>
  3. Date: Wed, 22 Jun 2011 17:40:50 -0400
  4. Subject: [PATCH 1/5] luma mc first pass done \o/
  5.  
  6. ---
  7. libavcodec/x86/Makefile              |    1 +
  8.  libavcodec/x86/dsputil_mmx.c         |   47 ++
  9.  libavcodec/x86/h264_qpel_10bit.asm   |  813 ++++++++++++++++++++++++++++++++++
  10.  libavcodec/x86/h264_qpel_mmx_10bit.c |  141 ++++++
  11.  4 files changed, 1002 insertions(+), 0 deletions(-)
  12.  create mode 100644 libavcodec/x86/h264_qpel_10bit.asm
  13.  create mode 100755 libavcodec/x86/h264_qpel_mmx_10bit.c
  14.  
  15. diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
  16. index 022ab27..d3cf0da 100644
  17. --- a/libavcodec/x86/Makefile
  18. +++ b/libavcodec/x86/Makefile
  19. @@ -46,6 +46,7 @@ MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
  20.                                            x86/fmtconvert.o              \
  21.                                            x86/h264_chromamc.o           \
  22.                                            x86/h264_chromamc_10bit.o     \
  23. +                                          x86/h264_qpel_10bit.o         \
  24.                                            $(YASM-OBJS-yes)
  25.  
  26.  MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
  27. diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
  28. index 5c5ecb2..43ff26a 100644
  29. --- a/libavcodec/x86/dsputil_mmx.c
  30. +++ b/libavcodec/x86/dsputil_mmx.c
  31. @@ -1896,6 +1896,7 @@ PREFETCH(prefetch_3dnow, prefetch)
  32.  #undef PREFETCH
  33.  
  34.  #include "h264_qpel_mmx.c"
  35. +#include "h264_qpel_mmx_10bit.c"
  36.  
  37.  void ff_put_h264_chroma_mc8_mmx_rnd   (uint8_t *dst, uint8_t *src,
  38.                                         int stride, int h, int x, int y);
  39. @@ -2649,6 +2650,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  40.              SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
  41.              SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
  42.              }
  43. +#if HAVE_YASM
  44. +#define SET_QPEL_FUNCS_10(PFX, IDX, SIZE, CPU) \
  45. +            c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## SIZE ## _mc00_10_ ## CPU; \
  46. +            c->PFX ## _pixels_tab[IDX][ 1] = ff_ ## PFX ## SIZE ## _mc10_10_ ## CPU; \
  47. +            c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## SIZE ## _mc20_10_ ## CPU; \
  48. +            c->PFX ## _pixels_tab[IDX][ 3] = ff_ ## PFX ## SIZE ## _mc30_10_ ## CPU; \
  49. +            c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## SIZE ## _mc01_10_ ## CPU; \
  50. +            c->PFX ## _pixels_tab[IDX][ 5] = ff_ ## PFX ## SIZE ## _mc11_10_ ## CPU; \
  51. +            c->PFX ## _pixels_tab[IDX][ 6] = ff_ ## PFX ## SIZE ## _mc21_10_ ## CPU; \
  52. +            c->PFX ## _pixels_tab[IDX][ 7] = ff_ ## PFX ## SIZE ## _mc31_10_ ## CPU; \
  53. +            c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## SIZE ## _mc02_10_ ## CPU; \
  54. +            c->PFX ## _pixels_tab[IDX][ 9] = ff_ ## PFX ## SIZE ## _mc12_10_ ## CPU; \
  55. +            c->PFX ## _pixels_tab[IDX][10] = ff_ ## PFX ## SIZE ## _mc22_10_ ## CPU; \
  56. +            c->PFX ## _pixels_tab[IDX][11] = ff_ ## PFX ## SIZE ## _mc32_10_ ## CPU; \
  57. +            c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## SIZE ## _mc03_10_ ## CPU; \
  58. +            c->PFX ## _pixels_tab[IDX][13] = ff_ ## PFX ## SIZE ## _mc13_10_ ## CPU; \
  59. +            c->PFX ## _pixels_tab[IDX][14] = ff_ ## PFX ## SIZE ## _mc23_10_ ## CPU; \
  60. +            c->PFX ## _pixels_tab[IDX][15] = ff_ ## PFX ## SIZE ## _mc33_10_ ## CPU
  61. +            else if (bit_depth == 10) {
  62. +                SET_QPEL_FUNCS_10(put_h264_qpel, 0, 16, mmxext);
  63. +                SET_QPEL_FUNCS_10(put_h264_qpel, 1, 8,  mmxext);
  64. +                SET_QPEL_FUNCS_10(put_h264_qpel, 2, 4,  mmxext);
  65. +                SET_QPEL_FUNCS_10(avg_h264_qpel, 0, 16, mmxext);
  66. +                SET_QPEL_FUNCS_10(avg_h264_qpel, 1, 8,  mmxext);
  67. +                SET_QPEL_FUNCS_10(avg_h264_qpel, 2, 4,  mmxext);
  68. +            }
  69. +#endif
  70.  
  71.              SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
  72.              SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
  73. @@ -2777,7 +2805,26 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  74.              H264_QPEL_FUNCS(3, 3, sse2);
  75.              }
  76.  #if HAVE_YASM
  77. +#define H264_QPEL_FUNCS_10(x, y, CPU)\
  78. +            c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
  79. +            c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
  80. +            c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
  81. +            c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
  82.              if (bit_depth == 10) {
  83. +                H264_QPEL_FUNCS_10(0, 0, sse2);
  84. +                H264_QPEL_FUNCS_10(0, 1, sse2);
  85. +                H264_QPEL_FUNCS_10(0, 2, sse2);
  86. +                H264_QPEL_FUNCS_10(0, 3, sse2);
  87. +                H264_QPEL_FUNCS_10(1, 1, sse2);
  88. +                H264_QPEL_FUNCS_10(1, 2, sse2);
  89. +                H264_QPEL_FUNCS_10(1, 3, sse2);
  90. +                H264_QPEL_FUNCS_10(2, 1, sse2);
  91. +                H264_QPEL_FUNCS_10(2, 2, sse2);
  92. +                H264_QPEL_FUNCS_10(2, 3, sse2);
  93. +                H264_QPEL_FUNCS_10(3, 1, sse2);
  94. +                H264_QPEL_FUNCS_10(3, 2, sse2);
  95. +                H264_QPEL_FUNCS_10(3, 3, sse2);
  96. +
  97.                  c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
  98.                  c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
  99.              }
  100. diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
  101. new file mode 100644
  102. index 0000000..fc44e85
  103. --- /dev/null
  104. +++ b/libavcodec/x86/h264_qpel_10bit.asm
  105. @@ -0,0 +1,813 @@
  106. +;*****************************************************************************
  107. +;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
  108. +;*****************************************************************************
  109. +;* Copyright (C) 2005-2011 x264 project
  110. +;*
  111. +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  112. +;*
  113. +;* This file is part of Libav.
  114. +;*
  115. +;* Libav is free software; you can redistribute it and/or
  116. +;* modify it under the terms of the GNU Lesser General Public
  117. +;* License as published by the Free Software Foundation; either
  118. +;* version 2.1 of the License, or (at your option) any later version.
  119. +;*
  120. +;* Libav is distributed in the hope that it will be useful,
  121. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  122. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  123. +;* Lesser General Public License for more details.
  124. +;*
  125. +;* You should have received a copy of the GNU Lesser General Public
  126. +;* License along with Libav; if not, write to the Free Software
  127. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  128. +;******************************************************************************
  129. +
  130. +%include "x86inc.asm"
  131. +%include "x86util.asm"
  132. +
  133. +SECTION_RODATA 32
  134. +
  135. +cextern pw_16
  136. +cextern pw_1
  137. +cextern pb_0
  138. +
  139. +pw_pixel_max: times 8 dw ((1 << 10)-1)
  140. +
  141. +pad10: times 8 dw 10*1023
  142. +pad20: times 8 dw 20*1023
  143. +pad30: times 8 dw 30*1023
  144. +depad: times 4 dd 32*20*1023 + 512
  145. +depad2: times 8 dw 20*1023 + 16*1022 + 16
  146. +unpad: times 8 dw 16*1022/32 ; needs to be mod 16
  147. +
  148. +tap1: times 4 dw  1, -5
  149. +tap2: times 4 dw 20, 20
  150. +tap3: times 4 dw -5,  1
  151. +pd_0f: times 4 dd 0xffff
  152. +
  153. +SECTION .text
  154. +
  155. +; All of  the 2x2 functions are probably no faster than the C version.
  156. +
  157. +;-----------------------------------------------------------------------------
  158. +; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  159. +;-----------------------------------------------------------------------------
  160. +%macro MC00 1
  161. +INIT_MMX
  162. +cglobal %1_h264_qpel4_mc00_10_mmxext, 3,4
  163. +    lea           r3, [r2*3]
  164. +    movq          m0, [r1     ]
  165. +    OP_MOV [r0     ], m0
  166. +    movq          m0, [r1+r2  ]
  167. +    OP_MOV [r0+r2  ], m0
  168. +    movq          m0, [r1+r2*2]
  169. +    OP_MOV [r0+r2*2], m0
  170. +    movq          m0, [r1+r3  ]
  171. +    OP_MOV [r0+r3  ], m0
  172. +    RET
  173. +
  174. +INIT_XMM
  175. +cglobal %1_h264_qpel8_mc00_10_sse2, 3,3
  176. +%rep 4
  177. +    movu        m0, [r1   ]
  178. +    OP_MOV [r0   ], m0
  179. +    movu        m0, [r1+r2]
  180. +    OP_MOV [r0+r2], m0
  181. +    lea         r0, [r0+r2*2]
  182. +    lea         r1, [r1+r2*2]
  183. +%endrep
  184. +    RET
  185. +
  186. +cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
  187. +%rep 8
  188. +    movu           m0, [r1      ]
  189. +    movu           m1, [r1   +16]
  190. +    OP_MOV [r0      ], m0
  191. +    OP_MOV [r0   +16], m1
  192. +    movu           m0, [r1+r2   ]
  193. +    movu           m1, [r1+r2+16]
  194. +    OP_MOV [r0+r2   ], m0
  195. +    OP_MOV [r0+r2+16], m1
  196. +    lea            r0, [r0+r2*2]
  197. +    lea            r1, [r1+r2*2]
  198. +%endrep
  199. +    RET
  200. +%endmacro
  201. +
  202. +%macro AVG_MOV 2
  203. +    pavgw %2, %1
  204. +    mova  %1, %2
  205. +%endmacro
  206. +
  207. +%define OP_MOV mova
  208. +MC00 put
  209. +
  210. +%define OP_MOV AVG_MOV
  211. +MC00 avg
  212. +
  213. +;-----------------------------------------------------------------------------
  214. +; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  215. +;-----------------------------------------------------------------------------
  216. +%macro FILT_H 4
  217. +    paddw  %1, %4
  218. +    psubw  %1, %2  ; a-b
  219. +    psraw  %1, 2   ; (a-b)/4
  220. +    psubw  %1, %2  ; (a-b)/4-b
  221. +    paddw  %1, %3  ; (a-b)/4-b+c
  222. +    psraw  %1, 2   ; ((a-b)/4-b+c)/4
  223. +    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  224. +%endmacro
  225. +
  226. +%macro ADDW 3
  227. +%if mmsize == 8
  228. +    paddw %1, %2
  229. +%else
  230. +    movu  %3, %2
  231. +    paddw %1, %3
  232. +%endif
  233. +%endmacro
  234. +
  235. +%macro MC20 3
  236. +cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
  237. +    mov     r3d, %3
  238. +    pxor     m0, m0
  239. +    mova     m1, [pw_pixel_max]
  240. +    mova     m6, [pw_16]
  241. +.nextrow
  242. +    movu     m2, [r1-4]
  243. +    movu     m3, [r1-2]
  244. +    movu     m4, [r1+0]
  245. +    ADDW     m4, [r1+2], m5
  246. +    ADDW     m3, [r1+4], m5
  247. +    ADDW     m2, [r1+6], m5
  248. +
  249. +    FILT_H   m2, m3, m4, m6
  250. +    psraw    m2, 1
  251. +    CLIPW    m2, m0, m1
  252. +    OP_MOV [r0], m2
  253. +    add      r0, r2
  254. +    add      r1, r2
  255. +    dec     r3d
  256. +    jg .nextrow
  257. +    REP_RET
  258. +%endmacro
  259. +
  260. +%define OP_MOV mova
  261. +INIT_MMX
  262. +MC20 mmxext, put, 4
  263. +INIT_XMM
  264. +MC20 sse2  , put, 8
  265. +
  266. +%define OP_MOV AVG_MOV
  267. +INIT_MMX
  268. +MC20 mmxext, avg, 4
  269. +INIT_XMM
  270. +MC20 sse2  , avg, 8
  271. +
  272. +;-----------------------------------------------------------------------------
  273. +; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
  274. +;-----------------------------------------------------------------------------
  275. +%macro MC30 3
  276. +cglobal %2_h264_qpel%3_mc30_10_%1, 3,5,6
  277. +    lea r4, [r1+2]
  278. +    jmp mangle(ff_%2_h264_qpel%3_mc10_10_%1.body)
  279. +%endmacro
  280. +
  281. +%define OP_MOV mova
  282. +INIT_MMX
  283. +MC30 mmxext, put, 4
  284. +INIT_XMM
  285. +MC30 sse2  , put, 8
  286. +
  287. +%define OP_MOV AVG_MOV
  288. +INIT_MMX
  289. +MC30 mmxext, avg, 4
  290. +INIT_XMM
  291. +MC30 sse2  , avg, 8
  292. +
  293. +;-----------------------------------------------------------------------------
  294. +; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
  295. +;-----------------------------------------------------------------------------
  296. +%macro MC10 3
  297. +cglobal %2_h264_qpel%3_mc10_10_%1, 3,5,7
  298. +    mov      r4, r1
  299. +.body
  300. +    mov     r3d, %3
  301. +    pxor     m0, m0
  302. +    mova     m1, [pw_pixel_max]
  303. +    mova     m6, [pw_16]
  304. +.nextrow
  305. +    movu     m2, [r1-4]
  306. +    movu     m3, [r1-2]
  307. +    movu     m4, [r1+0]
  308. +    ADDW     m4, [r1+2], m5
  309. +    ADDW     m3, [r1+4], m5
  310. +    ADDW     m2, [r1+6], m5
  311. +
  312. +    FILT_H   m2, m3, m4, m6
  313. +    psraw    m2, 1
  314. +    CLIPW    m2, m0, m1
  315. +    movu     m3, [r4]
  316. +    pavgw    m2, m3
  317. +    OP_MOV [r0], m2
  318. +    add      r0, r2
  319. +    add      r1, r2
  320. +    add      r4, r2
  321. +    dec     r3d
  322. +    jg .nextrow
  323. +    REP_RET
  324. +%endmacro
  325. +
  326. +%define OP_MOV mova
  327. +INIT_MMX
  328. +MC10 mmxext, put, 4
  329. +INIT_XMM
  330. +MC10 sse2  , put, 8
  331. +
  332. +%define OP_MOV AVG_MOV
  333. +INIT_MMX
  334. +MC10 mmxext, avg, 4
  335. +INIT_XMM
  336. +MC10 sse2  , avg, 8
  337. +
  338. +;-----------------------------------------------------------------------------
  339. +; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  340. +;-----------------------------------------------------------------------------
  341. +%macro FILT_V 8
  342. +    movu     %6, [r1]
  343. +    paddw    %1, %6
  344. +    mova     %7, %2
  345. +    paddw    %7, %5
  346. +    mova     %8, %3
  347. +    paddw    %8, %4
  348. +    FILT_H   %1, %7, %8, [pw_16]
  349. +    psraw    %1, 1
  350. +    CLIPW    %1, [pb_0], [pw_pixel_max]
  351. +%endmacro
  352. +
  353. +%macro MC02 3
  354. +cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
  355. +    lea      r3, [r2*2]
  356. +    sub      r1, r3
  357. +    movu     m0, [r1]
  358. +    movu     m1, [r1+r2]
  359. +    add      r1, r3
  360. +    movu     m2, [r1]
  361. +    movu     m3, [r1+r2]
  362. +    add      r1, r3
  363. +    movu     m4, [r1]
  364. +    add      r1, r2
  365. +
  366. +%rep %3-1
  367. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  368. +    OP_MOV [r0], m0
  369. +    add      r1, r2
  370. +    add      r0, r2
  371. +    SWAP 0,1,2,3,4,5
  372. +%endrep
  373. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  374. +    OP_MOV [r0], m0
  375. +    RET
  376. +%endmacro
  377. +
  378. +%define OP_MOV mova
  379. +INIT_MMX
  380. +MC02 mmxext, put, 4
  381. +INIT_XMM
  382. +MC02 sse2  , put, 8
  383. +
  384. +%define OP_MOV AVG_MOV
  385. +INIT_MMX
  386. +MC02 mmxext, avg, 4
  387. +INIT_XMM
  388. +MC02 sse2  , avg, 8
  389. +
  390. +;-----------------------------------------------------------------------------
  391. +; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
  392. +;-----------------------------------------------------------------------------
  393. +%macro MC01 3
  394. +cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
  395. +    mov      r4, r1
  396. +.body
  397. +    lea      r3, [r2*2]
  398. +    sub      r1, r3
  399. +    movu     m0, [r1]
  400. +    movu     m1, [r1+r2]
  401. +    add      r1, r3
  402. +    movu     m2, [r1]
  403. +    movu     m3, [r1+r2]
  404. +    add      r1, r3
  405. +    movu     m4, [r1]
  406. +    add      r1, r2
  407. +
  408. +%rep %3-1
  409. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  410. +    movu     m7, [r4]
  411. +    pavgw    m0, m7
  412. +    OP_MOV [r0], m0
  413. +    add      r4, r2
  414. +    add      r1, r2
  415. +    add      r0, r2
  416. +    SWAP 0,1,2,3,4,5
  417. +%endrep
  418. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  419. +    movu     m7, [r4]
  420. +    pavgw    m0, m7
  421. +    OP_MOV [r0], m0
  422. +    RET
  423. +%endmacro
  424. +
  425. +%define OP_MOV mova
  426. +INIT_MMX
  427. +MC01 mmxext, put, 4
  428. +INIT_XMM
  429. +MC01 sse2  , put, 8
  430. +
  431. +%define OP_MOV AVG_MOV
  432. +INIT_MMX
  433. +MC01 mmxext, avg, 4
  434. +INIT_XMM
  435. +MC01 sse2  , avg, 8
  436. +
  437. +;-----------------------------------------------------------------------------
  438. +; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
  439. +;-----------------------------------------------------------------------------
  440. +%macro MC03 3
  441. +cglobal %2_h264_qpel%3_mc03_10_%1, 3,5,8
  442. +    lea r4, [r1+r2]
  443. +    jmp mangle(ff_%2_h264_qpel%3_mc01_10_%1.body)
  444. +%endmacro
  445. +
  446. +%define OP_MOV mova
  447. +INIT_MMX
  448. +MC03 mmxext, put, 4
  449. +INIT_XMM
  450. +MC03 sse2  , put, 8
  451. +
  452. +%define OP_MOV AVG_MOV
  453. +INIT_MMX
  454. +MC03 mmxext, avg, 4
  455. +INIT_XMM
  456. +MC03 sse2  , avg, 8
  457. +
  458. +;-----------------------------------------------------------------------------
  459. +; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
  460. +;-----------------------------------------------------------------------------
  461. +%macro MC11 3
  462. +; this REALLY needs x86_64
  463. +cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
  464. +    mov      r4, r1
  465. +.body
  466. +    lea      r3, [r2*2]
  467. +    sub      r1, r3
  468. +    movu     m0, [r1]
  469. +    movu     m1, [r1+r2]
  470. +    add      r1, r3
  471. +    movu     m2, [r1]
  472. +    movu     m3, [r1+r2]
  473. +    add      r1, r3
  474. +    movu     m4, [r1]
  475. +    add      r1, r2
  476. +
  477. +%assign i 0
  478. +%rep %3
  479. +%assign i i+1
  480. +    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  481. +;now do FILT_H with fewer registers. probably faster than doing FILT_V then FILT_H
  482. +;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  483. +;unfortunately I need three registers, so m5 will have to be re-read from memory
  484. +    movu     m5, [r4-4]
  485. +    ADDW     m5, [r4+6], m7
  486. +    movu     m6, [r4-2]
  487. +    ADDW     m6, [r4+4], m7
  488. +    paddw    m5, [pw_16]
  489. +    psubw    m5, m6  ; a-b
  490. +    psraw    m5, 2   ; (a-b)/4
  491. +    psubw    m5, m6  ; (a-b)/4-b
  492. +;now I need to load c...
  493. +    movu     m6, [r4+0]
  494. +    ADDW     m6, [r4+2], m7
  495. +    paddw    m5, m6  ; (a-b)/4-b+c
  496. +    psraw    m5, 2   ; ((a-b)/4-b+c)/4
  497. +    paddw    m5, m6  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  498. +    psraw    m5, 1
  499. +    CLIPW    m5, [pb_0], [pw_pixel_max]
  500. +;avg FILT_V, FILT_H and reload m5
  501. +    pavgw    m0, m5
  502. +    OP_MOV [r0], m0
  503. +%if i<%3
  504. +    movu     m5, [r1]
  505. +    add      r4, r2
  506. +    add      r1, r2
  507. +    add      r0, r2
  508. +    SWAP 0,1,2,3,4,5
  509. +%endif
  510. +%endrep
  511. +    RET
  512. +%endmacro
  513. +
  514. +%define OP_MOV mova
  515. +INIT_MMX
  516. +MC11 mmxext, put, 4
  517. +INIT_XMM
  518. +MC11 sse2  , put, 8
  519. +
  520. +%define OP_MOV AVG_MOV
  521. +INIT_MMX
  522. +MC11 mmxext, avg, 4
  523. +INIT_XMM
  524. +MC11 sse2  , avg, 8
  525. +
  526. +;-----------------------------------------------------------------------------
  527. +; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  528. +;-----------------------------------------------------------------------------
  529. +%macro MC31 3
  530. +cglobal %2_h264_qpel%3_mc31_10_%1, 3,5,8
  531. +    mov r4, r1
  532. +    add r1, 2
  533. +    jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  534. +%endmacro
  535. +
  536. +%define OP_MOV mova
  537. +INIT_MMX
  538. +MC31 mmxext, put, 4
  539. +INIT_XMM
  540. +MC31 sse2  , put, 8
  541. +
  542. +%define OP_MOV AVG_MOV
  543. +INIT_MMX
  544. +MC31 mmxext, avg, 4
  545. +INIT_XMM
  546. +MC31 sse2  , avg, 8
  547. +
  548. +;-----------------------------------------------------------------------------
  549. +; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  550. +;-----------------------------------------------------------------------------
  551. +%macro MC13 3
  552. +cglobal %2_h264_qpel%3_mc13_10_%1, 3,5,8
  553. +    lea r4, [r1+r2]
  554. +    jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  555. +%endmacro
  556. +
  557. +%define OP_MOV mova
  558. +INIT_MMX
  559. +MC13 mmxext, put, 4
  560. +INIT_XMM
  561. +MC13 sse2  , put, 8
  562. +
  563. +%define OP_MOV AVG_MOV
  564. +INIT_MMX
  565. +MC13 mmxext, avg, 4
  566. +INIT_XMM
  567. +MC13 sse2  , avg, 8
  568. +
  569. +;-----------------------------------------------------------------------------
  570. +; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  571. +;-----------------------------------------------------------------------------
  572. +%macro MC33 3
  573. +cglobal %2_h264_qpel%3_mc33_10_%1, 3,5,8
  574. +    lea r4, [r1+r2]
  575. +    add r1, 2
  576. +    jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  577. +%endmacro
  578. +
  579. +%define OP_MOV mova
  580. +INIT_MMX
  581. +MC33 mmxext, put, 4
  582. +INIT_XMM
  583. +MC33 sse2  , put, 8
  584. +
  585. +%define OP_MOV AVG_MOV
  586. +INIT_MMX
  587. +MC33 mmxext, avg, 4
  588. +INIT_XMM
  589. +MC33 sse2  , avg, 8
  590. +
  591. +
  592. +
  593. +;-----------------------------------------------------------------------------
  594. +; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
  595. +;-----------------------------------------------------------------------------
  596. +%macro FILT_H2 3
  597. +    psubw  %1, %2  ; a-b
  598. +    psubw  %2, %3  ; b-c
  599. +    psllw  %2, 2
  600. +    psubw  %1, %2  ; a-5*b+4*c
  601. +    psllw  %3, 4
  602. +    paddw  %1, %3  ; a-5*b+20*c
  603. +%endmacro
  604. +
  605. +%macro FILT_VNRD 8
  606. +    movu     %6, [r1]
  607. +    paddw    %1, %6
  608. +    mova     %7, %2
  609. +    paddw    %7, %5
  610. +    mova     %8, %3
  611. +    paddw    %8, %4
  612. +    FILT_H2  %1, %7, %8
  613. +%endmacro
  614. +
  615. +%macro MC22 3
  616. +%2_hv%3_10_%1:
  617. +    add     rsp, gprsize
  618. +    neg      r2           ; This actually saves instructions
  619. +    lea      r1, [r1+r2*2]
  620. +    sub      r1, mmsize
  621. +    xor      r4, r4
  622. +    mov      r3, 3
  623. +.v_loop:
  624. +    movu     m0, [r1]
  625. +    sub      r1, r2
  626. +    movu     m1, [r1]
  627. +    sub      r1, r2
  628. +    movu     m2, [r1]
  629. +    sub      r1, r2
  630. +    movu     m3, [r1]
  631. +    sub      r1, r2
  632. +    movu     m4, [r1]
  633. +    sub      r1, r2
  634. +%assign i 0
  635. +%rep %3-1
  636. +    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  637. +    psubw    m0, [pad20]
  638. +    mova     [rsp+r4+i*mmsize*3], m0
  639. +    sub      r1, r2
  640. +    SWAP 0,1,2,3,4,5
  641. +%assign i i+1
  642. +%endrep
  643. +    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  644. +    psubw    m0, [pad20]
  645. +    mova     [rsp+r4+i*mmsize*3], m0
  646. +    add      r4, mmsize
  647. +    lea      r1, [r1+r2*8+mmsize]
  648. +%if %3==8
  649. +    lea      r1, [r1+r2*4]
  650. +%endif
  651. +    dec      r3
  652. +    jg .v_loop
  653. +    sub     rsp, gprsize
  654. +    neg      r2
  655. +    ret
  656. +
  657. +cglobal %2_h264_qpel%3_mc22_10_%1, 3,7,10
  658. +    mov      r6, rsp          ; backup stack pointer
  659. +    and     rsp, ~(mmsize-1)  ; align stack
  660. +    sub     rsp, 4096         ; TODO: calculate this correctly
  661. +
  662. +    call %2_hv%3_10_%1
  663. +
  664. +    mov        r4, mmsize
  665. +    mov       r3d, %3
  666. +    mova       m0, [tap1]
  667. +    mova       m7, [tap3]
  668. +%if num_mmregs > 8
  669. +    mova       m8, [tap2]
  670. +    mova       m9, [depad]
  671. +    %define s1 m8
  672. +    %define s2 m9
  673. +%else
  674. +    %define s1 [tap2]
  675. +    %define s2 [depad]
  676. +%endif
  677. +.h_loop:
  678. +    movu       m1, [rsp+r4-4]
  679. +    movu       m2, [rsp+r4-2]
  680. +    mova       m3, [rsp+r4+0]
  681. +    movu       m4, [rsp+r4+2]
  682. +    movu       m5, [rsp+r4+4]
  683. +    movu       m6, [rsp+r4+6]
  684. +    pmaddwd    m1, m0
  685. +    pmaddwd    m2, m0
  686. +    pmaddwd    m3, s1
  687. +    pmaddwd    m4, s1
  688. +    pmaddwd    m5, m7
  689. +    pmaddwd    m6, m7
  690. +    paddd      m1, s2
  691. +    paddd      m2, s2
  692. +    paddd      m3, m5
  693. +    paddd      m4, m6
  694. +    paddd      m1, m3
  695. +    paddd      m2, m4
  696. +    psrad      m1, 10
  697. +    psrad      m2, 10
  698. +    pslld      m2, 16
  699. +    pand       m1, [pd_0f]
  700. +    por        m1, m2
  701. +    CLIPW      m1, [pb_0], [pw_pixel_max]
  702. +    OP_MOV   [r0], m1
  703. +    add        r4, mmsize*3
  704. +    add        r0, r2
  705. +    dec       r3d
  706. +    jg .h_loop
  707. +
  708. +    mov     rsp, r6          ; restore stack pointer
  709. +    RET
  710. +%endmacro
  711. +
  712. +%define OP_MOV mova
  713. +INIT_MMX
  714. +MC22 mmxext, put, 4
  715. +INIT_XMM
  716. +MC22 sse2  , put, 8
  717. +
  718. +%define OP_MOV AVG_MOV
  719. +INIT_MMX
  720. +MC22 mmxext, avg, 4
  721. +INIT_XMM
  722. +MC22 sse2  , avg, 8
  723. +
  724. +;-----------------------------------------------------------------------------
  725. +; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  726. +;-----------------------------------------------------------------------------
  727. +%macro MC12 3
  728. +cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  729. +    mov      r6, rsp          ; backup stack pointer
  730. +    and     rsp, ~(mmsize-1)  ; align stack
  731. +    sub     rsp, 4096         ; TODO: calculate this correctly
  732. +
  733. +    call %2_hv%3_10_%1
  734. +
  735. +    xor        r4, r4
  736. +.body
  737. +    mov       r3d, %3
  738. +    mova       m0, [tap1]
  739. +    mova       m7, [tap3]
  740. +%if num_mmregs > 8
  741. +    mova       m8, [tap2]
  742. +    mova       m9, [depad]
  743. +    %define s1 m8
  744. +    %define s2 m9
  745. +%else
  746. +    %define s1 [tap2]
  747. +    %define s2 [depad]
  748. +%endif
  749. +.h_loop:
  750. +    movu       m1, [rsp+mmsize-4]
  751. +    movu       m2, [rsp+mmsize-2]
  752. +    mova       m3, [rsp+mmsize+0]
  753. +    movu       m4, [rsp+mmsize+2]
  754. +    movu       m5, [rsp+mmsize+4]
  755. +    movu       m6, [rsp+mmsize+6]
  756. +    pmaddwd    m1, m0
  757. +    pmaddwd    m2, m0
  758. +    pmaddwd    m3, s1
  759. +    pmaddwd    m4, s1
  760. +    pmaddwd    m5, m7
  761. +    pmaddwd    m6, m7
  762. +    paddd      m1, s2
  763. +    paddd      m2, s2
  764. +    paddd      m3, m5
  765. +    paddd      m4, m6
  766. +    paddd      m1, m3
  767. +    paddd      m2, m4
  768. +    psrad      m1, 10
  769. +    psrad      m2, 10
  770. +    pslld      m2, 16
  771. +    pand       m1, [pd_0f]
  772. +    por        m1, m2
  773. +    CLIPW      m1, [pw_0], [pw_pixel_max]
  774. +
  775. +    movu       m3, [rsp+r4+mmsize] ; movu needed for mc32
  776. +    paddw      m3, [depad2]
  777. +    psrlw      m3, 5
  778. +    psubw      m3, [unpad]
  779. +    CLIPW      m3, [pw_0], [pw_pixel_max]
  780. +    pavgw      m1, m3
  781. +
  782. +    OP_MOV   [r0], m1
  783. +    add       rsp, mmsize*3
  784. +    add        r0, r2
  785. +    dec       r3d
  786. +    jg .h_loop
  787. +
  788. +    mov     rsp, r6          ; restore stack pointer
  789. +    RET
  790. +%endmacro
  791. +
  792. +%define OP_MOV mova
  793. +INIT_MMX
  794. +MC12 mmxext, put, 4
  795. +INIT_XMM
  796. +MC12 sse2  , put, 8
  797. +
  798. +%define OP_MOV AVG_MOV
  799. +INIT_MMX
  800. +MC12 mmxext, avg, 4
  801. +INIT_XMM
  802. +MC12 sse2  , avg, 8
  803. +
  804. +;-----------------------------------------------------------------------------
  805. +; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
  806. +;-----------------------------------------------------------------------------
  807. +%macro MC32 3
  808. +cglobal %2_h264_qpel%3_mc32_10_%1, 3,7,10
  809. +    mov  r6, rsp          ; backup stack pointer
  810. +    and rsp, ~(mmsize-1)  ; align stack
  811. +    sub rsp, 4096         ; TODO: calculate this correctly
  812. +
  813. +    call %2_hv%3_10_%1
  814. +
  815. +    mov  r4, 2            ; sizeof(pixel)
  816. +    jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  817. +%endmacro
  818. +
  819. +%define OP_MOV mova
  820. +INIT_MMX
  821. +MC32 mmxext, put, 4
  822. +INIT_XMM
  823. +MC32 sse2  , put, 8
  824. +
  825. +%define OP_MOV AVG_MOV
  826. +INIT_MMX
  827. +MC32 mmxext, avg, 4
  828. +INIT_XMM
  829. +MC32 sse2  , avg, 8
  830. +
  831. +;-----------------------------------------------------------------------------
  832. +; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
  833. +;-----------------------------------------------------------------------------
  834. +%macro MC21 3
  835. +%2_h%3_10_%1:
  836. +    add       rsp, gprsize
  837. +    mov       r3d, %3
  838. +    xor        r4, r4
  839. +    mova       m6, [pad20]
  840. +.nextrow
  841. +    movu       m2, [r5-4]
  842. +    movu       m3, [r5-2]
  843. +    movu       m4, [r5+0]
  844. +    ADDW       m4, [r5+2], m5
  845. +    ADDW       m3, [r5+4], m5
  846. +    ADDW       m2, [r5+6], m5
  847. +
  848. +    FILT_H2    m2, m3, m4
  849. +    psubw      m2, m6
  850. +    mova [rsp+r4], m2
  851. +    add        r4, mmsize*3
  852. +    add        r5, r2
  853. +    dec       r3d
  854. +    jg .nextrow
  855. +    sub       rsp, gprsize
  856. +    ret
  857. +
  858. +cglobal %2_h264_qpel%3_mc21_10_%1, 3,7,10
  859. +    mov   r6, rsp          ; backup stack pointer
  860. +    and  rsp, ~(mmsize-1)  ; align stack
  861. +    sub  rsp, 4096         ; TODO: calculate this correctly
  862. +
  863. +    mov   r5, r1
  864. +    call  %2_hv%3_10_%1
  865. +
  866. +%define PAD mmsize*16*3*2     ; SIZE*16*3*sizeof(pixel)
  867. +    add  rsp, PAD
  868. +    call %2_h%3_10_%1
  869. +    sub  rsp, PAD
  870. +
  871. +    mov  r4, PAD-mmsize            ; H buffer
  872. +    jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  873. +%endmacro
  874. +
  875. +%define OP_MOV mova
  876. +INIT_MMX
  877. +MC21 mmxext, put, 4
  878. +INIT_XMM
  879. +MC21 sse2  , put, 8
  880. +
  881. +%define OP_MOV AVG_MOV
  882. +INIT_MMX
  883. +MC21 mmxext, avg, 4
  884. +INIT_XMM
  885. +MC21 sse2  , avg, 8
  886. +
  887. +;-----------------------------------------------------------------------------
  888. +; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
  889. +;-----------------------------------------------------------------------------
  890. +%macro MC23 3
  891. +cglobal %2_h264_qpel%3_mc23_10_%1, 3,7,10
  892. +    mov   r6, rsp          ; backup stack pointer
  893. +    and  rsp, ~(mmsize-1)  ; align stack
  894. +    sub  rsp, 4096         ; TODO: calculate this correctly
  895. +
  896. +    lea   r5, [r1+r2]
  897. +    call  %2_hv%3_10_%1
  898. +
  899. +%define PAD mmsize*16*3*2     ; SIZE*16*3*sizeof(pixel)
  900. +    add  rsp, PAD
  901. +    call %2_h%3_10_%1
  902. +    sub  rsp, PAD
  903. +
  904. +    mov  r4, PAD-mmsize            ; H buffer
  905. +    jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  906. +%endmacro
  907. +
  908. +%define OP_MOV mova
  909. +INIT_MMX
  910. +MC23 mmxext, put, 4
  911. +INIT_XMM
  912. +MC23 sse2  , put, 8
  913. +
  914. +%define OP_MOV AVG_MOV
  915. +INIT_MMX
  916. +MC23 mmxext, avg, 4
  917. +INIT_XMM
  918. +MC23 sse2  , avg, 8
  919. diff --git a/libavcodec/x86/h264_qpel_mmx_10bit.c b/libavcodec/x86/h264_qpel_mmx_10bit.c
  920. new file mode 100755
  921. index 0000000..98cf6da
  922. --- /dev/null
  923. +++ b/libavcodec/x86/h264_qpel_mmx_10bit.c
  924. @@ -0,0 +1,141 @@
  925. +/*
  926. + * Copyright (c) 2011 Daniel Kang
  927. + *
  928. + * This file is part of Libav.
  929. + *
  930. + * Libav is free software; you can redistribute it and/or
  931. + * modify it under the terms of the GNU Lesser General Public
  932. + * License as published by the Free Software Foundation; either
  933. + * version 2.1 of the License, or (at your option) any later version.
  934. + *
  935. + * Libav is distributed in the hope that it will be useful,
  936. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  937. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  938. + * Lesser General Public License for more details.
  939. + *
  940. + * You should have received a copy of the GNU Lesser General Public
  941. + * License along with Libav; if not, write to the Free Software
  942. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  943. + */
  944. +
  945. +#include "dsputil_mmx.h"
  946. +
  947. +#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
  948. +void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
  949. +    (uint8_t *dst, uint8_t *src, int stride);
  950. +
  951. +#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
  952. +    LUMA_MC_OP(put,  4, DEPTH, TYPE, OPT) \
  953. +    LUMA_MC_OP(avg,  4, DEPTH, TYPE, OPT) \
  954. +    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
  955. +    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
  956. +    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  957. +    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  958. +
  959. +#define LUMA_MC_816(DEPTH, TYPE, OPT) \
  960. +    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
  961. +    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
  962. +    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
  963. +    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
  964. +
  965. +LUMA_MC_ALL(10, mc00, mmxext)
  966. +LUMA_MC_ALL(10, mc10, mmxext)
  967. +LUMA_MC_ALL(10, mc20, mmxext)
  968. +LUMA_MC_ALL(10, mc30, mmxext)
  969. +LUMA_MC_ALL(10, mc01, mmxext)
  970. +LUMA_MC_ALL(10, mc11, mmxext)
  971. +LUMA_MC_ALL(10, mc21, mmxext)
  972. +LUMA_MC_ALL(10, mc31, mmxext)
  973. +LUMA_MC_ALL(10, mc02, mmxext)
  974. +LUMA_MC_ALL(10, mc12, mmxext)
  975. +LUMA_MC_ALL(10, mc22, mmxext)
  976. +LUMA_MC_ALL(10, mc32, mmxext)
  977. +LUMA_MC_ALL(10, mc03, mmxext)
  978. +LUMA_MC_ALL(10, mc13, mmxext)
  979. +LUMA_MC_ALL(10, mc23, mmxext)
  980. +LUMA_MC_ALL(10, mc33, mmxext)
  981. +
  982. +LUMA_MC_816(10, mc00, sse2)
  983. +LUMA_MC_816(10, mc10, sse2)
  984. +LUMA_MC_816(10, mc20, sse2)
  985. +LUMA_MC_816(10, mc30, sse2)
  986. +LUMA_MC_816(10, mc01, sse2)
  987. +LUMA_MC_816(10, mc11, sse2)
  988. +LUMA_MC_816(10, mc21, sse2)
  989. +LUMA_MC_816(10, mc31, sse2)
  990. +LUMA_MC_816(10, mc02, sse2)
  991. +LUMA_MC_816(10, mc12, sse2)
  992. +LUMA_MC_816(10, mc22, sse2)
  993. +LUMA_MC_816(10, mc32, sse2)
  994. +LUMA_MC_816(10, mc03, sse2)
  995. +LUMA_MC_816(10, mc13, sse2)
  996. +LUMA_MC_816(10, mc23, sse2)
  997. +LUMA_MC_816(10, mc33, sse2)
  998. +
  999. +#define QPEL8_OPMC(OP, MC, MMX)\
  1000. +void ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1001. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst  , src  , stride);\
  1002. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst+8, src+8, stride);\
  1003. +    src += 4*stride;\
  1004. +    dst += 4*stride;\
  1005. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst  , src  , stride);\
  1006. +    ff_ ## OP ## _h264_qpel4_ ## MC ## _10_ ## MMX(dst+8, src+8, stride);\
  1007. +}
  1008. +
  1009. +#define QPEL8_OP(MC, MMX)\
  1010. +QPEL8_OPMC(put, MC, MMX)\
  1011. +QPEL8_OPMC(avg, MC, MMX)
  1012. +
  1013. +#define QPEL8(MMX)\
  1014. +QPEL8_OP(mc00, MMX)\
  1015. +QPEL8_OP(mc01, MMX)\
  1016. +QPEL8_OP(mc02, MMX)\
  1017. +QPEL8_OP(mc03, MMX)\
  1018. +QPEL8_OP(mc10, MMX)\
  1019. +QPEL8_OP(mc11, MMX)\
  1020. +QPEL8_OP(mc12, MMX)\
  1021. +QPEL8_OP(mc13, MMX)\
  1022. +QPEL8_OP(mc20, MMX)\
  1023. +QPEL8_OP(mc21, MMX)\
  1024. +QPEL8_OP(mc22, MMX)\
  1025. +QPEL8_OP(mc23, MMX)\
  1026. +QPEL8_OP(mc30, MMX)\
  1027. +QPEL8_OP(mc31, MMX)\
  1028. +QPEL8_OP(mc32, MMX)\
  1029. +QPEL8_OP(mc33, MMX)
  1030. +
  1031. +#define QPEL16_OPMC(OP, MC, MMX)\
  1032. +void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1033. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
  1034. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1035. +    src += 8*stride;\
  1036. +    dst += 8*stride;\
  1037. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
  1038. +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
  1039. +}
  1040. +
  1041. +#define QPEL16_OP(MC, MMX)\
  1042. +QPEL16_OPMC(put, MC, MMX)\
  1043. +QPEL16_OPMC(avg, MC, MMX)
  1044. +
  1045. +#define QPEL16(MMX)\
  1046. +QPEL16_OP(mc01, MMX)\
  1047. +QPEL16_OP(mc02, MMX)\
  1048. +QPEL16_OP(mc03, MMX)\
  1049. +QPEL16_OP(mc10, MMX)\
  1050. +QPEL16_OP(mc11, MMX)\
  1051. +QPEL16_OP(mc12, MMX)\
  1052. +QPEL16_OP(mc13, MMX)\
  1053. +QPEL16_OP(mc20, MMX)\
  1054. +QPEL16_OP(mc21, MMX)\
  1055. +QPEL16_OP(mc22, MMX)\
  1056. +QPEL16_OP(mc23, MMX)\
  1057. +QPEL16_OP(mc30, MMX)\
  1058. +QPEL16_OP(mc31, MMX)\
  1059. +QPEL16_OP(mc32, MMX)\
  1060. +QPEL16_OP(mc33, MMX)
  1061. +
  1062. +QPEL8(mmxext)
  1063. +QPEL16_OP(mc00, mmxext)
  1064. +QPEL16(mmxext)
  1065. +QPEL16(sse2)
  1066. --
  1067. 1.7.5.1
  1068.  
  1069.  
  1070. From 970a7a164d0ae110821c174a7b9081f93172c095 Mon Sep 17 00:00:00 2001
  1071. From: Daniel Kang <daniel.d.kang@gmail.com>
  1072. Date: Sat, 25 Jun 2011 13:28:49 -0400
  1073. Subject: [PATCH 2/5] improvement?
  1074.  
  1075. ---
  1076. libavcodec/x86/h264_qpel_10bit.asm |   40 ++++++++++++++++++++---------------
  1077.  1 files changed, 23 insertions(+), 17 deletions(-)
  1078.  
  1079. diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
  1080. index fc44e85..bc0e78d 100644
  1081. --- a/libavcodec/x86/h264_qpel_10bit.asm
  1082. +++ b/libavcodec/x86/h264_qpel_10bit.asm
  1083. @@ -620,7 +620,7 @@ MC22 sse2  , avg, 8
  1084.  ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  1085.  ;-----------------------------------------------------------------------------
  1086.  %macro MC12 3
  1087. -cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1088. +cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
  1089.      mov      r6, rsp          ; backup stack pointer
  1090.      and     rsp, ~(mmsize-1)  ; align stack
  1091.      sub     rsp, 4096         ; TODO: calculate this correctly
  1092. @@ -630,16 +630,22 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1093.      xor        r4, r4
  1094.  .body
  1095.      mov       r3d, %3
  1096. -    mova       m0, [tap1]
  1097. -    mova       m7, [tap3]
  1098. +    pxor       m0, m0
  1099. +    mova       m7, [pw_pixel_max]
  1100.  %if num_mmregs > 8
  1101. -    mova       m8, [tap2]
  1102. -    mova       m9, [depad]
  1103. +    mova       m8, [tap1]
  1104. +    mova       m9, [tap2]
  1105. +    mova      m10, [tap3]
  1106. +    mova      m11, [depad]
  1107.      %define s1 m8
  1108.      %define s2 m9
  1109. +    %define s3 m10
  1110. +    %define s1 m11
  1111.  %else
  1112. -    %define s1 [tap2]
  1113. -    %define s2 [depad]
  1114. +    %define s1 [tap1]
  1115. +    %define s2 [tap2]
  1116. +    %define s3 [tap3]
  1117. +    %define d1 [depad]
  1118.  %endif
  1119.  .h_loop:
  1120.      movu       m1, [rsp+mmsize-4]
  1121. @@ -648,14 +654,14 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1122.      movu       m4, [rsp+mmsize+2]
  1123.      movu       m5, [rsp+mmsize+4]
  1124.      movu       m6, [rsp+mmsize+6]
  1125. -    pmaddwd    m1, m0
  1126. -    pmaddwd    m2, m0
  1127. -    pmaddwd    m3, s1
  1128. -    pmaddwd    m4, s1
  1129. -    pmaddwd    m5, m7
  1130. -    pmaddwd    m6, m7
  1131. -    paddd      m1, s2
  1132. -    paddd      m2, s2
  1133. +    pmaddwd    m1, s1
  1134. +    pmaddwd    m2, s1
  1135. +    pmaddwd    m3, s2
  1136. +    pmaddwd    m4, s2
  1137. +    pmaddwd    m5, s3
  1138. +    pmaddwd    m6, s3
  1139. +    paddd      m1, d1
  1140. +    paddd      m2, d1
  1141.      paddd      m3, m5
  1142.      paddd      m4, m6
  1143.      paddd      m1, m3
  1144. @@ -665,13 +671,13 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,10
  1145.      pslld      m2, 16
  1146.      pand       m1, [pd_0f]
  1147.      por        m1, m2
  1148. -    CLIPW      m1, [pw_0], [pw_pixel_max]
  1149. +    CLIPW      m1, m0, m7
  1150.  
  1151.      movu       m3, [rsp+r4+mmsize] ; movu needed for mc32
  1152.      paddw      m3, [depad2]
  1153.      psrlw      m3, 5
  1154.      psubw      m3, [unpad]
  1155. -    CLIPW      m3, [pw_0], [pw_pixel_max]
  1156. +    CLIPW      m3, m0, m7
  1157.      pavgw      m1, m3
  1158.  
  1159.      OP_MOV   [r0], m1
  1160. --
  1161. 1.7.5.1
  1162.  
  1163.  
  1164. From 6e61dab2e3c479e7c8258368a2025f41074e8bae Mon Sep 17 00:00:00 2001
  1165. From: Daniel Kang <daniel.d.kang@gmail.com>
  1166. Date: Sun, 26 Jun 2011 00:07:08 -0400
  1167. Subject: [PATCH 3/5] pengvado's fixes pt 1
  1168.  
  1169. ---
  1170. libavcodec/x86/h264_qpel_10bit.asm |  354 ++++++++++++------------------------
  1171.  1 files changed, 115 insertions(+), 239 deletions(-)
  1172.  
  1173. diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
  1174. index bc0e78d..ec17cf6 100644
  1175. --- a/libavcodec/x86/h264_qpel_10bit.asm
  1176. +++ b/libavcodec/x86/h264_qpel_10bit.asm
  1177. @@ -47,7 +47,56 @@ pd_0f: times 4 dd 0xffff
  1178.  
  1179.  SECTION .text
  1180.  
  1181. -; All of  the 2x2 functions are probably no faster than the C version.
  1182. +
  1183. +%macro AVG_MOV 2
  1184. +    pavgw %2, %1
  1185. +    mova  %1, %2
  1186. +%endmacro
  1187. +
  1188. +%macro ADDW 3
  1189. +%if mmsize == 8
  1190. +    paddw %1, %2
  1191. +%else
  1192. +    movu  %3, %2
  1193. +    paddw %1, %3
  1194. +%endif
  1195. +%endmacro
  1196. +
  1197. +%macro FILT_H 4
  1198. +    paddw  %1, %4
  1199. +    psubw  %1, %2  ; a-b
  1200. +    psraw  %1, 2   ; (a-b)/4
  1201. +    psubw  %1, %2  ; (a-b)/4-b
  1202. +    paddw  %1, %3  ; (a-b)/4-b+c
  1203. +    psraw  %1, 2   ; ((a-b)/4-b+c)/4
  1204. +    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1205. +%endmacro
  1206. +
  1207. +%macro FILT_V 8
  1208. +    movu     %6, [r1]
  1209. +    paddw    %1, %6
  1210. +    mova     %7, %2
  1211. +    paddw    %7, %5
  1212. +    mova     %8, %3
  1213. +    paddw    %8, %4
  1214. +    FILT_H   %1, %7, %8, [pw_16]
  1215. +    psraw    %1, 1
  1216. +    CLIPW    %1, [pb_0], [pw_pixel_max]
  1217. +%endmacro
  1218. +
  1219. +%macro MC 1
  1220. +%define OP_MOV mova
  1221. +INIT_MMX
  1222. +%1 mmxext, put, 4
  1223. +INIT_XMM
  1224. +%1 sse2  , put, 8
  1225. +
  1226. +%define OP_MOV AVG_MOV
  1227. +INIT_MMX
  1228. +%1 mmxext, avg, 4
  1229. +INIT_XMM
  1230. +%1 sse2  , avg, 8
  1231. +%endmacro
  1232.  
  1233.  ;-----------------------------------------------------------------------------
  1234.  ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  1235. @@ -94,11 +143,6 @@ cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
  1236.      RET
  1237.  %endmacro
  1238.  
  1239. -%macro AVG_MOV 2
  1240. -    pavgw %2, %1
  1241. -    mova  %1, %2
  1242. -%endmacro
  1243. -
  1244.  %define OP_MOV mova
  1245.  MC00 put
  1246.  
  1247. @@ -108,25 +152,6 @@ MC00 avg
  1248.  ;-----------------------------------------------------------------------------
  1249.  ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  1250.  ;-----------------------------------------------------------------------------
  1251. -%macro FILT_H 4
  1252. -    paddw  %1, %4
  1253. -    psubw  %1, %2  ; a-b
  1254. -    psraw  %1, 2   ; (a-b)/4
  1255. -    psubw  %1, %2  ; (a-b)/4-b
  1256. -    paddw  %1, %3  ; (a-b)/4-b+c
  1257. -    psraw  %1, 2   ; ((a-b)/4-b+c)/4
  1258. -    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1259. -%endmacro
  1260. -
  1261. -%macro ADDW 3
  1262. -%if mmsize == 8
  1263. -    paddw %1, %2
  1264. -%else
  1265. -    movu  %3, %2
  1266. -    paddw %1, %3
  1267. -%endif
  1268. -%endmacro
  1269. -
  1270.  %macro MC20 3
  1271.  cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
  1272.      mov     r3d, %3
  1273. @@ -137,9 +162,9 @@ cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
  1274.      movu     m2, [r1-4]
  1275.      movu     m3, [r1-2]
  1276.      movu     m4, [r1+0]
  1277. -    ADDW     m4, [r1+2], m5
  1278. -    ADDW     m3, [r1+4], m5
  1279.      ADDW     m2, [r1+6], m5
  1280. +    ADDW     m3, [r1+4], m5
  1281. +    ADDW     m4, [r1+2], m5
  1282.  
  1283.      FILT_H   m2, m3, m4, m6
  1284.      psraw    m2, 1
  1285. @@ -152,17 +177,7 @@ cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
  1286.      REP_RET
  1287.  %endmacro
  1288.  
  1289. -%define OP_MOV mova
  1290. -INIT_MMX
  1291. -MC20 mmxext, put, 4
  1292. -INIT_XMM
  1293. -MC20 sse2  , put, 8
  1294. -
  1295. -%define OP_MOV AVG_MOV
  1296. -INIT_MMX
  1297. -MC20 mmxext, avg, 4
  1298. -INIT_XMM
  1299. -MC20 sse2  , avg, 8
  1300. +MC MC20
  1301.  
  1302.  ;-----------------------------------------------------------------------------
  1303.  ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
  1304. @@ -173,17 +188,7 @@ cglobal %2_h264_qpel%3_mc30_10_%1, 3,5,6
  1305.      jmp mangle(ff_%2_h264_qpel%3_mc10_10_%1.body)
  1306.  %endmacro
  1307.  
  1308. -%define OP_MOV mova
  1309. -INIT_MMX
  1310. -MC30 mmxext, put, 4
  1311. -INIT_XMM
  1312. -MC30 sse2  , put, 8
  1313. -
  1314. -%define OP_MOV AVG_MOV
  1315. -INIT_MMX
  1316. -MC30 mmxext, avg, 4
  1317. -INIT_XMM
  1318. -MC30 sse2  , avg, 8
  1319. +MC MC30
  1320.  
  1321.  ;-----------------------------------------------------------------------------
  1322.  ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
  1323. @@ -200,9 +205,9 @@ cglobal %2_h264_qpel%3_mc10_10_%1, 3,5,7
  1324.      movu     m2, [r1-4]
  1325.      movu     m3, [r1-2]
  1326.      movu     m4, [r1+0]
  1327. -    ADDW     m4, [r1+2], m5
  1328. -    ADDW     m3, [r1+4], m5
  1329.      ADDW     m2, [r1+6], m5
  1330. +    ADDW     m3, [r1+4], m5
  1331. +    ADDW     m4, [r1+2], m5
  1332.  
  1333.      FILT_H   m2, m3, m4, m6
  1334.      psraw    m2, 1
  1335. @@ -218,33 +223,11 @@ cglobal %2_h264_qpel%3_mc10_10_%1, 3,5,7
  1336.      REP_RET
  1337.  %endmacro
  1338.  
  1339. -%define OP_MOV mova
  1340. -INIT_MMX
  1341. -MC10 mmxext, put, 4
  1342. -INIT_XMM
  1343. -MC10 sse2  , put, 8
  1344. -
  1345. -%define OP_MOV AVG_MOV
  1346. -INIT_MMX
  1347. -MC10 mmxext, avg, 4
  1348. -INIT_XMM
  1349. -MC10 sse2  , avg, 8
  1350. +MC MC10
  1351.  
  1352.  ;-----------------------------------------------------------------------------
  1353.  ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  1354.  ;-----------------------------------------------------------------------------
  1355. -%macro FILT_V 8
  1356. -    movu     %6, [r1]
  1357. -    paddw    %1, %6
  1358. -    mova     %7, %2
  1359. -    paddw    %7, %5
  1360. -    mova     %8, %3
  1361. -    paddw    %8, %4
  1362. -    FILT_H   %1, %7, %8, [pw_16]
  1363. -    psraw    %1, 1
  1364. -    CLIPW    %1, [pb_0], [pw_pixel_max]
  1365. -%endmacro
  1366. -
  1367.  %macro MC02 3
  1368.  cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
  1369.      lea      r3, [r2*2]
  1370. @@ -270,17 +253,7 @@ cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
  1371.      RET
  1372.  %endmacro
  1373.  
  1374. -%define OP_MOV mova
  1375. -INIT_MMX
  1376. -MC02 mmxext, put, 4
  1377. -INIT_XMM
  1378. -MC02 sse2  , put, 8
  1379. -
  1380. -%define OP_MOV AVG_MOV
  1381. -INIT_MMX
  1382. -MC02 mmxext, avg, 4
  1383. -INIT_XMM
  1384. -MC02 sse2  , avg, 8
  1385. +MC MC02
  1386.  
  1387.  ;-----------------------------------------------------------------------------
  1388.  ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
  1389. @@ -317,17 +290,7 @@ cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
  1390.      RET
  1391.  %endmacro
  1392.  
  1393. -%define OP_MOV mova
  1394. -INIT_MMX
  1395. -MC01 mmxext, put, 4
  1396. -INIT_XMM
  1397. -MC01 sse2  , put, 8
  1398. -
  1399. -%define OP_MOV AVG_MOV
  1400. -INIT_MMX
  1401. -MC01 mmxext, avg, 4
  1402. -INIT_XMM
  1403. -MC01 sse2  , avg, 8
  1404. +MC MC01
  1405.  
  1406.  ;-----------------------------------------------------------------------------
  1407.  ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
  1408. @@ -338,17 +301,7 @@ cglobal %2_h264_qpel%3_mc03_10_%1, 3,5,8
  1409.      jmp mangle(ff_%2_h264_qpel%3_mc01_10_%1.body)
  1410.  %endmacro
  1411.  
  1412. -%define OP_MOV mova
  1413. -INIT_MMX
  1414. -MC03 mmxext, put, 4
  1415. -INIT_XMM
  1416. -MC03 sse2  , put, 8
  1417. -
  1418. -%define OP_MOV AVG_MOV
  1419. -INIT_MMX
  1420. -MC03 mmxext, avg, 4
  1421. -INIT_XMM
  1422. -MC03 sse2  , avg, 8
  1423. +MC MC03
  1424.  
  1425.  ;-----------------------------------------------------------------------------
  1426.  ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
  1427. @@ -406,17 +359,7 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
  1428.      RET
  1429.  %endmacro
  1430.  
  1431. -%define OP_MOV mova
  1432. -INIT_MMX
  1433. -MC11 mmxext, put, 4
  1434. -INIT_XMM
  1435. -MC11 sse2  , put, 8
  1436. -
  1437. -%define OP_MOV AVG_MOV
  1438. -INIT_MMX
  1439. -MC11 mmxext, avg, 4
  1440. -INIT_XMM
  1441. -MC11 sse2  , avg, 8
  1442. +MC MC11
  1443.  
  1444.  ;-----------------------------------------------------------------------------
  1445.  ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  1446. @@ -428,17 +371,7 @@ cglobal %2_h264_qpel%3_mc31_10_%1, 3,5,8
  1447.      jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  1448.  %endmacro
  1449.  
  1450. -%define OP_MOV mova
  1451. -INIT_MMX
  1452. -MC31 mmxext, put, 4
  1453. -INIT_XMM
  1454. -MC31 sse2  , put, 8
  1455. -
  1456. -%define OP_MOV AVG_MOV
  1457. -INIT_MMX
  1458. -MC31 mmxext, avg, 4
  1459. -INIT_XMM
  1460. -MC31 sse2  , avg, 8
  1461. +MC MC31
  1462.  
  1463.  ;-----------------------------------------------------------------------------
  1464.  ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  1465. @@ -449,17 +382,7 @@ cglobal %2_h264_qpel%3_mc13_10_%1, 3,5,8
  1466.      jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  1467.  %endmacro
  1468.  
  1469. -%define OP_MOV mova
  1470. -INIT_MMX
  1471. -MC13 mmxext, put, 4
  1472. -INIT_XMM
  1473. -MC13 sse2  , put, 8
  1474. -
  1475. -%define OP_MOV AVG_MOV
  1476. -INIT_MMX
  1477. -MC13 mmxext, avg, 4
  1478. -INIT_XMM
  1479. -MC13 sse2  , avg, 8
  1480. +MC MC13
  1481.  
  1482.  ;-----------------------------------------------------------------------------
  1483.  ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  1484. @@ -471,19 +394,7 @@ cglobal %2_h264_qpel%3_mc33_10_%1, 3,5,8
  1485.      jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  1486.  %endmacro
  1487.  
  1488. -%define OP_MOV mova
  1489. -INIT_MMX
  1490. -MC33 mmxext, put, 4
  1491. -INIT_XMM
  1492. -MC33 sse2  , put, 8
  1493. -
  1494. -%define OP_MOV AVG_MOV
  1495. -INIT_MMX
  1496. -MC33 mmxext, avg, 4
  1497. -INIT_XMM
  1498. -MC33 sse2  , avg, 8
  1499. -
  1500. -
  1501. +MC MC33
  1502.  
  1503.  ;-----------------------------------------------------------------------------
  1504.  ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
  1505. @@ -507,8 +418,8 @@ MC33 sse2  , avg, 8
  1506.      FILT_H2  %1, %7, %8
  1507.  %endmacro
  1508.  
  1509. -%macro MC22 3
  1510. -%2_hv%3_10_%1:
  1511. +%macro HV 2
  1512. +put_hv%2_10_%1:
  1513.      add     rsp, gprsize
  1514.      neg      r2           ; This actually saves instructions
  1515.      lea      r1, [r1+r2*2]
  1516. @@ -527,7 +438,7 @@ MC33 sse2  , avg, 8
  1517.      movu     m4, [r1]
  1518.      sub      r1, r2
  1519.  %assign i 0
  1520. -%rep %3-1
  1521. +%rep %2-1
  1522.      FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  1523.      psubw    m0, [pad20]
  1524.      mova     [rsp+r4+i*mmsize*3], m0
  1525. @@ -540,7 +451,7 @@ MC33 sse2  , avg, 8
  1526.      mova     [rsp+r4+i*mmsize*3], m0
  1527.      add      r4, mmsize
  1528.      lea      r1, [r1+r2*8+mmsize]
  1529. -%if %3==8
  1530. +%if %2==8
  1531.      lea      r1, [r1+r2*4]
  1532.  %endif
  1533.      dec      r3
  1534. @@ -548,13 +459,21 @@ MC33 sse2  , avg, 8
  1535.      sub     rsp, gprsize
  1536.      neg      r2
  1537.      ret
  1538. +%endmacro
  1539.  
  1540. +INIT_MMX
  1541. +HV mmxext, 4
  1542. +INIT_XMM
  1543. +HV sse2  , 8
  1544. +
  1545. +%macro MC22 3
  1546.  cglobal %2_h264_qpel%3_mc22_10_%1, 3,7,10
  1547. +%define PAD mmsize*8*3*2      ; SIZE*16*4*sizeof(pixel)
  1548.      mov      r6, rsp          ; backup stack pointer
  1549.      and     rsp, ~(mmsize-1)  ; align stack
  1550. -    sub     rsp, 4096         ; TODO: calculate this correctly
  1551. +    sub     rsp, PAD
  1552.  
  1553. -    call %2_hv%3_10_%1
  1554. +    call put_hv%3_10_%1
  1555.  
  1556.      mov        r4, mmsize
  1557.      mov       r3d, %3
  1558. @@ -604,28 +523,19 @@ cglobal %2_h264_qpel%3_mc22_10_%1, 3,7,10
  1559.      RET
  1560.  %endmacro
  1561.  
  1562. -%define OP_MOV mova
  1563. -INIT_MMX
  1564. -MC22 mmxext, put, 4
  1565. -INIT_XMM
  1566. -MC22 sse2  , put, 8
  1567. -
  1568. -%define OP_MOV AVG_MOV
  1569. -INIT_MMX
  1570. -MC22 mmxext, avg, 4
  1571. -INIT_XMM
  1572. -MC22 sse2  , avg, 8
  1573. +MC MC22
  1574.  
  1575.  ;-----------------------------------------------------------------------------
  1576.  ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  1577.  ;-----------------------------------------------------------------------------
  1578.  %macro MC12 3
  1579.  cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
  1580. -    mov      r6, rsp          ; backup stack pointer
  1581. -    and     rsp, ~(mmsize-1)  ; align stack
  1582. -    sub     rsp, 4096         ; TODO: calculate this correctly
  1583. +%define PAD mmsize*8*3*2        ; SIZE*16*4*sizeof(pixel)
  1584. +    mov        r6, rsp          ; backup stack pointer
  1585. +    and       rsp, ~(mmsize-1)  ; align stack
  1586. +    sub       rsp, PAD
  1587.  
  1588. -    call %2_hv%3_10_%1
  1589. +    call put_hv%3_10_%1
  1590.  
  1591.      xor        r4, r4
  1592.  .body
  1593. @@ -640,7 +550,7 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
  1594.      %define s1 m8
  1595.      %define s2 m9
  1596.      %define s3 m10
  1597. -    %define s1 m11
  1598. +    %define d1 m11
  1599.  %else
  1600.      %define s1 [tap1]
  1601.      %define s2 [tap2]
  1602. @@ -673,7 +583,7 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
  1603.      por        m1, m2
  1604.      CLIPW      m1, m0, m7
  1605.  
  1606. -    movu       m3, [rsp+r4+mmsize] ; movu needed for mc32
  1607. +    movu       m3, [rsp+r4+mmsize] ; movu needed for mc32, etc
  1608.      paddw      m3, [depad2]
  1609.      psrlw      m3, 5
  1610.      psubw      m3, [unpad]
  1611. @@ -690,61 +600,42 @@ cglobal %2_h264_qpel%3_mc12_10_%1, 3,7,12
  1612.      RET
  1613.  %endmacro
  1614.  
  1615. -%define OP_MOV mova
  1616. -INIT_MMX
  1617. -MC12 mmxext, put, 4
  1618. -INIT_XMM
  1619. -MC12 sse2  , put, 8
  1620. -
  1621. -%define OP_MOV AVG_MOV
  1622. -INIT_MMX
  1623. -MC12 mmxext, avg, 4
  1624. -INIT_XMM
  1625. -MC12 sse2  , avg, 8
  1626. +MC MC12
  1627.  
  1628.  ;-----------------------------------------------------------------------------
  1629.  ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
  1630.  ;-----------------------------------------------------------------------------
  1631.  %macro MC32 3
  1632.  cglobal %2_h264_qpel%3_mc32_10_%1, 3,7,10
  1633. +%define PAD mmsize*8*3*2  ; SIZE*16*4*sizeof(pixel)
  1634.      mov  r6, rsp          ; backup stack pointer
  1635.      and rsp, ~(mmsize-1)  ; align stack
  1636. -    sub rsp, 4096         ; TODO: calculate this correctly
  1637. +    sub rsp, PAD
  1638.  
  1639. -    call %2_hv%3_10_%1
  1640. +    call put_hv%3_10_%1
  1641.  
  1642.      mov  r4, 2            ; sizeof(pixel)
  1643.      jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  1644.  %endmacro
  1645.  
  1646. -%define OP_MOV mova
  1647. -INIT_MMX
  1648. -MC32 mmxext, put, 4
  1649. -INIT_XMM
  1650. -MC32 sse2  , put, 8
  1651. -
  1652. -%define OP_MOV AVG_MOV
  1653. -INIT_MMX
  1654. -MC32 mmxext, avg, 4
  1655. -INIT_XMM
  1656. -MC32 sse2  , avg, 8
  1657. +MC MC32
  1658.  
  1659.  ;-----------------------------------------------------------------------------
  1660.  ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
  1661.  ;-----------------------------------------------------------------------------
  1662. -%macro MC21 3
  1663. -%2_h%3_10_%1:
  1664. +%macro H_NRD 2
  1665. +put_h%2_10_%1:
  1666.      add       rsp, gprsize
  1667. -    mov       r3d, %3
  1668. +    mov       r3d, %2
  1669.      xor        r4, r4
  1670.      mova       m6, [pad20]
  1671.  .nextrow
  1672.      movu       m2, [r5-4]
  1673.      movu       m3, [r5-2]
  1674.      movu       m4, [r5+0]
  1675. -    ADDW       m4, [r5+2], m5
  1676. -    ADDW       m3, [r5+4], m5
  1677.      ADDW       m2, [r5+6], m5
  1678. +    ADDW       m3, [r5+4], m5
  1679. +    ADDW       m4, [r5+2], m5
  1680.  
  1681.      FILT_H2    m2, m3, m4
  1682.      psubw      m2, m6
  1683. @@ -755,65 +646,50 @@ MC32 sse2  , avg, 8
  1684.      jg .nextrow
  1685.      sub       rsp, gprsize
  1686.      ret
  1687. +%endmacro
  1688.  
  1689. +INIT_MMX
  1690. +H_NRD mmxext, 4
  1691. +INIT_XMM
  1692. +H_NRD sse2  , 8
  1693. +
  1694. +%macro MC21 3
  1695.  cglobal %2_h264_qpel%3_mc21_10_%1, 3,7,10
  1696. +%define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
  1697.      mov   r6, rsp          ; backup stack pointer
  1698.      and  rsp, ~(mmsize-1)  ; align stack
  1699. -    sub  rsp, 4096         ; TODO: calculate this correctly
  1700.  
  1701. +    sub  rsp, PAD
  1702.      mov   r5, r1
  1703. -    call  %2_hv%3_10_%1
  1704. +    call put_h%3_10_%1
  1705.  
  1706. -%define PAD mmsize*16*3*2     ; SIZE*16*3*sizeof(pixel)
  1707. -    add  rsp, PAD
  1708. -    call %2_h%3_10_%1
  1709.      sub  rsp, PAD
  1710. +    call put_hv%3_10_%1
  1711.  
  1712. -    mov  r4, PAD-mmsize            ; H buffer
  1713. +    mov  r4, PAD-mmsize    ; H buffer
  1714.      jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  1715.  %endmacro
  1716.  
  1717. -%define OP_MOV mova
  1718. -INIT_MMX
  1719. -MC21 mmxext, put, 4
  1720. -INIT_XMM
  1721. -MC21 sse2  , put, 8
  1722. -
  1723. -%define OP_MOV AVG_MOV
  1724. -INIT_MMX
  1725. -MC21 mmxext, avg, 4
  1726. -INIT_XMM
  1727. -MC21 sse2  , avg, 8
  1728. +MC MC21
  1729.  
  1730.  ;-----------------------------------------------------------------------------
  1731.  ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
  1732.  ;-----------------------------------------------------------------------------
  1733.  %macro MC23 3
  1734.  cglobal %2_h264_qpel%3_mc23_10_%1, 3,7,10
  1735. +%define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
  1736.      mov   r6, rsp          ; backup stack pointer
  1737.      and  rsp, ~(mmsize-1)  ; align stack
  1738. -    sub  rsp, 4096         ; TODO: calculate this correctly
  1739.  
  1740. +    sub  rsp, PAD
  1741.      lea   r5, [r1+r2]
  1742. -    call  %2_hv%3_10_%1
  1743. +    call put_h%3_10_%1
  1744.  
  1745. -%define PAD mmsize*16*3*2     ; SIZE*16*3*sizeof(pixel)
  1746. -    add  rsp, PAD
  1747. -    call %2_h%3_10_%1
  1748.      sub  rsp, PAD
  1749. +    call put_hv%3_10_%1
  1750.  
  1751. -    mov  r4, PAD-mmsize            ; H buffer
  1752. +    mov  r4, PAD-mmsize    ; H buffer
  1753.      jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  1754.  %endmacro
  1755.  
  1756. -%define OP_MOV mova
  1757. -INIT_MMX
  1758. -MC23 mmxext, put, 4
  1759. -INIT_XMM
  1760. -MC23 sse2  , put, 8
  1761. -
  1762. -%define OP_MOV AVG_MOV
  1763. -INIT_MMX
  1764. -MC23 mmxext, avg, 4
  1765. -INIT_XMM
  1766. -MC23 sse2  , avg, 8
  1767. +MC MC23
  1768. --
  1769. 1.7.5.1
  1770.  
  1771.  
  1772. From d1ccd3f604b353aa32cc5460a3934fbbbaca0841 Mon Sep 17 00:00:00 2001
  1773. From: Daniel Kang <daniel.d.kang@gmail.com>
  1774. Date: Sun, 26 Jun 2011 13:04:22 -0400
  1775. Subject: [PATCH 4/5] more fixes
  1776.  
  1777. ---
  1778. libavcodec/x86/h264_qpel_10bit.asm |   58 ++++++++++++++++++++++++++++--------
  1779.  1 files changed, 45 insertions(+), 13 deletions(-)
  1780.  
  1781. diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
  1782. index ec17cf6..b5dfa8a 100644
  1783. --- a/libavcodec/x86/h264_qpel_10bit.asm
  1784. +++ b/libavcodec/x86/h264_qpel_10bit.asm
  1785. @@ -228,6 +228,30 @@ MC MC10
  1786.  ;-----------------------------------------------------------------------------
  1787.  ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  1788.  ;-----------------------------------------------------------------------------
  1789. +%macro V_FILT 11
  1790. +v_filt%9_%10_10_%11:
  1791. +    FILT_V m0, m1, m2, m3, m4, m5, m6, m7
  1792. +    ret
  1793. +%endmacro
  1794. +
  1795. +INIT_MMX
  1796. +RESET_MM_PERMUTATION
  1797. +%assign i 0
  1798. +%rep 4
  1799. +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
  1800. +SWAP 0,1,2,3,4,5
  1801. +%assign i i+1
  1802. +%endrep
  1803. +
  1804. +INIT_XMM
  1805. +RESET_MM_PERMUTATION
  1806. +%assign i 0
  1807. +%rep 6
  1808. +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
  1809. +SWAP 0,1,2,3,4,5
  1810. +%assign i i+1
  1811. +%endrep
  1812. +
  1813.  %macro MC02 3
  1814.  cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
  1815.      lea      r3, [r2*2]
  1816. @@ -241,15 +265,19 @@ cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
  1817.      movu     m4, [r1]
  1818.      add      r1, r2
  1819.  
  1820. -%rep %3-1
  1821. -    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  1822. +%assign i 0
  1823. +%assign j 0
  1824. +%rep %3
  1825. +    call v_filt%3_ %+ i %+ _10_%1
  1826.      OP_MOV [r0], m0
  1827. +%if j<%3-1
  1828.      add      r1, r2
  1829.      add      r0, r2
  1830.      SWAP 0,1,2,3,4,5
  1831. +%endif
  1832. +    %assign j j+1
  1833. +    %assign i (j % 6)
  1834.  %endrep
  1835. -    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  1836. -    OP_MOV [r0], m0
  1837.      RET
  1838.  %endmacro
  1839.  
  1840. @@ -273,20 +301,22 @@ cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
  1841.      movu     m4, [r1]
  1842.      add      r1, r2
  1843.  
  1844. -%rep %3-1
  1845. -    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  1846. +%assign i 0
  1847. +%assign j 0
  1848. +%rep %3
  1849. +    call v_filt%3_ %+ i %+ _10_%1
  1850.      movu     m7, [r4]
  1851.      pavgw    m0, m7
  1852.      OP_MOV [r0], m0
  1853. +%if i<%3-1
  1854.      add      r4, r2
  1855.      add      r1, r2
  1856.      add      r0, r2
  1857.      SWAP 0,1,2,3,4,5
  1858. +%endif
  1859. +    %assign j j+1
  1860. +    %assign i (j % 6)
  1861.  %endrep
  1862. -    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  1863. -    movu     m7, [r4]
  1864. -    pavgw    m0, m7
  1865. -    OP_MOV [r0], m0
  1866.      RET
  1867.  %endmacro
  1868.  
  1869. @@ -323,9 +353,9 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
  1870.      add      r1, r2
  1871.  
  1872.  %assign i 0
  1873. +%assign j 0
  1874.  %rep %3
  1875. -%assign i i+1
  1876. -    FILT_V   m0, m1, m2, m3, m4, m5, m6, m7
  1877. +    call v_filt%3_ %+ i %+ _10_%1
  1878.  ;now do FILT_H with fewer registers. probably faster than doing FILT_V then FILT_H
  1879.  ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  1880.  ;unfortunately I need three registers, so m5 will have to be re-read from memory
  1881. @@ -348,13 +378,15 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
  1882.  ;avg FILT_V, FILT_H and reload m5
  1883.      pavgw    m0, m5
  1884.      OP_MOV [r0], m0
  1885. -%if i<%3
  1886. +%if j<%3-1
  1887.      movu     m5, [r1]
  1888.      add      r4, r2
  1889.      add      r1, r2
  1890.      add      r0, r2
  1891.      SWAP 0,1,2,3,4,5
  1892.  %endif
  1893. +    %assign j j+1
  1894. +    %assign i (j % 6)
  1895.  %endrep
  1896.      RET
  1897.  %endmacro
  1898. --
  1899. 1.7.5.1
  1900.  
  1901.  
  1902. From ba151cdffadece22319d6a4722c01fe48eee10d3 Mon Sep 17 00:00:00 2001
  1903. From: Daniel Kang <daniel.d.kang@gmail.com>
  1904. Date: Sun, 26 Jun 2011 16:27:12 -0400
  1905. Subject: [PATCH 5/5] fixes pt 3
  1906.  
  1907. ---
  1908. libavcodec/x86/h264_qpel_10bit.asm |  113 ++++++++++++++----------------------
  1909.  1 files changed, 43 insertions(+), 70 deletions(-)
  1910.  
  1911. diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
  1912. index b5dfa8a..cb9b077 100644
  1913. --- a/libavcodec/x86/h264_qpel_10bit.asm
  1914. +++ b/libavcodec/x86/h264_qpel_10bit.asm
  1915. @@ -72,6 +72,18 @@ SECTION .text
  1916.      paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1917.  %endmacro
  1918.  
  1919. +%macro PRELOAD_V 0
  1920. +    lea      r3, [r2*3]
  1921. +    sub      r1, r3
  1922. +    movu     m0, [r1+r2]
  1923. +    movu     m1, [r1+r2*2]
  1924. +    add      r1, r3
  1925. +    movu     m2, [r1]
  1926. +    movu     m3, [r1+r2]
  1927. +    movu     m4, [r1+r2*2]
  1928. +    add      r1, r3
  1929. +%endmacro
  1930. +
  1931.  %macro FILT_V 8
  1932.      movu     %6, [r1]
  1933.      paddw    %1, %6
  1934. @@ -127,8 +139,9 @@ cglobal %1_h264_qpel8_mc00_10_sse2, 3,3
  1935.  %endrep
  1936.      RET
  1937.  
  1938. -cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
  1939. -%rep 8
  1940. +cglobal %1_h264_qpel16_mc00_10_sse2_asm, 3,4
  1941. +    mov r3d, 8
  1942. +.loop:
  1943.      movu           m0, [r1      ]
  1944.      movu           m1, [r1   +16]
  1945.      OP_MOV [r0      ], m0
  1946. @@ -139,7 +152,8 @@ cglobal %1_h264_qpel16_mc00_10_sse2, 3,3
  1947.      OP_MOV [r0+r2+16], m1
  1948.      lea            r0, [r0+r2*2]
  1949.      lea            r1, [r1+r2*2]
  1950. -%endrep
  1951. +    dec r3d
  1952. +    jg .loop
  1953.      RET
  1954.  %endmacro
  1955.  
  1956. @@ -230,7 +244,11 @@ MC MC10
  1957.  ;-----------------------------------------------------------------------------
  1958.  %macro V_FILT 11
  1959.  v_filt%9_%10_10_%11:
  1960. +    add    r4, r2
  1961. +.no_addr4:
  1962.      FILT_V m0, m1, m2, m3, m4, m5, m6, m7
  1963. +    add    r1, r2
  1964. +    add    r0, r2
  1965.      ret
  1966.  %endmacro
  1967.  
  1968. @@ -254,29 +272,16 @@ SWAP 0,1,2,3,4,5
  1969.  
  1970.  %macro MC02 3
  1971.  cglobal %2_h264_qpel%3_mc02_10_%1, 3,4,8
  1972. -    lea      r3, [r2*2]
  1973. -    sub      r1, r3
  1974. -    movu     m0, [r1]
  1975. -    movu     m1, [r1+r2]
  1976. -    add      r1, r3
  1977. -    movu     m2, [r1]
  1978. -    movu     m3, [r1+r2]
  1979. -    add      r1, r3
  1980. -    movu     m4, [r1]
  1981. -    add      r1, r2
  1982. +    PRELOAD_V
  1983.  
  1984. -%assign i 0
  1985. +    sub      r0, r2
  1986.  %assign j 0
  1987.  %rep %3
  1988. -    call v_filt%3_ %+ i %+ _10_%1
  1989. +    %assign i (j % 6)
  1990. +    call v_filt%3_ %+ i %+ _10_%1.no_addr4
  1991.      OP_MOV [r0], m0
  1992. -%if j<%3-1
  1993. -    add      r1, r2
  1994. -    add      r0, r2
  1995.      SWAP 0,1,2,3,4,5
  1996. -%endif
  1997.      %assign j j+1
  1998. -    %assign i (j % 6)
  1999.  %endrep
  2000.      RET
  2001.  %endmacro
  2002. @@ -290,32 +295,19 @@ MC MC02
  2003.  cglobal %2_h264_qpel%3_mc01_10_%1, 3,5,8
  2004.      mov      r4, r1
  2005.  .body
  2006. -    lea      r3, [r2*2]
  2007. -    sub      r1, r3
  2008. -    movu     m0, [r1]
  2009. -    movu     m1, [r1+r2]
  2010. -    add      r1, r3
  2011. -    movu     m2, [r1]
  2012. -    movu     m3, [r1+r2]
  2013. -    add      r1, r3
  2014. -    movu     m4, [r1]
  2015. -    add      r1, r2
  2016. +    PRELOAD_V
  2017.  
  2018. -%assign i 0
  2019. +    sub      r4, r2
  2020. +    sub      r0, r2
  2021.  %assign j 0
  2022.  %rep %3
  2023. +    %assign i (j % 6)
  2024.      call v_filt%3_ %+ i %+ _10_%1
  2025.      movu     m7, [r4]
  2026.      pavgw    m0, m7
  2027.      OP_MOV [r0], m0
  2028. -%if i<%3-1
  2029. -    add      r4, r2
  2030. -    add      r1, r2
  2031. -    add      r0, r2
  2032.      SWAP 0,1,2,3,4,5
  2033. -%endif
  2034.      %assign j j+1
  2035. -    %assign i (j % 6)
  2036.  %endrep
  2037.      RET
  2038.  %endmacro
  2039. @@ -338,23 +330,18 @@ MC MC03
  2040.  ;-----------------------------------------------------------------------------
  2041.  %macro MC11 3
  2042.  ; this REALLY needs x86_64
  2043. -cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
  2044. +cglobal %2_h264_qpel%3_mc11_10_%1, 3,6,8
  2045.      mov      r4, r1
  2046.  .body
  2047. -    lea      r3, [r2*2]
  2048. -    sub      r1, r3
  2049. -    movu     m0, [r1]
  2050. -    movu     m1, [r1+r2]
  2051. -    add      r1, r3
  2052. -    movu     m2, [r1]
  2053. -    movu     m3, [r1+r2]
  2054. -    add      r1, r3
  2055. -    movu     m4, [r1]
  2056. -    add      r1, r2
  2057. +    PRELOAD_V
  2058.  
  2059. -%assign i 0
  2060. +    sub      r0, r2
  2061. +    sub      r4, r2
  2062. +    mov      r5, r2
  2063. +    neg      r5
  2064.  %assign j 0
  2065.  %rep %3
  2066. +    %assign i (j % 6)
  2067.      call v_filt%3_ %+ i %+ _10_%1
  2068.  ;now do FILT_H with fewer registers. probably faster than doing FILT_V then FILT_H
  2069.  ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  2070. @@ -379,14 +366,10 @@ cglobal %2_h264_qpel%3_mc11_10_%1, 3,5,8
  2071.      pavgw    m0, m5
  2072.      OP_MOV [r0], m0
  2073.  %if j<%3-1
  2074. -    movu     m5, [r1]
  2075. -    add      r4, r2
  2076. -    add      r1, r2
  2077. -    add      r0, r2
  2078. +    movu     m5, [r1+r5]
  2079.      SWAP 0,1,2,3,4,5
  2080.  %endif
  2081.      %assign j j+1
  2082. -    %assign i (j % 6)
  2083.  %endrep
  2084.      RET
  2085.  %endmacro
  2086. @@ -397,7 +380,7 @@ MC MC11
  2087.  ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  2088.  ;-----------------------------------------------------------------------------
  2089.  %macro MC31 3
  2090. -cglobal %2_h264_qpel%3_mc31_10_%1, 3,5,8
  2091. +cglobal %2_h264_qpel%3_mc31_10_%1, 3,6,8
  2092.      mov r4, r1
  2093.      add r1, 2
  2094.      jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  2095. @@ -409,7 +392,7 @@ MC MC31
  2096.  ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  2097.  ;-----------------------------------------------------------------------------
  2098.  %macro MC13 3
  2099. -cglobal %2_h264_qpel%3_mc13_10_%1, 3,5,8
  2100. +cglobal %2_h264_qpel%3_mc13_10_%1, 3,6,8
  2101.      lea r4, [r1+r2]
  2102.      jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  2103.  %endmacro
  2104. @@ -420,7 +403,7 @@ MC MC13
  2105.  ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  2106.  ;-----------------------------------------------------------------------------
  2107.  %macro MC33 3
  2108. -cglobal %2_h264_qpel%3_mc33_10_%1, 3,5,8
  2109. +cglobal %2_h264_qpel%3_mc33_10_%1, 3,6,8
  2110.      lea r4, [r1+r2]
  2111.      add r1, 2
  2112.      jmp mangle(ff_%2_h264_qpel%3_mc11_10_%1.body)
  2113. @@ -687,12 +670,13 @@ H_NRD sse2  , 8
  2114.  
  2115.  %macro MC21 3
  2116.  cglobal %2_h264_qpel%3_mc21_10_%1, 3,7,10
  2117. +    mov   r5, r1
  2118. +.body
  2119.  %define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
  2120.      mov   r6, rsp          ; backup stack pointer
  2121.      and  rsp, ~(mmsize-1)  ; align stack
  2122.  
  2123.      sub  rsp, PAD
  2124. -    mov   r5, r1
  2125.      call put_h%3_10_%1
  2126.  
  2127.      sub  rsp, PAD
  2128. @@ -709,19 +693,8 @@ MC MC21
  2129.  ;-----------------------------------------------------------------------------
  2130.  %macro MC23 3
  2131.  cglobal %2_h264_qpel%3_mc23_10_%1, 3,7,10
  2132. -%define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
  2133. -    mov   r6, rsp          ; backup stack pointer
  2134. -    and  rsp, ~(mmsize-1)  ; align stack
  2135. -
  2136. -    sub  rsp, PAD
  2137.      lea   r5, [r1+r2]
  2138. -    call put_h%3_10_%1
  2139. -
  2140. -    sub  rsp, PAD
  2141. -    call put_hv%3_10_%1
  2142. -
  2143. -    mov  r4, PAD-mmsize    ; H buffer
  2144. -    jmp mangle(ff_%2_h264_qpel%3_mc12_10_%1.body)
  2145. +    jmp mangle(ff_%2_h264_qpel%3_mc21_10_%1.body)
  2146.  %endmacro
  2147.  
  2148.  MC MC23
  2149. --
  2150. 1.7.5.1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement