Advertisement
Guest User

Untitled

a guest
Sep 17th, 2017
453
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 26.70 KB | None | 0 0
  1. From dc3ee1e5dcc8cf010223976be504f88bbe4d9889 Mon Sep 17 00:00:00 2001
  2. From: Daniel Kang <daniel.d.kang@gmail.com>
  3. Date: Sun, 5 Jun 2011 18:33:23 -0400
  4. Subject: [PATCH 1/2] H.264: Add x86 assembly for 10-bit MC Chroma H.264
  5.  functions.
  6.  
  7. Mainly ported from 8-bit H.264 MC Chroma.
  8. ---
  9. libavcodec/x86/Makefile                |    1 +
  10.  libavcodec/x86/dsputil_mmx.c           |   32 ++++
  11.  libavcodec/x86/h264_chromamc_10bit.asm |  279 ++++++++++++++++++++++++++++++++
  12.  3 files changed, 312 insertions(+), 0 deletions(-)
  13.  create mode 100644 libavcodec/x86/h264_chromamc_10bit.asm
  14.  
  15. diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
  16. index 1c451c8..ea57bd1 100644
  17. --- a/libavcodec/x86/Makefile
  18. +++ b/libavcodec/x86/Makefile
  19. @@ -44,6 +44,7 @@ MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
  20.                                            x86/deinterlace.o             \
  21.                                            x86/fmtconvert.o              \
  22.                                            x86/h264_chromamc.o           \
  23. +                                          x86/h264_chromamc_10bit.o     \
  24.                                            $(YASM-OBJS-yes)
  25.  
  26.  MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
  27. diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
  28. index 1cc6991..7ac9679 100644
  29. --- a/libavcodec/x86/dsputil_mmx.c
  30. +++ b/libavcodec/x86/dsputil_mmx.c
  31. @@ -1938,6 +1938,19 @@ void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
  32.  void ff_avg_h264_chroma_mc4_ssse3     (uint8_t *dst, uint8_t *src,
  33.                                         int stride, int h, int x, int y);
  34.  
  35. +#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
  36. +void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
  37. +                                      (uint8_t *dst, uint8_t *src,\
  38. +                                       int stride, int h, int x, int y);
  39. +
  40. +CHROMA_MC(put, 2, 10, mmxext)
  41. +CHROMA_MC(avg, 2, 10, mmxext)
  42. +CHROMA_MC(put, 4, 10, mmxext)
  43. +CHROMA_MC(avg, 4, 10, mmxext)
  44. +CHROMA_MC(put, 8, 10, sse2)
  45. +CHROMA_MC(avg, 8, 10, sse2)
  46. +CHROMA_MC(put, 8, 10, avx)
  47. +CHROMA_MC(avg, 8, 10, avx)
  48.  
  49.  /* CAVS specific */
  50.  void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
  51. @@ -2420,6 +2433,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  52.  {
  53.      int mm_flags = av_get_cpu_flags();
  54.      const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
  55. +    const int bit_depth = avctx->bits_per_raw_sample;
  56.  
  57.      if (avctx->dsp_mask) {
  58.          if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
  59. @@ -2651,6 +2665,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  60.              c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
  61.              c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
  62.              }
  63. +            if (bit_depth == 10) {
  64. +                c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
  65. +                c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
  66. +                c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
  67. +                c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
  68. +            }
  69.  
  70.              c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
  71.  #endif
  72. @@ -2756,6 +2776,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  73.              H264_QPEL_FUNCS(3, 2, sse2);
  74.              H264_QPEL_FUNCS(3, 3, sse2);
  75.              }
  76. +            if (bit_depth == 10) {
  77. +                c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
  78. +                c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
  79. +            }
  80.          }
  81.  #if HAVE_SSSE3
  82.          if(mm_flags & AV_CPU_FLAG_SSSE3){
  83. @@ -2854,6 +2878,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  84.              }
  85.  #endif
  86.          }
  87. +#if HAVE_AVX
  88. +        if (mm_flags & AV_CPU_FLAG_AVX) {
  89. +            if (bit_depth == 10) {
  90. +                c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
  91. +                c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
  92. +            }
  93. +        }
  94. +#endif
  95.      }
  96.  
  97.      if (CONFIG_ENCODERS)
  98. diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
  99. new file mode 100644
  100. index 0000000..757d99f
  101. --- /dev/null
  102. +++ b/libavcodec/x86/h264_chromamc_10bit.asm
  103. @@ -0,0 +1,279 @@
  104. +;*****************************************************************************
  105. +;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
  106. +;*****************************************************************************
  107. +;* Copyright (C) 2005-2011 x264 project
  108. +;*
  109. +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  110. +;*
  111. +;* This file is part of Libav.
  112. +;*
  113. +;* Libav is free software; you can redistribute it and/or
  114. +;* modify it under the terms of the GNU Lesser General Public
  115. +;* License as published by the Free Software Foundation; either
  116. +;* version 2.1 of the License, or (at your option) any later version.
  117. +;*
  118. +;* Libav is distributed in the hope that it will be useful,
  119. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  120. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  121. +;* Lesser General Public License for more details.
  122. +;*
  123. +;* You should have received a copy of the GNU Lesser General Public
  124. +;* License along with Libav; if not, write to the Free Software
  125. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  126. +;******************************************************************************
  127. +
  128. +%include "x86inc.asm"
  129. +%include "x86util.asm"
  130. +
  131. +SECTION_RODATA
  132. +
  133. +cextern pw_4
  134. +cextern pw_8
  135. +cextern pw_32
  136. +cextern pw_64
  137. +
  138. +SECTION .text
  139. +
  140. +
  141. +%macro mv0_pixels_mc8 0
  142. +    lea           r4, [r2*2 ]
  143. +.next4rows
  144. +    movu          m0, [r1   ]
  145. +    movu          m1, [r1+r2]
  146. +    CHROMAMC_AVG  m0, [r0   ]
  147. +    CHROMAMC_AVG  m1, [r0+r2]
  148. +    mova     [r0   ], m0
  149. +    mova     [r0+r2], m1
  150. +    add           r0, r4
  151. +    add           r1, r4
  152. +    movu          m0, [r1   ]
  153. +    movu          m1, [r1+r2]
  154. +    CHROMAMC_AVG  m0, [r0   ]
  155. +    CHROMAMC_AVG  m1, [r0+r2]
  156. +    add           r1, r4
  157. +    mova     [r0   ], m0
  158. +    mova     [r0+r2], m1
  159. +    add           r0, r4
  160. +    sub          r3d, 4
  161. +    jne .next4rows
  162. +%endmacro
  163. +
  164. +;-----------------------------------------------------------------------------
  165. +; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
  166. +;-----------------------------------------------------------------------------
  167. +%macro CHROMA_MC8 2
  168. +; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
  169. +;                              int stride, int h, int mx, int my)
  170. +cglobal %1_h264_chroma_mc8_10_%2, 6,7,8
  171. +    movsxdifnidn  r2, r2d
  172. +    mov          r6d, r5d
  173. +    or           r6d, r4d
  174. +    jne .at_least_one_non_zero
  175. +    ; mx == 0 AND my == 0 - no filter needed
  176. +    mv0_pixels_mc8
  177. +    REP_RET
  178. +
  179. +.at_least_one_non_zero
  180. +    mov          r6d, 2
  181. +    test         r5d, r5d
  182. +    je .my_is_zero
  183. +    mov           r6, r2        ; dxy = x ? 1 : stride
  184. +    test         r4d, r4d
  185. +    jne .both_non_zero
  186. +.my_is_zero
  187. +    ; mx == 0 XOR my == 0 - 1 dimensional filter only
  188. +    or           r4d, r5d       ; x + y
  189. +    movd          m5, r4d
  190. +    mova          m4, [pw_8]
  191. +    mova          m6, [pw_4]    ; mm6 = rnd >> 3
  192. +    SPLATW        m5, m5        ; mm5 = B = x
  193. +    psubw         m4, m5        ; mm4 = A = 8-x
  194. +
  195. +.next1drow
  196. +    movu          m0, [r1   ]   ; mm0 = src[0..7]
  197. +    movu          m2, [r1+r6]   ; mm2 = src[1..8]
  198. +
  199. +    pmullw        m0, m4        ; mm0 = A * src[0..7]
  200. +    pmullw        m2, m5        ; mm2 = B * src[1..8]
  201. +
  202. +    paddw         m0, m6
  203. +    paddw         m0, m2
  204. +    psrlw         m0, 3
  205. +    CHROMAMC_AVG  m0, [r0]
  206. +    mova        [r0], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
  207. +
  208. +    add           r0, r2
  209. +    add           r1, r2
  210. +    dec           r3d
  211. +    jne .next1drow
  212. +    REP_RET
  213. +
  214. +.both_non_zero ; general case, bilinear
  215. +    movd          m4, r4m         ; x
  216. +    movd          m6, r5m         ; y
  217. +
  218. +    SPLATW        m4, m4          ; mm4 = x words
  219. +    SPLATW        m6, m6          ; mm6 = y words
  220. +    psllw         m5, m4, 3       ; mm5 = 8x
  221. +    pmullw        m4, m6          ; mm4 = x * y
  222. +    psllw         m6, 3           ; mm6 = 8y
  223. +    paddw         m1, m5, m6      ; mm7 = 8x+8y
  224. +    mova          m7, m4          ; DD = x * y
  225. +    psubw         m5, m4          ; mm5 = B = 8x - xy
  226. +    psubw         m6, m4          ; mm6 = C = 8y - xy
  227. +    paddw         m4, [pw_64]
  228. +    psubw         m4, m1          ; mm4 = A = xy - (8x+8y) + 64
  229. +
  230. +    movu          m0, [r1  ]      ; mm0 = src[0..7]
  231. +    movu          m1, [r1+2]      ; mm1 = src[1..8]
  232. +.next2drow
  233. +    add           r1, r2
  234. +
  235. +    pmullw        m2, m0, m4
  236. +    pmullw        m1, m5
  237. +    paddw         m2, m1          ; mm2 = A * src[0..7] + B * src[1..8]
  238. +
  239. +    movu          m0, [r1]
  240. +    movu          m1, [r1+2]
  241. +    pmullw        m3, m0, m6
  242. +    paddw         m2, m3          ; mm2 += C * src[0..7+strde]
  243. +    pmullw        m3, m1, m7
  244. +    paddw         m2, m3          ; mm2 += D * src[1..8+strde]
  245. +
  246. +    paddw         m2, [pw_32]
  247. +    psrlw         m2, 6
  248. +    CHROMAMC_AVG  m2, [r0]
  249. +    mova        [r0], m2          ; dst[0..7] = (mm2 + 32) >> 6
  250. +
  251. +    add           r0, r2
  252. +    dec          r3d
  253. +    jne .next2drow
  254. +    REP_RET
  255. +%endmacro
  256. +
  257. +;-----------------------------------------------------------------------------
  258. +; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
  259. +;-----------------------------------------------------------------------------
  260. +;TODO: xmm mc4
  261. +%macro MC4_OP 2
  262. +    movq          %1, [r1  ]
  263. +    movq          m1, [r1+2]
  264. +    add           r1, r2
  265. +    pmullw        %1, m4
  266. +    pmullw        m1, m2
  267. +    paddw         m1, %1
  268. +    mova          %1, m1
  269. +
  270. +    pmullw        %2, m5
  271. +    pmullw        m1, m3
  272. +    paddw         %2, [pw_32]
  273. +    paddw         m1, %2
  274. +    psrlw         m1, 6
  275. +    CHROMAMC_AVG4 m1, %2, [r0]
  276. +    movq        [r0], m1
  277. +    add           r0, r2
  278. +%endmacro
  279. +
  280. +%macro CHROMA_MC4 2
  281. +cglobal %1_h264_chroma_mc4_10_%2, 6,6,7
  282. +    movsxdifnidn  r2, r2d
  283. +    movd          m2, r4m         ; x
  284. +    movd          m3, r5m         ; y
  285. +    mova          m4, [pw_8]
  286. +    mova          m5, m4
  287. +    SPLATW        m2, m2
  288. +    SPLATW        m3, m3
  289. +    psubw         m4, m2
  290. +    psubw         m5, m3
  291. +
  292. +    movq          m0, [r1  ]
  293. +    movq          m6, [r1+2]
  294. +    add           r1, r2
  295. +    pmullw        m0, m4
  296. +    pmullw        m6, m2
  297. +    paddw         m6, m0
  298. +
  299. +.next2rows
  300. +    MC4_OP m0, m6
  301. +    MC4_OP m6, m0
  302. +    sub   r3d, 2
  303. +    jnz .next2rows
  304. +    REP_RET
  305. +%endmacro
  306. +
  307. +;-----------------------------------------------------------------------------
  308. +; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
  309. +;-----------------------------------------------------------------------------
  310. +%macro CHROMA_MC2 2
  311. +cglobal %1_h264_chroma_mc2_10_%2, 6,7
  312. +    movsxdifnidn  r2, r2d
  313. +    mov          r6d, r4d
  314. +    shl          r4d, 16
  315. +    sub          r4d, r6d
  316. +    add          r4d, 8
  317. +    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
  318. +    shl          r4d, 3
  319. +    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
  320. +
  321. +    movd          m5, r4d
  322. +    movd          m6, r5d
  323. +    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
  324. +    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
  325. +    movq          m2, [r1]
  326. +    pxor          m7, m7
  327. +    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
  328. +
  329. +.nextrow
  330. +    add           r1, r2
  331. +    movq          m1, m2
  332. +    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
  333. +    movq          m0, [r1]
  334. +    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
  335. +    movq          m2, m0
  336. +    pmaddwd       m0, m6
  337. +    paddw         m1, [pw_32]
  338. +    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
  339. +    psrlw         m1, 6
  340. +    packssdw      m1, m7
  341. +    CHROMAMC_AVG4 m1, m3, [r0]
  342. +    movd        [r0], m1
  343. +    add           r0, r2
  344. +    sub          r3d, 1
  345. +    jnz .nextrow
  346. +    REP_RET
  347. +%endmacro
  348. +
  349. +%macro NOTHING 2-3
  350. +%endmacro
  351. +%macro DIRECT_AVG 2
  352. +    PAVG          %1, %2
  353. +%endmacro
  354. +%macro COPY_AVG 3
  355. +    movq          %2, %3
  356. +    PAVG          %1, %2
  357. +%endmacro
  358. +
  359. +%define CHROMAMC_AVG  NOTHING
  360. +%define CHROMAMC_AVG4 NOTHING
  361. +INIT_XMM
  362. +CHROMA_MC8 put, sse2
  363. +%ifdef HAVE_AVX
  364. +CHROMA_MC8 put, avx
  365. +%endif
  366. +INIT_XMM
  367. +CHROMA_MC4 put, mmxext
  368. +INIT_MMX
  369. +CHROMA_MC2 put, mmxext
  370. +
  371. +%define CHROMAMC_AVG  DIRECT_AVG
  372. +%define CHROMAMC_AVG4 COPY_AVG
  373. +%define PAVG          pavgw
  374. +INIT_XMM
  375. +CHROMA_MC8 avg, sse2
  376. +%ifdef HAVE_AVX
  377. +CHROMA_MC8 avg, avx
  378. +%endif
  379. +INIT_XMM
  380. +CHROMA_MC4 avg, mmxext
  381. +INIT_MMX
  382. +CHROMA_MC2 avg, mmxext
  383. --
  384. 1.7.5.1
  385.  
  386.  
  387. From 068d2421c97a6f34d47275fe193a55629051fee0 Mon Sep 17 00:00:00 2001
  388. From: Daniel Kang <daniel.d.kang@gmail.com>
  389. Date: Sun, 12 Jun 2011 18:06:56 -0400
  390. Subject: [PATCH 2/2] H.264: Add x86 assembly for 10-bit weight/biweight H.264
  391.  functions.
  392.  
  393. Mainly ported from 8-bit H.264 weight/biweight.
  394. ---
  395. libavcodec/x86/Makefile              |    1 +
  396.  libavcodec/x86/h264_weight_10bit.asm |  298 ++++++++++++++++++++++++++++++++++
  397.  libavcodec/x86/h264dsp_mmx.c         |   72 ++++++++
  398.  3 files changed, 371 insertions(+), 0 deletions(-)
  399.  create mode 100644 libavcodec/x86/h264_weight_10bit.asm
  400.  
  401. diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
  402. index ea57bd1..022ab27 100644
  403. --- a/libavcodec/x86/Makefile
  404. +++ b/libavcodec/x86/Makefile
  405. @@ -15,6 +15,7 @@ YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
  406.                                            x86/h264_idct.o               \
  407.                                            x86/h264_idct_10bit.o         \
  408.                                            x86/h264_weight.o             \
  409. +                                          x86/h264_weight_10bit.o       \
  410.  
  411.  YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o          \
  412.                                            x86/h264_intrapred_10bit.o
  413. diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
  414. new file mode 100644
  415. index 0000000..ea6ed83
  416. --- /dev/null
  417. +++ b/libavcodec/x86/h264_weight_10bit.asm
  418. @@ -0,0 +1,298 @@
  419. +;*****************************************************************************
  420. +;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
  421. +;*****************************************************************************
  422. +;* Copyright (C) 2005-2011 x264 project
  423. +;*
  424. +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  425. +;*
  426. +;* This file is part of Libav.
  427. +;*
  428. +;* Libav is free software; you can redistribute it and/or
  429. +;* modify it under the terms of the GNU Lesser General Public
  430. +;* License as published by the Free Software Foundation; either
  431. +;* version 2.1 of the License, or (at your option) any later version.
  432. +;*
  433. +;* Libav is distributed in the hope that it will be useful,
  434. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  435. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  436. +;* Lesser General Public License for more details.
  437. +;*
  438. +;* You should have received a copy of the GNU Lesser General Public
  439. +;* License along with Libav; if not, write to the Free Software
  440. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  441. +;******************************************************************************
  442. +
  443. +%include "x86inc.asm"
  444. +%include "x86util.asm"
  445. +
  446. +SECTION_RODATA 32
  447. +
  448. +pw_pixel_max: times 8 dw ((1 << 10)-1)
  449. +sq_1: dq 1
  450. +      dq 0
  451. +
  452. +cextern pw_1
  453. +
  454. +SECTION .text
  455. +
  456. +;-----------------------------------------------------------------------------
  457. +; void h264_weight(uint8_t *dst, int stride, int log2_denom,
  458. +;                  int weight, int offset);
  459. +;-----------------------------------------------------------------------------
  460. +
  461. +%macro WEIGHT_SETUP 1
  462. +    mova       m0, [pw_1]
  463. +    movd       m2, r2d
  464. +    pslld      m0, m2       ; 1<<log2_denom
  465. +    SPLATW     m0, m0
  466. +    shl        r4, 19       ; *8, move to upper half of dword
  467. +    lea        r4, [r4+r3*2+0x10000]
  468. +    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
  469. +    pshufd     m3, m3, 0
  470. +    mova       m4, [pw_pixel_max]
  471. +    paddw      m2, [sq_1]   ; log2_denom+1
  472. +%ifnidn %1, sse4
  473. +    pxor       m7, m7
  474. +%endif
  475. +%endmacro
  476. +
  477. +%macro WEIGHT_OP 2-3
  478. +%if %0==2
  479. +    mova        m5, [r0+%2]
  480. +    punpckhwd   m6, m5, m0
  481. +    punpcklwd   m5, m0
  482. +%else
  483. +    movq        m5, [r0+%2]
  484. +    movq        m6, [r0+%3]
  485. +    punpcklwd   m5, m0
  486. +    punpcklwd   m6, m0
  487. +%endif
  488. +    pmaddwd     m5, m3
  489. +    pmaddwd     m6, m3
  490. +    psrad       m5, m2
  491. +    psrad       m6, m2
  492. +%ifidn %1, sse4
  493. +    packusdw    m5, m6
  494. +    pminsw      m5, m4
  495. +%else
  496. +    packssdw    m5, m6
  497. +    CLIPW       m5, m7, m4
  498. +%endif
  499. +%endmacro
  500. +
  501. +%macro WEIGHT_FUNC_DBL 1
  502. +cglobal h264_weight_16x16_10_%1, 5,6,8
  503. +    mov        r5, 16
  504. +.body
  505. +    WEIGHT_SETUP %1
  506. +.nextrow
  507. +    WEIGHT_OP %1,  0
  508. +    mova     [r0   ], m5
  509. +    WEIGHT_OP %1, 16
  510. +    mova     [r0+16], m5
  511. +    add        r0, r1
  512. +    dec        r6
  513. +    jnz .nextrow
  514. +    REP_RET
  515. +
  516. +cglobal h264_weight_16x8_10_%1, 5,6,8
  517. +    mov r6, 8
  518. +    jmp mangle(ff_h264_weight_16x16_10_%1.body)
  519. +%endmacro
  520. +
  521. +INIT_XMM
  522. +WEIGHT_FUNC_DBL sse2
  523. +WEIGHT_FUNC_DBL sse4
  524. +
  525. +
  526. +%macro WEIGHT_FUNC_MM 1
  527. +cglobal h264_weight_8x16_10_%1, 5,6,8
  528. +    mov        r6, 16
  529. +.body
  530. +    WEIGHT_SETUP %1
  531. +.nextrow
  532. +    WEIGHT_OP  %1, 0
  533. +    mova     [r0], m5
  534. +    add        r0, r1
  535. +    dec        r2
  536. +    jnz .nextrow
  537. +    REP_RET
  538. +
  539. +cglobal h264_weight_8x8_10_%1, 5,6,8
  540. +    mov r6, 8
  541. +    jmp mangle(ff_h264_weight_8x16_10_%1.body)
  542. +
  543. +cglobal h264_weight_8x4_10_%1, 5,6,8
  544. +    mov r6, 4
  545. +    jmp mangle(ff_h264_weight_8x16_10_%1.body)
  546. +%endmacro
  547. +
  548. +INIT_XMM
  549. +WEIGHT_FUNC_MM sse2
  550. +WEIGHT_FUNC_MM sse4
  551. +
  552. +
  553. +%macro WEIGHT_FUNC_HALF_MM 1
  554. +cglobal h264_weight_4x8_10_%1, 5,6,8
  555. +    mov        r6, 4
  556. +.body
  557. +    WEIGHT_SETUP %1
  558. +    lea        r3, [r1*2]
  559. +.nextrow
  560. +    WEIGHT_OP   %1, 0, r1
  561. +    movh      [r0], m5
  562. +    movhps [r0+r1], m5
  563. +    add         r0, r3
  564. +    dec         r6
  565. +    jnz .nextrow
  566. +    REP_RET
  567. +
  568. +cglobal h264_weight_4x4_10_%1, 5,6,8
  569. +    mov r6, 2
  570. +    jmp mangle(ff_h264_weight_4x8_10_%1.body)
  571. +
  572. +cglobal h264_weight_4x2_10_%1, 5,6,8
  573. +    mov r6, 1
  574. +    jmp mangle(ff_h264_weight_4x8_10_%1.body)
  575. +%endmacro
  576. +
  577. +INIT_XMM
  578. +WEIGHT_FUNC_HALF_MM sse2
  579. +WEIGHT_FUNC_HALF_MM sse4
  580. +
  581. +
  582. +;-----------------------------------------------------------------------------
  583. +; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
  584. +;                    int weightd, int weights, int offset);
  585. +;-----------------------------------------------------------------------------
  586. +%macro BIWEIGHT_SETUP 1
  587. +    lea        r6, [r6*4+1] ; (offset<<2)+1
  588. +    or         r6, 1
  589. +    shl        r5, 16
  590. +    or         r4, r5
  591. +    movd       m4, r4d      ; weightd | weights
  592. +    movd       m5, r6d      ; (offset+1)|1
  593. +    movd       m6, r3d      ; log2_denom
  594. +    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
  595. +    paddd      m6, [sq_1]
  596. +    pshufd     m4, m4, 0
  597. +    pshufd     m5, m5, 0
  598. +    mova       m3, [pw_pixel_max]
  599. +%ifnidn %1, sse4
  600. +    pxor       m7, m7
  601. +%endif
  602. +%endmacro
  603. +
  604. +%macro BIWEIGHT 2-3
  605. +%if %0==2
  606. +    mova       m0, [r0+%2]
  607. +    mova       m1, [r1+%2]
  608. +    punpckhwd  m2, m0, m1
  609. +    punpcklwd  m0, m1
  610. +%else
  611. +    movq       m0, [r0+%2]
  612. +    movq       m1, [r1+%2]
  613. +    punpcklwd  m0, m1
  614. +    movq       m2, [r0+%3]
  615. +    movq       m1, [r1+%3]
  616. +    punpcklwd  m2, m1
  617. +%endif
  618. +    pmaddwd    m0, m4
  619. +    pmaddwd    m2, m4
  620. +    paddd      m0, m5
  621. +    paddd      m2, m5
  622. +    psrad      m0, m6
  623. +    psrad      m2, m6
  624. +%ifidn %1, sse4
  625. +    packusdw   m0, m2
  626. +    pminsw     m0, m3
  627. +%else
  628. +    packssdw   m0, m2
  629. +    CLIPW      m0, m7, m3
  630. +%endif
  631. +%endmacro
  632. +
  633. +%macro BIWEIGHT_FUNC_DBL 1
  634. +cglobal h264_biweight_16x16_10_%1, 7,7,8
  635. +    BIWEIGHT_SETUP %1
  636. +    mov        r3, 16
  637. +.nextrow
  638. +    BIWEIGHT   %1,  0
  639. +    mova  [r0   ], m0
  640. +    BIWEIGHT   %1, 16
  641. +    mova  [r0+16], m0
  642. +    add        r0, r2
  643. +    add        r1, r2
  644. +    dec        r3
  645. +    jnz .nextrow
  646. +    REP_RET
  647. +
  648. +cglobal h264_biweight_16x8_10_%1, 7,7,8
  649. +    BIWEIGHT_SETUP %1
  650. +    mov r3, 8
  651. +    jmp mangle(ff_h264_biweight_16x16_10_%1.nextrow)
  652. +%endmacro
  653. +
  654. +INIT_XMM
  655. +BIWEIGHT_FUNC_DBL sse2
  656. +BIWEIGHT_FUNC_DBL sse4
  657. +
  658. +%macro BIWEIGHT_FUNC 1
  659. +cglobal h264_biweight_8x16_10_%1, 7,7,8
  660. +    BIWEIGHT_SETUP %1
  661. +    mov      r3, 16
  662. +.nextrow
  663. +    BIWEIGHT %1, 0
  664. +    mova   [r0], m0
  665. +    add      r0, r2
  666. +    add      r1, r2
  667. +    dec      r3
  668. +    jnz .nextrow
  669. +    REP_RET
  670. +
  671. +cglobal h264_biweight_8x8_10_%1, 7,7,8
  672. +    BIWEIGHT_SETUP %1
  673. +    mov r3, 8
  674. +    jmp mangle(ff_h264_biweight_8x16_10_%1.nextrow)
  675. +
  676. +cglobal h264_biweight_8x4_10_%1, 7,7,8
  677. +    BIWEIGHT_SETUP %1
  678. +    mov r3, 4
  679. +    jmp mangle(ff_h264_biweight_8x16_10_%1.nextrow)
  680. +%endmacro
  681. +
  682. +INIT_XMM
  683. +BIWEIGHT_FUNC sse2
  684. +BIWEIGHT_FUNC sse4
  685. +
  686. +%macro BIWEIGHT_FUNC_HALF 1
  687. +cglobal h264_biweight_4x8_10_%1, 7,7,8
  688. +    BIWEIGHT_SETUP %1
  689. +    mov        r3, 4
  690. +    lea        r4, [r2*2]
  691. +.nextrow
  692. +    BIWEIGHT    %1, 0, r2
  693. +    movh   [r0   ], m0
  694. +    movhps [r0+r2], m0
  695. +    add         r0, r4
  696. +    add         r1, r4
  697. +    dec         r3
  698. +    jnz .nextrow
  699. +    REP_RET
  700. +
  701. +cglobal h264_biweight_4x4_10_%1, 7,7,8
  702. +    BIWEIGHT_SETUP %1
  703. +    mov        r3, 2
  704. +    lea        r4, [r2*2]
  705. +    jmp mangle(ff_h264_biweight_4x8_10_%1.nextrow)
  706. +
  707. +cglobal h264_biweight_4x2_10_%1, 7,7,8
  708. +    BIWEIGHT_SETUP %1
  709. +    mov        r3, 2
  710. +    lea        r4, [r2*2]
  711. +    jmp mangle(ff_h264_biweight_4x8_10_%1.nextrow)
  712. +%endmacro
  713. +
  714. +INIT_XMM
  715. +BIWEIGHT_FUNC_HALF sse2
  716. +BIWEIGHT_FUNC_HALF sse4
  717. diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
  718. index 3fccd08..d57314f 100644
  719. --- a/libavcodec/x86/h264dsp_mmx.c
  720. +++ b/libavcodec/x86/h264dsp_mmx.c
  721. @@ -326,6 +326,41 @@ H264_BIWEIGHT_MMX    ( 4,  8)
  722.  H264_BIWEIGHT_MMX    ( 4,  4)
  723.  H264_BIWEIGHT_MMX    ( 4,  2)
  724.  
  725. +#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
  726. +void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
  727. +    int stride, int log2_denom, int weight, int offset);
  728. +
  729. +#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
  730. +void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
  731. +    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
  732. +   int weightd, int weights, int offset);
  733. +
  734. +#define H264_WEIGHT_10_SSE(W, H, DEPTH) \
  735. +H264_WEIGHT_10(W, H, DEPTH, sse2) \
  736. +H264_WEIGHT_10(W, H, DEPTH, sse4)
  737. +
  738. +#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
  739. +H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
  740. +H264_BIWEIGHT_10(W, H, DEPTH, sse4)
  741. +
  742. +H264_WEIGHT_10_SSE(16, 16, 10)
  743. +H264_WEIGHT_10_SSE(16,  8, 10)
  744. +H264_WEIGHT_10_SSE( 8, 16, 10)
  745. +H264_WEIGHT_10_SSE( 8,  8, 10)
  746. +H264_WEIGHT_10_SSE( 8,  4, 10)
  747. +H264_WEIGHT_10_SSE( 4,  8, 10)
  748. +H264_WEIGHT_10_SSE( 4,  4, 10)
  749. +H264_WEIGHT_10_SSE( 4,  2, 10)
  750. +
  751. +H264_BIWEIGHT_10_SSE(16, 16, 10)
  752. +H264_BIWEIGHT_10_SSE(16,  8, 10)
  753. +H264_BIWEIGHT_10_SSE( 8, 16, 10)
  754. +H264_BIWEIGHT_10_SSE( 8,  8, 10)
  755. +H264_BIWEIGHT_10_SSE( 8,  4, 10)
  756. +H264_BIWEIGHT_10_SSE( 4,  8, 10)
  757. +H264_BIWEIGHT_10_SSE( 4,  4, 10)
  758. +H264_BIWEIGHT_10_SSE( 4,  2, 10)
  759. +
  760.  void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
  761.  {
  762.      int mm_flags = av_get_cpu_flags();
  763. @@ -454,6 +489,24 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
  764.                  c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
  765.  #endif
  766.  
  767. +                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_10_sse2;
  768. +                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_10_sse2;
  769. +                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_10_sse2;
  770. +                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_10_sse2;
  771. +                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_10_sse2;
  772. +                c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_10_sse2;
  773. +                c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_10_sse2;
  774. +                c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_10_sse2;
  775. +
  776. +                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_10_sse2;
  777. +                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_10_sse2;
  778. +                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_10_sse2;
  779. +                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_10_sse2;
  780. +                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_10_sse2;
  781. +                c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_10_sse2;
  782. +                c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_10_sse2;
  783. +                c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_10_sse2;
  784. +
  785.                  c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
  786.                  c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
  787.  #if HAVE_ALIGNED_STACK
  788. @@ -463,6 +516,25 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
  789.                  c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
  790.  #endif
  791.              }
  792. +            if (mm_flags&AV_CPU_FLAG_SSE4) {
  793. +                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_10_sse4;
  794. +                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_10_sse4;
  795. +                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_10_sse4;
  796. +                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_10_sse4;
  797. +                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_10_sse4;
  798. +                c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_10_sse4;
  799. +                c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_10_sse4;
  800. +                c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_10_sse4;
  801. +
  802. +                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_10_sse4;
  803. +                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_10_sse4;
  804. +                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_10_sse4;
  805. +                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_10_sse4;
  806. +                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_10_sse4;
  807. +                c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_10_sse4;
  808. +                c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_10_sse4;
  809. +                c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_10_sse4;
  810. +            }
  811.  #if HAVE_AVX
  812.              if (mm_flags&AV_CPU_FLAG_AVX) {
  813.                  c->h264_idct_dc_add    =
  814. --
  815. 1.7.5.1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement