Advertisement
Guest User

Untitled

a guest
Sep 15th, 2017
455
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 26.81 KB | None | 0 0
  1. From 8a876dc2921aff14ef3173d75fbb3391c817c259 Mon Sep 17 00:00:00 2001
  2. From: Daniel Kang <daniel.d.kang@gmail.com>
  3. Date: Sun, 5 Jun 2011 18:33:23 -0400
  4. Subject: [PATCH 1/2] H.264: Add x86 assembly for 10-bit MC Chroma H.264
  5.  functions.
  6.  
  7. Mainly ported from 8-bit H.264 MC Chroma.
  8. ---
  9. libavcodec/x86/Makefile                |    1 +
  10.  libavcodec/x86/dsputil_mmx.c           |   32 ++++
  11.  libavcodec/x86/h264_chromamc_10bit.asm |  278 ++++++++++++++++++++++++++++++++
  12.  3 files changed, 311 insertions(+), 0 deletions(-)
  13.  create mode 100644 libavcodec/x86/h264_chromamc_10bit.asm
  14.  
  15. diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
  16. index 1c451c8..ea57bd1 100644
  17. --- a/libavcodec/x86/Makefile
  18. +++ b/libavcodec/x86/Makefile
  19. @@ -44,6 +44,7 @@ MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
  20.                                            x86/deinterlace.o             \
  21.                                            x86/fmtconvert.o              \
  22.                                            x86/h264_chromamc.o           \
  23. +                                          x86/h264_chromamc_10bit.o     \
  24.                                            $(YASM-OBJS-yes)
  25.  
  26.  MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
  27. diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
  28. index 1cc6991..d68a493 100644
  29. --- a/libavcodec/x86/dsputil_mmx.c
  30. +++ b/libavcodec/x86/dsputil_mmx.c
  31. @@ -1938,6 +1938,19 @@ void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
  32.  void ff_avg_h264_chroma_mc4_ssse3     (uint8_t *dst, uint8_t *src,
  33.                                         int stride, int h, int x, int y);
  34.  
  35. +#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
  36. +void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
  37. +                                      (uint8_t *dst, uint8_t *src,\
  38. +                                       int stride, int h, int x, int y);
  39. +
  40. +CHROMA_MC(put, 2, 10, mmxext)
  41. +CHROMA_MC(avg, 2, 10, mmxext)
  42. +CHROMA_MC(put, 4, 10, sse2)
  43. +CHROMA_MC(avg, 4, 10, sse2)
  44. +CHROMA_MC(put, 8, 10, sse2)
  45. +CHROMA_MC(avg, 8, 10, sse2)
  46. +CHROMA_MC(put, 8, 10, avx)
  47. +CHROMA_MC(avg, 8, 10, avx)
  48.  
  49.  /* CAVS specific */
  50.  void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
  51. @@ -2420,6 +2433,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  52.  {
  53.      int mm_flags = av_get_cpu_flags();
  54.      const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
  55. +    const int bit_depth = avctx->bits_per_raw_sample;
  56.  
  57.      if (avctx->dsp_mask) {
  58.          if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
  59. @@ -2651,6 +2665,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  60.              c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
  61.              c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
  62.              }
  63. +            if (bit_depth == 10) {
  64. +                c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
  65. +                c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
  66. +            }
  67.  
  68.              c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
  69.  #endif
  70. @@ -2756,6 +2774,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  71.              H264_QPEL_FUNCS(3, 2, sse2);
  72.              H264_QPEL_FUNCS(3, 3, sse2);
  73.              }
  74. +            if (bit_depth == 10) {
  75. +                c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
  76. +                c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
  77. +                c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_sse2;
  78. +                c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_sse2;
  79. +            }
  80.          }
  81.  #if HAVE_SSSE3
  82.          if(mm_flags & AV_CPU_FLAG_SSSE3){
  83. @@ -2854,6 +2878,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  84.              }
  85.  #endif
  86.          }
  87. +#if HAVE_AVX
  88. +        if (mm_flags & AV_CPU_FLAG_AVX) {
  89. +            if (bit_depth == 10) {
  90. +                c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
  91. +                c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
  92. +            }
  93. +        }
  94. +#endif
  95.      }
  96.  
  97.      if (CONFIG_ENCODERS)
  98. diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
  99. new file mode 100644
  100. index 0000000..c4ad900
  101. --- /dev/null
  102. +++ b/libavcodec/x86/h264_chromamc_10bit.asm
  103. @@ -0,0 +1,278 @@
  104. +;*****************************************************************************
  105. +;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
  106. +;*****************************************************************************
  107. +;* Copyright (C) 2005-2011 x264 project
  108. +;*
  109. +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  110. +;*
  111. +;* This file is part of Libav.
  112. +;*
  113. +;* Libav is free software; you can redistribute it and/or
  114. +;* modify it under the terms of the GNU Lesser General Public
  115. +;* License as published by the Free Software Foundation; either
  116. +;* version 2.1 of the License, or (at your option) any later version.
  117. +;*
  118. +;* Libav is distributed in the hope that it will be useful,
  119. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  120. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  121. +;* Lesser General Public License for more details.
  122. +;*
  123. +;* You should have received a copy of the GNU Lesser General Public
  124. +;* License along with Libav; if not, write to the Free Software
  125. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  126. +;******************************************************************************
  127. +
  128. +%include "x86inc.asm"
  129. +%include "x86util.asm"
  130. +
  131. +SECTION_RODATA
  132. +
  133. +cextern pw_4
  134. +cextern pw_8
  135. +cextern pw_32
  136. +cextern pw_64
  137. +
  138. +SECTION .text
  139. +
  140. +
  141. +%macro mv0_pixels_mc8 0
  142. +    lea           r4, [r2*2 ]
  143. +.next4rows
  144. +    movu          m0, [r1   ]
  145. +    movu          m1, [r1+r2]
  146. +    CHROMAMC_AVG  m0, [r0   ]
  147. +    CHROMAMC_AVG  m1, [r0+r2]
  148. +    mova     [r0   ], m0
  149. +    mova     [r0+r2], m1
  150. +    add           r0, r4
  151. +    add           r1, r4
  152. +    movu          m0, [r1   ]
  153. +    movu          m1, [r1+r2]
  154. +    CHROMAMC_AVG  m0, [r0   ]
  155. +    CHROMAMC_AVG  m1, [r0+r2]
  156. +    add           r1, r4
  157. +    mova     [r0   ], m0
  158. +    mova     [r0+r2], m1
  159. +    add           r0, r4
  160. +    sub          r3d, 4
  161. +    jne .next4rows
  162. +%endmacro
  163. +
  164. +;-----------------------------------------------------------------------------
  165. +; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
  166. +;-----------------------------------------------------------------------------
  167. +%macro CHROMA_MC8 2
  168. +; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
  169. +;                              int stride, int h, int mx, int my)
  170. +cglobal %1_h264_chroma_mc8_10_%2, 6,7,8
  171. +    movsxdifnidn  r2, r2d
  172. +    mov          r6d, r5d
  173. +    or           r6d, r4d
  174. +    jne .at_least_one_non_zero
  175. +    ; mx == 0 AND my == 0 - no filter needed
  176. +    mv0_pixels_mc8
  177. +    REP_RET
  178. +
  179. +.at_least_one_non_zero
  180. +    mov          r6d, 2
  181. +    test         r5d, r5d
  182. +    je .my_is_zero
  183. +    mov           r6, r2        ; dxy = x ? 1 : stride
  184. +    test         r4d, r4d
  185. +    jne .both_non_zero
  186. +.my_is_zero
  187. +    ; mx == 0 XOR my == 0 - 1 dimensional filter only
  188. +    or           r4d, r5d       ; x + y
  189. +    movd          m5, r4d
  190. +    mova          m4, [pw_8]
  191. +    mova          m6, [pw_4]    ; mm6 = rnd >> 3
  192. +    SPLATW        m5, m5        ; mm5 = B = x
  193. +    psubw         m4, m5        ; mm4 = A = 8-x
  194. +
  195. +.next1drow
  196. +    movu          m0, [r1   ]   ; mm0 = src[0..7]
  197. +    movu          m2, [r1+r6]   ; mm2 = src[1..8]
  198. +
  199. +    pmullw        m0, m4        ; mm0 = A * src[0..7]
  200. +    pmullw        m2, m5        ; mm2 = B * src[1..8]
  201. +
  202. +    paddw         m0, m6
  203. +    paddw         m0, m2
  204. +    psrlw         m0, 3
  205. +    CHROMAMC_AVG  m0, [r0]
  206. +    mova        [r0], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
  207. +
  208. +    add           r0, r2
  209. +    add           r1, r2
  210. +    dec           r3d
  211. +    jne .next1drow
  212. +    REP_RET
  213. +
  214. +.both_non_zero ; general case, bilinear
  215. +    movd          m4, r4m         ; x
  216. +    movd          m6, r5m         ; y
  217. +
  218. +    SPLATW        m4, m4          ; mm4 = x words
  219. +    SPLATW        m6, m6          ; mm6 = y words
  220. +    psllw         m5, m4, 3       ; mm5 = 8x
  221. +    pmullw        m4, m6          ; mm4 = x * y
  222. +    psllw         m6, 3           ; mm6 = 8y
  223. +    paddw         m1, m5, m6      ; mm7 = 8x+8y
  224. +    mova          m7, m4          ; DD = x * y
  225. +    psubw         m5, m4          ; mm5 = B = 8x - xy
  226. +    psubw         m6, m4          ; mm6 = C = 8y - xy
  227. +    paddw         m4, [pw_64]
  228. +    psubw         m4, m1          ; mm4 = A = xy - (8x+8y) + 64
  229. +
  230. +    movu          m0, [r1  ]      ; mm0 = src[0..7]
  231. +    movu          m1, [r1+2]      ; mm1 = src[1..8]
  232. +.next2drow
  233. +    add           r1, r2
  234. +
  235. +    pmullw        m2, m0, m4
  236. +    pmullw        m1, m5
  237. +    paddw         m2, m1          ; mm2 = A * src[0..7] + B * src[1..8]
  238. +
  239. +    movu          m0, [r1]
  240. +    movu          m1, [r1+2]
  241. +    pmullw        m3, m0, m6
  242. +    paddw         m2, m3          ; mm2 += C * src[0..7+strde]
  243. +    pmullw        m3, m1, m7
  244. +    paddw         m2, m3          ; mm2 += D * src[1..8+strde]
  245. +
  246. +    paddw         m2, [pw_32]
  247. +    psrlw         m2, 6
  248. +    CHROMAMC_AVG  m2, [r0]
  249. +    mova        [r0], m2          ; dst[0..7] = (mm2 + 32) >> 6
  250. +
  251. +    add           r0, r2
  252. +    dec          r3d
  253. +    jne .next2drow
  254. +    REP_RET
  255. +%endmacro
  256. +
  257. +;-----------------------------------------------------------------------------
  258. +; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
  259. +;-----------------------------------------------------------------------------
  260. +%macro MC4_OP 2
  261. +    movq          %1, [r1  ]
  262. +    movq          m1, [r1+2]
  263. +    add           r1, r2
  264. +    pmullw        %1, m4
  265. +    pmullw        m1, m2
  266. +    paddw         m1, %1
  267. +    mova          %1, m1
  268. +
  269. +    pmullw        %2, m5
  270. +    pmullw        m1, m3
  271. +    paddw         %2, [pw_32]
  272. +    paddw         m1, %2
  273. +    psrlw         m1, 6
  274. +    CHROMAMC_AVG4 m1, %2, [r0]
  275. +    movq        [r0], m1
  276. +    add           r0, r2
  277. +%endmacro
  278. +
  279. +%macro CHROMA_MC4 2
  280. +cglobal %1_h264_chroma_mc4_10_%2, 6,6,7
  281. +    movsxdifnidn  r2, r2d
  282. +    movd          m2, r4m         ; x
  283. +    movd          m3, r5m         ; y
  284. +    mova          m4, [pw_8]
  285. +    mova          m5, m4
  286. +    SPLATW        m2, m2
  287. +    SPLATW        m3, m3
  288. +    psubw         m4, m2
  289. +    psubw         m5, m3
  290. +
  291. +    movq          m0, [r1  ]
  292. +    movq          m6, [r1+2]
  293. +    add           r1, r2
  294. +    pmullw        m0, m4
  295. +    pmullw        m6, m2
  296. +    paddw         m6, m0
  297. +
  298. +.next2rows
  299. +    MC4_OP m0, m6
  300. +    MC4_OP m6, m0
  301. +    sub   r3d, 2
  302. +    jnz .next2rows
  303. +    REP_RET
  304. +%endmacro
  305. +
  306. +;-----------------------------------------------------------------------------
  307. +; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
  308. +;-----------------------------------------------------------------------------
  309. +%macro CHROMA_MC2 2
  310. +cglobal %1_h264_chroma_mc2_10_%2, 6,7
  311. +    movsxdifnidn  r2, r2d
  312. +    mov          r6d, r4d
  313. +    shl          r4d, 16
  314. +    sub          r4d, r6d
  315. +    add          r4d, 8
  316. +    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
  317. +    shl          r4d, 3
  318. +    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
  319. +
  320. +    movd          m5, r4d
  321. +    movd          m6, r5d
  322. +    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
  323. +    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
  324. +    movq          m2, [r1]
  325. +    pxor          m7, m7
  326. +    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
  327. +
  328. +.nextrow
  329. +    add           r1, r2
  330. +    movq          m1, m2
  331. +    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
  332. +    movq          m0, [r1]
  333. +    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
  334. +    movq          m2, m0
  335. +    pmaddwd       m0, m6
  336. +    paddw         m1, [pw_32]
  337. +    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
  338. +    psrlw         m1, 6
  339. +    packssdw      m1, m7
  340. +    CHROMAMC_AVG4 m1, m3, [r0]
  341. +    movd        [r0], m1
  342. +    add           r0, r2
  343. +    sub          r3d, 1
  344. +    jnz .nextrow
  345. +    REP_RET
  346. +%endmacro
  347. +
  348. +%macro NOTHING 2-3
  349. +%endmacro
  350. +%macro DIRECT_AVG 2
  351. +    PAVG          %1, %2
  352. +%endmacro
  353. +%macro COPY_AVG 3
  354. +    movq          %2, %3
  355. +    PAVG          %1, %2
  356. +%endmacro
  357. +
  358. +%define CHROMAMC_AVG  NOTHING
  359. +%define CHROMAMC_AVG4 NOTHING
  360. +INIT_XMM
  361. +CHROMA_MC8 put, sse2
  362. +%ifdef HAVE_AVX
  363. +CHROMA_MC8 put, avx
  364. +%endif
  365. +INIT_XMM
  366. +CHROMA_MC4 put, sse2
  367. +INIT_MMX
  368. +CHROMA_MC2 put, mmxext
  369. +
  370. +%define CHROMAMC_AVG  DIRECT_AVG
  371. +%define CHROMAMC_AVG4 COPY_AVG
  372. +%define PAVG          pavgw
  373. +INIT_XMM
  374. +CHROMA_MC8 avg, sse2
  375. +%ifdef HAVE_AVX
  376. +CHROMA_MC8 avg, avx
  377. +%endif
  378. +INIT_XMM
  379. +CHROMA_MC4 avg, sse2
  380. +INIT_MMX
  381. +CHROMA_MC2 avg, mmxext
  382. --
  383. 1.7.5.1
  384.  
  385.  
  386. From ab30144e5c598bdf1e058d445dd09ab5633cf5d0 Mon Sep 17 00:00:00 2001
  387. From: Daniel Kang <daniel.d.kang@gmail.com>
  388. Date: Mon, 6 Jun 2011 05:50:49 -0400
  389. Subject: [PATCH 2/2] weight WIP
  390.  
  391. ---
  392. libavcodec/x86/Makefile              |    1 +
  393.  libavcodec/x86/h264_weight_10bit.asm |  301 ++++++++++++++++++++++++++++++++++
  394.  libavcodec/x86/h264dsp_mmx.c         |   72 ++++++++
  395.  3 files changed, 374 insertions(+), 0 deletions(-)
  396.  create mode 100644 libavcodec/x86/h264_weight_10bit.asm
  397.  
  398. diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
  399. index ea57bd1..022ab27 100644
  400. --- a/libavcodec/x86/Makefile
  401. +++ b/libavcodec/x86/Makefile
  402. @@ -15,6 +15,7 @@ YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
  403.                                            x86/h264_idct.o               \
  404.                                            x86/h264_idct_10bit.o         \
  405.                                            x86/h264_weight.o             \
  406. +                                          x86/h264_weight_10bit.o       \
  407.  
  408.  YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o          \
  409.                                            x86/h264_intrapred_10bit.o
  410. diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
  411. new file mode 100644
  412. index 0000000..55bc8bc
  413. --- /dev/null
  414. +++ b/libavcodec/x86/h264_weight_10bit.asm
  415. @@ -0,0 +1,301 @@
  416. +;*****************************************************************************
  417. +;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
  418. +;*****************************************************************************
  419. +;* Copyright (C) 2005-2011 x264 project
  420. +;*
  421. +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  422. +;*
  423. +;* This file is part of Libav.
  424. +;*
  425. +;* Libav is free software; you can redistribute it and/or
  426. +;* modify it under the terms of the GNU Lesser General Public
  427. +;* License as published by the Free Software Foundation; either
  428. +;* version 2.1 of the License, or (at your option) any later version.
  429. +;*
  430. +;* Libav is distributed in the hope that it will be useful,
  431. +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  432. +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  433. +;* Lesser General Public License for more details.
  434. +;*
  435. +;* You should have received a copy of the GNU Lesser General Public
  436. +;* License along with Libav; if not, write to the Free Software
  437. +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  438. +;******************************************************************************
  439. +
  440. +%include "x86inc.asm"
  441. +%include "x86util.asm"
  442. +
  443. +SECTION_RODATA 32
  444. +
  445. +pw_pixel_max: times 8 dw ((1 << 10)-1)
  446. +sq_1: dq 1
  447. +      dq 0
  448. +
  449. +cextern pw_1
  450. +
  451. +SECTION .text
  452. +
  453. +;-----------------------------------------------------------------------------
  454. +; void h264_weight(uint8_t *dst, int stride, int log2_denom,
  455. +;                  int weight, int offset);
  456. +;-----------------------------------------------------------------------------
  457. +
  458. +%macro WEIGHT_SETUP 1
  459. +    mova       m0, [pw_1]
  460. +    movd       m2, r2d
  461. +    pslld      m0, m2       ; 1<<log2_denom
  462. +    lea        r4, [r4*8+1] ; 1+(offset<<3)
  463. +    shl        r3, 1        ; weight << 1
  464. +    shl        r4, 16
  465. +    or         r4, r3
  466. +    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
  467. +    pshufd     m3, m3, 0
  468. +    mova       m4, [pw_pixel_max]
  469. +    paddw      m2, [sq_1]   ; denom+1
  470. +%ifnidn %1, sse4
  471. +    pxor       m7, m7
  472. +%endif
  473. +%endmacro
  474. +
  475. +%macro WEIGHT_OP 3
  476. +;;;;;;;;;;;;;;;; can be optimized
  477. +    movq        m5, [r0+%2]
  478. +    movq        m6, [r0+%3]
  479. +    punpcklwd   m5, m0
  480. +    punpcklwd   m6, m0
  481. +    pmaddwd     m5, m3
  482. +    pmaddwd     m6, m3
  483. +    psrad       m5, m2
  484. +    psrad       m6, m2
  485. +%ifidn %1, sse4
  486. +    packusdw    m5, m6
  487. +    pminsw      m5, m4
  488. +%else
  489. +    packssdw    m5, m6
  490. +    CLIPW       m5, m7, m4
  491. +%endif
  492. +%endmacro
  493. +
  494. +%macro WEIGHT_FUNC_DBL 1
  495. +cglobal h264_weight_16x16_10_%1, 5,5,8
  496. +    WEIGHT_SETUP %1
  497. +    mov        r2, 16
  498. +.nextrow
  499. +    WEIGHT_OP %1,  0,  8
  500. +    mova     [r0   ], m5
  501. +    WEIGHT_OP %1, 16, 24
  502. +    mova     [r0+16], m5
  503. +    add        r0, r1
  504. +    dec        r2
  505. +    jnz .nextrow
  506. +    REP_RET
  507. +
  508. +cglobal h264_weight_16x8_10_%1, 5,5,8
  509. +    WEIGHT_SETUP %1
  510. +    mov        r2, 8
  511. +    jmp mangle(ff_h264_weight_16x16_10_%1.nextrow)
  512. +%endmacro
  513. +
  514. +INIT_XMM
  515. +WEIGHT_FUNC_DBL sse2
  516. +WEIGHT_FUNC_DBL sse4
  517. +
  518. +
  519. +%macro WEIGHT_FUNC_MM 1
  520. +cglobal h264_weight_8x16_10_%1, 7,7,8
  521. +    WEIGHT_SETUP %1
  522. +    mov        r2, 16
  523. +.nextrow
  524. +    WEIGHT_OP  %1, 0, 8
  525. +    mova     [r0], m5
  526. +    add        r0, r1
  527. +    dec        r2
  528. +    jnz .nextrow
  529. +    REP_RET
  530. +
  531. +cglobal h264_weight_8x8_10_%1, 7,7,8
  532. +    WEIGHT_SETUP %1
  533. +    mov        r2, 8
  534. +    jmp mangle(ff_h264_weight_8x16_10_%1.nextrow)
  535. +
  536. +cglobal h264_weight_8x4_10_%1, 7,7,8
  537. +    WEIGHT_SETUP %1
  538. +    mov        r2, 4
  539. +    jmp mangle(ff_h264_weight_8x16_10_%1.nextrow)
  540. +%endmacro
  541. +
  542. +INIT_XMM
  543. +WEIGHT_FUNC_MM sse2
  544. +WEIGHT_FUNC_MM sse4
  545. +
  546. +
  547. +%macro WEIGHT_FUNC_HALF_MM 1
  548. +cglobal h264_weight_4x8_10_%1, 5,5,8
  549. +    WEIGHT_SETUP %1
  550. +    mov        r2, 4
  551. +    lea        r3, [r1*2]
  552. +.nextrow
  553. +    WEIGHT_OP   %1, 0, r1
  554. +    movh      [r0], m5
  555. +    movhps [r0+r1], m5
  556. +    add         r0, r3
  557. +    dec         r2
  558. +    jnz .nextrow
  559. +    REP_RET
  560. +
  561. +cglobal h264_weight_4x4_10_%1, 5,5,8
  562. +    WEIGHT_SETUP %1
  563. +    mov        r2, 2
  564. +    lea        r3, [r1*2]
  565. +    jmp mangle(ff_h264_weight_4x8_10_%1.nextrow)
  566. +
  567. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; NEEDS TO BE TESTED
  568. +cglobal h264_weight_4x2_10_%1, 5,5,8
  569. +int 3
  570. +    WEIGHT_SETUP %1
  571. +    mov        r2, 1
  572. +    lea        r3, [r1*2]
  573. +    jmp mangle(ff_h264_weight_4x8_10_%1.nextrow)
  574. +%endmacro
  575. +
  576. +INIT_XMM
  577. +WEIGHT_FUNC_HALF_MM sse2
  578. +WEIGHT_FUNC_HALF_MM sse4
  579. +
  580. +
  581. +;-----------------------------------------------------------------------------
  582. +; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
  583. +;                    int weightd, int weights, int offset);
  584. +;-----------------------------------------------------------------------------
  585. +%macro BIWEIGHT_SETUP 1
  586. +    add        r6, 1
  587. +    or         r6, 1
  588. +    shl        r5, 16
  589. +    or         r4, r5
  590. +    movd       m4, r4d ; weightd | weights
  591. +    movd       m5, r6d ; (offset+1)|1
  592. +    movd       m6, r3d ; log2_denom
  593. +    pslld      m5, m6  ; ((offset+1)|1)<<log2_denom
  594. +    paddd      m6, [sq_1]
  595. +    pshufd     m4, m4, 0
  596. +    pshufd     m5, m5, 0
  597. +    mova       m3, [pw_pixel_max]
  598. +%ifnidn %1, sse4
  599. +    pxor       m7, m7
  600. +%endif
  601. +%endmacro
  602. +
  603. +%macro BIWEIGHT 2-3
  604. +%if %0==2
  605. +    mova       m0, [r0+%2]
  606. +    mova       m1, [r1+%2]
  607. +    punpckhwd  m2, m0, m1
  608. +    punpcklwd  m0, m1
  609. +%else
  610. +    movq       m0, [r0+%2]
  611. +    movq       m1, [r1+%2]
  612. +    punpcklwd  m0, m1
  613. +    movq       m2, [r0+%3]
  614. +    movq       m1, [r1+%3]
  615. +    punpcklwd  m2, m1
  616. +%endif
  617. +    pmaddwd    m0, m4
  618. +    pmaddwd    m2, m4
  619. +    paddd      m0, m5
  620. +    paddd      m2, m5
  621. +    psrad      m0, m6
  622. +    psrad      m2, m6
  623. +%ifidn %1, sse4
  624. +    packusdw   m0, m2
  625. +    pminsw     m0, m3
  626. +%else
  627. +    packssdw   m0, m2
  628. +    CLIPW      m0, m7, m3
  629. +%endif
  630. +%endmacro
  631. +
  632. +%macro BIWEIGHT_FUNC_DBL 1
  633. +cglobal h264_biweight_16x16_10_%1, 7,7,8
  634. +    BIWEIGHT_SETUP %1
  635. +    mov        r3, 16
  636. +.nextrow
  637. +    BIWEIGHT   %1,  0
  638. +    mova  [r0   ], m0
  639. +    BIWEIGHT   %1, 16
  640. +    mova  [r0+16], m0
  641. +    add        r0, r2
  642. +    add        r1, r2
  643. +    dec        r3
  644. +    jnz .nextrow
  645. +    REP_RET
  646. +
  647. +cglobal h264_biweight_16x8_10_%1, 7,7,8
  648. +    BIWEIGHT_SETUP %1
  649. +    mov r3, 8
  650. +    jmp mangle(ff_h264_biweight_16x16_10_%1.nextrow)
  651. +%endmacro
  652. +
  653. +INIT_XMM
  654. +BIWEIGHT_FUNC_DBL sse2
  655. +BIWEIGHT_FUNC_DBL sse4
  656. +
  657. +%macro BIWEIGHT_FUNC 1
  658. +cglobal h264_biweight_8x16_10_%1, 7,7,8
  659. +    BIWEIGHT_SETUP %1
  660. +    mov      r3, 16
  661. +.nextrow
  662. +    BIWEIGHT %1, 0
  663. +    mova   [r0], m0
  664. +    add      r0, r2
  665. +    add      r1, r2
  666. +    dec      r3
  667. +    jnz .nextrow
  668. +    REP_RET
  669. +
  670. +cglobal h264_biweight_8x8_10_%1, 7,7,8
  671. +    BIWEIGHT_SETUP %1
  672. +    mov r3, 8
  673. +    jmp mangle(ff_h264_biweight_8x16_10_%1.nextrow)
  674. +
  675. +cglobal h264_biweight_8x4_10_%1, 7,7,8
  676. +    BIWEIGHT_SETUP %1
  677. +    mov r3, 4
  678. +    jmp mangle(ff_h264_biweight_8x16_10_%1.nextrow)
  679. +%endmacro
  680. +
  681. +INIT_XMM
  682. +BIWEIGHT_FUNC sse2
  683. +BIWEIGHT_FUNC sse4
  684. +
  685. +%macro BIWEIGHT_FUNC_HALF 1
  686. +cglobal h264_biweight_4x8_10_%1, 7,7,8
  687. +    BIWEIGHT_SETUP %1
  688. +    mov        r3, 4
  689. +    lea        r4, [r2*2]
  690. +.nextrow
  691. +    BIWEIGHT    %1, 0, r2
  692. +    movh   [r0   ], m0
  693. +    movhps [r0+r2], m0
  694. +    add         r0, r4
  695. +    add         r1, r4
  696. +    dec         r3
  697. +    jnz .nextrow
  698. +    REP_RET
  699. +
  700. +cglobal h264_biweight_4x4_10_%1, 7,7,8
  701. +    BIWEIGHT_SETUP %1
  702. +    mov        r3, 2
  703. +    lea        r4, [r2*2]
  704. +    jmp mangle(ff_h264_biweight_4x8_10_%1.nextrow)
  705. +
  706. +cglobal h264_biweight_4x2_10_%1, 7,7,8
  707. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; NEEDS TO BE TESTED
  708. +    BIWEIGHT_SETUP %1
  709. +    mov        r3, 2
  710. +    lea        r4, [r2*2]
  711. +    jmp mangle(ff_h264_biweight_4x8_10_%1.nextrow)
  712. +%endmacro
  713. +
  714. +INIT_XMM
  715. +BIWEIGHT_FUNC_HALF sse2
  716. +BIWEIGHT_FUNC_HALF sse4
  717. diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
  718. index 3fccd08..d57314f 100644
  719. --- a/libavcodec/x86/h264dsp_mmx.c
  720. +++ b/libavcodec/x86/h264dsp_mmx.c
  721. @@ -326,6 +326,41 @@ H264_BIWEIGHT_MMX    ( 4,  8)
  722.  H264_BIWEIGHT_MMX    ( 4,  4)
  723.  H264_BIWEIGHT_MMX    ( 4,  2)
  724.  
  725. +#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
  726. +void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
  727. +    int stride, int log2_denom, int weight, int offset);
  728. +
  729. +#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
  730. +void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
  731. +    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
  732. +   int weightd, int weights, int offset);
  733. +
  734. +#define H264_WEIGHT_10_SSE(W, H, DEPTH) \
  735. +H264_WEIGHT_10(W, H, DEPTH, sse2) \
  736. +H264_WEIGHT_10(W, H, DEPTH, sse4)
  737. +
  738. +#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
  739. +H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
  740. +H264_BIWEIGHT_10(W, H, DEPTH, sse4)
  741. +
  742. +H264_WEIGHT_10_SSE(16, 16, 10)
  743. +H264_WEIGHT_10_SSE(16,  8, 10)
  744. +H264_WEIGHT_10_SSE( 8, 16, 10)
  745. +H264_WEIGHT_10_SSE( 8,  8, 10)
  746. +H264_WEIGHT_10_SSE( 8,  4, 10)
  747. +H264_WEIGHT_10_SSE( 4,  8, 10)
  748. +H264_WEIGHT_10_SSE( 4,  4, 10)
  749. +H264_WEIGHT_10_SSE( 4,  2, 10)
  750. +
  751. +H264_BIWEIGHT_10_SSE(16, 16, 10)
  752. +H264_BIWEIGHT_10_SSE(16,  8, 10)
  753. +H264_BIWEIGHT_10_SSE( 8, 16, 10)
  754. +H264_BIWEIGHT_10_SSE( 8,  8, 10)
  755. +H264_BIWEIGHT_10_SSE( 8,  4, 10)
  756. +H264_BIWEIGHT_10_SSE( 4,  8, 10)
  757. +H264_BIWEIGHT_10_SSE( 4,  4, 10)
  758. +H264_BIWEIGHT_10_SSE( 4,  2, 10)
  759. +
  760.  void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
  761.  {
  762.      int mm_flags = av_get_cpu_flags();
  763. @@ -454,6 +489,24 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
  764.                  c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
  765.  #endif
  766.  
  767. +                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_10_sse2;
  768. +                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_10_sse2;
  769. +                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_10_sse2;
  770. +                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_10_sse2;
  771. +                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_10_sse2;
  772. +                c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_10_sse2;
  773. +                c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_10_sse2;
  774. +                c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_10_sse2;
  775. +
  776. +                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_10_sse2;
  777. +                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_10_sse2;
  778. +                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_10_sse2;
  779. +                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_10_sse2;
  780. +                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_10_sse2;
  781. +                c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_10_sse2;
  782. +                c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_10_sse2;
  783. +                c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_10_sse2;
  784. +
  785.                  c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
  786.                  c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
  787.  #if HAVE_ALIGNED_STACK
  788. @@ -463,6 +516,25 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
  789.                  c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
  790.  #endif
  791.              }
  792. +            if (mm_flags&AV_CPU_FLAG_SSE4) {
  793. +                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_10_sse4;
  794. +                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_10_sse4;
  795. +                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_10_sse4;
  796. +                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_10_sse4;
  797. +                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_10_sse4;
  798. +                c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_10_sse4;
  799. +                c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_10_sse4;
  800. +                c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_10_sse4;
  801. +
  802. +                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_10_sse4;
  803. +                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_10_sse4;
  804. +                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_10_sse4;
  805. +                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_10_sse4;
  806. +                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_10_sse4;
  807. +                c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_10_sse4;
  808. +                c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_10_sse4;
  809. +                c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_10_sse4;
  810. +            }
  811.  #if HAVE_AVX
  812.              if (mm_flags&AV_CPU_FLAG_AVX) {
  813.                  c->h264_idct_dc_add    =
  814. --
  815. 1.7.5.1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement