Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From dc3ee1e5dcc8cf010223976be504f88bbe4d9889 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sun, 5 Jun 2011 18:33:23 -0400
- Subject: [PATCH 1/2] H.264: Add x86 assembly for 10-bit MC Chroma H.264
- functions.
- Mainly ported from 8-bit H.264 MC Chroma.
- ---
- libavcodec/x86/Makefile | 1 +
- libavcodec/x86/dsputil_mmx.c | 32 ++++
- libavcodec/x86/h264_chromamc_10bit.asm | 279 ++++++++++++++++++++++++++++++++
- 3 files changed, 312 insertions(+), 0 deletions(-)
- create mode 100644 libavcodec/x86/h264_chromamc_10bit.asm
- diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
- index 1c451c8..ea57bd1 100644
- --- a/libavcodec/x86/Makefile
- +++ b/libavcodec/x86/Makefile
- @@ -44,6 +44,7 @@ MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
- x86/deinterlace.o \
- x86/fmtconvert.o \
- x86/h264_chromamc.o \
- + x86/h264_chromamc_10bit.o \
- $(YASM-OBJS-yes)
- MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
- diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
- index 1cc6991..7ac9679 100644
- --- a/libavcodec/x86/dsputil_mmx.c
- +++ b/libavcodec/x86/dsputil_mmx.c
- @@ -1938,6 +1938,19 @@ void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
- void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
- +#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
- +void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
- + (uint8_t *dst, uint8_t *src,\
- + int stride, int h, int x, int y);
- +
- +CHROMA_MC(put, 2, 10, mmxext)
- +CHROMA_MC(avg, 2, 10, mmxext)
- +CHROMA_MC(put, 4, 10, mmxext)
- +CHROMA_MC(avg, 4, 10, mmxext)
- +CHROMA_MC(put, 8, 10, sse2)
- +CHROMA_MC(avg, 8, 10, sse2)
- +CHROMA_MC(put, 8, 10, avx)
- +CHROMA_MC(avg, 8, 10, avx)
- /* CAVS specific */
- void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
- @@ -2420,6 +2433,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- {
- int mm_flags = av_get_cpu_flags();
- const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
- + const int bit_depth = avctx->bits_per_raw_sample;
- if (avctx->dsp_mask) {
- if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
- @@ -2651,6 +2665,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
- c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
- }
- + if (bit_depth == 10) {
- + c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
- + c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
- + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
- + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
- + }
- c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
- #endif
- @@ -2756,6 +2776,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- H264_QPEL_FUNCS(3, 2, sse2);
- H264_QPEL_FUNCS(3, 3, sse2);
- }
- + if (bit_depth == 10) {
- + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
- + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
- + }
- }
- #if HAVE_SSSE3
- if(mm_flags & AV_CPU_FLAG_SSSE3){
- @@ -2854,6 +2878,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
- }
- #endif
- }
- +#if HAVE_AVX
- + if (mm_flags & AV_CPU_FLAG_AVX) {
- + if (bit_depth == 10) {
- + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
- + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
- + }
- + }
- +#endif
- }
- if (CONFIG_ENCODERS)
- diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
- new file mode 100644
- index 0000000..757d99f
- --- /dev/null
- +++ b/libavcodec/x86/h264_chromamc_10bit.asm
- @@ -0,0 +1,279 @@
- +;*****************************************************************************
- +;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
- +;*****************************************************************************
- +;* Copyright (C) 2005-2011 x264 project
- +;*
- +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
- +;*
- +;* This file is part of Libav.
- +;*
- +;* Libav is free software; you can redistribute it and/or
- +;* modify it under the terms of the GNU Lesser General Public
- +;* License as published by the Free Software Foundation; either
- +;* version 2.1 of the License, or (at your option) any later version.
- +;*
- +;* Libav is distributed in the hope that it will be useful,
- +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- +;* Lesser General Public License for more details.
- +;*
- +;* You should have received a copy of the GNU Lesser General Public
- +;* License along with Libav; if not, write to the Free Software
- +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- +;******************************************************************************
- +
- +%include "x86inc.asm"
- +%include "x86util.asm"
- +
- +SECTION_RODATA
- +
- +cextern pw_4
- +cextern pw_8
- +cextern pw_32
- +cextern pw_64
- +
- +SECTION .text
- +
- +
- +%macro mv0_pixels_mc8 0
- + lea r4, [r2*2 ]
- +.next4rows
- + movu m0, [r1 ]
- + movu m1, [r1+r2]
- + CHROMAMC_AVG m0, [r0 ]
- + CHROMAMC_AVG m1, [r0+r2]
- + mova [r0 ], m0
- + mova [r0+r2], m1
- + add r0, r4
- + add r1, r4
- + movu m0, [r1 ]
- + movu m1, [r1+r2]
- + CHROMAMC_AVG m0, [r0 ]
- + CHROMAMC_AVG m1, [r0+r2]
- + add r1, r4
- + mova [r0 ], m0
- + mova [r0+r2], m1
- + add r0, r4
- + sub r3d, 4
- + jne .next4rows
- +%endmacro
- +
- +;-----------------------------------------------------------------------------
- +; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
- +;-----------------------------------------------------------------------------
- +%macro CHROMA_MC8 2
- +; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
- +; int stride, int h, int mx, int my)
- +cglobal %1_h264_chroma_mc8_10_%2, 6,7,8
- + movsxdifnidn r2, r2d
- + mov r6d, r5d
- + or r6d, r4d
- + jne .at_least_one_non_zero
- + ; mx == 0 AND my == 0 - no filter needed
- + mv0_pixels_mc8
- + REP_RET
- +
- +.at_least_one_non_zero
- + mov r6d, 2
- + test r5d, r5d
- + je .my_is_zero
- + mov r6, r2 ; dxy = x ? 1 : stride
- + test r4d, r4d
- + jne .both_non_zero
- +.my_is_zero
- + ; mx == 0 XOR my == 0 - 1 dimensional filter only
- + or r4d, r5d ; x + y
- + movd m5, r4d
- + mova m4, [pw_8]
- + mova m6, [pw_4] ; mm6 = rnd >> 3
- + SPLATW m5, m5 ; mm5 = B = x
- + psubw m4, m5 ; mm4 = A = 8-x
- +
- +.next1drow
- + movu m0, [r1 ] ; mm0 = src[0..7]
- + movu m2, [r1+r6] ; mm2 = src[1..8]
- +
- + pmullw m0, m4 ; mm0 = A * src[0..7]
- + pmullw m2, m5 ; mm2 = B * src[1..8]
- +
- + paddw m0, m6
- + paddw m0, m2
- + psrlw m0, 3
- + CHROMAMC_AVG m0, [r0]
- + mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
- +
- + add r0, r2
- + add r1, r2
- + dec r3d
- + jne .next1drow
- + REP_RET
- +
- +.both_non_zero ; general case, bilinear
- + movd m4, r4m ; x
- + movd m6, r5m ; y
- +
- + SPLATW m4, m4 ; mm4 = x words
- + SPLATW m6, m6 ; mm6 = y words
- + psllw m5, m4, 3 ; mm5 = 8x
- + pmullw m4, m6 ; mm4 = x * y
- + psllw m6, 3 ; mm6 = 8y
- + paddw m1, m5, m6 ; mm7 = 8x+8y
- + mova m7, m4 ; DD = x * y
- + psubw m5, m4 ; mm5 = B = 8x - xy
- + psubw m6, m4 ; mm6 = C = 8y - xy
- + paddw m4, [pw_64]
- + psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
- +
- + movu m0, [r1 ] ; mm0 = src[0..7]
- + movu m1, [r1+2] ; mm1 = src[1..8]
- +.next2drow
- + add r1, r2
- +
- + pmullw m2, m0, m4
- + pmullw m1, m5
- + paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
- +
- + movu m0, [r1]
- + movu m1, [r1+2]
- + pmullw m3, m0, m6
- + paddw m2, m3 ; mm2 += C * src[0..7+strde]
- + pmullw m3, m1, m7
- + paddw m2, m3 ; mm2 += D * src[1..8+strde]
- +
- + paddw m2, [pw_32]
- + psrlw m2, 6
- + CHROMAMC_AVG m2, [r0]
- + mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
- +
- + add r0, r2
- + dec r3d
- + jne .next2drow
- + REP_RET
- +%endmacro
- +
- +;-----------------------------------------------------------------------------
- +; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
- +;-----------------------------------------------------------------------------
- +;TODO: xmm mc4
- +%macro MC4_OP 2
- + movq %1, [r1 ]
- + movq m1, [r1+2]
- + add r1, r2
- + pmullw %1, m4
- + pmullw m1, m2
- + paddw m1, %1
- + mova %1, m1
- +
- + pmullw %2, m5
- + pmullw m1, m3
- + paddw %2, [pw_32]
- + paddw m1, %2
- + psrlw m1, 6
- + CHROMAMC_AVG4 m1, %2, [r0]
- + movq [r0], m1
- + add r0, r2
- +%endmacro
- +
- +%macro CHROMA_MC4 2
- +cglobal %1_h264_chroma_mc4_10_%2, 6,6,7
- + movsxdifnidn r2, r2d
- + movd m2, r4m ; x
- + movd m3, r5m ; y
- + mova m4, [pw_8]
- + mova m5, m4
- + SPLATW m2, m2
- + SPLATW m3, m3
- + psubw m4, m2
- + psubw m5, m3
- +
- + movq m0, [r1 ]
- + movq m6, [r1+2]
- + add r1, r2
- + pmullw m0, m4
- + pmullw m6, m2
- + paddw m6, m0
- +
- +.next2rows
- + MC4_OP m0, m6
- + MC4_OP m6, m0
- + sub r3d, 2
- + jnz .next2rows
- + REP_RET
- +%endmacro
- +
- +;-----------------------------------------------------------------------------
- +; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
- +;-----------------------------------------------------------------------------
- +%macro CHROMA_MC2 2
- +cglobal %1_h264_chroma_mc2_10_%2, 6,7
- + movsxdifnidn r2, r2d
- + mov r6d, r4d
- + shl r4d, 16
- + sub r4d, r6d
- + add r4d, 8
- + imul r5d, r4d ; x*y<<16 | y*(8-x)
- + shl r4d, 3
- + sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
- +
- + movd m5, r4d
- + movd m6, r5d
- + punpckldq m5, m5 ; mm5 = {A,B,A,B}
- + punpckldq m6, m6 ; mm6 = {C,D,C,D}
- + movq m2, [r1]
- + pxor m7, m7
- + pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
- +
- +.nextrow
- + add r1, r2
- + movq m1, m2
- + pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
- + movq m0, [r1]
- + pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
- + movq m2, m0
- + pmaddwd m0, m6
- + paddw m1, [pw_32]
- + paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
- + psrlw m1, 6
- + packssdw m1, m7
- + CHROMAMC_AVG4 m1, m3, [r0]
- + movd [r0], m1
- + add r0, r2
- + sub r3d, 1
- + jnz .nextrow
- + REP_RET
- +%endmacro
- +
- +%macro NOTHING 2-3
- +%endmacro
- +%macro DIRECT_AVG 2
- + PAVG %1, %2
- +%endmacro
- +%macro COPY_AVG 3
- + movq %2, %3
- + PAVG %1, %2
- +%endmacro
- +
- +%define CHROMAMC_AVG NOTHING
- +%define CHROMAMC_AVG4 NOTHING
- +INIT_XMM
- +CHROMA_MC8 put, sse2
- +%ifdef HAVE_AVX
- +CHROMA_MC8 put, avx
- +%endif
- +INIT_XMM
- +CHROMA_MC4 put, mmxext
- +INIT_MMX
- +CHROMA_MC2 put, mmxext
- +
- +%define CHROMAMC_AVG DIRECT_AVG
- +%define CHROMAMC_AVG4 COPY_AVG
- +%define PAVG pavgw
- +INIT_XMM
- +CHROMA_MC8 avg, sse2
- +%ifdef HAVE_AVX
- +CHROMA_MC8 avg, avx
- +%endif
- +INIT_XMM
- +CHROMA_MC4 avg, mmxext
- +INIT_MMX
- +CHROMA_MC2 avg, mmxext
- --
- 1.7.5.1
- From 068d2421c97a6f34d47275fe193a55629051fee0 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Sun, 12 Jun 2011 18:06:56 -0400
- Subject: [PATCH 2/2] H.264: Add x86 assembly for 10-bit weight/biweight H.264
- functions.
- Mainly ported from 8-bit H.264 weight/biweight.
- ---
- libavcodec/x86/Makefile | 1 +
- libavcodec/x86/h264_weight_10bit.asm | 298 ++++++++++++++++++++++++++++++++++
- libavcodec/x86/h264dsp_mmx.c | 72 ++++++++
- 3 files changed, 371 insertions(+), 0 deletions(-)
- create mode 100644 libavcodec/x86/h264_weight_10bit.asm
- diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
- index ea57bd1..022ab27 100644
- --- a/libavcodec/x86/Makefile
- +++ b/libavcodec/x86/Makefile
- @@ -15,6 +15,7 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
- x86/h264_idct.o \
- x86/h264_idct_10bit.o \
- x86/h264_weight.o \
- + x86/h264_weight_10bit.o \
- YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
- x86/h264_intrapred_10bit.o
- diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
- new file mode 100644
- index 0000000..ea6ed83
- --- /dev/null
- +++ b/libavcodec/x86/h264_weight_10bit.asm
- @@ -0,0 +1,298 @@
- +;*****************************************************************************
- +;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
- +;*****************************************************************************
- +;* Copyright (C) 2005-2011 x264 project
- +;*
- +;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
- +;*
- +;* This file is part of Libav.
- +;*
- +;* Libav is free software; you can redistribute it and/or
- +;* modify it under the terms of the GNU Lesser General Public
- +;* License as published by the Free Software Foundation; either
- +;* version 2.1 of the License, or (at your option) any later version.
- +;*
- +;* Libav is distributed in the hope that it will be useful,
- +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- +;* Lesser General Public License for more details.
- +;*
- +;* You should have received a copy of the GNU Lesser General Public
- +;* License along with Libav; if not, write to the Free Software
- +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- +;******************************************************************************
- +
- +%include "x86inc.asm"
- +%include "x86util.asm"
- +
- +SECTION_RODATA 32
- +
- +pw_pixel_max: times 8 dw ((1 << 10)-1)
- +sq_1: dq 1
- + dq 0
- +
- +cextern pw_1
- +
- +SECTION .text
- +
- +;-----------------------------------------------------------------------------
- +; void h264_weight(uint8_t *dst, int stride, int log2_denom,
- +; int weight, int offset);
- +;-----------------------------------------------------------------------------
- +
- +%macro WEIGHT_SETUP 1
- + mova m0, [pw_1]
- + movd m2, r2d
- + pslld m0, m2 ; 1<<log2_denom
- + SPLATW m0, m0
- + shl r4, 19 ; *8, move to upper half of dword
- + lea r4, [r4+r3*2+0x10000]
- + movd m3, r4d ; weight<<1 | 1+(offset<<(3))
- + pshufd m3, m3, 0
- + mova m4, [pw_pixel_max]
- + paddw m2, [sq_1] ; log2_denom+1
- +%ifnidn %1, sse4
- + pxor m7, m7
- +%endif
- +%endmacro
- +
- +%macro WEIGHT_OP 2-3
- +%if %0==2
- + mova m5, [r0+%2]
- + punpckhwd m6, m5, m0
- + punpcklwd m5, m0
- +%else
- + movq m5, [r0+%2]
- + movq m6, [r0+%3]
- + punpcklwd m5, m0
- + punpcklwd m6, m0
- +%endif
- + pmaddwd m5, m3
- + pmaddwd m6, m3
- + psrad m5, m2
- + psrad m6, m2
- +%ifidn %1, sse4
- + packusdw m5, m6
- + pminsw m5, m4
- +%else
- + packssdw m5, m6
- + CLIPW m5, m7, m4
- +%endif
- +%endmacro
- +
- +%macro WEIGHT_FUNC_DBL 1
- +cglobal h264_weight_16x16_10_%1, 5,6,8
- + mov r5, 16
- +.body
- + WEIGHT_SETUP %1
- +.nextrow
- + WEIGHT_OP %1, 0
- + mova [r0 ], m5
- + WEIGHT_OP %1, 16
- + mova [r0+16], m5
- + add r0, r1
- + dec r6
- + jnz .nextrow
- + REP_RET
- +
- +cglobal h264_weight_16x8_10_%1, 5,6,8
- + mov r6, 8
- + jmp mangle(ff_h264_weight_16x16_10_%1.body)
- +%endmacro
- +
- +INIT_XMM
- +WEIGHT_FUNC_DBL sse2
- +WEIGHT_FUNC_DBL sse4
- +
- +
- +%macro WEIGHT_FUNC_MM 1
- +cglobal h264_weight_8x16_10_%1, 5,6,8
- + mov r6, 16
- +.body
- + WEIGHT_SETUP %1
- +.nextrow
- + WEIGHT_OP %1, 0
- + mova [r0], m5
- + add r0, r1
- + dec r2
- + jnz .nextrow
- + REP_RET
- +
- +cglobal h264_weight_8x8_10_%1, 5,6,8
- + mov r6, 8
- + jmp mangle(ff_h264_weight_8x16_10_%1.body)
- +
- +cglobal h264_weight_8x4_10_%1, 5,6,8
- + mov r6, 4
- + jmp mangle(ff_h264_weight_8x16_10_%1.body)
- +%endmacro
- +
- +INIT_XMM
- +WEIGHT_FUNC_MM sse2
- +WEIGHT_FUNC_MM sse4
- +
- +
- +%macro WEIGHT_FUNC_HALF_MM 1
- +cglobal h264_weight_4x8_10_%1, 5,6,8
- + mov r6, 4
- +.body
- + WEIGHT_SETUP %1
- + lea r3, [r1*2]
- +.nextrow
- + WEIGHT_OP %1, 0, r1
- + movh [r0], m5
- + movhps [r0+r1], m5
- + add r0, r3
- + dec r6
- + jnz .nextrow
- + REP_RET
- +
- +cglobal h264_weight_4x4_10_%1, 5,6,8
- + mov r6, 2
- + jmp mangle(ff_h264_weight_4x8_10_%1.body)
- +
- +cglobal h264_weight_4x2_10_%1, 5,6,8
- + mov r6, 1
- + jmp mangle(ff_h264_weight_4x8_10_%1.body)
- +%endmacro
- +
- +INIT_XMM
- +WEIGHT_FUNC_HALF_MM sse2
- +WEIGHT_FUNC_HALF_MM sse4
- +
- +
- +;-----------------------------------------------------------------------------
- +; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
- +; int weightd, int weights, int offset);
- +;-----------------------------------------------------------------------------
- +%macro BIWEIGHT_SETUP 1
- + lea r6, [r6*4+1] ; (offset<<2)+1
- + or r6, 1
- + shl r5, 16
- + or r4, r5
- + movd m4, r4d ; weightd | weights
- + movd m5, r6d ; (offset+1)|1
- + movd m6, r3d ; log2_denom
- + pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
- + paddd m6, [sq_1]
- + pshufd m4, m4, 0
- + pshufd m5, m5, 0
- + mova m3, [pw_pixel_max]
- +%ifnidn %1, sse4
- + pxor m7, m7
- +%endif
- +%endmacro
- +
- +%macro BIWEIGHT 2-3
- +%if %0==2
- + mova m0, [r0+%2]
- + mova m1, [r1+%2]
- + punpckhwd m2, m0, m1
- + punpcklwd m0, m1
- +%else
- + movq m0, [r0+%2]
- + movq m1, [r1+%2]
- + punpcklwd m0, m1
- + movq m2, [r0+%3]
- + movq m1, [r1+%3]
- + punpcklwd m2, m1
- +%endif
- + pmaddwd m0, m4
- + pmaddwd m2, m4
- + paddd m0, m5
- + paddd m2, m5
- + psrad m0, m6
- + psrad m2, m6
- +%ifidn %1, sse4
- + packusdw m0, m2
- + pminsw m0, m3
- +%else
- + packssdw m0, m2
- + CLIPW m0, m7, m3
- +%endif
- +%endmacro
- +
- +%macro BIWEIGHT_FUNC_DBL 1
- +cglobal h264_biweight_16x16_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 16
- +.nextrow
- + BIWEIGHT %1, 0
- + mova [r0 ], m0
- + BIWEIGHT %1, 16
- + mova [r0+16], m0
- + add r0, r2
- + add r1, r2
- + dec r3
- + jnz .nextrow
- + REP_RET
- +
- +cglobal h264_biweight_16x8_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 8
- + jmp mangle(ff_h264_biweight_16x16_10_%1.nextrow)
- +%endmacro
- +
- +INIT_XMM
- +BIWEIGHT_FUNC_DBL sse2
- +BIWEIGHT_FUNC_DBL sse4
- +
- +%macro BIWEIGHT_FUNC 1
- +cglobal h264_biweight_8x16_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 16
- +.nextrow
- + BIWEIGHT %1, 0
- + mova [r0], m0
- + add r0, r2
- + add r1, r2
- + dec r3
- + jnz .nextrow
- + REP_RET
- +
- +cglobal h264_biweight_8x8_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 8
- + jmp mangle(ff_h264_biweight_8x16_10_%1.nextrow)
- +
- +cglobal h264_biweight_8x4_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 4
- + jmp mangle(ff_h264_biweight_8x16_10_%1.nextrow)
- +%endmacro
- +
- +INIT_XMM
- +BIWEIGHT_FUNC sse2
- +BIWEIGHT_FUNC sse4
- +
- +%macro BIWEIGHT_FUNC_HALF 1
- +cglobal h264_biweight_4x8_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 4
- + lea r4, [r2*2]
- +.nextrow
- + BIWEIGHT %1, 0, r2
- + movh [r0 ], m0
- + movhps [r0+r2], m0
- + add r0, r4
- + add r1, r4
- + dec r3
- + jnz .nextrow
- + REP_RET
- +
- +cglobal h264_biweight_4x4_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 2
- + lea r4, [r2*2]
- + jmp mangle(ff_h264_biweight_4x8_10_%1.nextrow)
- +
- +cglobal h264_biweight_4x2_10_%1, 7,7,8
- + BIWEIGHT_SETUP %1
- + mov r3, 2
- + lea r4, [r2*2]
- + jmp mangle(ff_h264_biweight_4x8_10_%1.nextrow)
- +%endmacro
- +
- +INIT_XMM
- +BIWEIGHT_FUNC_HALF sse2
- +BIWEIGHT_FUNC_HALF sse4
- diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
- index 3fccd08..d57314f 100644
- --- a/libavcodec/x86/h264dsp_mmx.c
- +++ b/libavcodec/x86/h264dsp_mmx.c
- @@ -326,6 +326,41 @@ H264_BIWEIGHT_MMX ( 4, 8)
- H264_BIWEIGHT_MMX ( 4, 4)
- H264_BIWEIGHT_MMX ( 4, 2)
- +#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
- +void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
- + int stride, int log2_denom, int weight, int offset);
- +
- +#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
- +void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
- + (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
- + int weightd, int weights, int offset);
- +
- +#define H264_WEIGHT_10_SSE(W, H, DEPTH) \
- +H264_WEIGHT_10(W, H, DEPTH, sse2) \
- +H264_WEIGHT_10(W, H, DEPTH, sse4)
- +
- +#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
- +H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
- +H264_BIWEIGHT_10(W, H, DEPTH, sse4)
- +
- +H264_WEIGHT_10_SSE(16, 16, 10)
- +H264_WEIGHT_10_SSE(16, 8, 10)
- +H264_WEIGHT_10_SSE( 8, 16, 10)
- +H264_WEIGHT_10_SSE( 8, 8, 10)
- +H264_WEIGHT_10_SSE( 8, 4, 10)
- +H264_WEIGHT_10_SSE( 4, 8, 10)
- +H264_WEIGHT_10_SSE( 4, 4, 10)
- +H264_WEIGHT_10_SSE( 4, 2, 10)
- +
- +H264_BIWEIGHT_10_SSE(16, 16, 10)
- +H264_BIWEIGHT_10_SSE(16, 8, 10)
- +H264_BIWEIGHT_10_SSE( 8, 16, 10)
- +H264_BIWEIGHT_10_SSE( 8, 8, 10)
- +H264_BIWEIGHT_10_SSE( 8, 4, 10)
- +H264_BIWEIGHT_10_SSE( 4, 8, 10)
- +H264_BIWEIGHT_10_SSE( 4, 4, 10)
- +H264_BIWEIGHT_10_SSE( 4, 2, 10)
- +
- void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
- {
- int mm_flags = av_get_cpu_flags();
- @@ -454,6 +489,24 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
- c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
- #endif
- + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_10_sse2;
- + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_10_sse2;
- + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_10_sse2;
- + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_10_sse2;
- + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_10_sse2;
- + c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_10_sse2;
- + c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_10_sse2;
- + c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_10_sse2;
- +
- + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_10_sse2;
- + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_10_sse2;
- + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_10_sse2;
- + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_10_sse2;
- + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_10_sse2;
- + c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_10_sse2;
- + c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_10_sse2;
- + c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_10_sse2;
- +
- c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
- c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
- #if HAVE_ALIGNED_STACK
- @@ -463,6 +516,25 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
- #endif
- }
- + if (mm_flags&AV_CPU_FLAG_SSE4) {
- + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_10_sse4;
- + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_10_sse4;
- + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_10_sse4;
- + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_10_sse4;
- + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_10_sse4;
- + c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_10_sse4;
- + c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_10_sse4;
- + c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_10_sse4;
- +
- + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_10_sse4;
- + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_10_sse4;
- + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_10_sse4;
- + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_10_sse4;
- + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_10_sse4;
- + c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_10_sse4;
- + c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_10_sse4;
- + c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_10_sse4;
- + }
- #if HAVE_AVX
- if (mm_flags&AV_CPU_FLAG_AVX) {
- c->h264_idct_dc_add =
- --
- 1.7.5.1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement