Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ;*****************************************************************************
- ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
- ;*****************************************************************************
- ;* Copyright (C) 2011 Daniel Kang
- ;*
- ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
- ;*
- ;* This file is part of Libav.
- ;*
- ;* Libav is free software; you can redistribute it and/or
- ;* modify it under the terms of the GNU Lesser General Public
- ;* License as published by the Free Software Foundation; either
- ;* version 2.1 of the License, or (at your option) any later version.
- ;*
- ;* Libav is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;* Lesser General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU Lesser General Public
- ;* License along with Libav; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;******************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA 32
- cextern pw_16
- cextern pw_5
- SECTION .text
- %macro AVG_MOV 2
- pavgb %2, %1
- movh %1, %2
- %endmacro
- %macro MC 1
- %define OP_MOV movh
- INIT_MMX
- %1 mmxext, put, 4
- INIT_XMM
- %1 sse2 , put, 8
- %define OP_MOV AVG_MOV
- INIT_MMX
- %1 mmxext, avg, 4
- INIT_XMM
- %1 sse2 , avg, 8
- %endmacro
- %macro MCAxA 8
- %ifdef ARCH_X86_64
- %ifnidn %1,mmxext
- MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
- %endif
- %else
- MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
- %endif
- %endmacro
- %macro MCAxA_OP 8
- cglobal %2_h264_qpel%5_%3_%1, %6,%7,%8
- %ifdef ARCH_X86_32
- call stub_%2_h264_qpel%4_%3_%1
- mov r0, r0m
- mov r1, r1m
- add r0, %4*2
- add r1, %4*2
- call stub_%2_h264_qpel%4_%3_%1
- mov r0, r0m
- mov r1, r1m
- lea r0, [r0+r2*%4]
- lea r1, [r1+r2*%4]
- call stub_%2_h264_qpel%4_%3_%1
- mov r0, r0m
- mov r1, r1m
- lea r0, [r0+r2*%4+%4*2]
- lea r1, [r1+r2*%4+%4*2]
- call stub_%2_h264_qpel%4_%3_%1
- RET
- %else ; ARCH_X86_64
- mov r10, r0
- mov r11, r1
- call stub_%2_h264_qpel%4_%3_%1
- lea r0, [r10+%4*2]
- lea r1, [r11+%4*2]
- call stub_%2_h264_qpel%4_%3_%1
- lea r0, [r10+r2*%4]
- lea r1, [r11+r2*%4]
- call stub_%2_h264_qpel%4_%3_%1
- lea r0, [r10+r2*%4+%4*2]
- lea r1, [r11+r2*%4+%4*2]
- %ifndef UNIX64 ; fall through to function
- call stub_%2_h264_qpel%4_%3_%1
- RET
- %endif
- %endif
- %endmacro
- ;cpu, put/avg, mc, 4/8, ...
- %macro cglobal_mc 7
- %assign i %4*2
- MCAxA %1, %2, %3, %4, i, %5,%6,%7
- cglobal %2_h264_qpel%4_%3_%1, %5,%6,%7
- %ifndef UNIX64 ; no prologue or epilogue for UNIX64
- call stub_%2_h264_qpel%4_%3_%1
- RET
- %endif
- stub_%2_h264_qpel%4_%3_%1:
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro COPY4 1
- %1 m0, [r1 ]
- OP_MOV [r0 ], m0
- %1 m0, [r1+r2 ]
- OP_MOV [r0+r2 ], m0
- %1 m0, [r1+r2*2]
- OP_MOV [r0+r2*2], m0
- %1 m0, [r1+r3 ]
- OP_MOV [r0+r3 ], m0
- %endmacro
- %macro MC00 1
- INIT_MMX
- %define MOV_OP movh
- cglobal %1_h264_qpel4_mc00_mmxext,3,4
- lea r3, [r2*3 ]
- COPY4 movh
- RET
- %define MOV_OP mova
- cglobal %1_h264_qpel8_mc00_mmxext,3,4
- lea r3, [r2*3 ]
- COPY4 movu
- lea r0, [r0+r2*4]
- lea r1, [r1+r2*4]
- COPY4 movu
- RET
- INIT_XMM
- cglobal %1_h264_qpel16_mc00_sse2,3,5
- lea r3, [r2*3 ]
- mov r4d, 4
- .loop:
- COPY4 movu
- lea r0, [r0+r2*4]
- lea r1, [r1+r2*4]
- dec r4d
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX
- %define OP_MOV MOV_OP
- MC00 put
- INIT_MMX
- %define OP_MOV AVG_MOV
- MC00 avg
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC20 3
- cglobal_mc %1, %2, mc20, %3, 3,4,9
- mov r3d, %3
- pxor m7, m7
- mova m4, [pw_5]
- mova m5, [pw_16]
- .nextrow
- movh m1, [r1-1]
- movh m2, [r1+0]
- movh m3, [r1+1]
- movh m0, [r1+2]
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m0, m7
- paddw m1, m0
- paddw m2, m3
- movh m0, [r1-2]
- movh m3, [r1+3]
- punpcklbw m0, m7
- punpcklbw m3, m7
- paddw m0, m3
- psllw m2, 2
- psubw m2, m1
- pmullw m2, m4
- paddw m0, m5
- paddw m0, m2
- psraw m0, 5
- packuswb m0, m0
- OP_MOV [r0], m0
- add r0, r2
- add r1, r2
- dec r3d
- jg .nextrow
- rep ret
- %endmacro
- MC MC20
Add Comment
Please, Sign In to add comment