Untitled

;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
;*****************************************************************************
;* Copyright (C) 2011 Daniel Kang
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA 32

cextern pw_16
cextern pw_5

SECTION .text

%macro AVG_MOV 2
    pavgb  %2, %1
    movh   %1, %2
%endmacro

%macro MC 1
%define OP_MOV movh
INIT_MMX
%1 mmxext, put, 4
INIT_XMM
%1 sse2  , put, 8

%define OP_MOV AVG_MOV
INIT_MMX
%1 mmxext, avg, 4
INIT_XMM
%1 sse2  , avg, 8
%endmacro

%macro MCAxA 8
%ifdef ARCH_X86_64
%ifnidn %1,mmxext
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%endif
%else
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%endif
%endmacro

%macro MCAxA_OP 8
cglobal %2_h264_qpel%5_%3_%1, %6,%7,%8
%ifdef ARCH_X86_32
    call stub_%2_h264_qpel%4_%3_%1
    mov  r0, r0m
    mov  r1, r1m
    add  r0, %4*2
    add  r1, %4*2
    call stub_%2_h264_qpel%4_%3_%1
    mov  r0, r0m
    mov  r1, r1m
    lea  r0, [r0+r2*%4]
    lea  r1, [r1+r2*%4]
    call stub_%2_h264_qpel%4_%3_%1
    mov  r0, r0m
    mov  r1, r1m
    lea  r0, [r0+r2*%4+%4*2]
    lea  r1, [r1+r2*%4+%4*2]
    call stub_%2_h264_qpel%4_%3_%1
    RET
%else ; ARCH_X86_64
    mov r10, r0
    mov r11, r1
    call stub_%2_h264_qpel%4_%3_%1
    lea  r0, [r10+%4*2]
    lea  r1, [r11+%4*2]
    call stub_%2_h264_qpel%4_%3_%1
    lea  r0, [r10+r2*%4]
    lea  r1, [r11+r2*%4]
    call stub_%2_h264_qpel%4_%3_%1
    lea  r0, [r10+r2*%4+%4*2]
    lea  r1, [r11+r2*%4+%4*2]
%ifndef UNIX64 ; fall through to function
    call stub_%2_h264_qpel%4_%3_%1
    RET
%endif
%endif
%endmacro

;cpu, put/avg, mc, 4/8, ...
%macro cglobal_mc 7
%assign i %4*2
MCAxA %1, %2, %3, %4, i, %5,%6,%7

cglobal %2_h264_qpel%4_%3_%1, %5,%6,%7
%ifndef UNIX64 ; no prologue or epilogue for UNIX64
    call stub_%2_h264_qpel%4_%3_%1
    RET
%endif

stub_%2_h264_qpel%4_%3_%1:
%endmacro

;-----------------------------------------------------------------------------
; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro COPY4 1
    %1            m0, [r1     ]
    OP_MOV [r0     ], m0
    %1            m0, [r1+r2  ]
    OP_MOV [r0+r2  ], m0
    %1            m0, [r1+r2*2]
    OP_MOV [r0+r2*2], m0
    %1            m0, [r1+r3  ]
    OP_MOV [r0+r3  ], m0
%endmacro

%macro MC00 1
INIT_MMX
%define MOV_OP movh
cglobal %1_h264_qpel4_mc00_mmxext,3,4
    lea   r3, [r2*3   ]
    COPY4 movh
    RET

%define MOV_OP mova
cglobal %1_h264_qpel8_mc00_mmxext,3,4
    lea   r3, [r2*3   ]
    COPY4 movu
    lea   r0, [r0+r2*4]
    lea   r1, [r1+r2*4]
    COPY4 movu
    RET

INIT_XMM
cglobal %1_h264_qpel16_mc00_sse2,3,5
    lea   r3, [r2*3   ]
    mov  r4d, 4
.loop:
    COPY4 movu
    lea   r0, [r0+r2*4]
    lea   r1, [r1+r2*4]
    dec  r4d
    jg .loop
    REP_RET
%endmacro

INIT_MMX
%define OP_MOV MOV_OP
MC00 put

INIT_MMX
%define OP_MOV AVG_MOV
MC00 avg

;-----------------------------------------------------------------------------
; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC20 3
cglobal_mc %1, %2, mc20, %3, 3,4,9
    mov      r3d, %3
    pxor      m7, m7
    mova      m4, [pw_5]
    mova      m5, [pw_16]
.nextrow
    movh      m1, [r1-1]
    movh      m2, [r1+0]
    movh      m3, [r1+1]
    movh      m0, [r1+2]
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m0, m7
    paddw     m1, m0
    paddw     m2, m3
    movh      m0, [r1-2]
    movh      m3, [r1+3]
    punpcklbw m0, m7
    punpcklbw m3, m7
    paddw     m0, m3
    psllw     m2, 2
    psubw     m2, m1
    pmullw    m2, m4
    paddw     m0, m5
    paddw     m0, m2
    psraw     m0, 5
    packuswb  m0, m0
    OP_MOV [r0], m0
    add      r0, r2
    add      r1, r2
    dec     r3d
    jg .nextrow
    rep ret
%endmacro

MC MC20