Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ;/*
- ; * Provide SSE luma mc functions for HEVC decoding
- ; * Copyright (c) 2013 Pierre-Edouard LEPERE
- ; *
- ; * This file is part of Libav.
- ; *
- ; * Libav is free software; you can redistribute it and/or
- ; * modify it under the terms of the GNU Lesser General Public
- ; * License as published by the Free Software Foundation; either
- ; * version 2.1 of the License, or (at your option) any later version.
- ; *
- ; * Libav is distributed in the hope that it will be useful,
- ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
- ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ; * Lesser General Public License for more details.
- ; *
- ; * You should have received a copy of the GNU Lesser General Public
- ; * License along with Libav; if not, write to the Free Software
- ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ; */
- ;%include "libavutil/x86/x86inc.asm"
- %include "libavutil/x86/x86util.asm"
- SECTION_RODATA
- SECTION .text
- INIT_XMM sse4 ; adds ff_ and _sse4 to function name
- ;******************************
- ;void put_hevc_mc_pixels_8(int16_t *dst, ptrdiff_t dststride,
- ; uint8_t *_src, ptrdiff_t _srcstride,
- ; int width, int height, int mx, int my,
- ; int16_t* mcbuffer)
- ;
- ; r0 : *dst
- ; r1 : dststride
- ; r2 : *src
- ; r3 : srcstride
- ; r4 : width
- ; r5 : height
- ;
- ;******************************
- ;1 by 1. Can be done on any processor
- cglobal put_hevc_mc_pixels_2_8, 9, 12
- pxor xmm0,xmm0 ;set register at zero
- mov r6,0 ;height
- mc_pixels_2_h: ;for height
- mov r7,0 ;width
- mc_pixels_2_w: ;for width
- mov r9,0
- mov r9b,[r2+r7] ;get byte
- shl r9,6 ;shift
- mov [r0+2*r7],r9w ;store
- inc r7
- cmp r7, r4 ;cmp width
- jl mc_pixels_2_w ;width loop
- lea r0,[r0+2*r1] ;dst += dststride
- lea r2,[r2+r3] ;src += srcstride
- add r6,1
- cmp r6,r5 ;cmp height
- jl mc_pixels_2_h ;height loop
- RET
- ;4 by 4
- cglobal put_hevc_mc_pixels_4_8, 9, 12
- pxor xmm0,xmm0 ;set register at zero
- mov r6,0 ;height
- mov r9,0
- ;8 by 8
- mc_pixels_4_h: ;for height
- mov r7,0 ;width
- mc_pixels_4_w: ;for width
- pxor xmm1,xmm1
- movq xmm1,[r2+r7] ;load 64 bits
- punpcklbw xmm2,xmm1,xmm0 ;unpack to 16 bits
- psllw xmm2,6 ;shift left 6 bits (14 - bit depth) each 16bit element
- movq [r0+2*r7],xmm2 ;store 64 bits
- add r7,4 ;add 4 for width loop
- cmp r7, r4 ;cmp width
- jl mc_pixels_4_w ;width loop
- lea r0,[r0+2*r1] ;dst += dststride
- lea r2,[r2+r3] ;src += srcstride
- add r6,1
- cmp r6,r5 ;cmp height
- jl mc_pixels_4_h ;height loop
- RET
- ;8 by 8
- cglobal put_hevc_mc_pixels_8_8, 9, 12
- pxor xmm0,xmm0 ;set register at zero
- mov r6,0 ;height
- mov r9,0
- ;8 by 8
- mc_pixels_8_h: ;for height
- mov r7,0 ;width
- mc_pixels_8_w: ;for width
- pxor xmm1,xmm1
- movq xmm1,[r2+r7] ;load 64 bits
- punpcklbw xmm2,xmm1,xmm0 ;unpack to 16 bits
- psllw xmm2,6 ;shift left 6 bits (14 - bit depth) each 16bit element
- movdqu [r0+2*r7],xmm2 ;store 128 bits
- add r7,8 ;add 8 for width loop
- cmp r7, r4 ;cmp width
- jl mc_pixels_8_w ;width loop
- lea r0,[r0+2*r1] ;dst += dststride
- lea r2,[r2+r3] ;src += srcstride
- add r6,1
- cmp r6,r5 ;cmp height
- jl mc_pixels_8_h ;height loop
- RET
- ;16 by 16
- cglobal put_hevc_mc_pixels_16_8, 9, 12
- pxor xmm0,xmm0 ;set register at zero
- mov r6,0 ;height
- mov r9,0
- ;8 by 8
- mc_pixels_16_h: ;for height
- mov r7,0 ;width
- mc_pixels_16_w: ;for width
- pxor xmm1,xmm1
- movdqu xmm1,[r2+r7] ;load 128 bits
- punpcklbw xmm2,xmm1,xmm0 ;unpack low to 16 bits
- punpckhbw xmm3,xmm1,xmm0 ;unpack high to 16 bits
- psllw xmm2,6 ;shift left 6 bits (14 - bit depth) each 16bit element
- psllw xmm3,6 ;shift left 6 bits (14 - bit depth) each 16bit element
- movdqu [r0+2*r7],xmm2 ;store 128 bits
- movdqu [r0+2*r7+16],xmm3 ;store 128 bits
- add r7,16 ;add 8 for width loop
- cmp r7, r4 ;cmp width
- jl mc_pixels_16_w ;width loop
- lea r0,[r0+2*r1] ;dst += dststride
- lea r2,[r2+r3] ;src += srcstride
- add r6,1
- cmp r6,r5 ;cmp height
- jl mc_pixels_16_h ;height loop
- RET
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement