Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ;/*
 - ; * Provide SSE luma mc functions for HEVC decoding
 - ; * Copyright (c) 2013 Pierre-Edouard LEPERE
 - ; *
 - ; * This file is part of Libav.
 - ; *
 - ; * Libav is free software; you can redistribute it and/or
 - ; * modify it under the terms of the GNU Lesser General Public
 - ; * License as published by the Free Software Foundation; either
 - ; * version 2.1 of the License, or (at your option) any later version.
 - ; *
 - ; * Libav is distributed in the hope that it will be useful,
 - ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
 - ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 - ; * Lesser General Public License for more details.
 - ; *
 - ; * You should have received a copy of the GNU Lesser General Public
 - ; * License along with Libav; if not, write to the Free Software
 - ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 - ; */
 - ;%include "libavutil/x86/x86inc.asm"
 - %include "libavutil/x86/x86util.asm"
 - SECTION_RODATA
 - SECTION .text
 - INIT_XMM sse4 ; adds ff_ and _sse4 to function name
 - ;******************************
 - ;void put_hevc_mc_pixels_8(int16_t *dst, ptrdiff_t dststride,
 - ; uint8_t *_src, ptrdiff_t _srcstride,
 - ; int width, int height, int mx, int my,
 - ; int16_t* mcbuffer)
 - ;
 - ; r0 : *dst
 - ; r1 : dststride
 - ; r2 : *src
 - ; r3 : srcstride
 - ; r4 : width
 - ; r5 : height
 - ;
 - ;******************************
 - ;1 by 1. Can be done on any processor
 - cglobal put_hevc_mc_pixels_2_8, 9, 12
 - pxor xmm0,xmm0 ;set register at zero
 - mov r6,0 ;height
 - mc_pixels_2_h: ;for height
 - mov r7,0 ;width
 - mc_pixels_2_w: ;for width
 - mov r9,0
 - mov r9b,[r2+r7] ;get byte
 - shl r9,6 ;shift
 - mov [r0+2*r7],r9w ;store
 - inc r7
 - cmp r7, r4 ;cmp width
 - jl mc_pixels_2_w ;width loop
 - lea r0,[r0+2*r1] ;dst += dststride
 - lea r2,[r2+r3] ;src += srcstride
 - add r6,1
 - cmp r6,r5 ;cmp height
 - jl mc_pixels_2_h ;height loop
 - RET
 - ;4 by 4
 - cglobal put_hevc_mc_pixels_4_8, 9, 12
 - pxor xmm0,xmm0 ;set register at zero
 - mov r6,0 ;height
 - mov r9,0
 - ;8 by 8
 - mc_pixels_4_h: ;for height
 - mov r7,0 ;width
 - mc_pixels_4_w: ;for width
 - pxor xmm1,xmm1
 - movq xmm1,[r2+r7] ;load 64 bits
 - punpcklbw xmm2,xmm1,xmm0 ;unpack to 16 bits
 - psllw xmm2,6 ;shift left 6 bits (14 - bit depth) each 16bit element
 - movq [r0+2*r7],xmm2 ;store 64 bits
 - add r7,4 ;add 4 for width loop
 - cmp r7, r4 ;cmp width
 - jl mc_pixels_4_w ;width loop
 - lea r0,[r0+2*r1] ;dst += dststride
 - lea r2,[r2+r3] ;src += srcstride
 - add r6,1
 - cmp r6,r5 ;cmp height
 - jl mc_pixels_4_h ;height loop
 - RET
 - ;8 by 8
 - cglobal put_hevc_mc_pixels_8_8, 9, 12
 - pxor xmm0,xmm0 ;set register at zero
 - mov r6,0 ;height
 - mov r9,0
 - ;8 by 8
 - mc_pixels_8_h: ;for height
 - mov r7,0 ;width
 - mc_pixels_8_w: ;for width
 - pxor xmm1,xmm1
 - movq xmm1,[r2+r7] ;load 64 bits
 - punpcklbw xmm2,xmm1,xmm0 ;unpack to 16 bits
 - psllw xmm2,6 ;shift left 6 bits (14 - bit depth) each 16bit element
 - movdqu [r0+2*r7],xmm2 ;store 128 bits
 - add r7,8 ;add 8 for width loop
 - cmp r7, r4 ;cmp width
 - jl mc_pixels_8_w ;width loop
 - lea r0,[r0+2*r1] ;dst += dststride
 - lea r2,[r2+r3] ;src += srcstride
 - add r6,1
 - cmp r6,r5 ;cmp height
 - jl mc_pixels_8_h ;height loop
 - RET
 - ;16 by 16
 - cglobal put_hevc_mc_pixels_16_8, 9, 12
 - pxor xmm0,xmm0 ;set register at zero
 - mov r6,0 ;height
 - mov r9,0
 - ;8 by 8
 - mc_pixels_16_h: ;for height
 - mov r7,0 ;width
 - mc_pixels_16_w: ;for width
 - pxor xmm1,xmm1
 - movdqu xmm1,[r2+r7] ;load 128 bits
 - punpcklbw xmm2,xmm1,xmm0 ;unpack low to 16 bits
 - punpckhbw xmm3,xmm1,xmm0 ;unpack high to 16 bits
 - psllw xmm2,6 ;shift left 6 bits (14 - bit depth) each 16bit element
 - psllw xmm3,6 ;shift left 6 bits (14 - bit depth) each 16bit element
 - movdqu [r0+2*r7],xmm2 ;store 128 bits
 - movdqu [r0+2*r7+16],xmm3 ;store 128 bits
 - add r7,16 ;add 8 for width loop
 - cmp r7, r4 ;cmp width
 - jl mc_pixels_16_w ;width loop
 - lea r0,[r0+2*r1] ;dst += dststride
 - lea r2,[r2+r3] ;src += srcstride
 - add r6,1
 - cmp r6,r5 ;cmp height
 - jl mc_pixels_16_h ;height loop
 - RET
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment