Advertisement
Guest User

Untitled

a guest
Nov 18th, 2013
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ;/*
  2. ; * Provide SSE luma mc functions for HEVC decoding
  3. ; * Copyright (c) 2013 Pierre-Edouard LEPERE
  4. ; *
  5. ; * This file is part of Libav.
  6. ; *
  7. ; * Libav is free software; you can redistribute it and/or
  8. ; * modify it under the terms of the GNU Lesser General Public
  9. ; * License as published by the Free Software Foundation; either
  10. ; * version 2.1 of the License, or (at your option) any later version.
  11. ; *
  12. ; * Libav is distributed in the hope that it will be useful,
  13. ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ; * Lesser General Public License for more details.
  16. ; *
  17. ; * You should have received a copy of the GNU Lesser General Public
  18. ; * License along with Libav; if not, write to the Free Software
  19. ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ; */
  21. ;%include "libavutil/x86/x86inc.asm"
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION_RODATA
  25.  
  26.  
  27. SECTION .text
  28.  
  29.  
  30. INIT_XMM sse4   ; adds ff_ and _sse4 to function name
  31.  
  32. ;******************************
  33. ;void put_hevc_mc_pixels_8(int16_t *dst, ptrdiff_t dststride,
  34. ;                                       uint8_t *_src, ptrdiff_t _srcstride,
  35. ;                                       int width, int height, int mx, int my,
  36. ;                                       int16_t* mcbuffer)
  37. ;
  38. ;   r0 : *dst
  39. ;   r1 : dststride
  40. ;   r2 : *src
  41. ;   r3 : srcstride
  42. ;   r4 : width
  43. ;   r5 : height
  44. ;
  45. ;******************************
  46. ;1 by 1. Can be done on any processor
  47. cglobal put_hevc_mc_pixels_2_8, 9, 12
  48.     pxor        xmm0,xmm0       ;set register at zero
  49.     mov         r6,0            ;height
  50. mc_pixels_2_h:  ;for height
  51.     mov         r7,0            ;width
  52.  
  53. mc_pixels_2_w:  ;for width
  54.     mov         r9,0
  55.     mov         r9b,[r2+r7]     ;get byte
  56.     shl         r9,6            ;shift
  57.     mov         [r0+2*r7],r9w   ;store
  58.     inc         r7
  59.     cmp         r7, r4          ;cmp width
  60.     jl          mc_pixels_2_w       ;width loop
  61.     lea         r0,[r0+2*r1]    ;dst += dststride
  62.     lea         r2,[r2+r3]      ;src += srcstride
  63.     add         r6,1
  64.     cmp         r6,r5           ;cmp height
  65.     jl          mc_pixels_2_h       ;height loop
  66.     RET
  67. ;4 by 4
  68. cglobal put_hevc_mc_pixels_4_8, 9, 12
  69.     pxor        xmm0,xmm0       ;set register at zero
  70.     mov         r6,0            ;height
  71.     mov         r9,0
  72.  
  73.     ;8 by 8
  74. mc_pixels_4_h:  ;for height
  75.     mov         r7,0            ;width
  76.  
  77. mc_pixels_4_w:  ;for width
  78.  
  79.     pxor        xmm1,xmm1
  80.     movq        xmm1,[r2+r7]    ;load 64 bits
  81.     punpcklbw   xmm2,xmm1,xmm0  ;unpack to 16 bits
  82.     psllw       xmm2,6          ;shift left 6 bits (14 - bit depth) each 16bit element
  83.     movq        [r0+2*r7],xmm2  ;store 64 bits
  84.     add         r7,4            ;add 4 for width loop
  85.     cmp         r7, r4          ;cmp width
  86.     jl          mc_pixels_4_w       ;width loop
  87.     lea         r0,[r0+2*r1]    ;dst += dststride
  88.     lea         r2,[r2+r3]      ;src += srcstride
  89.     add         r6,1
  90.     cmp         r6,r5           ;cmp height
  91.     jl          mc_pixels_4_h       ;height loop
  92.     RET
  93.  
  94. ;8 by 8
  95. cglobal put_hevc_mc_pixels_8_8, 9, 12
  96.     pxor        xmm0,xmm0       ;set register at zero
  97.     mov         r6,0            ;height
  98.     mov         r9,0
  99.  
  100.     ;8 by 8
  101. mc_pixels_8_h:  ;for height
  102.     mov         r7,0            ;width
  103.  
  104. mc_pixels_8_w:  ;for width
  105.  
  106.     pxor        xmm1,xmm1
  107.     movq        xmm1,[r2+r7]    ;load 64 bits
  108.     punpcklbw   xmm2,xmm1,xmm0  ;unpack to 16 bits
  109.     psllw       xmm2,6          ;shift left 6 bits (14 - bit depth) each 16bit element
  110.     movdqu      [r0+2*r7],xmm2  ;store 128 bits
  111.     add         r7,8            ;add 8 for width loop
  112.     cmp         r7, r4          ;cmp width
  113.     jl          mc_pixels_8_w       ;width loop
  114.     lea         r0,[r0+2*r1]    ;dst += dststride
  115.     lea         r2,[r2+r3]      ;src += srcstride
  116.     add         r6,1
  117.     cmp         r6,r5           ;cmp height
  118.     jl          mc_pixels_8_h       ;height loop
  119.     RET
  120.  
  121. ;16 by 16
  122. cglobal put_hevc_mc_pixels_16_8, 9, 12
  123.     pxor        xmm0,xmm0       ;set register at zero
  124.     mov         r6,0            ;height
  125.     mov         r9,0
  126.  
  127.     ;8 by 8
  128. mc_pixels_16_h: ;for height
  129.     mov         r7,0            ;width
  130.  
  131. mc_pixels_16_w: ;for width
  132.  
  133.     pxor        xmm1,xmm1
  134.     movdqu      xmm1,[r2+r7]    ;load 128 bits
  135.     punpcklbw   xmm2,xmm1,xmm0  ;unpack low to 16 bits
  136.     punpckhbw   xmm3,xmm1,xmm0  ;unpack high to 16 bits
  137.     psllw       xmm2,6          ;shift left 6 bits (14 - bit depth) each 16bit element
  138.     psllw       xmm3,6          ;shift left 6 bits (14 - bit depth) each 16bit element
  139.     movdqu      [r0+2*r7],xmm2  ;store 128 bits
  140.     movdqu      [r0+2*r7+16],xmm3   ;store 128 bits
  141.     add         r7,16           ;add 8 for width loop
  142.     cmp         r7, r4          ;cmp width
  143.     jl          mc_pixels_16_w      ;width loop
  144.     lea         r0,[r0+2*r1]    ;dst += dststride
  145.     lea         r2,[r2+r3]      ;src += srcstride
  146.     add         r6,1
  147.     cmp         r6,r5           ;cmp height
  148.     jl          mc_pixels_16_h      ;height loop
  149.     RET
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement