Advertisement
Guest User

Untitled

a guest
Apr 1st, 2016
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ;******************************************************************************
  2. ;* V210 SIMD unpack
  3. ;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
  4. ;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION_RODATA
  26.  
  27. v210_mask: times 4 dd 0x3ff
  28. v210_mult: dw 64,4,64,4,64,4,64,4
  29. v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
  30. v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
  31.  
  32. v210_uyvy_chroma_mask:  times 2 db 0xff, 0x03, 0xf0, 0x3f, 0x00, 0xfc, 0x0f, 0x00
  33. v210_uyvy_chroma_mask2: times 2 db 0xf0, 0x3f, 0x00, 0xfc, 0x0f, 0x00, 0xff, 0x03
  34. v210_uyvy_luma_mask:    times 2 db 0x00, 0xfc, 0x0f, 0x00, 0xff, 0x03, 0xf0, 0x3f
  35. v210_uyvy_luma_mask2:   times 2 db 0x0f, 0x00, 0xff, 0x03, 0xf0, 0x3f, 0x00, 0xfc
  36.  
  37. v210_uyvy_chroma_shuf:  db 0, 1, -1, -1, 2,  3, -1, -1, 5, 6, -1, -1, 8, 9, -1, -1
  38. v210_uyvy_luma_shuf:  db -1, -1, 1, 2, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 9, 10
  39. v210_uyvy_chroma_shuf2: db 0, 1, -1, -1, 3, 4, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
  40. v210_uyvy_luma_shuf2: db -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 7, 8, -1, -1, 10, 11
  41. v210_uyvy_chroma_shuf3: db 5, 6, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1, 13, 14, -1, -1
  42. v210_uyvy_luma_shuf3: db -1, -1, 6, 7, -1, -1, 9, 10, -1, -1, 12, 13, -1, -1, 14, 15
  43.  
  44. v210_uyvy_chroma_mult:    times 2 dw 0x7fff, 0, 0x800, 0, 0x2000, 0, 0x7fff, 0
  45. v210_uyvy_luma_mult:    times 2 dw 0, 0x2000, 0, 0x7fff, 0, 0x800, 0, 0x2000
  46. v210_uyvy_chroma_mult2:   times 2 dw 0x800, 0, 0x2000, 0, 0x7fff, 0, 0x800, 0
  47. v210_uyvy_luma_mult2:   times 2 dw 0, 0x7fff, 0, 0x800, 0, 0x2000, 0, 0x7fff
  48. v210_uyvy_chroma_mult3:   times 2 dw 0x2000, 0, 0x7fff, 0, 0x800, 0, 0x2000, 0
  49. v210_uyvy_luma_mult3:   times 2 dw 0, 0x800, 0, 0x2000, 0, 0x7fff, 0, 0x800
  50.  
  51. SECTION .text
  52.  
  53. %macro v210_planar_unpack 1
  54.  
  55. ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
  56. cglobal v210_planar_unpack_%1, 5, 5, 7
  57.     movsxdifnidn r4, r4d
  58.     lea    r1, [r1+2*r4]
  59.     add    r2, r4
  60.     add    r3, r4
  61.     neg    r4
  62.  
  63.     mova   m3, [v210_mult]
  64.     mova   m4, [v210_mask]
  65.     mova   m5, [v210_luma_shuf]
  66.     mova   m6, [v210_chroma_shuf]
  67. .loop:
  68. %ifidn %1, unaligned
  69.     movu   m0, [r0]
  70. %else
  71.     mova   m0, [r0]
  72. %endif
  73.  
  74.     pmullw m1, m0, m3
  75.     psrld  m0, 10
  76.     psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
  77.     pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
  78.  
  79.     shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
  80.     pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
  81.     movu   [r1+2*r4], m2
  82.  
  83.     shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
  84.     pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
  85.     movq   [r2+r4], m1
  86.     movhps [r3+r4], m1
  87.  
  88.     add r0, mmsize
  89.     add r4, 6
  90.     jl  .loop
  91.  
  92.     REP_RET
  93. %endmacro
  94.  
  95. INIT_XMM ssse3
  96. v210_planar_unpack unaligned
  97.  
  98. %if HAVE_AVX_EXTERNAL
  99. INIT_XMM avx
  100. v210_planar_unpack unaligned
  101. %endif
  102.  
  103. INIT_XMM ssse3
  104. v210_planar_unpack aligned
  105.  
  106. %if HAVE_AVX_EXTERNAL
  107. INIT_XMM avx
  108. v210_planar_unpack aligned
  109. %endif
  110.  
  111. %macro v210_uyvy_unpack 1
  112.  
  113. ; v210_uyvy_unpack(const uint32_t *src, uint16_t *uyvy, int64_t width)
  114. cglobal v210_uyvy_unpack_%1, 3, 3, 14
  115.     shl    r2, 2
  116.     add    r1, r2
  117.     neg    r2
  118.  
  119.     mova m6, [v210_uyvy_luma_mask]
  120.     mova m7, [v210_uyvy_chroma_mask]
  121.     mova m8, [v210_uyvy_luma_mask2]
  122.     mova m9, [v210_uyvy_chroma_mask2]
  123.  
  124.     mova   m10, [v210_uyvy_chroma_shuf]
  125.     mova   m11, [v210_uyvy_chroma_mult]
  126.     mova   m12, [v210_uyvy_luma_shuf]
  127.     mova   m13, [v210_uyvy_luma_mult]
  128.  
  129. .loop:
  130. %ifidn %1, unaligned
  131.     movu   m0, [r0]
  132.     movu   m4, [r0+mmsize]
  133. %else
  134.     mova   m0, [r0]
  135.     mova   m4, [r0+mmsize]
  136. %endif
  137.  
  138.     palignr  m2, m4, m0, 10
  139.  
  140.     pand     m1, m0, m6
  141.     pand     m0, m7
  142.  
  143.     pand     m3, m2, m8
  144.     pand     m2, m9
  145.  
  146.     pand     m5, m4, m6
  147.     pand     m4, m7
  148.  
  149.     pshufb   m0, m10
  150.     pmulhrsw m0, m11
  151.     pshufb   m1, m12
  152.     pmulhrsw m1, m13
  153.     por      m0, m1
  154.  
  155.     pshufb   m2, [v210_uyvy_chroma_shuf2]
  156.     pmulhrsw m2, [v210_uyvy_chroma_mult2]
  157.     pshufb   m3, [v210_uyvy_luma_shuf2]
  158.     pmulhrsw m3, [v210_uyvy_luma_mult2]
  159.     por      m2, m3
  160.  
  161.     pshufb   m4, [v210_uyvy_chroma_shuf3]
  162.     pmulhrsw m4, [v210_uyvy_chroma_mult3]
  163.     pshufb   m5, [v210_uyvy_luma_shuf3]
  164.     pmulhrsw m5, [v210_uyvy_luma_mult3]
  165.     por      m4, m5
  166.  
  167.     mova [r1+r2], m0
  168.     mova [r1+r2+mmsize], m2
  169.     mova [r1+r2+2*mmsize], m4
  170.  
  171.     add r0, 2*mmsize
  172.     add r2, 3*mmsize
  173.     jl  .loop
  174.  
  175.     REP_RET
  176. %endmacro
  177.  
  178. INIT_XMM ssse3
  179. v210_uyvy_unpack unaligned
  180.  
  181. %if HAVE_AVX_EXTERNAL
  182. INIT_XMM avx
  183. v210_uyvy_unpack unaligned
  184. %endif
  185.  
  186. INIT_XMM ssse3
  187. v210_uyvy_unpack aligned
  188.  
  189. %if HAVE_AVX_EXTERNAL
  190. INIT_XMM avx
  191. v210_uyvy_unpack aligned
  192. %endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement