Guest User

Untitled

a guest
Feb 18th, 2018
287
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2011 Daniel Kang
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24.  
  25. %include "x86inc.asm"
  26. %include "x86util.asm"
  27.  
  28. SECTION_RODATA 32
  29.  
  30. cextern pw_16
  31. cextern pw_5
  32.  
  33. SECTION .text
  34.  
  35. %macro AVG_MOV 2
  36.     pavgb  %2, %1
  37.     movh   %1, %2
  38. %endmacro
  39.  
  40. %macro MC 1
  41. %define OP_MOV movh
  42. INIT_MMX
  43. %1 mmxext, put, 4
  44. INIT_XMM
  45. %1 sse2  , put, 8
  46.  
  47. %define OP_MOV AVG_MOV
  48. INIT_MMX
  49. %1 mmxext, avg, 4
  50. INIT_XMM
  51. %1 sse2  , avg, 8
  52. %endmacro
  53.  
  54. %macro MCAxA 8
  55. %ifdef ARCH_X86_64
  56. %ifnidn %1,mmxext
  57. MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
  58. %endif
  59. %else
  60. MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
  61. %endif
  62. %endmacro
  63.  
  64. %macro MCAxA_OP 8
  65. cglobal %2_h264_qpel%5_%3_%1, %6,%7,%8
  66. %ifdef ARCH_X86_32
  67.     call stub_%2_h264_qpel%4_%3_%1
  68.     mov  r0, r0m
  69.     mov  r1, r1m
  70.     add  r0, %4*2
  71.     add  r1, %4*2
  72.     call stub_%2_h264_qpel%4_%3_%1
  73.     mov  r0, r0m
  74.     mov  r1, r1m
  75.     lea  r0, [r0+r2*%4]
  76.     lea  r1, [r1+r2*%4]
  77.     call stub_%2_h264_qpel%4_%3_%1
  78.     mov  r0, r0m
  79.     mov  r1, r1m
  80.     lea  r0, [r0+r2*%4+%4*2]
  81.     lea  r1, [r1+r2*%4+%4*2]
  82.     call stub_%2_h264_qpel%4_%3_%1
  83.     RET
  84. %else ; ARCH_X86_64
  85.     mov r10, r0
  86.     mov r11, r1
  87.     call stub_%2_h264_qpel%4_%3_%1
  88.     lea  r0, [r10+%4*2]
  89.     lea  r1, [r11+%4*2]
  90.     call stub_%2_h264_qpel%4_%3_%1
  91.     lea  r0, [r10+r2*%4]
  92.     lea  r1, [r11+r2*%4]
  93.     call stub_%2_h264_qpel%4_%3_%1
  94.     lea  r0, [r10+r2*%4+%4*2]
  95.     lea  r1, [r11+r2*%4+%4*2]
  96. %ifndef UNIX64 ; fall through to function
  97.     call stub_%2_h264_qpel%4_%3_%1
  98.     RET
  99. %endif
  100. %endif
  101. %endmacro
  102.  
  103. ;cpu, put/avg, mc, 4/8, ...
  104. %macro cglobal_mc 7
  105. %assign i %4*2
  106. MCAxA %1, %2, %3, %4, i, %5,%6,%7
  107.  
  108. cglobal %2_h264_qpel%4_%3_%1, %5,%6,%7
  109. %ifndef UNIX64 ; no prologue or epilogue for UNIX64
  110.     call stub_%2_h264_qpel%4_%3_%1
  111.     RET
  112. %endif
  113.  
  114. stub_%2_h264_qpel%4_%3_%1:
  115. %endmacro
  116.  
  117. ;-----------------------------------------------------------------------------
  118. ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  119. ;-----------------------------------------------------------------------------
  120. %macro COPY4 1
  121.     %1            m0, [r1     ]
  122.     OP_MOV [r0     ], m0
  123.     %1            m0, [r1+r2  ]
  124.     OP_MOV [r0+r2  ], m0
  125.     %1            m0, [r1+r2*2]
  126.     OP_MOV [r0+r2*2], m0
  127.     %1            m0, [r1+r3  ]
  128.     OP_MOV [r0+r3  ], m0
  129. %endmacro
  130.  
  131. %macro MC00 1
  132. INIT_MMX
  133. %define MOV_OP movh
  134. cglobal %1_h264_qpel4_mc00_mmxext,3,4
  135.     lea   r3, [r2*3   ]
  136.     COPY4 movh
  137.     RET
  138.  
  139. %define MOV_OP mova
  140. cglobal %1_h264_qpel8_mc00_mmxext,3,4
  141.     lea   r3, [r2*3   ]
  142.     COPY4 movu
  143.     lea   r0, [r0+r2*4]
  144.     lea   r1, [r1+r2*4]
  145.     COPY4 movu
  146.     RET
  147.  
  148. INIT_XMM
  149. cglobal %1_h264_qpel16_mc00_sse2,3,5
  150.     lea   r3, [r2*3   ]
  151.     mov  r4d, 4
  152. .loop:
  153.     COPY4 movu
  154.     lea   r0, [r0+r2*4]
  155.     lea   r1, [r1+r2*4]
  156.     dec  r4d
  157.     jg .loop
  158.     REP_RET
  159. %endmacro
  160.  
  161. INIT_MMX
  162. %define OP_MOV MOV_OP
  163. MC00 put
  164.  
  165. INIT_MMX
  166. %define OP_MOV AVG_MOV
  167. MC00 avg
  168.  
  169. ;-----------------------------------------------------------------------------
  170. ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  171. ;-----------------------------------------------------------------------------
  172. %macro MC20 3
  173. cglobal_mc %1, %2, mc20, %3, 3,4,9
  174.     mov      r3d, %3
  175.     pxor      m7, m7
  176.     mova      m4, [pw_5]
  177.     mova      m5, [pw_16]
  178. .nextrow
  179.     movh      m1, [r1-1]
  180.     movh      m2, [r1+0]
  181.     movh      m3, [r1+1]
  182.     movh      m0, [r1+2]
  183.     punpcklbw m1, m7
  184.     punpcklbw m2, m7
  185.     punpcklbw m3, m7
  186.     punpcklbw m0, m7
  187.     paddw     m1, m0
  188.     paddw     m2, m3
  189.     movh      m0, [r1-2]
  190.     movh      m3, [r1+3]
  191.     punpcklbw m0, m7
  192.     punpcklbw m3, m7
  193.     paddw     m0, m3
  194.     psllw     m2, 2
  195.     psubw     m2, m1
  196.     pmullw    m2, m4
  197.     paddw     m0, m5
  198.     paddw     m0, m2
  199.     psraw     m0, 5
  200.     packuswb  m0, m0
  201.     OP_MOV [r0], m0
  202.     add      r0, r2
  203.     add      r1, r2
  204.     dec     r3d
  205.     jg .nextrow
  206.     rep ret
  207. %endmacro
  208.  
  209. MC MC20
Add Comment
Please, Sign In to add comment