Advertisement
xiahanlu

快速拷贝Alpha汇编

Jun 9th, 2018
200
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.29 KB | None | 0 0
  1. .686 ; create 32 bit code
  2. .mmx
  3. .xmm
  4. .model flat, stdcall ; 32 bit memory model
  5. option casemap :none ; case sensitive
  6.  
  7. ;=============================================================================
  8. ; int CAlphaSingle32Unalign16_SSE (void * _vptr, int dword_byte,
  9. ; int dst_x, int dst_y,
  10. ; int dst_w, int dst_h,
  11. ; COLORREF col,
  12. ; BYTE alpha_dst, BYTE alpha_src );
  13. ;
  14. ; @TODO: SSSE4 vector 127 opt. better register mgr. align
  15. ;=============================================================================
  16.  
  17.  
  18. .code
  19.  
  20. align 16
  21. CAlphaSingle32Unalign16_SSE proc C
  22. option prologue:none, epilogue:none
  23.  
  24. push edi ;- U save ctx.
  25. push esi ;- V save ctx
  26. STACK_PUSH_CC equ 8
  27.  
  28. mov edi, [esp+4+STACK_PUSH_CC] ; -U load _vptr .
  29. mov ecx, [esp+8+STACK_PUSH_CC] ; -V load dword pitch .
  30. mov eax, [esp+12+STACK_PUSH_CC] ; -U pos X
  31. mov edx, [esp+16+STACK_PUSH_CC] ; -V pos Y
  32. imul edx, ecx ;-N mul it.
  33. add edi, edx ; -N
  34. lea edi, [edi+eax*4];-N
  35. mov eax, [esp+20+STACK_PUSH_CC] ; -N W
  36. xor esi, esi
  37. mov edx, [esp+24+STACK_PUSH_CC] ; - N H
  38. cmp eax, esi
  39. jle V_EXIT_FAILED
  40. cmp edx, esi
  41. jle V_EXIT_FAILED
  42. lea esi, [eax *4]
  43. sub ecx, esi
  44. mov esi, eax
  45.  
  46. pxor xmm7, xmm7
  47. movss xmm6, dword ptr[esp+28+STACK_PUSH_CC] ; load source .
  48. movss xmm5, dword ptr[esp+32+STACK_PUSH_CC] ; load alpha dst .
  49. movss xmm0, dword ptr[esp+36+STACK_PUSH_CC] ; load alpha src .
  50. pshufb xmm0, xmm7 ; fill all alpha - src.
  51. psllw xmm0, 8 ; save word's hi bit alpha value
  52. pshufb xmm5, xmm7 ; fill all alpha - dst.
  53. psllw xmm5, 8 ; save word's hi bit alpha value
  54. punpcklbw xmm6, xmm7
  55. pmulhuw xmm6, xmm0
  56. packuswb xmm6, xmm6
  57. pshufd xmm6, xmm6, 0
  58. ;; xmm7 <- ZERO
  59. ;; xmm6 <- source fixed pixel .
  60. ;; xmm5 <- ahpla group.
  61. cmp eax, 16
  62. jge _vecBlock_predo
  63. ;;
  64. ;; make alpha marco .
  65. ;;
  66. mixer_make macro mixer_out
  67. movdqa xmm4, mixer_out ;; save cache .
  68. punpcklbw mixer_out, xmm7
  69. pmulhuw mixer_out, xmm5
  70. punpckhbw xmm4, xmm7
  71. pmulhuw xmm4, xmm5
  72. packuswb mixer_out, xmm4
  73. endm ;;; !! mixer_make
  74.  
  75. align 16
  76. _badLoop:
  77. movd xmm0, dword ptr [edi] ;; fetch dword
  78. mixer_make xmm0
  79. paddusb xmm0, xmm6
  80. movd dword ptr [edi], xmm0
  81. add edi, 4
  82. dec eax
  83. jne _badLoop
  84. add edi, ecx
  85. dec edx
  86. mov eax, esi
  87. jne _badLoop
  88. jmp V_EXIT_SUCCESS
  89.  
  90. align 16
  91. _vecBlock_predo:
  92. test eax, 15
  93. je _betterLoop
  94. and eax, -16
  95. align 16
  96. _StdLoop:
  97. movdqu xmm0, xmmword ptr [edi+000h] ;; fetch P0
  98. movdqu xmm1, xmmword ptr [edi+010h] ;; fetch P1
  99. movdqu xmm2, xmmword ptr [edi+020h] ;; fetch P2
  100. movdqu xmm3, xmmword ptr [edi+030h] ;; fetch P3
  101.  
  102. mixer_make xmm0
  103. mixer_make xmm1
  104. mixer_make xmm2
  105. mixer_make xmm3
  106.  
  107. paddusb xmm0, xmm6
  108. paddusb xmm1, xmm6
  109. paddusb xmm2, xmm6
  110. paddusb xmm3, xmm6
  111.  
  112. movdqu xmmword ptr [edi+000h], xmm0 ;; write P0
  113. movdqu xmmword ptr [edi+010h], xmm1 ;; write P1
  114. movdqu xmmword ptr [edi+020h], xmm2 ;; write P2
  115. movdqu xmmword ptr [edi+030h], xmm3 ;; write P3
  116.  
  117. add edi, 64
  118. sub eax, 16
  119. mov esi, esi
  120. jne _StdLoop
  121. mov eax, esi
  122. and eax, 15
  123. @@:
  124. movd xmm0, dword ptr [edi] ;; fetch dword
  125. mixer_make xmm0
  126. paddusb xmm0, xmm6
  127. movd dword ptr [edi], xmm0
  128. add edi, 4
  129. dec eax
  130. jne @B
  131. add edi, ecx
  132. mov eax, esi
  133. and eax, -16
  134. dec edx
  135. jne _StdLoop
  136. jmp V_EXIT_SUCCESS
  137.  
  138. align 16
  139. _betterLoop:
  140. movdqu xmm0, xmmword ptr [edi+000h] ;; fetch P0
  141. movdqu xmm1, xmmword ptr [edi+010h] ;; fetch P1
  142. movdqu xmm2, xmmword ptr [edi+020h] ;; fetch P2
  143. movdqu xmm3, xmmword ptr [edi+030h] ;; fetch P3
  144.  
  145. mixer_make xmm0
  146. mixer_make xmm1
  147. mixer_make xmm2
  148. mixer_make xmm3
  149.  
  150. paddusb xmm0, xmm6
  151. paddusb xmm1, xmm6
  152. paddusb xmm2, xmm6
  153. paddusb xmm3, xmm6
  154.  
  155. movdqu xmmword ptr [edi+000h], xmm0 ;; write P0
  156. movdqu xmmword ptr [edi+010h], xmm1 ;; write P1
  157. movdqu xmmword ptr [edi+020h], xmm2 ;; write P2
  158. movdqu xmmword ptr [edi+030h], xmm3 ;; write P3
  159.  
  160. add edi, 64
  161. mov esi, esi
  162. sub eax, 16
  163. jne _betterLoop
  164.  
  165. add edi, ecx
  166. mov eax, esi
  167. dec edx
  168. jne _betterLoop
  169.  
  170. V_EXIT_SUCCESS:
  171. pop esi
  172. pop edi
  173. xor eax, eax
  174. ret
  175. V_EXIT_FAILED:
  176. pop esi
  177. pop edi
  178. mov eax, -1
  179. ret
  180.  
  181. CAlphaSingle32Unalign16_SSE endp
  182.  
  183. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement