Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; Swap2
- ;
- ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Const32 segment align(32) ".Const"
- himask DQ 0FF00FF00FF00FF00H, 0FF00FF00FF00FF00H, \
- 0FF00FF00FF00FF00H, 0FF00FF00FF00FF00H
- lomask DQ 000FF00FF00FF00FFH, 000FF00FF00FF00FFH, \
- 000FF00FF00FF00FFH, 000FF00FF00FF00FFH
- Const32 ends
- ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- .Code
- ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; Version 4
- ;
- Swap2 Proc ; lpbaBuffer:RCX, cbBuffer:RDX
- test edx, 0FFFFFF80H ;
- jnz check_front_align ; more than 127 bytes? use AVX
- test dx, 0FFF8H ;
- jz noalign_check_4 ; less than 8? maybe 4...
- mov r10, 0FF00FF00FF00FF00H ; AND mask
- mov r11, 000FF00FF00FF00FFH ; AND mask
- align 4 ; align code
- noalign_loop_8: ;
- sub rdx, 8 ; next 8 bytes
- mov r8, [rcx + rdx] ; swap bytes
- mov r9, r8 ;
- and r8, r10 ;
- and r9, r11 ;
- shr r8, 8 ;
- shl r9, 8 ;
- or r9, r8 ;
- mov [rcx + rdx], r9 ;
- noalign_check_8: ;
- test dx, 0FFF8H ;
- jnz noalign_loop_8 ; 8 more?
- noalign_check_4: ;
- test dl, 0FCH ; 4 more bytes?
- jz @F ;
- sub rdx, 4 ; next 4 bytes
- mov r8d, [rcx + rdx] ; swap bytes
- mov r9d, r8d ;
- and r8d, 0FF00FF00H ;
- and r9d, 000FF00FFH ;
- shr r8d, 8 ;
- shl r9d, 8 ;
- or r9d, r8d ;
- mov [rcx + rdx], r9d ;
- @@: ;
- test dl, 0FEH ;
- jz @F ; 2 more bytes?
- mov ax, [rcx + rdx - 2] ; swap bytes
- rol ax, 8 ;
- mov [rcx + rdx - 2], ax ;
- @@: ret ; end...
- int 3 ;
- ; AVX start , align front of data
- align 4 ; Align the code on 4 byte boundary
- loop_front_align: ;
- mov ax, word ptr [rcx] ; Swap bytes
- rol ax, 8 ;
- mov word ptr [rcx], ax ;
- add rcx, 2 ;
- sub rdx, 2 ;
- check_front_align: ;
- test cl, 1FH ;
- jnz loop_front_align ; Not aligned? continue
- ; AVX start , align back of data
- test dl, 1FH ;
- jz @F ; Aligned? skip...
- align 4 ; Align code on 4 byte boundary
- loop_back_align: ;
- sub rdx, 2 ; Swap bytes
- mov ax, word ptr [rcx + rdx] ;
- rol ax, 8 ;
- mov word ptr [rcx + rdx], ax ;
- check_back_align: ;
- test dl, 1FH ;
- jnz loop_back_align ; Not aligned? continue
- @@:
- vmovdqa ymm6, ymmword ptr himask ; AND mask
- vmovdqa ymm7, ymmword ptr lomask ; AND mask
- ; AVX loop
- align 4 ; align code
- stream: ;
- vmovdqa ymm0, ymmword ptr[rcx + rdx - 32] ; load block of 32 bytes
- vpand ymm1, ymm6, ymm0 ; AND high
- vpand ymm2, ymm7, ymm0 ; AND low
- vpsrldq ymm3, ymm1, 1 ; shift 1 byte right
- vpslldq ymm4, ymm2, 1 ; shift 1 byte left
- vpor ymm5, ymm3, ymm4 ; high OR low
- vmovdqa ymmword ptr[rcx + rdx - 32], ymm5 ; store block of 32 bytes
- sub rdx, 32 ;
- jnz stream ; next 32 bytes?
- ret
- Swap2 EndP
- ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- End
- ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement