Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ; ========================================================================================
- ; FillColor(x,y,w,h,c)
- ; p1: r8d = x
- ; p2: r9d = y
- ; p3: r10d = width
- ; p4: r11d = height
- ; p5: r12d = rgb color
- ;
- ; Dispatches to AVX2 or SSE based on cpuSupport.
- ; ========================================================================================
- MEMORY$ equ 20000000h ; base of emulator RAM
- VIDEO$ equ 24000000h ; 2048x1080x4 = $87:0000 bytes
- SPRITE$ equ VIDEO$+880000h ; 2048x64x4 = 80000h (512k)
- REU$ equ 25000000h ; 16 mb reu
- global FillColor
- export FillColor
- align 16
- FillColor:
- sub rsp, 28h
- mov [rsp+00h], rsi
- mov [rsp+08h], rdi
- mov [rsp+10h], rbx
- mov [rsp+18h], rbp
- mov [rsp+20h], r15
- ; ----- Enter -----
- ; Compute starting dst: base + y*pitch + x*4
- mov edi, VIDEO$ ;base
- shl r8d, 2 ; x * 4
- shl r9d, 13 ; y * 2000h
- add edi, r8d
- add edi, r9d ; dst ptr
- mov ebp, edi ; 1st row ptr = dst ptr
- ; --------------------------------------
- ; CPU feature detection (AVX yes/no)
- ; --------------------------------------
- xor r15d, r15d ; default = 0 (no avx)
- mov eax, 1
- cpuid
- shr ecx, 28 ; shift AVX feature bit into bit 0
- and ecx, 1 ; isolate the bit (will be 0 or 1)
- add r15d, ecx ; add 0 (no AVX) or 1 (AVX)
- test r15d, r15d
- jz .use_sse
- ; --------
- ; AVX path
- ; --------
- call FillColorAVX
- jmp .Exit
- ; --------
- ; SSE path
- ; --------
- .use_sse:
- call FillColorSSE
- ; ----- Exit -----
- .Exit:
- mov rsi, [rsp+00h]
- mov rdi, [rsp+08h]
- mov rbx, [rsp+10h]
- mov rbp, [rsp+18h]
- mov r15, [rsp+20h]
- add rsp, 28h
- ret
- ; ========================================================================================
- ; ========================================================================================
- ; FillColorAVX2: (x,w) must be multiples of 8 pixels
- ; ========================================================================================
- ; r8d = x, r9d = y, r10d = w, r11d = h, r12d = color
- ; ========================================================================================
- global FillColorAVX
- ;export FillColorAVX
- align 16
- FillColorAVX:
- ; Broadcast color to ymm0
- movd xmm1, r12d ; color
- vbroadcastss ymm0, xmm1
- shr r10d, 3 ; 8 pixels per ymm0
- ; ----- The Loop -----
- mov edx, r11d ; height
- .fc0:
- mov ecx, r10d ; width
- ; shr ecx, 3 ; 8 pixels per ymm0
- .fc1:
- vmovdqa [edi], ymm0
- add edi, 32 ; 8 * 4 bytes
- dec ecx
- jnz .fc1
- ; next row
- add ebp, 2000h ; next row ptr
- mov edi, ebp ; next dst ptr
- dec edx
- jnz .fc0
- ret
- ; ========================================================================================
- ; ========================================================================================
- ; FillColorSSE: (x,w) must be multiples of 4 pixels
- ; ========================================================================================
- ; r8d = x, r9d = y, r10d = w, r11d = h, r12d = color
- ; ========================================================================================
- global FillColorSSE
- ;export FillColorSSE
- align 16
- FillColorSSE:
- movd xmm0, r12d
- pshufd xmm0, xmm0, 0 ; replicate color to 4 dwords
- shr r10d, 2 ; 4 pixels per xmm0
- ; ----- The Loop -----
- mov edx, r11d ; height
- .fc0:
- mov ecx, r10d ; width
- ; shr ecx, 2 ; 4 pixels per xmm0
- .fc1:
- movdqa [edi], xmm0
- add edi, 16
- dec ecx
- jnz .fc1
- add ebp, 2000h
- mov edi, ebp
- dec edx
- jnz .fc0
- ret
- ; ========================================================================================
Advertisement