Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Dense to dense
- ;; Without cache (for storing the result)
- ;; AVX-512
- ;; Without tolerances
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- global _denseToDenseAddAVX512_nocache_64_linux
- _denseToDenseAddAVX512_nocache_64_linux:
- push rbp
- mov rbp, rsp
- ; c = a + lambda * b
- ; rdi: address1
- ; rsi: address2
- ; rdx: address3
- ; rcx: count
- ; xmm0: lambda
- mov rax, rcx
- shr rcx, 4
- and rax, 0x0F
- vzeroupper
- vmovupd zmm5, [abs_mask]
- sub rsp, 8
- movlpd [rbp - 8], xmm0
- vbroadcastsd zmm7, [rbp - 8]
- vmovapd zmm6, zmm7
- cmp rcx, 0
- je after_loop_denseToDenseAddAVX512_nocache_64_linux
- start_denseToDenseAddAVX512_nocache_64_linux:
- vmovapd zmm0, [rdi] ; a
- vmovapd zmm1, zmm7
- vmulpd zmm1, zmm1, [rsi] ; b
- vaddpd zmm0, zmm0, zmm1 ; zmm0 = c = a + b
- vmovntpd [rdx], zmm0
- vmovapd zmm2, [rdi + 64] ; a
- vmovapd zmm3, zmm6
- vmulpd zmm3, zmm3, [rsi + 64] ; b
- vaddpd zmm2, zmm2, zmm3 ; zmm2 = c = a + b
- vmovntpd [rdx + 64], zmm2
- add rdi, 128
- add rsi, 128
- add rdx, 128
- loop start_denseToDenseAddAVX512_nocache_64_linux
- after_loop_denseToDenseAddAVX512_nocache_64_linux:
- cmp rax, 0
- je end_denseToDenseAddAVX512_nocache_64_linux
- mov rcx, rax
- last_loop_denseToDenseAddAVX512_nocache_64_linux:
- movlpd xmm0, [rdi] ; a
- movapd xmm1, xmm7
- mulsd xmm1, [rsi] ; b
- addsd xmm0, xmm1 ; xmm0 = c = a + b
- movlpd [rdx], xmm0
- add rdi, 8
- add rsi, 8
- add rdx, 8
- loop last_loop_denseToDenseAddAVX512_nocache_64_linux
- end_denseToDenseAddAVX512_nocache_64_linux:
- mov rsp, rbp
- pop rbp
- ret
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement