Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2019
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.69 KB | None | 0 0
  1. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2. ;; Dense to dense
  3. ;; Without cache (for storing the result)
  4. ;; AVX-512
  5. ;; Without tolerances
  6. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  7.  
  8. global _denseToDenseAddAVX512_nocache_64_linux
  9. _denseToDenseAddAVX512_nocache_64_linux:
  10.  
  11. push rbp
  12. mov rbp, rsp
  13. ; c = a + lambda * b
  14. ; rdi: address1
  15. ; rsi: address2
  16. ; rdx: address3
  17. ; rcx: count
  18. ; xmm0: lambda
  19.  
  20. mov rax, rcx
  21. shr rcx, 4
  22. and rax, 0x0F
  23.  
  24. vzeroupper
  25.  
  26. vmovupd zmm5, [abs_mask]
  27.  
  28. sub rsp, 8
  29. movlpd [rbp - 8], xmm0
  30. vbroadcastsd zmm7, [rbp - 8]
  31. vmovapd zmm6, zmm7
  32.  
  33. cmp rcx, 0
  34. je after_loop_denseToDenseAddAVX512_nocache_64_linux
  35.  
  36. start_denseToDenseAddAVX512_nocache_64_linux:
  37.  
  38. vmovapd zmm0, [rdi] ; a
  39. vmovapd zmm1, zmm7
  40. vmulpd zmm1, zmm1, [rsi] ; b
  41. vaddpd zmm0, zmm0, zmm1 ; zmm0 = c = a + b
  42. vmovntpd [rdx], zmm0
  43.  
  44. vmovapd zmm2, [rdi + 64] ; a
  45. vmovapd zmm3, zmm6
  46. vmulpd zmm3, zmm3, [rsi + 64] ; b
  47. vaddpd zmm2, zmm2, zmm3 ; zmm2 = c = a + b
  48. vmovntpd [rdx + 64], zmm2
  49.  
  50. add rdi, 128
  51. add rsi, 128
  52. add rdx, 128
  53.  
  54. loop start_denseToDenseAddAVX512_nocache_64_linux
  55.  
  56. after_loop_denseToDenseAddAVX512_nocache_64_linux:
  57.  
  58. cmp rax, 0
  59. je end_denseToDenseAddAVX512_nocache_64_linux
  60.  
  61. mov rcx, rax
  62.  
  63. last_loop_denseToDenseAddAVX512_nocache_64_linux:
  64.  
  65. movlpd xmm0, [rdi] ; a
  66. movapd xmm1, xmm7
  67. mulsd xmm1, [rsi] ; b
  68. addsd xmm0, xmm1 ; xmm0 = c = a + b
  69. movlpd [rdx], xmm0
  70.  
  71. add rdi, 8
  72. add rsi, 8
  73. add rdx, 8
  74.  
  75. loop last_loop_denseToDenseAddAVX512_nocache_64_linux
  76.  
  77. end_denseToDenseAddAVX512_nocache_64_linux:
  78.  
  79. mov rsp, rbp
  80. pop rbp
  81. ret
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement