Advertisement
thezer0th

sol128.asm

Mar 29th, 2020
238
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.89 KB | None | 0 0
  1. section .text
  2. global pix
  3. extern pixtime
  4.  
  5.  
  6. ; 0x0,%1:%2 += 0x0,%3:%4, fp
  7. %macro add128 4
  8. add %2, %4
  9. adc %1, %3
  10. %endmacro
  11.  
  12. ; 0x0,%1:%2 -= 0x0,%3:%4, fp
  13. %macro sub128 4
  14. sub %2, %4
  15. sbb %1, %3
  16. %endmacro
  17.  
  18. ; %1:%2 <<= %3
  19. %macro shl128 3
  20. shld %1, %2, %3
  21. shl %2, %3
  22. %endmacro
  23.  
  24. ; %1:%2 >>= %3
  25. %macro shr128 3
  26. shrd %2, %1, %3
  27. shr %1, %3
  28. %endmacro
  29.  
  30.  
  31. ; %1:%2 = rdi * rbp:rax + rdx
  32. ; (n_1 2^B + n_2) = d (q_1 2^B + q_2) + r
  33. %macro div128_64 2
  34. xor edx, edx
  35. mov rax, %1
  36. div rsi ; n_1 = d q_1 + \hat{r}
  37. mov rbp, rax ; save q_1
  38. mov rax, %2
  39. div rsi ; 2^B * \hat{r} + n_2 = d q_2 + r
  40. %endmacro
  41.  
  42.  
  43. ; computes 16^{8m-n} \mod{8n+q}. operates in 64-bit regime. in order to be
  44. ; correct, we require (M-1)^2 < 2^64, or (8m+5)^2 < 64, or m ~< 550*10^6 or so
  45. ; uses conventions from `series_part`:
  46. ; - 8m-n in rdi, 8n+q in rsi (neither are to be modified);
  47. ; - rbx, r8..r12 are not to be modified.
  48. ; the result is in rcx
  49. %macro powmod16 0
  50. mov eax, 16 ; base <- 16
  51. mov ecx, 1 ; res <- 1
  52. mov r13, rdi ; \vareps <- 8m-n
  53.  
  54. %%loop_vareps:
  55. test r13, r13
  56. je %%loop_vareps_end ; if \vareps = 0, break
  57.  
  58. bt r13, 0
  59. jnc %%skip_update_res ; if exp & 1 == 1, we need to update res
  60. xchg rcx, rax ; swap base and res (we need res in rax)
  61. imul rcx ; res *= base
  62. xor edx, edx ; clear rdx for div/mul
  63. div rsi ; rdx <- res % mod
  64. mov rax, rcx
  65. mov rcx, rdx ; rax <- base, r8 <- res, i.e. as before
  66. %%skip_update_res:
  67.  
  68. mul rax ; base *= base
  69. xor edx, edx ; clear rdx for div/mul
  70. div rsi ; rdx <- base % mod
  71. mov rax, rdx ; base <- rdx = base % mod
  72.  
  73. shr r13, 1 ; exp /= 2
  74. jmp %%loop_vareps
  75. %%loop_vareps_end:
  76. %endmacro
  77.  
  78.  
  79. ; computes 128 upper bits of the fractional part of
  80. ; 2^{32m} \sum_{n=0}^\infty \frac{1}{16^m} \frac{1}{8n+q}
  81. ; parameters:
  82. ; - 8m [rdi];
  83. ; - q [rsi].
  84. ; modifies: all volatile registers
  85. ; returns in r11:r12
  86. series_part:
  87. ; rdi: 8m-n, rsi:8n+q in the procedure
  88. xor ebx, ebx ; n <- 0
  89. xor r11d, r11d
  90. xor r12d, r12d ; r11:r12 =: res <- 0
  91.  
  92.  
  93. ; series for n=0..8m
  94. .loop_prim:
  95. cmp rbx, r10
  96. jg .loop_prim_end ; if n > 8m (8m saved from `series`), break
  97.  
  98. powmod16 ; now rcx = 16^{8m-n} \mod{8n+q}
  99. div128_64 -1, -1 ; 2^128-1 = (8n+q) rbp:rax + rdx
  100.  
  101. ; here we mul rbp:rax and rcx
  102. mul rcx ; now rdx:rax = rax * rcx
  103. imul rcx, rbp ; now rcx is lower part of rbp * rax
  104. add rdx, rcx ; now rdx:rax = rbp:rax * rcx
  105.  
  106. add128 r11, r12, rdx, rax ; res += (16^{8m-n} \mod{8n+q})/(8n+q), fp
  107.  
  108. inc rbx ; ++n
  109. dec rdi ; adjust 8m-n
  110. add rsi, 8 ; adjust 8n+q
  111. jmp .loop_prim
  112. .loop_prim_end:
  113.  
  114. ; series for n=8m+1..\infty (or rather while the terms are >=2^{-128})
  115. mov r13, 0x1000000000000000
  116. xor r14d, r14d ; r13:r14 <- 2^124 = 2^128/16^{n-8m}
  117. .loop_pert:
  118.  
  119. test r13, r13
  120. jne .skip_check
  121. test r14, r14
  122. je .loop_pert_end ; if r13 = r14 = 0, r13:r14 = 0: break
  123. .skip_check:
  124.  
  125. div128_64 r13, r14 ; r13:r14 = (8n+q) rbp:rax + rdx
  126. add128 r11, r12, rbp, rax ; res += 1/(16^{n-8m}(8n+q)), fp
  127.  
  128. ; ++n
  129. add rsi, 8 ; adjust 8n+q
  130. shr128 r13, r14, 4 ; adjust r13:r14
  131. jmp .loop_pert
  132. .loop_pert_end:
  133.  
  134. ret
  135.  
  136.  
  137. ; computes* digits 32m..32m+63 of \pi (* with possible errors)
  138. ; parameters:
  139. ; - m [rdi]: m from the description.
  140. ; returns: digits 32m..32m+63 of \pi
  141. ; modifies: all volatile registers
  142. series:
  143. xor r8d, r8d
  144. xor r9d, r9d ; r8:r9 =: res <- 0
  145. shl rdi, 3 ; m *= 8
  146. mov r10, rdi ; save 8m
  147.  
  148. mov esi, 1
  149. call series_part
  150. shl128 r11, r12, 2
  151. add128 r8, r9, r11, r12 ; res = fp(res + 4 series_part(8m+1, 1)) (S(m, q) henceforth)
  152.  
  153. mov rdi, r10
  154. mov esi, 4
  155. call series_part
  156. shl128 r11, r12, 1
  157. sub128 r8, r9, r11, r12 ; res = fp(res - 2 S(m, 4))
  158.  
  159. mov rdi, r10
  160. mov esi, 5
  161. call series_part
  162. sub128 r8, r9, r11, r12 ; res = fp(res - S(m, 5))
  163.  
  164. mov rdi, r10
  165. mov esi, 6
  166. call series_part
  167. sub128 r8, r9, r11, r12 ; res = fp(res - S(m, 6))
  168.  
  169. mov rax, r8 ; move upper 64 bits into rax
  170. ret
  171.  
  172.  
  173. ; invokes pixtime with the result of rdtsc
  174. ; modifies: rdi, rax, rdx
  175. run_pixtime:
  176. rdtsc ; call rdtsc - result is edx:eax, where upper parts are 0
  177. shl rdx, 32 ; move ebx part of the result from the lower half of rdx into the upper
  178. lea rdi, [rdx+rax] ; now rdx+rax is real result - move it into 1st parameter slot
  179. lea rsp, [rsp-0x8] ; need to align rsp for ABI once again
  180. call pixtime
  181. lea rsp, [rsp+0x8] ; realign rsp
  182. ret
  183.  
  184.  
  185. ; an implementation of pix
  186. ; parameters:
  187. ; - ppi [rdi]: ptr to result array;
  188. ; - pidx [rsi]: ptr to index;
  189. ; - max [rdx]: max index.
  190. ; modifies: all volatile registers
  191. pix:
  192. ; save args (we will need registers)
  193. push rdi
  194. push rsi
  195. push rdx
  196.  
  197. call run_pixtime
  198.  
  199. ; save non-volatiles
  200. push rbx
  201. push rbp
  202. push r12
  203. push r13
  204. push r14
  205.  
  206. ; positions of args wrt rsp (+)
  207. MAX equ 0x28
  208. PIDX_PTR equ MAX+0x8
  209. PPI_PTR equ PIDX_PTR+0x8
  210.  
  211. .loop_m:
  212. mov edi, 2 ; we shall increment *pidx by 2 (i.e. do 64 bits at once)
  213. mov rcx, [rsp+PIDX_PTR] ; load pidx into rcx
  214. lock xadd [rcx], rdi ; load m = (*pidx += 2) atomically into rdi
  215. cmp rdi, [rsp+MAX]
  216. jge .loop_m_end ; if m >= max, end
  217.  
  218. push rdi ; save m
  219. call series ; compute digits 32m..32m+63 into rax
  220. pop rdi ; restore m
  221.  
  222. mov rcx, [rsp+PPI_PTR] ; load ppi into rcx
  223.  
  224. cmp rdi, [rsp+MAX]
  225. jge .skip_upper
  226. mov DWORD [rcx+4*rdi+4], eax; if m+1 < max, load the lower part into ppi[m+1]
  227. .skip_upper:
  228. shr rax, 32 ; move the higher part into the lower
  229. mov DWORD [rcx+4*rdi], eax ; load eax into ppi[m]
  230.  
  231. jmp .loop_m
  232. .loop_m_end:
  233. lea rsp, [rsp-0x8] ; align stack for ABI
  234. call run_pixtime
  235. lea rsp, [rsp+0x8]
  236.  
  237. ; restore non-volatiles
  238. pop r14
  239. pop r13
  240. pop r12
  241. pop rbp
  242. pop rbx
  243.  
  244. lea rsp, [rsp+0x18] ; skip saved args
  245. ret
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement