Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- section .text
- global pix
- extern pixtime
- ; 0x0,%1:%2 += 0x0,%3:%4, fp
- %macro add128 4
- add %2, %4
- adc %1, %3
- %endmacro
- ; 0x0,%1:%2 -= 0x0,%3:%4, fp
- %macro sub128 4
- sub %2, %4
- sbb %1, %3
- %endmacro
- ; %1:%2 <<= %3
- %macro shl128 3
- shld %1, %2, %3
- shl %2, %3
- %endmacro
- ; %1:%2 >>= %3
- %macro shr128 3
- shrd %2, %1, %3
- shr %1, %3
- %endmacro
- ; %1:%2 = rdi * rbp:rax + rdx
- ; (n_1 2^B + n_2) = d (q_1 2^B + q_2) + r
- %macro div128_64 2
- xor edx, edx
- mov rax, %1
- div rsi ; n_1 = d q_1 + \hat{r}
- mov rbp, rax ; save q_1
- mov rax, %2
- div rsi ; 2^B * \hat{r} + n_2 = d q_2 + r
- %endmacro
- ; computes 16^{8m-n} \mod{8n+q}. operates in 64-bit regime. in order to be
- ; correct, we require (M-1)^2 < 2^64, or (8m+5)^2 < 64, or m ~< 550*10^6 or so
- ; uses conventions from `series_part`:
- ; - 8m-n in rdi, 8n+q in rsi (neither are to be modified);
- ; - rbx, r8..r12 are not to be modified.
- ; the result is in rcx
- %macro powmod16 0
- mov eax, 16 ; base <- 16
- mov ecx, 1 ; res <- 1
- mov r13, rdi ; \vareps <- 8m-n
- %%loop_vareps:
- test r13, r13
- je %%loop_vareps_end ; if \vareps = 0, break
- bt r13, 0
- jnc %%skip_update_res ; if exp & 1 == 1, we need to update res
- xchg rcx, rax ; swap base and res (we need res in rax)
- imul rcx ; res *= base
- xor edx, edx ; clear rdx for div/mul
- div rsi ; rdx <- res % mod
- mov rax, rcx
- mov rcx, rdx ; rax <- base, r8 <- res, i.e. as before
- %%skip_update_res:
- mul rax ; base *= base
- xor edx, edx ; clear rdx for div/mul
- div rsi ; rdx <- base % mod
- mov rax, rdx ; base <- rdx = base % mod
- shr r13, 1 ; exp /= 2
- jmp %%loop_vareps
- %%loop_vareps_end:
- %endmacro
- ; computes 128 upper bits of the fractional part of
- ; 2^{32m} \sum_{n=0}^\infty \frac{1}{16^m} \frac{1}{8n+q}
- ; parameters:
- ; - 8m [rdi];
- ; - q [rsi].
- ; modifies: all volatile registers
- ; returns in r11:r12
- series_part:
- ; rdi: 8m-n, rsi:8n+q in the procedure
- xor ebx, ebx ; n <- 0
- xor r11d, r11d
- xor r12d, r12d ; r11:r12 =: res <- 0
- ; series for n=0..8m
- .loop_prim:
- cmp rbx, r10
- jg .loop_prim_end ; if n > 8m (8m saved from `series`), break
- powmod16 ; now rcx = 16^{8m-n} \mod{8n+q}
- div128_64 -1, -1 ; 2^128-1 = (8n+q) rbp:rax + rdx
- ; here we mul rbp:rax and rcx
- mul rcx ; now rdx:rax = rax * rcx
- imul rcx, rbp ; now rcx is lower part of rbp * rax
- add rdx, rcx ; now rdx:rax = rbp:rax * rcx
- add128 r11, r12, rdx, rax ; res += (16^{8m-n} \mod{8n+q})/(8n+q), fp
- inc rbx ; ++n
- dec rdi ; adjust 8m-n
- add rsi, 8 ; adjust 8n+q
- jmp .loop_prim
- .loop_prim_end:
- ; series for n=8m+1..\infty (or rather while the terms are >=2^{-128})
- mov r13, 0x1000000000000000
- xor r14d, r14d ; r13:r14 <- 2^124 = 2^128/16^{n-8m}
- .loop_pert:
- test r13, r13
- jne .skip_check
- test r14, r14
- je .loop_pert_end ; if r13 = r14 = 0, r13:r14 = 0: break
- .skip_check:
- div128_64 r13, r14 ; r13:r14 = (8n+q) rbp:rax + rdx
- add128 r11, r12, rbp, rax ; res += 1/(16^{n-8m}(8n+q)), fp
- ; ++n
- add rsi, 8 ; adjust 8n+q
- shr128 r13, r14, 4 ; adjust r13:r14
- jmp .loop_pert
- .loop_pert_end:
- ret
- ; computes* digits 32m..32m+63 of \pi (* with possible errors)
- ; parameters:
- ; - m [rdi]: m from the description.
- ; returns: digits 32m..32m+63 of \pi
- ; modifies: all volatile registers
- series:
- xor r8d, r8d
- xor r9d, r9d ; r8:r9 =: res <- 0
- shl rdi, 3 ; m *= 8
- mov r10, rdi ; save 8m
- mov esi, 1
- call series_part
- shl128 r11, r12, 2
- add128 r8, r9, r11, r12 ; res = fp(res + 4 series_part(8m+1, 1)) (S(m, q) henceforth)
- mov rdi, r10
- mov esi, 4
- call series_part
- shl128 r11, r12, 1
- sub128 r8, r9, r11, r12 ; res = fp(res - 2 S(m, 4))
- mov rdi, r10
- mov esi, 5
- call series_part
- sub128 r8, r9, r11, r12 ; res = fp(res - S(m, 5))
- mov rdi, r10
- mov esi, 6
- call series_part
- sub128 r8, r9, r11, r12 ; res = fp(res - S(m, 6))
- mov rax, r8 ; move upper 64 bits into rax
- ret
- ; invokes pixtime with the result of rdtsc
- ; modifies: rdi, rax, rdx
- run_pixtime:
- rdtsc ; call rdtsc - result is edx:eax, where upper parts are 0
- shl rdx, 32 ; move ebx part of the result from the lower half of rdx into the upper
- lea rdi, [rdx+rax] ; now rdx+rax is real result - move it into 1st parameter slot
- lea rsp, [rsp-0x8] ; need to align rsp for ABI once again
- call pixtime
- lea rsp, [rsp+0x8] ; realign rsp
- ret
- ; an implementation of pix
- ; parameters:
- ; - ppi [rdi]: ptr to result array;
- ; - pidx [rsi]: ptr to index;
- ; - max [rdx]: max index.
- ; modifies: all volatile registers
- pix:
- ; save args (we will need registers)
- push rdi
- push rsi
- push rdx
- call run_pixtime
- ; save non-volatiles
- push rbx
- push rbp
- push r12
- push r13
- push r14
- ; positions of args wrt rsp (+)
- MAX equ 0x28
- PIDX_PTR equ MAX+0x8
- PPI_PTR equ PIDX_PTR+0x8
- .loop_m:
- mov edi, 2 ; we shall increment *pidx by 2 (i.e. do 64 bits at once)
- mov rcx, [rsp+PIDX_PTR] ; load pidx into rcx
- lock xadd [rcx], rdi ; load m = (*pidx += 2) atomically into rdi
- cmp rdi, [rsp+MAX]
- jge .loop_m_end ; if m >= max, end
- push rdi ; save m
- call series ; compute digits 32m..32m+63 into rax
- pop rdi ; restore m
- mov rcx, [rsp+PPI_PTR] ; load ppi into rcx
- cmp rdi, [rsp+MAX]
- jge .skip_upper
- mov DWORD [rcx+4*rdi+4], eax; if m+1 < max, load the lower part into ppi[m+1]
- .skip_upper:
- shr rax, 32 ; move the higher part into the lower
- mov DWORD [rcx+4*rdi], eax ; load eax into ppi[m]
- jmp .loop_m
- .loop_m_end:
- lea rsp, [rsp-0x8] ; align stack for ABI
- call run_pixtime
- lea rsp, [rsp+0x8]
- ; restore non-volatiles
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- lea rsp, [rsp+0x18] ; skip saved args
- ret
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement