sol128.asm

section .text
global pix
extern pixtime


    ; 0x0,%1:%2 += 0x0,%3:%4, fp
%macro add128 4
    add %2, %4
    adc %1, %3
%endmacro

    ; 0x0,%1:%2 -= 0x0,%3:%4, fp
%macro sub128 4
    sub %2, %4
    sbb %1, %3
%endmacro

    ; %1:%2 <<= %3
%macro shl128 3
    shld %1, %2, %3
    shl %2, %3
%endmacro

    ; %1:%2 >>= %3
%macro shr128 3
    shrd %2, %1, %3
    shr %1, %3
%endmacro


    ; %1:%2 = rdi * rbp:rax + rdx
    ; (n_1 2^B + n_2) = d (q_1 2^B + q_2) + r
%macro div128_64 2
    xor edx, edx
    mov rax, %1
    div rsi                     ; n_1 = d q_1 + \hat{r}
    mov rbp, rax                ; save q_1
    mov rax, %2
    div rsi                     ; 2^B * \hat{r} + n_2 = d q_2 + r
%endmacro


    ; computes 16^{8m-n} \mod{8n+q}. operates in 64-bit regime. in order to be
    ; correct, we require (M-1)^2 < 2^64, or (8m+5)^2 < 64, or m ~< 550*10^6 or so
    ; uses conventions from `series_part`:
    ; - 8m-n in rdi, 8n+q in rsi (neither are to be modified);
    ; - rbx, r8..r12 are not to be modified.
    ; the result is in rcx
%macro powmod16 0
    mov eax, 16                 ; base <- 16
    mov ecx, 1                  ; res <- 1
    mov r13, rdi                ; \vareps <- 8m-n

%%loop_vareps:
    test r13, r13
    je %%loop_vareps_end        ; if \vareps = 0, break

    bt r13, 0
    jnc %%skip_update_res       ; if exp & 1 == 1, we need to update res
    xchg rcx, rax               ; swap base and res (we need res in rax)
    imul rcx                    ; res *= base
    xor edx, edx                ; clear rdx for div/mul
    div rsi                     ; rdx <- res % mod
    mov rax, rcx
    mov rcx, rdx                ; rax <- base, r8 <- res, i.e. as before
%%skip_update_res:

    mul rax                     ; base *= base
    xor edx, edx                ; clear rdx for div/mul
    div rsi                     ; rdx <- base % mod
    mov rax, rdx                ; base <- rdx = base % mod

    shr r13, 1                  ; exp /= 2
    jmp %%loop_vareps
%%loop_vareps_end:
%endmacro


    ; computes 128 upper bits of the fractional part of
    ; 2^{32m} \sum_{n=0}^\infty \frac{1}{16^m} \frac{1}{8n+q}
    ; parameters:
    ; - 8m [rdi];
    ; - q [rsi].
    ; modifies: all volatile registers
    ; returns in r11:r12
series_part:
    ; rdi: 8m-n, rsi:8n+q in the procedure
    xor ebx, ebx                ; n <- 0
    xor r11d, r11d
    xor r12d, r12d              ;  r11:r12 =: res <- 0


    ; series for n=0..8m
.loop_prim:
    cmp rbx, r10
    jg .loop_prim_end           ; if n > 8m (8m saved from `series`), break

    powmod16                    ; now rcx = 16^{8m-n} \mod{8n+q}
    div128_64 -1, -1            ; 2^128-1 = (8n+q) rbp:rax + rdx

    ; here we mul rbp:rax and rcx
    mul rcx                     ; now rdx:rax = rax * rcx
    imul rcx, rbp               ; now rcx is lower part of rbp * rax
    add rdx, rcx                ; now rdx:rax = rbp:rax * rcx

    add128 r11, r12, rdx, rax   ; res += (16^{8m-n} \mod{8n+q})/(8n+q), fp

    inc rbx                     ; ++n
    dec rdi                     ; adjust 8m-n
    add rsi, 8                  ; adjust 8n+q
    jmp .loop_prim
.loop_prim_end:

    ; series for n=8m+1..\infty (or rather while the terms are >=2^{-128})
    mov r13, 0x1000000000000000
    xor r14d, r14d              ; r13:r14 <- 2^124 = 2^128/16^{n-8m}
.loop_pert:

    test r13, r13
    jne .skip_check
    test r14, r14
    je .loop_pert_end           ; if r13 = r14 = 0, r13:r14 = 0: break
.skip_check:

    div128_64 r13, r14          ; r13:r14 = (8n+q) rbp:rax + rdx
    add128 r11, r12, rbp, rax   ; res += 1/(16^{n-8m}(8n+q)), fp

    ; ++n
    add rsi, 8                  ; adjust 8n+q
    shr128 r13, r14, 4          ; adjust r13:r14
    jmp .loop_pert
.loop_pert_end:

    ret


    ; computes* digits 32m..32m+63 of \pi (* with possible errors)
    ; parameters:
    ; - m [rdi]: m from the description.
    ; returns: digits 32m..32m+63 of \pi
    ; modifies: all volatile registers
series:
    xor r8d, r8d
    xor r9d, r9d                ; r8:r9 =: res <- 0
    shl rdi, 3                  ; m *= 8
    mov r10, rdi                ; save 8m

    mov esi, 1
    call series_part
    shl128 r11, r12, 2
    add128 r8, r9, r11, r12     ; res = fp(res + 4 series_part(8m+1, 1)) (S(m, q) henceforth)

    mov rdi, r10
    mov esi, 4
    call series_part
    shl128 r11, r12, 1
    sub128 r8, r9, r11, r12     ; res = fp(res - 2 S(m, 4))

    mov rdi, r10
    mov esi, 5
    call series_part
    sub128 r8, r9, r11, r12     ; res = fp(res - S(m, 5))

    mov rdi, r10
    mov esi, 6
    call series_part
    sub128 r8, r9, r11, r12     ; res = fp(res - S(m, 6))

    mov rax, r8                 ; move upper 64 bits into rax
    ret


    ; invokes pixtime with the result of rdtsc
    ; modifies: rdi, rax, rdx
run_pixtime:
    rdtsc                       ; call rdtsc - result is edx:eax, where upper parts are 0
    shl rdx, 32                 ; move ebx part of the result from the lower half of rdx into the upper
    lea rdi, [rdx+rax]          ; now rdx+rax is real result - move it into 1st parameter slot
    lea rsp, [rsp-0x8]          ; need to align rsp for ABI once again
    call pixtime
    lea rsp, [rsp+0x8]          ; realign rsp
    ret


    ; an implementation of pix
    ; parameters:
    ; - ppi [rdi]: ptr to result array;
    ; - pidx [rsi]: ptr to index;
    ; - max [rdx]: max index.
    ; modifies: all volatile registers
pix:
    ; save args (we will need registers)
    push rdi
    push rsi
    push rdx

    call run_pixtime

    ; save non-volatiles
    push rbx
    push rbp
    push r12
    push r13
    push r14

    ; positions of args wrt rsp (+)
    MAX equ 0x28
    PIDX_PTR equ MAX+0x8
    PPI_PTR equ PIDX_PTR+0x8

.loop_m:
    mov edi, 2                  ; we shall increment *pidx by 2 (i.e. do 64 bits at once)
    mov rcx, [rsp+PIDX_PTR]     ; load pidx into rcx
    lock xadd [rcx], rdi        ; load m = (*pidx += 2) atomically into rdi
    cmp rdi, [rsp+MAX]
    jge .loop_m_end             ; if m >= max, end

    push rdi                    ; save m
    call series                 ; compute digits 32m..32m+63 into rax
    pop rdi                     ; restore m

    mov rcx, [rsp+PPI_PTR]      ; load ppi into rcx

    cmp rdi, [rsp+MAX]
    jge .skip_upper
    mov DWORD [rcx+4*rdi+4], eax; if m+1 < max, load the lower part into ppi[m+1]
.skip_upper:
    shr rax, 32                 ; move the higher part into the lower
    mov DWORD [rcx+4*rdi], eax  ; load eax into ppi[m]

    jmp .loop_m
.loop_m_end:
    lea rsp, [rsp-0x8]          ; align stack for ABI
    call run_pixtime
    lea rsp, [rsp+0x8]

    ; restore non-volatiles
    pop r14
    pop r13
    pop r12
    pop rbp
    pop rbx

    lea rsp, [rsp+0x18]         ; skip saved args
    ret