Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- FDM_unaligned proc
- ;preserve callee-save register
- movapd reg6, xmm6
- movapd reg7, xmm7
- movapd reg12, xmm12
- movapd reg13, xmm13
- movapd reg14, xmm14
- ;load constants
- movapd xmm5, c2
- movapd xmm14, c3
- movapd xmm13, half
- movapd xmm12, quarter
- mov rax, rdx
- add rax, 7984 ; N*8-8
- mov r9, rdx ;save alt in r9
- mov r10, r8 ;save neu in r10
- TimeLoop:
- ;first two elements
- movapd xmm0, [rdx] ;alt[0,1]
- movupd xmm1, [rdx+8] ;alt[1,2] -> reuse in inner loop
- movsd xmm7, xmm1 ;xmm7 = alt[1]
- mulsd xmm7, xmm14 ;calculate neu[0]=alt[1]/c3
- movhlps xmm6, xmm1 ;xmm6 = alt[2]
- addsd xmm6, xmm0 ;alt[0]+alt[2]
- mulsd xmm6, xmm12 ;0.25*(alt[0]+alt[2])
- movsd xmm0, xmm1 ;xmm0 hat jetzt alt[1]
- mulsd xmm0, xmm13 ;0.5*alt[1]
- addsd xmm0, xmm6 ;=neu[1]
- movlhps xmm7, xmm0 ;combine neu[0] and neu[1]
- movapd xmmword ptr [r8], xmm7
- add rdx, 16
- add r8, 16
- InnerLoop:
- movupd xmm2, [rdx+8]
- movapd xmm4, [rdx+16]
- addpd xmm1, xmm2
- mulpd xmm1, xmm12
- movapd xmm3, [rdx]
- mulpd xmm3, xmm13
- addpd xmm3, xmm1
- movapd [r8], xmm3
- movupd xmm1, [rdx+24]
- addpd xmm2, xmm1
- mulpd xmm2, xmm12
- mulpd xmm4, xmm13
- addpd xmm4, xmm2
- movapd [r8+16], xmm4
- add r8, 32
- add rdx, 32
- cmp rdx, rax
- jne InnerLoop
- ;calculate neu[N-1]
- movsd xmm6, xmm1
- addsd xmm6, xmm5
- movsd qword ptr [r8], xmm6
- ;swap pointers
- xchg r9,r10
- mov rdx, r9
- mov r8,r10
- mov rax, rdx
- add rax, 7984
- sub rcx, 1
- jnz TimeLoop
- ;restore callee-save register
- movapd xmm6, reg6
- movapd xmm7, reg7
- movapd xmm12, reg12
- movapd xmm13, reg13
- movapd xmm14, reg14
- RET
- FDM_unaligned endp
Advertisement
Add Comment
Please, Sign In to add comment