Guest User

Untitled

a guest
Feb 3rd, 2013
31
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. FDM_999_ASM_reuse proc ;needs one element more...
  2.     ;preserve callee-save register
  3.     movapd reg6,  xmm6
  4.     movapd reg7,  xmm7
  5.     movapd reg12, xmm12
  6.     movapd reg13, xmm13
  7.     movapd reg14, xmm14
  8.     ;load constants
  9.     movapd xmm5,  c2
  10.     movapd xmm14, c3
  11.     movapd xmm13, half
  12.     movapd xmm12, quarter
  13.    
  14.     mov rax, rdx
  15.     add rax, 7984 ; N*8-8
  16.    
  17.     mov r9, rdx     ;save alt in r9
  18.     mov r10, r8     ;save neu in r10
  19.    
  20. TimeLoop:
  21.     ;first two elements
  22.     movapd  xmm0, [rdx]
  23.     movapd  xmm3, [rdx+16]
  24.    
  25.     movapd  xmm1, xmm0      ;alt[1,2] -> reuse in inner loop
  26.     shufpd  xmm1, xmm3, 01b
  27.    
  28.     movsd   xmm7, xmm1      ;xmm7 = alt[1]
  29.     mulsd   xmm7, xmm14     ;calculate neu[0]=alt[1]/c3
  30.    
  31.     movhlps xmm6, xmm1      ;xmm6 = alt[2]
  32.     addsd   xmm6, xmm0      ;alt[0]+alt[2]
  33.     mulsd   xmm6, xmm12     ;0.25*(alt[0]+alt[2])
  34.    
  35.     movsd   xmm0, xmm1      ;xmm0 hat jetzt alt[1]
  36.     mulsd   xmm0, xmm13     ;0.5*alt[1]
  37.     addsd   xmm0, xmm6      ;=neu[1]
  38.    
  39.     movlhps xmm7, xmm0      ;combine neu[0] and neu[1]
  40.     movapd  xmmword ptr [r8], xmm7
  41.    
  42.     add     rdx, 16
  43.     add     r8,  16
  44.    
  45. InnerLoop:
  46.     ;movupd xmm2, [rdx+8]
  47.     movapd  xmm4, [rdx+16]
  48.    
  49.     movapd  xmm2, xmm3
  50.     shufpd  xmm2, xmm4, 01b
  51.    
  52.     addpd   xmm1, xmm2
  53.     mulpd   xmm1, xmm12
  54.    
  55.     ;movapd xmm3, [rdx]
  56.     mulpd   xmm3, xmm13
  57.     addpd   xmm3, xmm1
  58.     movapd  [r8], xmm3
  59.    
  60.     ;movupd xmm1, [rdx+24]
  61.     movapd  xmm3, [rdx+32]
  62.     movapd  xmm1, xmm4
  63.     shufpd  xmm1, xmm3, 01b
  64.     addpd   xmm2, xmm1
  65.     mulpd   xmm2, xmm12
  66.    
  67.    
  68.     mulpd   xmm4, xmm13
  69.     addpd   xmm4, xmm2
  70.     movapd  [r8+16], xmm4
  71.     add     r8,  32
  72.     add     rdx, 32
  73.     cmp     rdx, rax
  74.     jne InnerLoop
  75.    
  76.     ;calculate neu[N-1]
  77.     movsd   xmm6, xmm1
  78.     addsd   xmm6, xmm5
  79.     movsd   qword ptr [r8], xmm6
  80.    
  81.     ;swap pointers
  82.     xchg        r9,r10
  83.     mov     rdx, r9
  84.     mov     r8,r10
  85.     mov     rax, rdx
  86.     add     rax, 7984
  87.    
  88.     sub         rcx, 1
  89.     jnz TimeLoop
  90.    
  91.     ;restore callee-save register
  92.     movapd xmm6,  reg6
  93.     movapd xmm7,  reg7
  94.     movapd xmm12, reg12
  95.     movapd xmm13, reg13
  96.     movapd xmm14, reg14
  97.    
  98.     RET
  99. FDM_999_ASM_reuse endp
Advertisement
Add Comment
Please, Sign In to add comment