Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ;//
- ;// INTEL CORPORATION PROPRIETARY INFORMATION
- ;// This software is supplied under the terms of a license agreement or
- ;// nondisclosure agreement with Intel Corporation and may not be copied
- ;// or disclosed except in accordance with the terms of that agreement.
- ;// Copyright (c) 2000 Intel Corporation. All Rights Reserved.
- ;//
- ;//
- ; log_wmt.asm
- ;
- ; double log(double);
- ;
- ; Initial version: 12/15/2000
- ; Updated with bug fixes: 2/20/2001
- ;
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; ;;
- ;; Another important feature is that we use the table of log(1/B) ;;
- ;; throughout. To ensure numerical accuracy, we only need to ensure that ;;
- ;; T(0)_hi = B(last)_hi, T(0)_lo = B(last)_lo. This ensures W_hi = 0 and ;;
- ;; W_lo = 0 exactly in the case of |X-1| <= 2^(-7). ;;
- ;; Finally, we do away with the need for extra-precision addition by the ;;
- ;; following observation. The three pieces at the end are ;;
- ;; A = W_hi + r_hi; B = r_lo; C = P + W_lo. ;;
- ;; When W_hi = W_lo = 0, the addition sequence (A+B) + C is accurate as ;;
- ;; the sum A+B is exact. ;;
- ;; Otherwise, A + (B+C) is accurate as B is going to be largely shifted ;;
- ;; off compared to the final result. ;;
- ;; Hence if we use compare and mask operations to ;;
- ;; create alpha = (r_lo or 0), beta = (0 or r_lo), Res_hi <- W_hi+alpha, ;;
- ;; Res_lo <- C + beta, then result is accurately computed as ;;
- ;; Res_hi+Res_lo. ;;
- ;; ;;
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- .686P
- .387
- .XMM
- .MODEL FLAT,C
- EXTRN C __libm_error_support : NEAR
- CONST SEGMENT PARA PUBLIC USE32 'CONST'
- ALIGN 16
- emask DQ 000FFFFFFFFFFFFFH, 000FFFFFFFFFFFFFH ; mask off sign/expo field
- Magic DQ 428FFFFFFFFFF80FH, 428FFFFFFFFFF80FH ; 2^(42)-1+2^(-7)
- hi_mask DQ 7FFFFFFFFFE00000H, 7FFFFFFFFFE00000H ; mask of bottom 21 bits
- LOG_2 DQ 3FE62E42FEFA3800H, 3D2EF35793C76730H ; L_hi,L_lo -> [L_lo|L_hi]
- place_L DQ 0000000000000000H,0FFFFFFFFFFFFFFFFH ; 0,1 -> [FF..FF|00..00]
- place_R DQ 0FFFFFFFFFFFFFFFFH, 0000000000000000H ; 1,0 -> [00..00|FF..FF]
- One DQ 3ff0000000000000H, 3ff0000000000000H ; 1,1
- Zero DQ 0000000000000000H, 0000000000000000H ; 0,0
- Two52 DQ 4330000000000000H, 4330000000000000H ; 2^52 for normalization
- Infs DQ 0FFF0000000000000H, 7FF0000000000000H ; -inf,+inf --> [+inf|-inf]
- NaN DQ 7FF0000000000001H, 7FF0000000000001H ; NaN for log(-ve), log(Nan)
- coeff DQ 3FC24998090DC555H, 0BFCFFFFFFF201E13H ; p6,p3 ->[p3|p6]
- DQ 0BFC555C54DD57D75H, 3FD55555555555A7H ; p5,p2 ->[p2|p5]
- DQ 3FC9999998867A53H, 0BFE000000000001CH ; p4,p1 ->[p1|p4]
- ;-------Table B-----------
- B_Tbl DQ 3FF0000000000000H, 3FF0000000000000H
- DQ 3FEF820000000000H, 3FEF820000000000H
- DQ 3FEF080000000000H, 3FEF080000000000H
- DQ 3FEE920000000000H, 3FEE920000000000H
- DQ 3FEE1E0000000000H, 3FEE1E0000000000H
- DQ 3FEDAE0000000000H, 3FEDAE0000000000H
- DQ 3FED420000000000H, 3FED420000000000H
- DQ 3FECD80000000000H, 3FECD80000000000H
- DQ 3FEC720000000000H, 3FEC720000000000H
- DQ 3FEC0E0000000000H, 3FEC0E0000000000H
- DQ 3FEBAC0000000000H, 3FEBAC0000000000H
- DQ 3FEB4E0000000000H, 3FEB4E0000000000H
- DQ 3FEAF20000000000H, 3FEAF20000000000H
- DQ 3FEA980000000000H, 3FEA980000000000H
- DQ 3FEA420000000000H, 3FEA420000000000H
- DQ 3FE9EC0000000000H, 3FE9EC0000000000H
- DQ 3FE99A0000000000H, 3FE99A0000000000H
- DQ 3FE9480000000000H, 3FE9480000000000H
- DQ 3FE8FA0000000000H, 3FE8FA0000000000H
- DQ 3FE8AC0000000000H, 3FE8AC0000000000H
- DQ 3FE8620000000000H, 3FE8620000000000H
- DQ 3FE8180000000000H, 3FE8180000000000H
- DQ 3FE7D00000000000H, 3FE7D00000000000H
- DQ 3FE78A0000000000H, 3FE78A0000000000H
- DQ 3FE7460000000000H, 3FE7460000000000H
- DQ 3FE7020000000000H, 3FE7020000000000H
- DQ 3FE6C20000000000H, 3FE6C20000000000H
- DQ 3FE6820000000000H, 3FE6820000000000H
- DQ 3FE6420000000000H, 3FE6420000000000H
- DQ 3FE6060000000000H, 3FE6060000000000H
- DQ 3FE5CA0000000000H, 3FE5CA0000000000H
- DQ 3FE58E0000000000H, 3FE58E0000000000H
- DQ 3FE5560000000000H, 3FE5560000000000H
- DQ 3FE51E0000000000H, 3FE51E0000000000H
- DQ 3FE4E60000000000H, 3FE4E60000000000H
- DQ 3FE4B00000000000H, 3FE4B00000000000H
- DQ 3FE47A0000000000H, 3FE47A0000000000H
- DQ 3FE4460000000000H, 3FE4460000000000H
- DQ 3FE4140000000000H, 3FE4140000000000H
- DQ 3FE3E20000000000H, 3FE3E20000000000H
- DQ 3FE3B20000000000H, 3FE3B20000000000H
- DQ 3FE3820000000000H, 3FE3820000000000H
- DQ 3FE3520000000000H, 3FE3520000000000H
- DQ 3FE3240000000000H, 3FE3240000000000H
- DQ 3FE2F60000000000H, 3FE2F60000000000H
- DQ 3FE2CA0000000000H, 3FE2CA0000000000H
- DQ 3FE29E0000000000H, 3FE29E0000000000H
- DQ 3FE2740000000000H, 3FE2740000000000H
- DQ 3FE24A0000000000H, 3FE24A0000000000H
- DQ 3FE2200000000000H, 3FE2200000000000H
- DQ 3FE1F80000000000H, 3FE1F80000000000H
- DQ 3FE1D00000000000H, 3FE1D00000000000H
- DQ 3FE1A80000000000H, 3FE1A80000000000H
- DQ 3FE1820000000000H, 3FE1820000000000H
- DQ 3FE15C0000000000H, 3FE15C0000000000H
- DQ 3FE1360000000000H, 3FE1360000000000H
- DQ 3FE1120000000000H, 3FE1120000000000H
- DQ 3FE0EC0000000000H, 3FE0EC0000000000H
- DQ 3FE0CA0000000000H, 3FE0CA0000000000H
- DQ 3FE0A60000000000H, 3FE0A60000000000H
- DQ 3FE0840000000000H, 3FE0840000000000H
- DQ 3FE0620000000000H, 3FE0620000000000H
- DQ 3FE0420000000000H, 3FE0420000000000H
- DQ 3FE0200000000000H, 3FE0200000000000H
- DQ 3FE0000000000000H, 3FE0000000000000H
- ;-------Table T_hi,T_lo so that movapd gives [ T_lo | T_hi ]
- T_Tbl DQ 0000000000000000H, 0000000000000000H
- DQ 3F8FBEA8B13C0000H, 3CDEC927B17E4E13H
- DQ 3F9F7A9B16780000H, 3D242AD9271BE7D7H
- DQ 3FA766D923C20000H, 3D1FF0A82F1C24C1H
- DQ 3FAF0C30C1114000H, 3D31A88653BA4140H
- DQ 3FB345179B63C000H, 3D3D4203D36150D0H
- DQ 3FB6EF528C056000H, 3D24573A51306A44H
- DQ 3FBA956D3ECAC000H, 3D3E63794C02C4AFH
- DQ 3FBE2507702AE000H, 3D303B433FD6EEDCH
- DQ 3FC0D79E7CD48000H, 3D3CB422847849E4H
- DQ 3FC299D30C606000H, 3D3D4D0079DC08D9H
- DQ 3FC44F8B726F8000H, 3D3DF6A4432B9BB4H
- DQ 3FC601B076E7A000H, 3D3152D7D4DFC8E5H
- DQ 3FC7B00916515000H, 3D146280D3E606A3H
- DQ 3FC9509AA0044000H, 3D3F1E675B4D35C6H
- DQ 3FCAF6895610D000H, 3D375BEBBA042B64H
- DQ 3FCC8DF7CB9A8000H, 3D3EEE42F58E1E6EH
- DQ 3FCE2A877A6B2000H, 3D3823817787081AH
- DQ 3FCFB7D86EEE3000H, 3D371FCF1923FB43H
- DQ 3FD0A504E97BB000H, 3D303094E6690C44H
- DQ 3FD1661CAECB9800H, 3D2D1C000C076A8BH
- DQ 3FD22981FBEF7800H, 3D17AF7A7DA9FC99H
- DQ 3FD2E9E2BCE12000H, 3D24300C128D1DC2H
- DQ 3FD3A71C56BB4800H, 3D08C46FB5A88483H
- DQ 3FD4610BC29C5800H, 3D385F4D833BCDC7H
- DQ 3FD51D1D93104000H, 3D35B0FAA20D9C8EH
- DQ 3FD5D01DC49FF000H, 3D2740AB8CFA5ED3H
- DQ 3FD68518244CF800H, 3D28722FF88BF119H
- DQ 3FD73C1800DC0800H, 3D3320DBF75476C0H
- DQ 3FD7E9883FA49800H, 3D3FAFF96743F289H
- DQ 3FD898D38A893000H, 3D31F666071E2F57H
- DQ 3FD94A0428036000H, 3D30E7BCB08C6B44H
- DQ 3FD9F123F4BF6800H, 3D36892015F2401FH
- DQ 3FDA99FCABDB8000H, 3D11E89C5F87A311H
- DQ 3FDB44977C148800H, 3D3C6A343FB526DBH
- DQ 3FDBEACD9E271800H, 3D268A6EDB879B51H
- DQ 3FDC92B7D6BB0800H, 3D10FE9FFF876CC2H
- DQ 3FDD360E90C38000H, 3D342CDB58440FD6H
- DQ 3FDDD4AA04E1C000H, 3D32D8512DF01AFDH
- DQ 3FDE74D262788800H, 3CFEB945ED9457BCH
- DQ 3FDF100F6C2EB000H, 3D2CCE779D37F3D8H
- DQ 3FDFACC89C9A9800H, 3D163E0D100EC76CH
- DQ 3FE02582A5C9D000H, 3D222C6C4E98E18CH
- DQ 3FE0720E5C40DC00H, 3D38E27400B03FBEH
- DQ 3FE0BF52E7353800H, 3D19B5899CD387D3H
- DQ 3FE109EB9E2E4C00H, 3D12DA67293E0BE7H
- DQ 3FE15533D3B8D400H, 3D3D981CA8B0D3C3H
- DQ 3FE19DB6BA0BA400H, 3D2B675885A4A268H
- DQ 3FE1E6DF676FF800H, 3D1A58BA81B983AAH
- DQ 3FE230B0D8BEBC00H, 3D12FC066E48667BH
- DQ 3FE2779E1EC93C00H, 3D36523373359B79H
- DQ 3FE2BF29F9841C00H, 3CFD8A3861D3B7ECH
- DQ 3FE30757344F0C00H, 3D309BE85662F034H
- DQ 3FE34C80A8958000H, 3D1D4093FCAC34BDH
- DQ 3FE39240DDE5CC00H, 3D3493DBEAB758B3H
- DQ 3FE3D89A6B1A5400H, 3D28C7CD5FA81E3EH
- DQ 3FE41BCFF4860000H, 3D076FD6B90E2A84H
- DQ 3FE4635BCF40DC00H, 3D2CE8D5D412CAADH
- DQ 3FE4A3E862342400H, 3D224FA993F78464H
- DQ 3FE4E8D015786C00H, 3D38B1C0D0303623H
- DQ 3FE52A6D269BC400H, 3D30022268F689C9H
- DQ 3FE56C91D71CF800H, 3CE07BAFD1366E9EH
- DQ 3FE5AB505B390400H, 3CD5627AF66563FAH
- DQ 3FE5EE82AA241800H, 3D2202380CDA46BEH
- DQ 3FE62E42FEFA3800H, 3D2EF35793C76730H
- ALIGN 16
- CONST ENDS
- $cmpsd MACRO op1, op2, op3
- LOCAL begin_cmpsd, end_cmpsd
- begin_cmpsd:
- cmppd op1, op2, op3
- end_cmpsd:
- org begin_cmpsd
- db 0F2h
- org end_cmpsd
- ENDM
- _TEXT SEGMENT PARA PUBLIC USE32 'CODE'
- ALIGN 16
- PUBLIC _log_pentium4, _CIlog_pentium4
- _CIlog_pentium4 PROC NEAR
- push ebp
- mov ebp, esp
- sub esp, 8 ; for argument DBLSIZE
- and esp, 0fffffff0h
- fstp qword ptr [esp]
- movq xmm0, qword ptr [esp]
- call start
- leave
- ret
- ;----------------------;
- ;--Argument Reduction--;
- ;----------------------;
- _log_pentium4 label proc
- movlpd xmm0, QWORD PTR [4+esp] ;... load X to low part of xmm0
- start:
- mov edx,0 ;... set edx to 0
- DENORMAL_RETRY:
- movapd xmm5,xmm0
- unpcklpd xmm0,xmm0 ;... [X|X]
- psrlq xmm5,52
- pextrw ecx,xmm5,0
- movapd xmm1, QWORD PTR [emask] ;... pair of 000FF...FF
- movapd xmm3, QWORD PTR [One] ;... pair of 3FF000...000
- movapd xmm4, QWORD PTR [Magic] ;... pair of 2^(42)-1+2^(-7)
- movapd xmm6, QWORD PTR [hi_mask] ;... pair of 7FFFFFFF..FE00000
- andpd xmm0,xmm1
- orpd xmm0,xmm3 ;... [Y|Y]
- addpd xmm4,xmm0 ;... 11 lsb contains the index to B
- ;... the last 4 lsb are don't cares, the
- ;... 7 bits following that is the index
- ;... Hence by masking, we already have index*16
- pextrw eax,xmm4,0
- and eax,000007F0H ;... eax is offset
- movapd xmm4, QWORD PTR [eax+B_Tbl] ;... [B|B]
- movapd xmm7, QWORD PTR [eax+T_Tbl]
- andpd xmm6,xmm0 ;... [Y_hi|Y_hi]
- subpd xmm0,xmm6 ;... [Y_lo|Y_lo]
- mulpd xmm6,xmm4 ;... [B*Y_hi|B*Y_hi]
- subpd xmm6,xmm3 ;... [R_hi|R_hi]
- addsd xmm7,xmm6 ;... [T_lo|T_hi+R_hi]
- mulpd xmm0,xmm4 ;... [R_lo|R_lo]
- movapd xmm4,xmm0 ;... [R_lo|R_lo]
- addpd xmm0,xmm6 ;... [R|R]
- ;-----------------------------------------;
- ;--Approx and Reconstruction in parallel--;
- ;-----------------------------------------;
- ;...m is in ecx, [T_lo,T_hi+R_hi] in xmm7
- ;...xmm4 through xmm6 will be used
- and ecx,00000FFFH ;... note we need sign and biased exponent
- sub ecx,1
- cmp ecx,2045 ;... the largest biased exponent 2046-1
- ;... if ecx is ABOVE (unsigned) this, either
- ;... the sign is +ve and biased exponent is 7FF
- ;... or the sign is +ve and exponent is 0, or
- ;... the sign is -ve (i.e. sign bit 1)
- ja SPECIAL_CASES
- sub ecx,1022 ;... m in integer format
- add ecx,edx ;... this is the denormal adjustment
- cvtsi2sd xmm6,ecx
- unpcklpd xmm6,xmm6 ;... [m | m] in FP format
- shl ecx,10
- add eax,ecx ;16*(64*m + j) 0 <=> (m=-1 & j=64) or (m=0 & j=0)
- mov ecx,16
- mov edx,0
- cmp eax,0
- cmove edx,ecx ;this is the index into the mask table (place_{L,R})
- movapd xmm1, QWORD PTR [coeff] ;... loading [p3|p6]
- movapd xmm3,xmm0
- movapd xmm2, QWORD PTR [coeff+16] ;... loading [p2|p5]
- mulpd xmm1,xmm0 ;... [p3 R | p6 R]
- mulpd xmm3,xmm3 ;... [R^2|R^2]
- addpd xmm1,xmm2 ;... [p2+p3 R |p5+p6 R]
- movapd xmm2, QWORD PTR [coeff+32] ;... [p1|p4]
- mulsd xmm3,xmm3 ;... [R^2|R^4]
- movapd xmm5, QWORD PTR [LOG_2] ;... loading [L_lo|L_hi]
- ;... [T_lo|T_hi+R_hi] already in xmm7
- mulpd xmm6,xmm5 ;... [m L_lo | m L_hi]
- movapd xmm5, QWORD PTR [edx+place_L] ;... [FF..FF|00.00] or [00..00|FF..FF]
- andpd xmm4,xmm5 ;... [R_lo|0] or [0|R_lo]
- addpd xmm7,xmm6 ;... [W_lo|W_hi]
- addpd xmm7,xmm4 ;... [A_lo|A_hi]
- mulpd xmm1,xmm0 ;... [p2 R+p3 R^2|p5 R+p6 R^2]
- mulsd xmm3,xmm0 ;... [R^2|R^5]
- addpd xmm1,xmm2 ;... [p1+.. | p4+...]
- movapd xmm6,xmm7
- unpckhpd xmm6,xmm6 ;... [*|A_lo]
- mulpd xmm1,xmm3 ;... [P_hi|P_lo]
- sub esp, 16
- movapd xmm0,xmm1 ;... copy of [P_hi|P_lo]
- unpckhpd xmm1,xmm1 ;... [P_hi|P_hi]
- ;...[P_hi|P_lo] in xmm1 at this point
- addsd xmm0,xmm1 ;... [*|P]
- addsd xmm0,xmm6
- addsd xmm0,xmm7
- movlpd QWORD PTR [esp+4], xmm0 ; return result
- fld QWORD PTR [esp+4] ;
- add esp, 16
- ret
- SPECIAL_CASES:
- movlpd xmm0, QWORD PTR [4+esp] ;... load X again
- movapd xmm1, QWORD PTR [Zero]
- $cmpsd xmm1,xmm0,0
- pextrw eax,xmm1,0 ;... ones if X = +-0.0
- cmp eax,0
- ja INPUT_ZERO
- cmp ecx,-1 ;... ecx = -1 iff X is positive denormal
- je INPUT_DENORM
- cmp ecx,000007FEH
- ja INPUT_NEGATIVE
- movlpd xmm0, QWORD PTR [4+esp]
- movapd xmm1, QWORD PTR [emask]
- movapd xmm2, QWORD PTR [One]
- andpd xmm0,xmm1
- orpd xmm0,xmm2 ;... xmm0 is 1 iff the input argument was +inf
- $cmpsd xmm2,xmm0,0
- pextrw eax,xmm2,0 ;... 0 if X is NaN
- cmp eax, 0
- je INPUT_NaN
- INPUT_INF:
- ;....Input is +Inf
- fld QWORD PTR [Infs+8] ;
- ret
- INPUT_NaN:
- ; movlpd xmm0, QWORD PTR [esp+4]
- ; addsd xmm0, xmm0
- ; sub esp, 16
- ; movlpd QWORD PTR [esp+4], xmm0 ; return result
- ; fld QWORD PTR [esp+4] ;
- ; add esp, 16
- ; ret
- mov edx, 1000
- jmp CALL_LIBM_ERROR
- INPUT_ZERO:
- ; raise Divide by Zero
- movlpd xmm2, QWORD PTR [One]
- divsd xmm2, xmm0
- movlpd xmm1, QWORD PTR [Infs]
- mov edx, 2
- jmp CALL_LIBM_ERROR
- INPUT_DENORM:
- ;....check for zero or denormal
- ;....for now I assume this is simply denormal
- ;....in reality, we need to check for zero and handle appropriately
- movlpd xmm1,Two52
- mulsd xmm0,xmm1
- mov edx,-52 ;...set adjustment to exponent
- jmp DENORMAL_RETRY ;...branch back
- INPUT_NEGATIVE:
- add ecx,1
- and ecx, 7ffH
- cmp ecx, 7ffH
- jae NEG_INF_NAN
- NEG_NORMAL_INFINITY:
- ; xmm1=0
- xorpd xmm1, xmm1
- ; raise Invalid
- divsd xmm1, xmm1
- mov edx, 3
- CALL_LIBM_ERROR:
- ;call libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag)
- sub esp, 28
- movlpd QWORD PTR [esp+16], xmm1
- mov DWORD PTR [esp+12],edx
- mov edx, esp
- add edx,16
- mov DWORD PTR [esp+8],edx
- add edx,16
- mov DWORD PTR [esp+4],edx
- mov DWORD PTR [esp],edx
- call NEAR PTR __libm_error_support
- ; movlpd xmm0, QWORD PTR [esp+16]
- ; movlpd QWORD PTR [esp+16], xmm0 ; return result
- fld QWORD PTR [esp+16] ;
- add esp,28
- ret
- NEG_INF_NAN:
- movlpd xmm2, QWORD PTR [esp+4]
- movlpd xmm0, QWORD PTR [esp+4]
- movd eax, xmm2
- psrlq xmm2, 32
- movd ecx, xmm2
- and ecx, 0fffffH ; eliminate sign/exponent
- or eax, ecx
- cmp eax,0
- jz NEG_NORMAL_INFINITY ; negative infinity
- ; addsd xmm0, xmm0
- ; sub esp,16
- ; movlpd QWORD PTR [esp+4], xmm0
- ; fld QWORD PTR [esp+4]
- ; add esp, 16
- ; ret
- mov edx, 1000
- jmp CALL_LIBM_ERROR
- _CIlog_pentium4 ENDP
- ALIGN 16
- _TEXT ENDS
- END
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement