math-sll.h

/*

mod sllatan() and _sllexp() for correct result.

*/


/*
 * $Id: math-sll.c,v 1.15 2002/08/20 18:01:54 andrewm Exp $
 *
 * Change: CHUI
 *
 * Purpose
 *  A fixed point (31.32 bit) math library.
 *
 * Description
 *  Floating point packs the most accuracy in the available bits, but it
 *  often provides more accuracy than is required.  It is time consuming to
 *  carry the extra precision around, particularly on platforms that don't
 *  have a dedicated floating point processor.
 *
 *  This library is a compromise.  All math is done using the 64 bit signed
 *  "long long" format (sll), and is not intended to be portable, just as
 *  fast as possible.  Since "long long" is a elementary type, it can be
 *  passed around without resorting to the use of pointers.  Since the
 *  format used is fixed point, there is never a need to do time consuming
 *  checks and adjustments to maintain normalized numbers, as is the case
 *  in floating point.
 *
 *  Simply put, this library is limited to handling numbers with a whole
 *  part of up to 2^31 - 1 = 2.147483647e9 in magnitude, and fractional
 *  parts down to 2^-32 = 2.3283064365e-10 in magnitude.  This yields a
 *  decent range and accuracy for many applications.
 *
 * IMPORTANT
 *  No checking for arguments out of range (error).
 *  No checking for divide by zero (error).
 *  No checking for overflow (error).
 *  No checking for underflow (warning).
 *  Chops, doesn't round.
 *
 * Functions
 *  sll dbl2sll(double x)           double -> sll
 *  double slldbl(sll x)            sll -> double
 *
 *  sll slladd(sll x, sll y)        x + y
 *  sll sllsub(sll x, sll y)        x - y
 *  sll sllmul(sll x, sll y)        x * y
 *  sll slldiv(sll x, sll y)        x / y
 *
 *  sll sllinv(sll v)           1 / x
 *  sll sllmul2(sll x)          x * 2
 *  sll sllmul4(sll x)          x * 4
 *  sll sllmul2n(sll x, int n)      x * 2^n, 0 <= n <= 31
 *  sll slldiv2(sll x)          x / 2
 *  sll slldiv4(sll x)          x / 4
 *  sll slldiv2n(sll x, int n)      x / 2^n, 0 <= n <= 31
 *
 *  sll sllcos(sll x)           cos x
 *  sll sllsin(sll x)           sin x
 *  sll slltan(sll x)           tan x
 *  sll sllatan(sll x)          atan x
 *
 *  sll sllexp(sll x)           e^x
 *  sll slllog(sll x)           ln x
 *
 *  sll sllpow(sll x, sll y)        x^y
 *  sll sllsqrt(sll x)          x^(1 / 2)
 *
 * History
 *  * Aug 20 2002 Nicolas Pitre <nico@cam.org> v1.15
 *  - Replaced all shifting assembly with C equivalents
 *  - Reformated ARM asm and changed comments to begin with @
 *  - Updated C version of sllmul()
 *  - Removed the unsupported architecture #error - should be portable now
 *
 *  * Aug 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.14
 *  - Fixed sign handling of ARM sllmul()
 *  - Ported sllmul() to x86 - it can be inlined now
 *  - Updated the sllmul() comments to reflect my changes
 *  - Updated the header comments
 *
 *  * Aug 17 2002 Nicolas Pitre <nico@cam.org> v1.13
 *  - Corrected and expanded upon Andrew's sllmul() comments
 *  - Added in an non-optimal but portable C version of sllmul()
 *
 *  * Aug 16 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.12
 *  - Added in corrected optimized sllmul() for ARM by Nicolas Pitre
 *  - Changed comments on multiplication to describe Nicolas's method
 *
 *  * Jun 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.11
 *  - Reverted optimized sllmul() for ARM because of bug
 *
 *  * Jun 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.10
 *  - Added in optimized sllmul() for ARM by Nicolas Pitre
 *  - Changed comments on multiplication to describe Nicolas's method
 *  - Optimized multiplications and divisions by powers of 2
 *
 *  * Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.9
 *  - Optimized multiplcations and divisions by powers of 2
 *
 *  * Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.8
 *  - Consolidated constants
 *  - Added macro for _slladd() _sllsub()
 *  - Removed __inline__ from slladd() sllsub()
 *  - Renamed umul() to ullmul() and made global
 *  - Added function prototypes
 *  - Corrected header comment about fractional range
 *  - Added warning for non-Linux operating systems
 *
 *  * Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.7
 *  - Corrected some i386 assembly comments
 *  - Renamed calc_*() to _sll*()
 *  - Moved _sllexp() closer to sllexp()
 *
 *  * Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.6
 *  - Added sllmul2() sllmul4() sllmul2n() for i386
 *  - Added slldiv2() slldiv4() slldiv2n() for i386
 *  - Removed input constraints on sllmul2() sllmul4() sllmul2n() for ARM
 *  - Removed input constraints on slldiv2() slldiv4() slldiv2n() for ARM
 *  - Modified ARM assembly for WYSIWYG output
 *  - Changed asm to __asm__
 *
 *  * Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.5
 *  - Fixed umul() for i386
 *  - Fixed dbl2sll() and sll2dbl() - I forgot ARM doubles are big-endian
 *  - Very lightly tested on ARM and i386 and it seems okay
 *
 *  * Feb  4 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.4
 *  - Added umul() for i386
 *
 *  * Jan 20 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.3
 *  - Added fast multiplication functions sllmul2(), sllmul4(), sllmul2n()
 *  - Added fast division functions slldiv2() slldiv(), slldiv4n()
 *  - Added square root function sllsqrt()
 *  - Added library roll-call
 *  - Reformatted the history to RPM format (ick)
 *  - Moved sllexp() closer to related functions
 *  - Added algorithm description to sllpow()
 *
 *  * Jan 19 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.1
 *  - Corrected constants, thanks to Mark A. Lisher for noticing
 *  - Put source under CVS control
 *
 *  * Jan 18 2002 Andrew E. Mileski <andrewm@isoar.ca>
 *  - Added some more explanation to calc_cos() and calc_sin()
 *  - Added sllatan() and documented it fairly verbosely
 *
 *  * July 13 2000 Andrew E. Mileski <andrewm@isoar.ca>
 *  - Corrected documentation for multiplication algorithm
 *
 *  * May 21 2000 Andrew E. Mileski <andrewm@isoar.ca>
 *  - Rewrote slltanx() to avoid scaling argument for both sine and cosine
 *
 *  * May 19 2000  Andrew E. Mileski <andrewm@isoar.ca>
 *  - Unrolled loops
 *  - Added sllinv(), and sllneg()
 *  - Changed constants to type "LL" (was "UL" - oops)
 *  - Changed all routines to use inverse constants instead of division
 *
 *  * May 15, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - Fixed slltan() - used sin/cos instead of sllsin/sllcos.  Doh!
 *  - Added slllog(x) and sllpow(x,y)
 *
 *  * May 11, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - Added simple tan(x) that could stand some optimization
 *  - Added more constants (see <math.h>)
 *
 *  * May 3, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - Added sllsin(), sllcos(), and trig constants
 *
 *  * May 2, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - All routines and macros now have sll their identifiers
 *  - Changed mul() to umul() and added sllmul() to handle signed numbers
 *  - Added and tested sllexp(), sllint(), and sllfrac()
 *  - Added some constants
 *
 *  * Apr 26, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - Added mul(), and began testing it (unsigned only)
 *
 *  * Apr 25, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - Added sll2dbl() [convert a signed long long to a double]
 *  - Began testing.  Well gee whiz it works! :)
 *
 *  * Apr 24, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - Added dbl2sll() [convert a double to signed long long]
 *  - Began documenting
 *
 *  * Apr ??, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *  - Conceived, written, and fiddled with
 *
 *
 *      Copyright (C) 2000 Andrew E. Mileski
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License version 2 as published by the Free Software Foundation.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
 */

#ifndef MATHSLL_H
#define MATHSLL_H

#ifdef __cplusplus
extern "C" {
#endif


#ifndef USE_FIXED_POINT

#include<math.h>

typedef double sll;
typedef double ull;

#define sllvalue(X)     ((double)X)
#define int2sll(X)  ((sll)(X))
#define sll2int(X)  ((int)(X))
#define sll_abs(X)  (fabs(X))
#define sllneg(X)       (-(X))


static __inline__ double sll2dbl(sll x)
{
    return (double)x;
}

static __inline__ sll slladd(sll x, sll y)
{
    return x + y;
}

static __inline__ sll sllsub(sll x, sll y)
{
    return x - y;
}

static __inline__ sll sllmul(sll x, sll y)
{
    return x * y;
}

static __inline__ sll slldiv(sll x, sll y)
{
    return x / y;
}

static __inline__ sll sllinv(sll x)
{
    return 1 / x;
}


static __inline__ float sll2float(sll s)
{
    return (float)s;
}

static __inline__ sll float2sll(float f)
{
    return (sll)f;
}

static __inline__ sll sllsin(sll x)
{
    return sin(x);
}

static __inline__ sll sllcos(sll x)
{
    return cos(x);
}

static __inline__ sll slltan(sll x)
{
    return tan(x);
}

static __inline__ sll sllsqrt(sll x)
{
    return sqrt(x);
}

static __inline__ sll dbl2sll(double dbl)
{
    return (float)dbl;
}

static __inline__ sll sllpow(sll x, sll y)
{
    return pow(x,y);
}

#else

/* Data types */
typedef signed long long sll;
typedef unsigned long long ull;

/* Macros */
#define int2sll(X)  (((sll) (X)) << 32)
#define sllvalue(X)     (X)
#define sll2int(X)  ((int) ((X) >> 32))
#define sll_abs(X)  ((X) & 0xefffffffffffffffLL)
#define sllint(X)   ((X) & 0xffffffff00000000LL)
#define sllfrac(X)  ((X) & 0x00000000ffffffffLL)
#define sllneg(X)   (-(X))
#define _slladd(X,Y)    ((X) + (Y))
#define _sllsub(X,Y)    ((X) - (Y))

/* Constants (converted from double) */
#define CONST_0     0x0000000000000000LL
#define CONST_1     0x0000000100000000LL
#define CONST_2     0x0000000200000000LL
#define CONST_3     0x0000000300000000LL
#define CONST_4     0x0000000400000000LL
#define CONST_10    0x0000000a00000000LL
#define CONST_1_2   0x0000000080000000LL
#define CONST_1_3   0x0000000055555555LL
#define CONST_1_4   0x0000000040000000LL
#define CONST_1_5   0x0000000033333333LL
#define CONST_1_6   0x000000002aaaaaaaLL
#define CONST_1_7   0x0000000024924924LL
#define CONST_1_8   0x0000000020000000LL
#define CONST_1_9   0x000000001c71c71cLL
#define CONST_1_10  0x0000000019999999LL
#define CONST_1_11  0x000000001745d174LL
#define CONST_1_12  0x0000000015555555LL
#define CONST_1_20  0x000000000cccccccLL
#define CONST_1_30  0x0000000008888888LL
#define CONST_1_42  0x0000000006186186LL
#define CONST_1_56  0x0000000004924924LL
#define CONST_1_72  0x00000000038e38e3LL
#define CONST_1_90  0x0000000002d82d82LL
#define CONST_1_110 0x000000000253c825LL
#define CONST_1_132 0x0000000001f07c1fLL
#define CONST_1_156 0x0000000001a41a41LL
#define CONST_E     0x00000002b7e15162LL
#define CONST_1_E   0x000000005e2d58d8LL
#define CONST_SQRTE 0x00000001a61298e1LL
#define CONST_1_SQRTE   0x000000009b4597e3LL
#define CONST_LOG2_E    0x0000000171547652LL
#define CONST_LOG10_E   0x000000006f2dec54LL
#define CONST_LN2   0x00000000b17217f7LL
#define CONST_LN10  0x000000024d763776LL
#define CONST_PI    0x00000003243f6a88LL
#define CONST_PI_2  0x00000001921fb544LL
#define CONST_PI_4  0x00000000c90fdaa2LL
#define CONST_1_PI  0x00000000517cc1b7LL
#define CONST_2_PI  0x00000000a2f9836eLL
#define CONST_2_SQRTPI  0x0000000120dd7504LL
#define CONST_SQRT2 0x000000016a09e667LL
#define CONST_1_SQRT2   0x00000000b504f333LL

static __inline__ sll slladd(sll x, sll y)
{
    return (x + y);
}

static __inline__ sll sllsub(sll x, sll y)
{
    return (x - y);
}

/*
 * Let a = A * 2^32 + a_hi * 2^0 + a_lo * 2^(-32)
 * Let b = B * 2^32 + b_hi * 2^0 + b_lo * 2^(-32)
 *
 * Where:
 *   *_hi is the integer part
 *   *_lo the fractional part
 *   A and B are the sign (0 for positive, -1 for negative).
 *
 * a * b = (A * 2^32 + a_hi * 2^0 + a_lo * 2^-32)
 *       * (B * 2^32 + b_hi * 2^0 + b_lo * 2^-32)
 *
 * Expanding the terms, we get:
 *
 *   = A * B * 2^64 + A * b_h * 2^32 + A * b_l * 2^0
 *   + a_h * B * 2^32 + a_h * b_h * 2^0 + a_h * b_l * 2^-32
 *   + a_l * B * 2^0 + a_l * b_h * 2^-32 + a_l * b_l * 2^-64
 *
 * Grouping by powers of 2, we get:
 *
 *   = A * B * 2^64
 *   Meaningless overflow from sign extension - ignore
 *
 *   + (A * b_h + a_h * B) * 2^32
 *   Overflow which we can't handle - ignore
 *
 *   + (A * b_l + a_h * b_h + a_l * B) * 2^0
 *   We only need the low 32 bits of this term, as the rest is overflow
 *
 *   + (a_h * b_l + a_l * b_h) * 2^-32
 *   We need all 64 bits of this term
 *
 *   +  a_l * b_l * 2^-64
 *   We only need the high 32 bits of this term, as the rest is underflow
 *
 * Note that:
 *   a > 0 && b > 0: A =  0, B =  0 and the third term is a_h * b_h
 *   a < 0 && b > 0: A = -1, B =  0 and the third term is a_h * b_h - b_l
 *   a > 0 && b < 0: A =  0, B = -1 and the third term is a_h * b_h - a_l
 *   a < 0 && b < 0: A = -1, B = -1 and the third term is a_h * b_h - a_l - b_l
 */
#if defined(__arm__)
static __inline__ sll sllmul(sll left, sll right)
{
    /*
     * From gcc/config/arm/arm.h:
     *   In a pair of registers containing a DI or DF value the 'Q'
     *   operand returns the register number of the register containing
     *   the least significant part of the value.  The 'R' operand returns
     *   the register number of the register containing the most
     *   significant part of the value.
     */
    sll retval;

    __asm__ (
        "@ sllmul\n\t"
        "umull  %R0, %Q0, %Q1, %Q2\n\t"
        "mul    %R0, %R1, %R2\n\t"
        "umlal  %Q0, %R0, %Q1, %R2\n\t"
        "umlal  %Q0, %R0, %Q2, %R1\n\t"
            "tst    %R1, #0x80000000\n\t"
            "subne  %R0, %R0, %Q2\n\t"
            "tst    %R2, #0x80000000\n\t"
            "subne  %R0, %R0, %Q1\n\t"
        : "=&r" (retval)
        : "%r" (left), "r" (right)
        : "cc"
    );

    return retval;
}
#elif defined(__i386__)
static __inline__ sll sllmul(sll left, sll right)
{
    register sll retval;
    __asm__(
        "# sllmul\n\t"
        "   movl    %1, %%eax\n\t"
        "   mull    %3\n\t"
        "   movl    %%edx, %%ebx\n\t"
        "\n\t"
        "   movl    %2, %%eax\n\t"
        "   mull    %4\n\t"
        "   movl    %%eax, %%ecx\n\t"
        "\n\t"
        "   movl    %1, %%eax\n\t"
        "   mull    %4\n\t"
        "   addl    %%eax, %%ebx\n\t"
        "   adcl    %%edx, %%ecx\n\t"
        "\n\t"
        "   movl    %2, %%eax\n\t"
        "   mull    %3\n\t"
        "   addl    %%ebx, %%eax\n\t"
        "   adcl    %%ecx, %%edx\n\t"
        "\n\t"
        "   btl $31, %2\n\t"
        "   jnc 1f\n\t"
        "   subl    %3, %%edx\n\t"
        "1: btl $31, %4\n\t"
        "   jnc 1f\n\t"
        "   subl    %1, %%edx\n\t"
        "1:\n\t"
        : "=&A" (retval)
        : "m" (left), "m" (((unsigned *) &left)[1]),
          "m" (right), "m" (((unsigned *) &right)[1])
        : "ebx", "ecx", "cc"
    );
    return retval;
}
#else
/* Plain C version: not optimal but portable. */
#warning Fixed Point no optimal
static __inline__ sll sllmul(sll a, sll b)
{
    unsigned int a_lo, b_lo;
    signed int a_hi, b_hi;
    sll x;

    a_lo = a;
    a_hi = (ull) a >> 32;
    b_lo = b;
    b_hi = (ull) b >> 32;

    x = ((ull) (a_hi * b_hi) << 32)
      + (((ull) a_lo * b_lo) >> 32)
      + (sll) a_lo * b_hi
      + (sll) b_lo * a_hi;

    return x;
}
#endif

static __inline__ sll sllinv(sll v)
{
    int sgn = 0;
    sll u;
    ull s = -1;

    /* Use positive numbers, or the approximation won't work */
    if (v < CONST_0) {
        v = sllneg(v);
        sgn = 1;
    }

    /* An approximation - must be larger than the actual value */
    for (u = v; u; ((ull)u) >>= 1)
        s >>= 1;

    /* Newton's Method */
    u = sllmul(s, _sllsub(CONST_2, sllmul(v, s)));
    u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
    u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
    u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
    u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
    u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));

    return ((sgn) ? sllneg(u): u);
}

static __inline__ sll slldiv(sll left, sll right)
{
    return sllmul(left, sllinv(right));
}

static __inline__ sll sllmul2(sll x)
{
    return x << 1;
}

static __inline__ sll sllmul4(sll x)
{
    return x << 2;
}

static __inline__ sll sllmul2n(sll x, int n)
{
    sll y;

#if defined(__arm__)
    /*
     * On ARM we need to do explicit assembly since the compiler
     * doesn't know the range of n is limited and decides to call
     * a library function instead.
     */
    __asm__ (
        "@ sllmul2n\n\t"
        "mov    %R0, %R1, lsl %2\n\t"
        "orr    %R0, %R0, %Q1, lsr %3\n\t"
        "mov    %Q0, %Q1, lsl %2\n\t"
        : "=r" (y)
        : "r" (x), "rM" (n), "rM" (32 - n)
    );
#else
    y = x << n;
#endif

    return y;
}

static __inline__ sll slldiv2(sll x)
{
    return x >> 1;
}

static __inline__ sll slldiv4(sll x)
{
    return x >> 2;
}

static __inline__ sll slldiv2n(sll x, int n)
{
    sll y;

#if defined(__arm__)
    /*
     * On ARM we need to do explicit assembly since the compiler
     * doesn't know the range of n is limited and decides to call
     * a library function instead.
     */
    __asm__ (
        "@ slldiv2n\n\t"
        "mov    %Q0, %Q1, lsr %2\n\t"
        "orr    %Q0, %Q0, %R1, lsl %3\n\t"
        "mov    %R0, %R1, asr %2\n\t"
        : "=r" (y)
        : "r" (x), "rM" (n), "rM" (32 - n)
    );
#else
    y = x >> n;
#endif

    return y;
}

/*
 * Unpack the IEEE floating point double format and put it in fixed point
 * sll format.
 */
static __inline__ sll dbl2sll(double dbl)
{
    union {
        double d;
        unsigned u[2];
        ull ull;
        sll sll;
    } in, retval;
    register unsigned exp;

    /* Move into memory as args might be passed in regs */
    in.d = dbl;

#if defined(__arm__)

    /* ARM architecture has a big-endian double */
    exp = in.u[0];
    in.u[0] = in.u[1];
    in.u[1] = exp;

#endif /* defined(__arm__) */

    /* Leading 1 is assumed by IEEE */
    retval.u[1] = 0x40000000;

    /* Unpack the mantissa into the unsigned long */
    retval.u[1] |= (in.u[1] << 10) & 0x3ffffc00;
    retval.u[1] |= (in.u[0] >> 22) & 0x000003ff;
    retval.u[0] = in.u[0] << 10;

    /* Extract the exponent and align the decimals */
    exp = (in.u[1] >> 20) & 0x7ff;
    if (exp)
        retval.ull >>= 1053 - exp;
    else
        return 0L;

    /* Negate if negative flag set */
    if (in.u[1] & 0x80000000)
        retval.sll = -retval.sll;

    return retval.sll;
}

static __inline__ sll float2sll(float f)
{
    return dbl2sll((double)f);
}

static __inline__ double sll2dbl(sll s)
{
    union {
        double d;
        unsigned u[2];
        ull ull;
        sll sll;
    } in, retval;
    register unsigned exp;
    register unsigned flag;

    if (s == 0)
        return 0.0;

    /* Move into memory as args might be passed in regs */
    in.sll = s;

    /* Handle the negative flag */
    if (in.sll < 1) {
        flag = 0x80000000;
        in.ull = sllneg(in.sll);
    } else
        flag = 0x00000000;

    /* Normalize */
    for (exp = 1053; in.ull && (in.u[1] & 0x80000000) == 0; exp--) {
        in.ull <<= 1;
    }
    in.ull <<= 1;
    exp++;
    in.ull >>= 12;
    retval.ull = in.ull;
    retval.u[1] |= flag | (exp << 20);

#if defined(__arm__)

    /* ARM architecture has a big-endian double */
    exp = retval.u[0];
    retval.u[0] = retval.u[1];
    retval.u[1] = exp;

#endif /* defined(__arm__) */

    return retval.d;
}

static __inline__ float sll2float(sll s)
{
    return ((float)sll2dbl(s));
}

/*
 * Calculate cos x where -pi/4 <= x <= pi/4
 *
 * Description:
 *  cos x = 1 - x^2 / 2! + x^4 / 4! - ... + x^(2N) / (2N)!
 *  Note that (pi/4)^12 / 12! < 2^-32 which is the smallest possible number.
 */
static __inline__ sll _sllcos(sll x)
{
    sll retval, x2;
    x2 = sllmul(x, x);
    /*
     * cos x = t0 + t1 + t2 + t3 + t4 + t5 + t6
     *
     * f0 =  0! =  1
     * f1 =  2! =  2 *  1 * f0 =   2 * f0
     * f2 =  4! =  4 *  3 * f1 =  12 x f1
     * f3 =  6! =  6 *  5 * f2 =  30 * f2
     * f4 =  8! =  8 *  7 * f3 =  56 * f3
     * f5 = 10! = 10 *  9 * f4 =  90 * f4
     * f6 = 12! = 12 * 11 * f5 = 132 * f5
     *
     * t0 = 1
     * t1 = -t0 * x2 /   2 = -t0 * x2 * CONST_1_2
     * t2 = -t1 * x2 /  12 = -t1 * x2 * CONST_1_12
     * t3 = -t2 * x2 /  30 = -t2 * x2 * CONST_1_30
     * t4 = -t3 * x2 /  56 = -t3 * x2 * CONST_1_56
     * t5 = -t4 * x2 /  90 = -t4 * x2 * CONST_1_90
     * t6 = -t5 * x2 / 132 = -t5 * x2 * CONST_1_132
     */
    retval = _sllsub(CONST_1, sllmul(x2, CONST_1_132));
    retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_90));
    retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_56));
    retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_30));
    retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_12));
    retval = _sllsub(CONST_1, slldiv2(sllmul(x2, retval)));
    return retval;
}

/*
 * Calculate sin x where -pi/4 <= x <= pi/4
 *
 * Description:
 *  sin x = x - x^3 / 3! + x^5 / 5! - ... + x^(2N+1) / (2N+1)!
 *  Note that (pi/4)^13 / 13! < 2^-32 which is the smallest possible number.
 */
static __inline__ sll _sllsin(sll x)
{
    sll retval, x2;
    x2 = sllmul(x, x);
    /*
     * sin x = t0 + t1 + t2 + t3 + t4 + t5 + t6
     *
     * f0 =  0! =  1
     * f1 =  3! =  3 *  2 * f0 =   6 * f0
     * f2 =  5! =  5 *  4 * f1 =  20 x f1
     * f3 =  7! =  7 *  6 * f2 =  42 * f2
     * f4 =  9! =  9 *  8 * f3 =  72 * f3
     * f5 = 11! = 11 * 10 * f4 = 110 * f4
     * f6 = 13! = 13 * 12 * f5 = 156 * f5
     *
     * t0 = 1
     * t1 = -t0 * x2 /   6 = -t0 * x2 * CONST_1_6
     * t2 = -t1 * x2 /  20 = -t1 * x2 * CONST_1_20
     * t3 = -t2 * x2 /  42 = -t2 * x2 * CONST_1_42
     * t4 = -t3 * x2 /  72 = -t3 * x2 * CONST_1_72
     * t5 = -t4 * x2 / 110 = -t4 * x2 * CONST_1_110
     * t6 = -t5 * x2 / 156 = -t5 * x2 * CONST_1_156
     */
    retval = _sllsub(x, sllmul(x2, CONST_1_156));
    retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_110));
    retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_72));
    retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_42));
    retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_20));
    retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_6));
    return retval;
}

static __inline__ sll sllcos(sll x)
{
    int i;
    sll retval;

    /* Calculate cos (x - i * pi/2), where -pi/4 <= x - i * pi/2 <= pi/4  */
    i = sll2int(_slladd(sllmul(x, CONST_2_PI), CONST_1_2));
    x = _sllsub(x, sllmul(int2sll(i), CONST_PI_2));

    switch (i & 3) {
        default:
        case 0:
            retval = _sllcos(x);
            break;
        case 1:
            retval = sllneg(_sllsin(x));
            break;
        case 2:
            retval = sllneg(_sllcos(x));
            break;
        case 3:
            retval = _sllsin(x);
            break;
    }
    return retval;
}

static __inline__ sll sllsin(sll x)
{
    int i;
    sll retval;

    /* Calculate sin (x - n * pi/2), where -pi/4 <= x - i * pi/2 <= pi/4 */
    i = sll2int(_slladd(sllmul(x, CONST_2_PI), CONST_1_2));
    x = _sllsub(x, sllmul(int2sll(i), CONST_PI_2));

    switch (i & 3) {
        default:
        case 0:
            retval = _sllsin(x);
            break;
        case 1:
            retval = _sllcos(x);
            break;
        case 2:
            retval = sllneg(_sllsin(x));
            break;
        case 3:
            retval = sllneg(_sllcos(x));
            break;
    }
    return retval;
}

static __inline__ sll slltan(sll x)
{
    int i;
    sll retval;
    i = sll2int(_slladd(sllmul(x, CONST_2_PI), CONST_1_2));
    x = _sllsub(x, sllmul(int2sll(i), CONST_PI_2));
    switch (i & 3) {
        default:
        case 0:
        case 2:
            retval = slldiv(_sllsin(x), _sllcos(x));
            break;
        case 1:
        case 3:
            retval = sllneg(slldiv(_sllcos(x), _sllsin(x)));
            break;
    }
    return retval;
}

/*
 * atan x = SUM[n=0,) (-1)^n * x^(2n + 1)/(2n + 1), |x| < 1
 *
 * Two term approximation
 *  a = x - x^3/3
 * Gives us
 *  atan x = a + ??
 * Let ?? = arctan ?
 *  atan x = a + arctan ?
 * Rearrange
 *  atan x - a = arctan ?
 * Apply tan to both sides
 *  tan (atan x - a) = tan arctan ?
 *  tan (atan x - a) = ?
 * Applying the standard formula
 *  tan (u - v) = (tan u - tan v) / (1 + tan u * tan v)
 * Gives us
 *  tan (atan x - a) = (tan atan x - tan a) / (1 + tan arctan x * tan a)
 * Let t = tan a
 *  tan (atan x - a) = (x - t) / (1 + x * t)
 * So finally
 *  arctan x = a + arctan ((tan x - t) / (1 + x * t))
 * And the typical worst case is x = 1.0 which converges in 3 iterations.
 */
static __inline__ sll _sllatan(sll x)
{
    sll a, t, retval;

    /* First iteration */
    a = sllmul(x, _sllsub(CONST_1, sllmul(x, sllmul(x, CONST_1_3))));
    retval = a;

    /* Second iteration */
    t = slldiv(_sllsin(a), _sllcos(a));
    x = slldiv(_sllsub(x, t), _slladd(CONST_1, sllmul(t, x)));
    a = sllmul(x, _sllsub(CONST_1, sllmul(x, sllmul(x, CONST_1_3))));
    retval = _slladd(retval, a);

    /* Third  iteration */
    t = slldiv(_sllsin(a), _sllcos(a));
    x = slldiv(_sllsub(x, t), _slladd(CONST_1, sllmul(t, x)));
    a = sllmul(x, _sllsub(CONST_1, sllmul(x, sllmul(x, CONST_1_3))));
    return _slladd(retval, a);
}

static __inline__ sll sllatan(sll x)
{
    sll retval;

    if (x < sllneg(CONST_1))
    /* if x < -1 then atan x = PI/2 + atan 1/x */
        retval = sllneg(CONST_PI_2);
    else if (x > CONST_1)
    /* if x > 1 then atan x = PI/2 - atan 1/x */
        retval = CONST_PI_2;
    else
        return _sllatan(x);
    return _sllsub(retval, _sllatan(sllinv(x)));
}

/*
 * Calculate e^x where -0.5 <= x <= 0.5
 *
 * Description:
 *  e^x = x^0 / 0! + x^1 / 1! + ... + x^N / N!
 *  Note that 0.5^11 / 11! < 2^-32 which is the smallest possible number.
 */
static __inline__ sll _sllexp(sll x)
{
    sll retval;
    retval = _slladd(CONST_1, sllmul(x, CONST_1_11)); /* merge 2 redundant line */
    retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_10)));
    retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_9)));
    retval = _slladd(CONST_1, sllmul(retval, slldiv2n(x, 3)));
    retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_7)));
    retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_6)));
    retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_5)));
    retval = _slladd(CONST_1, sllmul(retval, slldiv4(x)));
    retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_3)));
    retval = _slladd(CONST_1, sllmul(retval, slldiv2(x)));
    retval = _slladd(CONST_1, sllmul(retval, x)); /* the retval is incorrect without this line */
    return retval;
}

/*
 * Calculate e^x where x is arbitrary
 */
static __inline__ sll sllexp(sll x)
{
    int i;
    sll e, retval;

    e = CONST_E;

    /* -0.5 <= x <= 0.5  */
    i = sll2int(_slladd(x, CONST_1_2));
    retval = _sllexp(_sllsub(x, int2sll(i)));

    /* i >= 0 */
    if (i < 0) {
        i = -i;
        e = CONST_1_E;
    }

    /* Scale the result */
    for (;i; i >>= 1) {
        if (i & 1)
            retval = sllmul(retval, e);
        e = sllmul(e, e);
    }
    return retval;
}

/*
 * Calculate natural logarithm using Netwton-Raphson method
 */
static __inline__ sll slllog(sll x)
{
    sll x1, ln = 0;

    /* Scale: e^(-1/2) <= x <= e^(1/2) */
    while (x < CONST_1_SQRTE) {
        ln = _sllsub(ln, CONST_1);
        x = sllmul(x, CONST_E);
    }
    while (x > CONST_SQRTE) {
        ln = _slladd(ln, CONST_1);
        x = sllmul(x, CONST_1_E);
    }

    /* First iteration */
    x1 = sllmul(_sllsub(x, CONST_1), slldiv2(_sllsub(x, CONST_3)));
    ln = _sllsub(ln, x1);
    x = sllmul(x, _sllexp(x1));

    /* Second iteration */
    x1 = sllmul(_sllsub(x, CONST_1), slldiv2(_sllsub(x, CONST_3)));
    ln = _sllsub(ln, x1);
    x = sllmul(x, _sllexp(x1));

    /* Third iteration */
    x1 = sllmul(_sllsub(x, CONST_1), slldiv2(_sllsub(x, CONST_3)));
    ln = _sllsub(ln, x1);

    return ln;
}

/*
 * ln x^y = y * log x
 * e^(ln x^y) = e^(y * log x)
 * x^y = e^(y * ln x)
 */
static __inline__ sll sllpow(sll x, sll y)
{
    if (y == CONST_0)
        return CONST_1;
    return sllexp(sllmul(y, slllog(x)));
}

/*
 * Consider a parabola centered on the y-axis
 *  y = a * x^2 + b
 * Has zeros (y = 0)  at
 *  a * x^2 + b = 0
 *  a * x^2 = -b
 *  x^2 = -b / a
 *  x = +- (-b / a)^(1 / 2)
 * Letting a = 1 and b = -X
 *  y = x^2 - X
 *  x = +- X^(1 / 2)
 * Which is convenient since we want to find the square root of X, and we can
 * use Newton's Method to find the zeros of any f(x)
 *  xn = x - f(x) / f'(x)
 * Applied Newton's Method to our parabola
 *  f(x) = x^2 - X
 *  xn = x - (x^2 - X) / (2 * x)
 *  xn = x - (x - X / x) / 2
 * To make this converge quickly, we scale X so that
 *  X = 4^N * z
 * Taking the roots of both sides
 *  X^(1 / 2) = (4^n * z)^(1 / 2)
 *  X^(1 / 2) = 2^n * z^(1 / 2)
 * Let N = 2^n
 *  x^(1 / 2) = N * z^(1 / 2)
 * We want this to converge to the positive root, so we must start at a point
 *  0 < start <= x^(1 / 2)
 * or
 *  x^(1/2) <= start <= infinity
 * since
 *  (1/2)^(1/2) = 0.707
 *  2^(1/2) = 1.414
 * A good choice is 1 which lies in the middle, and takes 4 iterations to
 * converge from either extreme.
 */
static __inline__ sll sllsqrt(sll x)
{
    sll n, xn;

    /* Start with a scaling factor of 1 */
    n = CONST_1;

    /* Quick solutions for the simple cases */
    if (x <= CONST_0 || x == CONST_1)
        return x;

    /* Scale x so that 0.5 <= x < 2 */
    while (x >= CONST_2) {
        x = slldiv4(x);
        n = sllmul2(n);
    }
    while (x < CONST_1_2) {
        x = sllmul4(x);
        n = slldiv2(n);
    }

    /* Simple solution if x = 4^n */
    if (x == CONST_1)
        return n;

    /* The starting point */
    xn = CONST_1;

    /* Four iterations will be enough */
    xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));
    xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));
    xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));
    xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));

    /* Scale the result */
    return sllmul(n, xn);
}

#endif

#ifdef __cplusplus
}
#endif

#endif /* MATHSLL_H */