Untitled

/**
 * Helper functions used for the diagnostic of the vdt routines.
 * They are not optimised for speed.
 * Authors: Danilo Piparo CERN
 **/


/*
 * VDT is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef VDTHELPER_H_
#define VDTHELPER_H_

#include <bitset>
#include <iostream>
#include <sstream>
#include <string>
#include <functional>
#include "inttypes.h"
// #include "x86intrin.h"
#include <cmath> //for log2
#include "time.h"
#include "sys/time.h"

#ifdef __APPLE__
#include <CoreServices/CoreServices.h>
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <unistd.h>
#endif


namespace{

// Establish the size of the double and single precision and the bitsets
constexpr double _tmp=0;
constexpr uint32_t dp_size_in_bits = sizeof(_tmp)*8;
using dp_bitset = std::bitset<dp_size_in_bits>;

}

namespace vdth{

//------------------------------------------------------------------------------

// Useful alias for some functions
using dpdpfunction = std::function<double(double)>;
using dpdpfunctionv = std::function<void (uint32_t, double*, double*)>;
using spspfunction = std::function<float(float)>;
using spspfunctionv = std::function<void(uint32_t, float*, float*)>;

using dpdp2function = std::function<double(double,double)>;
using dpdp2functionv = std::function<void (uint32_t, double*, double*, double*)>;
using spsp2function = std::function<float(float,float)>;
using spsp2functionv = std::function<void(uint32_t, float*, float*, float*)>;
//maybe for convenience
template<class T> using genfpfunction = std::function<T(T)>;
template<class T> using genfpfunctionv = std::function<void(uint32_t, T*, T*)>;
template<class T> using genfp2function = std::function<T(T,T)>;
template<class T> using genfp2functionv = std::function<void(uint32_t, T*, T*, T*)>;
//------------------------------------------------------------------------------
/// Useful union

union standard{
    double dp;
    float sp[2];
    uint64_t li;
};

//------------------------------------------------------------------------------

template<class T>
uint32_t inline getSizeInbits(const T x){
    return sizeof(x) * 8;
}

//------------------------------------------------------------------------------

/// Convert a fp into a uint64_t not optimised for speed
template<class T>
inline uint64_t fp2uint64(const T x){
    const uint32_t size = getSizeInbits(x);
    standard conv;
    conv.dp=0.;
    if (size==dp_size_in_bits)
        conv.dp=x;
    else
        conv.sp[0]=x;
    return conv.li;

}

//------------------------------------------------------------------------------
/// Convert a double into a bitset
template<class T>
inline const dp_bitset fp2bs( const T x ){
    dp_bitset const bits (fp2uint64(x));
    return bits;
}

//------------------------------------------------------------------------------
/// Print as a dp formatted bitset
template<class T>
const std::string getbsasstr(const T x){

    const uint32_t size = getSizeInbits(x);

    uint32_t offset = 0;
    uint32_t exp_size = 11;
    uint32_t mant_size = 52;
    if (size!=dp_size_in_bits){
        offset = 32;
        exp_size = 8;
        mant_size = 23;
    }

    // Convert the bitstream to string
    std::string bitset_as_string (fp2bs(x).to_string());

    std::ostringstream os;

    // sign
    os  << bitset_as_string[offset] << " ";
    // exponent
    for (unsigned int i=offset+1;i<offset+1+exp_size;i++)
        os <<  bitset_as_string[i];
    os << " ";
    //mantissa
    for (unsigned int i=offset+1+exp_size;i<offset+1+exp_size+mant_size;i++)
        os <<  bitset_as_string[i];

    return os.str();
}


//------------------------------------------------------------------------------
/// Returns most significative different bit dp
template <class T>
uint16_t diffbit(const T a,const T b ){
    /// make a xor
    uint64_t ia = fp2uint64(a);
    uint64_t ib = fp2uint64(b);
    uint64_t c = ia>ib? ia-ib : ib -ia;
    //uint64_t c = ia^ib;
    /// return the log2+1
    return log2(c)+1;
}

//------------------------------------------------------------------------------

///Check and print which instructions sets are enabled.
void print_instructions_info(){

    std::ostringstream os;
    os << "List of enabled instructions' sets:\n";

    os << " o SSE2 instructions set "
#ifndef __SSE2__
            << "not "
#endif
            << "enabled.\n"

            << " o SSE3 instructions set "
#ifndef __SSE3__
            << "not "
#endif
            << "enabled.\n"

            << " o SSE4.1 instructions set "
#ifndef __SSE4_1__
            << "not "
#endif
            << "enabled.\n"

            << " o AVX instructions set "
#ifndef __AVX__
            << "not "
#endif
            << "enabled.\n";
    std::cout << os.str();
}

//------------------------------------------------------------------------------

/// Print the different bit of two fp numbers
template<class T>
void print_different_bit(const T a, const T b, const bool show_identical=true){

    std::cout.precision(10);
    std::cout << "Different bit between " << a << " and " << b
            << " is " << diffbit(a,b) << std::endl;
    if (show_identical)
        std::cout << getbsasstr(a) << std::endl
        << getbsasstr(b) << std::endl<< std::endl;
}


//------------------------------------------------------------------------------

/// Invoke two functions and print on screen their argument and different bits
template<class T>
void printFuncDiff(const std::string& func_name, std::function<T(T)> f1,std::function<T(T)> f2, const T x){
    std::cout << "Function " << func_name << "(" << x << ")" <<  std::endl;
    print_different_bit(f1(x),f2(x),true);
}

/// Invoke two functions and print on screen their argument and different bits
template<class T>
void printFuncDiff(const std::string& func_name, std::function<T(T,T)> f1,std::function<T(T,T)> f2, const T x, const T y){
    std::cout << "Function " << func_name << "(" << x << ", "<< y <<")" <<  std::endl;
    print_different_bit(f1(x,y),f2(x,y),true);
}

//------------------------------------------------------------------------------

/// Invoke two functions and print on screen their argument and different bits
template<class T>
void printFuncDiff(const std::string& func_name,
        genfpfunctionv<T> f1,
        genfpfunctionv<T> f2,
        T* x_arr,
        const uint32_t size){
    std::cout << "Function " << func_name << std::endl;
    T* res_1 = new T[size];
    f1(size,x_arr,res_1);
    T* res_2 = new T[size];
    f2(size,x_arr,res_2);
    for (uint32_t i=0;i<size;i++){
        std::cout << "Calculated in " << x_arr[i] << std::endl;
        print_different_bit(res_1[i],res_2[i],true);
    };
    delete [] res_1;
    delete [] res_2;
}

/// Invoke two functions and print on screen their argument and different bits
template<class T>
void printFuncDiff(const std::string& func_name,
        genfp2functionv<T> f1,
        genfp2functionv<T> f2,
        T* x_arr,
        T* y_arr,
        const uint32_t size){
    std::cout << "Function " << func_name << std::endl;
    T* res_1 = new T[size];
    f1(size,x_arr,y_arr,res_1);
    T* res_2 = new T[size];
    f2(size,x_arr,y_arr,res_2);
    for (uint32_t i=0;i<size;i++){
        std::cout << "Calculated in (" << x_arr[i] << ", " << y_arr[i] << ")" << std::endl;
        print_different_bit(res_1[i],res_2[i],true);
    };
    delete [] res_1;
    delete [] res_2;
}

//------------------------------------------------------------------------------
// Function tests
/// Test a fp function with a double (double) signatures
template<class T>
void printFuncDiff(const std::string& name,
        std::function<T(T)> fpfunction,
        std::function<T(T)> fpfunction_ref,
        T* fpvals,
        const uint32_t size){

    for (uint32_t i=0;i<size;i++)
        printFuncDiff ( name,
                (std::function<T(T)>) fpfunction,
                (std::function<T(T)>) fpfunction_ref,
                fpvals[i] );

}


//------------------------------------------------------------------------------
// Function tests
/// Test a fp function with a double (double) signatures
template<class T>
void printFuncDiff(const std::string& name,
        std::function<T(T,T)> fpfunction,
        std::function<T(T,T)> fpfunction_ref,
        T* fpvals1,
        T* fpvals2,
        const uint32_t size){

    for (uint32_t i=0;i<size;i++)
        printFuncDiff ( name,
                (std::function<T(T,T)>) fpfunction,
                (std::function<T(T,T)>) fpfunction_ref,
                fpvals1[i],
                fpvals2[i]);

}

//------------------------------------------------------------------------------
/// Get the clock cycles
class timer{
public:
    timer(){}
    ~timer(){}
    void print(){
        const uint64_t nsecs=get_elapsed_time();
        std::cout << "Time elapsed: " << nsecs << " nanoseconds.\n";// ("
        //<< m_get_elapsed_clocks(nsecs) << " clock)\n";
    }
#if defined (__APPLE__)
    void inline start(){m_time1=mach_absolute_time();}
    void inline stop(){m_time2=mach_absolute_time();}
    uint64_t get_elapsed_time(){
        static mach_timebase_info_data_t    sTimebaseInfo;
        const uint64_t elapsed = m_time2 - m_time1;
        // Convert to nanoseconds.
        // Have to do some pointer fun because AbsoluteToNanoseconds
        // works in terms of UnsignedWide, which is a structure rather
        // than a proper 64-bit integer.

        if ( sTimebaseInfo.denom == 0 ) {
            (void) mach_timebase_info(&sTimebaseInfo);
        }

        // Do the maths. We hope that the multiplication doesn't
        // overflow; the price you pay for working in fixed point.

        uint64_t elapsedNano = elapsed * sTimebaseInfo.numer / sTimebaseInfo.denom;

        return elapsedNano;
    }

private:
    uint64_t m_time1,m_time2;

#else
    void inline start(){
        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &m_time1);
    }
    void inline stop(){
        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &m_time2);
    }

    /// Return time in nanoseconds

    uint64_t get_elapsed_time(){
        timespec temp;
        temp.tv_sec = m_time2.tv_sec-m_time1.tv_sec;
        temp.tv_nsec = m_time2.tv_nsec-m_time1.tv_nsec;
        uint64_t elapsed_time = temp.tv_nsec;
        elapsed_time += 1e9*temp.tv_sec;
        return elapsed_time;
    }

private:
    timespec m_time1,m_time2;
#endif

};

//------------------------------------------------------------------------------
// inline uint64_t getcpuclock() {
//  return __rdtsc();
// }


}//end of namespace vdth
#endif

/*
 * vdtcore_common.h
 * Common functions for the vdt routines.
 * The basic idea is to exploit Pade polynomials.
 * A lot of ideas were inspired by the cephes math library (by Stephen L. Moshier
 * moshier@na-net.ornl.gov) as well as actual code for the exp, log, sin, cos,
 * tan, asin, acos and atan functions. The Cephes library can be found here:
 * http://www.netlib.org/cephes/
 *
 *  Created on: Jun 23, 2012
 *      Author: Danilo Piparo, Thomas Hauth, Vincenzo Innocente
 */

/*
 * VDT is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef VDTCOMMON_H_
#define VDTCOMMON_H_

#include "inttypes.h"
#include <cmath>

namespace vdt{

namespace details{

// Constants
const double TWOPI = 2.*M_PI;
const double PI = M_PI;
const double PIO2 = M_PI_2;
const double PIO4 = M_PI_4;
const double ONEOPIO4 = 4./M_PI;

const float TWOPIF = 2.*M_PI;
const float PIF = M_PI;
const float PIO2F = M_PI_2;
const float PIO4F = M_PI_4;
const float ONEOPIO4F = 4./M_PI;

const double MOREBITS = 6.123233995736765886130E-17;


const float MAXNUMF = 3.4028234663852885981170418348451692544e38f;

//------------------------------------------------------------------------------

/// Used to switch between different type of interpretations of the data (64 bits)
union ieee754{
    ieee754 () {};
    ieee754 (double thed) {d=thed;};
    ieee754 (uint64_t thell) {ll=thell;};
    ieee754 (float thef) {f[0]=thef;};
    ieee754 (uint32_t thei) {i[0]=thei;};
  double d;
  float f[2];
  uint32_t i[2];
  uint64_t ll;
  uint16_t s[4];
};

//------------------------------------------------------------------------------

/// Converts an unsigned long long to a double
inline double uint642dp(uint64_t ll) {
  ieee754 tmp;
  tmp.ll=ll;
  return tmp.d;
}

//------------------------------------------------------------------------------

/// Converts a double to an unsigned long long
inline uint64_t dp2uint64(double x) {
  ieee754 tmp;
  tmp.d=x;
  return tmp.ll;
}

//------------------------------------------------------------------------------
/// Makes an AND of a double and a unsigned long long
inline double dpANDuint64(const double x, const uint64_t i ){
  return uint642dp(dp2uint64(x) & i);
}
//------------------------------------------------------------------------------
/// Makes an OR of a double and a unsigned long long
inline double dpORuint64(const double x, const uint64_t i ){
  return uint642dp(dp2uint64(x) | i);
}

/// Makes a XOR of a double and a unsigned long long
inline double dpXORuint64(const double x, const uint64_t i ){
  return uint642dp(dp2uint64(x) ^ i);
}

//------------------------------------------------------------------------------
inline uint64_t getSignMask(const double x){
  const uint64_t mask=0x8000000000000000ULL;
  return dp2uint64(x) & mask;
}

//------------------------------------------------------------------------------
/// Converts an int to a float
inline float uint322sp(int x) {
    ieee754 tmp;
    tmp.i[0]=x;
    return tmp.f[0];
  }

//------------------------------------------------------------------------------
/// Converts a float to an int
inline uint32_t sp2uint32(float x) {
    ieee754 tmp;
    tmp.f[0]=x;
    return tmp.i[0];
  }

//------------------------------------------------------------------------------
/// Makes an AND of a float and a unsigned long
inline float spANDuint32(const float x, const uint32_t i ){
  return uint322sp(sp2uint32(x) & i);
}
//------------------------------------------------------------------------------
/// Makes an OR of a float and a unsigned long
inline float spORuint32(const float x, const uint32_t i ){
  return uint322sp(sp2uint32(x) | i);
}

//------------------------------------------------------------------------------
/// Makes an OR of a float and a unsigned long
inline float spXORuint32(const float x, const uint32_t i ){
  return uint322sp(sp2uint32(x) ^ i);
}
//------------------------------------------------------------------------------
/// Get the sign mask
inline uint32_t getSignMask(const float x){
  const uint32_t mask=0x80000000;
  return sp2uint32(x) & mask;
}

//------------------------------------------------------------------------------
/// Like frexp but vectorising and the exponent is a double.
inline double getMantExponent(const double x, double & fe){

  uint64_t n = dp2uint64(x);

  // Shift to the right up to the beginning of the exponent.
  // Then with a mask, cut off the sign bit
  uint64_t le = (n >> 52);

  // chop the head of the number: an int contains more than 11 bits (32)
  int32_t e = le; // This is important since sums on uint64_t do not vectorise
  fe = e-1023 ;

  // This puts to 11 zeroes the exponent
  n &=0x800FFFFFFFFFFFFFULL;
  // build a mask which is 0.5, i.e. an exponent equal to 1022
  // which means *2, see the above +1.
  const uint64_t p05 = 0x3FE0000000000000ULL; //dp2uint64(0.5);
  n |= p05;

  return uint642dp(n);
}

//------------------------------------------------------------------------------
/// Like frexp but vectorising and the exponent is a float.
inline float getMantExponentf(const float x, float & fe){

    uint32_t n = sp2uint32(x);
    int32_t e = (n >> 23)-127;
    fe = e;

    // fractional part
    const uint32_t p05f = 0x3f000000; // //sp2uint32(0.5);
    n &= 0x807fffff;// ~0x7f800000;
    n |= p05f;

    return uint322sp(n);

}

//------------------------------------------------------------------------------
/// Converts a fp to an int
inline uint32_t fp2uint(float x) {
    return sp2uint32(x);
  }
/// Converts a fp to an int
inline uint64_t fp2uint(double x) {
    return dp2uint64(x);
  }
/// Converts an int to fp
inline float int2fp(uint32_t i) {
    return uint322sp(i);
  }
/// Converts an int to fp
inline double int2fp(uint64_t i) {
    return uint642dp(i);
  }

//------------------------------------------------------------------------------
/**
 * A vectorisable floor implementation, not only triggered by fast-math.
 * These functions do not distinguish between -0.0 and 0.0, so are not IEC6509
 * compliant for argument -0.0
**/
inline double fpfloor(const double x){
  // no problem since exp is defined between -708 and 708. Int is enough for it!
  int32_t ret = int32_t (x);
  ret-=(sp2uint32(x)>>31);
  return ret;

}
//------------------------------------------------------------------------------
/**
 * A vectorisable floor implementation, not only triggered by fast-math.
 * These functions do not distinguish between -0.0 and 0.0, so are not IEC6509
 * compliant for argument -0.0
**/
inline float fpfloor(const float x){
  int32_t ret = int32_t (x);
  ret-=(sp2uint32(x)>>31);
  return ret;

}

//------------------------------------------------------------------------------


}

} // end of namespace vdt

#endif /* VDTCOMMON_H_ */


/*
 * sincos_common.h
 * The basic idea is to exploit Pade polynomials.
 * A lot of ideas were inspired by the cephes math library (by Stephen L. Moshier
 * moshier@na-net.ornl.gov) as well as actual code.
 * The Cephes library can be found here:  http://www.netlib.org/cephes/
 *
 *  Created on: Jun 23, 2012
 *      Author: Danilo Piparo, Thomas Hauth, Vincenzo Innocente
 */

/*
 * VDT is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <cmath>
#include <limits>

#ifndef SINCOS_COMMON_H_
#define SINCOS_COMMON_H_

namespace vdt{

namespace details{

// double precision constants

const double DP1sc = 7.85398125648498535156E-1;
const double DP2sc = 3.77489470793079817668E-8;
const double DP3sc = 2.69515142907905952645E-15;

const double C1sin = 1.58962301576546568060E-10;
const double C2sin =-2.50507477628578072866E-8;
const double C3sin = 2.75573136213857245213E-6;
const double C4sin =-1.98412698295895385996E-4;
const double C5sin = 8.33333333332211858878E-3;
const double C6sin =-1.66666666666666307295E-1;

const double C1cos =-1.13585365213876817300E-11;
const double C2cos = 2.08757008419747316778E-9;
const double C3cos =-2.75573141792967388112E-7;
const double C4cos = 2.48015872888517045348E-5;
const double C5cos =-1.38888888888730564116E-3;
const double C6cos = 4.16666666666665929218E-2;

const double DP1 = 7.853981554508209228515625E-1;
const double DP2 = 7.94662735614792836714E-9;
const double DP3 = 3.06161699786838294307E-17;

// single precision constants

const float DP1F = 0.78515625;
const float DP2F = 2.4187564849853515625e-4;
const float DP3F = 3.77489497744594108e-8;

const float T24M1 = 16777215.;

//------------------------------------------------------------------------------

inline double get_sin_px(const double x){
    double px=C1sin;
    px *= x;
    px += C2sin;
    px *= x;
    px += C3sin;
    px *= x;
    px += C4sin;
    px *= x;
    px += C5sin;
    px *= x;
    px += C6sin;
    return px;
}

//------------------------------------------------------------------------------

inline double get_cos_px(const double x){
    double px=C1cos;
    px *= x;
    px += C2cos;
    px *= x;
    px += C3cos;
    px *= x;
    px += C4cos;
    px *= x;
    px += C5cos;
    px *= x;
    px += C6cos;
    return px;
}


//------------------------------------------------------------------------------
/// Reduce to 0 to 45
inline double reduce2quadrant(double x, int32_t& quad) {

    x = fabs(x);
    quad = int (ONEOPIO4 * x); // always positive, so (int) == std::floor
    quad = (quad+1) & (~1);
    const double y = double (quad);
    // Extended precision modular arithmetic
    return ((x - y * DP1) - y * DP2) - y * DP3;
  }

//------------------------------------------------------------------------------
/// Sincos only for -45deg < x < 45deg
inline void fast_sincos_m45_45( const double z, double & s, double &c ) {

    double zz = z * z;
    s = z  +  z * zz * get_sin_px(zz);
    c = 1.0 - zz * .5 + zz * zz * get_cos_px(zz);
  }


//------------------------------------------------------------------------------

} // End namespace details

/// Double precision sincos
/*inline */ void __attribute__ ((noinline)) fast_sincos( const double xx, double & s, double &c ) {
    // I have to use doubles to make it vectorise...

    int j;
    double x = details::reduce2quadrant(xx,j);
    const double signS = (j&4);

    j-=2;

    const double signC = (j&4);
    const double poly = j&2;

    details::fast_sincos_m45_45(x,s,c);

    //swap
    if( poly==0 ) {
      const double tmp = c;
      c=s;
      s=tmp;
    }

    if(signC == 0.)
      c = -c;
    if(signS != 0.)
      s = -s;
    if (xx < 0.)
      s = -s;

  }


// Single precision functions

namespace details {
//------------------------------------------------------------------------------
/// Reduce to 0 to 45
inline float reduce2quadrant(float x, int & quad) {
    /* make argument positive */
    x = fabs(x);

    quad = int (ONEOPIO4F * x); /* integer part of x/PIO4 */

    quad = (quad+1) & (~1);
    const float y = float(quad);
    // quad &=4;
    // Extended precision modular arithmetic
    return ((x - y * DP1F) - y * DP2F) - y * DP3F;
  }


//------------------------------------------------------------------------------


/// Sincos only for -45deg < x < 45deg
inline void fast_sincosf_m45_45( const float x, float & s, float &c ) {

    float z = x * x;

    s = (((-1.9515295891E-4f * z
       + 8.3321608736E-3f) * z
      - 1.6666654611E-1f) * z * x)
      + x;

    c = ((  2.443315711809948E-005f * z
        - 1.388731625493765E-003f) * z
     + 4.166664568298827E-002f) * z * z
      - 0.5f * z + 1.0f;
  }

//------------------------------------------------------------------------------

} // end details namespace

/// Single precision sincos
inline void fast_sincosf( const float xx, float & s, float &c ) {


    int j;
    const float x = details::reduce2quadrant(xx,j);
    int signS = (j&4);

    j-=2;

    const int signC = (j&4);
    const int poly = j&2;

    float ls,lc;
    details::fast_sincosf_m45_45(x,ls,lc);

    //swap
    if( poly==0 ) {
      const float tmp = lc;
      lc=ls; ls=tmp;
    }

    if(signC == 0) lc = -lc;
    if(signS != 0) ls = -ls;
    if (xx<0)  ls = -ls;
    c=lc;
    s=ls;
  }


} // end namespace vdt

#endif


#include <iostream>

#include <cmath>
#include <stdint.h>

const int samples = 2048; // Table size.
const double pi = 3.1415926535897932;
const double pi_half = pi / 2;
const double inverse_pi_half = 1 / pi_half;
const double two_pi = pi * 2;
const double interval = pi_half / samples;
const double inverse_interval = samples / pi_half;

static double table_sin[samples + 2];
static double table_cos_interval[samples + 2];
static double table_cos[samples + 2];

void PopulateTrigonometricTable(double* sin_table, double* cos_table_interval, double* cos_table_2, int samples) {
  double* sin_buffer = sin_table;
  double* cos_buffer = cos_table_interval;
  const double pi_half = 3.1415926535897932 / 2;
  double interval = pi_half / samples;
  for (int i = 0; i < samples + 1; i++) {
    double sample = std::sin(i * interval);
    sin_buffer[i] = sample;
    cos_table_2[samples - i] = sample;
    cos_buffer[samples - i] = sample * interval;
  }

  // Fill this to catch out of bound accesses when calculating Math.sin(pi/2).
  sin_buffer[samples + 1] = std::sin(pi_half + interval);
  cos_table_2[samples + 1] = std::cos(pi_half + interval);
  cos_buffer[samples + 1] = std::cos(pi_half + interval) * interval;
}

void InitTrigonometricFunctions() {
  PopulateTrigonometricTable(table_sin, table_cos_interval, table_cos, samples);
}

// This implements the following algorithm.
// 1) Multiplication takes care of to-number conversion.
// 2) Reduce x to the first quadrant [0, pi/2].
// Conveniently enough, in case of +/-Infinity, we get NaN.
// 3) Replace x by (pi/2-x) if x was in the 2nd or 4th quadrant.
// 4) Do a table lookup for the closest samples to the left and right of x.
// 5) Find the derivatives at those sampling points by table lookup:
// dsin(x)/dx = cos(x) = sin(pi/2-x) for x in [0, pi/2].
// 6) Use cubic spline interpolation to approximate sin(x).
// 7) Negate the result if x was in the 3rd or 4th quadrant.
// 8) Get rid of -0 by adding 0.
double __attribute__ ((noinline)) MathSinImpl(double x) {
  double multiple = std::floor(x * inverse_pi_half);
  int multiple_and_one = ((int)multiple) & 1;
  x = multiple_and_one * pi_half + (1 - (multiple_and_one << 1)) * (x - multiple * pi_half);
  int multiple_and_two = ((int)multiple) & 2;
  double double_index = x * inverse_interval;
  int index = (int)double_index/* | 0*/;
  double t1 = double_index - index;
  double t2 = 1 - t1;
  double y1 = table_sin[index];
  double y2 = table_sin[index + 1];
  double dy = y2 - y1;
  return (t2 * y1 + t1 * y2 +
    t1 * t2 * ((table_cos_interval[index] - dy) * t2 +
           (dy - table_cos_interval[index + 1]) * t1)) *
    (1 - multiple_and_two) + 0;
};

void MathSinCosImpl(double x, double &sinx, double &cosx) {
  sinx = MathSinImpl(x);
  cosx = MathSinImpl(x + pi_half);
}

void MathSinCosImpl2(double x, double &sinx, double &cosx) {
  double multiple = std::floor(x * inverse_pi_half);
  int multiple_and_one = ((int)multiple) & 1;
  x = multiple_and_one * pi_half + (1 - (multiple_and_one << 1)) * (x - multiple * pi_half);
  int multiple_and_two = ((int)multiple) & 2;
  double double_index = x * inverse_interval;
  int index = (int)double_index/* | 0*/;
  double t1 = double_index - index;
  double t2 = 1 - t1;
  double y1 = table_sin[index];
  double y2 = table_sin[index + 1];
  double ycos1 = table_cos[index];
  double ycos2 = table_cos[index + 1];
  double dy = y2 - y1;
  double dycos = ycos1 - ycos2;
  sinx = (t2 * y1 + t1 * y2 + t1 * t2 * ((ycos1 * interval - dy) * t2 + (dy - ycos2 * interval) * t1)) *  (1 - multiple_and_two) + 0;
  cosx = (t1 * ycos2 + t2 * ycos1 + t2 * t1 * ((y2 * interval - dycos) * t1 + (dycos - y1 * interval) * t2)) *  (1 - multiple_and_two) + 0;
};

int main(int argc, char **argv) {
  InitTrigonometricFunctions();
  const int64_t n = (int64_t)1e+8;
  double topval = 10000.0;
  double topvalDiv2p32 = topval / std::numeric_limits<uint32_t>::max();
  double step = topval / n;
  double x = 0.0;
  double maxerr = 0;
  int maxbitserr = 0;
  std::cout.precision(15);
  int64_t sumerr = 0;
  uint32_t r = 0;

  double a = 0;
  for(int64_t i = 0; i < n; i++) {
    r = 69069 * r + 1;
    x = r * topvalDiv2p32;
    // x += step;

    double sin_fast, cos_fast;

    sin_fast = MathSinImpl(x);
    // MathSinCosImpl(x, sin_fast, cos_fast);
    //vdt::fast_sincos(x, sin_fast, cos_fast);
    //sin_fast = std::sin(x);

    a += sin_fast;
#if 0
    double stdsin = std::sin(x);

    //double err = std::abs(1.0 - sin_fast / stdsin);
    int diffbits = vdth::diffbit<double>(sin_fast, stdsin);
    sumerr += diffbits;
    if(diffbits > maxbitserr) {
      std::cout << diffbits  << " (x=" << x << ")" << std::endl;
      maxbitserr = diffbits;
      //maxerr =  err;
    }
#endif
  }
  std::cout << a << std::endl;
  std::cout << ((double)sumerr / n) << std::endl;
    //std::cout << "Hello, world!" << std::endl;
    return 0;
}