fastPow with union, with Benchmark + warmup

#include <iostream>
#include <cmath>
#include <conio.h>
#include <windows.h>


// uses union, no -fno-alias... required
inline double fastPow(const double a, const double b) {
  union {
    double d;
    int x[2];
  } u = { a };
  u.x[1] = (int)(b * (u.x[1] - 1072632447) + 1072632447);
  u.x[0] = 0;
  return u.d;
}

inline double fastStructPow(const double a, const double b) {
  union {
    double d;
    struct {
      int a;
      int b;
    } s;
  } u = { a };
  u.s.b = (int)(b * (u.s.b - 1072632447) + 1072632447);
  u.s.a = 0;
  return u.d;
}


// old version
inline double oldFastPow(const double a, const double b)
{
  int tmp = *(1 + (int*)&a);
  int tmp2 = (int)(b * (tmp - 1072632447) + 1072632447);
  double p = 0.0;
  *(1 + (int * )&p) = tmp2;
  return p;
}

// should be much more precise with large b
inline double fastPrecisePow(double a, const double b) {
  // calculate approximation with just the fraction of the exponent
  int exp = (int) b;
  union {
    double d;
    int x[2];
  } u = { a };
  u.x[1] = (int)((b - exp) * (u.x[1] - 1072632447) + 1072632447);
  u.x[0] = 0;

  // exponentiation by squaring with the exponent's integer part
  // double r = u.d makes everything much slower, not sure why
  double r = 1.0;
  while (exp) {
    if (exp & 1) {
      r *= a;
    }
    a *= a;
    exp >>= 1;
  }

  return r * u.d;
}


int main() {
  double x_max = 4.5423;
  double y_max = 32.3211;

  size_t times = 300000000;
  double dx = x_max / times;
  double dy = y_max / times;

  // warmup
  LARGE_INTEGER frequency;
  QueryPerformanceFrequency(&frequency);
  LARGE_INTEGER start;
  LARGE_INTEGER end;
  QueryPerformanceCounter(&start);
  double sum = 0;
  double x = 0.1;
  double y = 0.1;
  double a = 1.0 / times;
  for (size_t i=0; i<times; ++i) {
    x += dx;
    y += dy;
    sum += x * y + x + y / (i - y);
  }
  QueryPerformanceCounter(&end);
  std::cout << sum << " warmup" << std::endl << std::endl;


  // original pow ///////////////////////////////////
  QueryPerformanceCounter(&start);
  sum = 0;
  x = 0;
  y = 0;
  for (size_t i=0; i<times; ++i) {
    x += dx;
    y += dy;
    sum += pow(x, y);
  }
  QueryPerformanceCounter(&end);
  double t_pow = ((end.QuadPart - start.QuadPart) * 1.0 / frequency.QuadPart);
  std::cout << sum << " pow() " << t_pow << " sec" << std::endl;


  // fastPow ////////////////////////////////
  QueryPerformanceCounter(&start);
  sum = 0;
  x = 0;
  y = 0;
  for (size_t i=0; i<times; ++i) {
    x += dx;
    y += dy;
    sum += fastPow(x, y);
  }
  QueryPerformanceCounter(&end);
  double t_fastPow = ((end.QuadPart - start.QuadPart) * 1.0 / frequency.QuadPart);
  std::cout << sum << " fastPow() " << t_fastPow << " sec, speedup=" << (t_pow / t_fastPow) << std::endl;

  // oldFastPow ////////////////////////////////
  QueryPerformanceCounter(&start);
  sum = 0;
  x = 0;
  y = 0;
  for (size_t i=0; i<times; ++i) {
    x += dx;
    y += dy;
    sum += oldFastPow(x, y);
  }
  QueryPerformanceCounter(&end);
  double t_oldFastPow = ((end.QuadPart - start.QuadPart) * 1.0 / frequency.QuadPart);
  std::cout << sum << " oldFastPow() " << t_oldFastPow << " sec, speedup=" << (t_pow / t_oldFastPow) << std::endl;

  // fastStructPow ////////////////////////////////
  QueryPerformanceCounter(&start);
  sum = 0;
  x = 0;
  y = 0;
  for (size_t i=0; i<times; ++i) {
    x += dx;
    y += dy;
    sum += fastStructPow(x, y);
  }
  QueryPerformanceCounter(&end);
  double t_fastStructPow = ((end.QuadPart - start.QuadPart) * 1.0 / frequency.QuadPart);
  std::cout << sum << " fastStructPow() " << t_fastStructPow << " sec, speedup=" << (t_pow / t_fastStructPow) << std::endl;


  // fastPrecisePow ////////////////////////////////
  QueryPerformanceCounter(&start);
  sum = 0;
  x = 0;
  y = 0;
  for (size_t i=0; i<times; ++i) {
    x += dx;
    y += dy;
    sum += fastPrecisePow(x, y);
  }
  QueryPerformanceCounter(&end);
  double t_fastPrecisePow = ((end.QuadPart - start.QuadPart) * 1.0 / frequency.QuadPart);
  std::cout << sum << " fastPrecisePow() " << t_fastPrecisePow << " sec, speedup=" << (t_pow / t_fastPrecisePow) << std::endl;
}