Sieve of Eratosthenes - segmented for sieving up to 2^64-1

// zrbj_sx_sieve64_v1.cpp - Sieve of Eratosthenes reference implementation
// 2014-11-09
//
// This implements a segmented odds-only Sieve of Eratosthenes for sieving up to 2^64-1,
// for a maximum window size of 32 bits (corresponding to bitmaps with 2^31 bits). All
// index-related (32-bit) variables use the uintxx_t typedef so that potential bugs
// related to index math can be experimentally excluded by changing that typedef to
// uint64_t. The digests for the test ranges and the actual primes have been verified
// against the console version of the primesieve.org program, except for those that are
// beyond its limits.
//
// The actual sieve is implemented in sieve32(), the rest is just overhead for testing.

#undef NDEBUG

//#include "zrbj/hdr_pre.h"
#include <cassert>
#include <climits>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <algorithm>
#include <vector>
//#include "zrbj/hdr_post.h"

// Dealing only with odd numbers and packed (odd-only) bitmaps makes a lot of things easier.
// The same goes for specifying all ranges - like sieve windows - via bit indices and not via
// the corresponding numbers.
//
// For one thing, the small factors bitmap needs only 31 bits, which buys a bit of head room
// for bitmap index variables even if those are uint32_t. It also reduces drastically the
// number of special cases that need to be handled, pretty much everywhere, and this makes
// the code much more robust. Last but not least, this makes it possible to specify all ranges
// as half-open or base + count, without dirty tricks like decreeing that 0 be treated as 2^32.
// Closed ranges (i.e. 'max_bit' instead of 'end_bit') can be made to work but they require
// more care and cause a lot of +1/-1 noise.
//
// The code here is designed for an index type with 32 bits. When bit indexes can go beyond that,
// the corresponding number type (uint64_t) is used instead.
//
// Changing this typedef to uint64_t eliminates all potential bugs related to 32-bit index math
// but it makes the code slower (slightly so if compiled as 64-bit code, signficantly for 32-bit
// code). Compiling as 32-bit code with gcc, the difference is 8.9 s instead of 4.7 s for the
// sieve init, 9.7 s instead of 5 s for sieving a million primes just below 2^64.

typedef uint_fast32_t uintxx_t;

///////////////////////////////////////////////////////////////////////////////////////////////////
// Bit manipulation functions are sometimes faster using the machine word size, sometimes using
// unsigned char. Using templated functions makes it possible to mix and match. It also makes
// it easy to experiment with intrinsics, by the simple expedient of defining specialisations
// for certain types and implementing them via intrinsics.

template<typename word_t>
void set_bit (word_t *p, uintxx_t index)
{
   enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };

   // we can trust the compiler to use masking and shifting instead of division; we cannot do that
   // ourselves without having the log2 which cannot easily be computed as a constexpr

   p[index / BITS_PER_WORD] |= word_t(1) << (index % BITS_PER_WORD);
}

//-------------------------------------------------------------------------------------------------
// Returning the result as unsigned char makes this compatible with the intrinsics for equivalent
// CPU instructions; forcing the intrinsics to bool instead would cause potential inefficiencies.

template<typename word_t>
unsigned char bit (word_t const *p, uintxx_t index)
{
   enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };

   return (unsigned char)((p[index / BITS_PER_WORD] >> (index % BITS_PER_WORD)) & 1u);
}

///////////////////////////////////////////////////////////////////////////////////////////////////
// As far as prime counts and checksums are concerned, bit 0 is often treated as a stand-in for the
// non-representable number 2 (which, being even, is the odd man out in such a bitmap). This makes
// some things simpler; for example, it makes the number of primes in a bitmap equal to the number
// of 0 bits.
//
// Bit 0 needs special consideration in any case since 1 it is neither prime nor composite.
//
// Prime counts, sums and products (primorials) are often available from sources like oeis.org.
// They are independent of position and order, which makes it easy to combine the results of
// separately processed blocks. The checksum is order-dependent - intendedly so - and it further
// enhances the error detection capability of the digest.

template<typename word_t>
struct digest_pod_t
{
   word_t count;
   word_t sum;
   word_t product;
   word_t checksum;

   bool operator != (digest_pod_t const &other) const
   {
      return !(*this == other);
   }

   bool operator == (digest_pod_t const &other) const
   {
      return checksum == other.checksum
          && product  == other.product
          && sum      == other.sum
          && count    == other.count;
   }

   void add_prime (word_t n)
   {
      count    += 1;
      sum      += n;
      product  *= n;
      checksum += n * count;
   }

   template<typename const_forward_iterator_t>
   void add_primes (const_forward_iterator_t begin, const_forward_iterator_t end)
   {
      for (const_forward_iterator_t p = begin; p != end; ++p)
      {
         add_prime(*p);
      }
   }

   void add_odd_composites_bitmap (
      void const *bitmap,
      uintxx_t    min_bit,
      uintxx_t    end_bit,           // first bit past the range (max_bit + 1)
      word_t      bit_offset = 0  )  // virtual offset of bit 0 of the bitmap (first number >> 1)
   {
      unsigned char const *odd_composites = static_cast<unsigned char const *>(bitmap);

      if (min_bit < end_bit)
      {
         if (min_bit == 0 && bit_offset == 0)
         {
            if (bit(odd_composites, 0) == 0)  add_prime(2);

            ++min_bit;
         }

         word_t base = (bit_offset << 1) + 1;

         for (uintxx_t k = min_bit; k < end_bit; ++k)
         {
            if (bit(odd_composites, k))  continue;

            add_prime(word_t(base + (k << 1)));
         }
      }
   }
};

//-------------------------------------------------------------------------------------------------
// mixed-size comparison

bool operator == (digest_pod_t<uint32_t> const &d32, digest_pod_t<uint64_t> const &d64)
{
   return d32.checksum == uint32_t(d64.checksum)
       && d32.product  == uint32_t(d64.product)
       && d32.sum      == uint32_t(d64.sum)
       && d32.count    == uint32_t(d64.count);
}

bool operator == (digest_pod_t<uint64_t> const &d64, digest_pod_t<uint32_t> const &d32)
{
   return d32 == d64;
}

bool operator != (digest_pod_t<uint32_t> const &d32, digest_pod_t<uint64_t> const &d64)
{
   return !(d32 == d64);
}

bool operator != (digest_pod_t<uint64_t> const &d64, digest_pod_t<uint32_t> const &d32)
{
   return !(d64 == d32);
}

//-------------------------------------------------------------------------------------------------
// convenience

template<typename value_t>
void print (
   char const *prefix,
   digest_pod_t<value_t> const &digest,
   unsigned count_digits = 9  )
{
   enum {  H = 2 * sizeof(value_t)  };

   std::printf("%s{ %*llu, %0*llX, %0*llX, %0*llX }",
      prefix ? prefix : "",
      count_digits, uint64_t(digest.count),
      H, uint64_t(digest.sum),
      H, uint64_t(digest.product),
      H, uint64_t(digest.checksum)  );
}

template<typename word_t>
struct digest_t: digest_pod_t<word_t>
{
   digest_t ()
   {
      this->count = this->sum = this->checksum = 0;
      this->product = 1;
   }

   digest_t (word_t c, word_t s, word_t p, word_t x)
   {
      this->count = c;  this->sum = s;  this->product = p;  this->checksum = x;
   }
};

typedef digest_t<uint32_t> digest32_t;
typedef digest_t<uint64_t> digest64_t;

///////////////////////////////////////////////////////////////////////////////////////////////////
// all ranges have been verified against the primesieve.org console executable, except for those
// which exceed its cap (18446744030759878665 == UINT64_MAX - 10 * UINT32_MAX).

struct test_range_t
{
   uint64_t lo, hi;
   digest_pod_t<uint64_t> digest;
};

#define _( lo, hi, first_prime, last_prime, cnt, sum, prd, chk )  \
   {  lo ## u, hi ## u, {  cnt ## u, 0x ## sum ## u, 0x ## prd ## u, 0x ## chk ## u  }  }

test_range_t const Ranges_1M[] =
{
   _(                    1,             10000000 /*   10000000 */,                    2,              9999991 /*   9999990 */,    664579, 000002E9D50C6334, E284CDD9939E68AA, 13F0C0CBB8773897 ),
   _(             81673073,            100000000 /*   18326928 */,             81673073,             99999989 /*  18326917 */,   1000000, 00005299CB5A5478, 7236ED226E8BA3D5, 8B636C3E423B6BAA ),
   _(            979292933,           1000000000 /*   20707068 */,            979292933,            999999937 /*  20707005 */,   1000000, 000384138D549EA0, 879FC41968FAD3E9, EAFCC52696740814 ),
   _(           9976982339,          10000000000 /*   23017662 */,           9976982339,           9999999967 /*  23017629 */,   1000000, 00237C7AA33A931A, FEEAC78D65A10667, D7C1D531E4400712 ),
   _(          99974683489,         100000000000 /*   25316512 */,          99974683489,          99999999977 /*  25316489 */,   1000000, 016339F4D1418D36, A1C1A63E3E302FEF, 478434FCE8AB133E ),
   _(         999972400027,        1000000000000 /*   27599974 */,         999972400027,         999999999989 /*  27599963 */,   1000000, 0DE0AA269BDFD8BA, CEE122E76B72227F, D50470D359BE4DFE ),
   _(        9999970054519,       10000000000000 /*   29945482 */,        9999970054519,        9999999999971 /*  29945453 */,   1000000, 8AC715685B09F258, 8593DE5292613F85, 8B32A42E25B99C64 ),
   _(       99999967808393,      100000000000000 /*   32191608 */,       99999967808393,       99999999999973 /*  32191581 */,   1000000, 6BC74F88FD0C5A6C, 1AB5F68D9123D491, D9CDC114AA6B0AAE ),
   _(      999999965485057,     1000000000000000 /*   34514944 */,      999999965485057,      999999999999989 /*  34514933 */,   1000000, 35C99E13AAF0731E, FC1D792962ADB95B, 1AF84035F20F6A04 ),
   _(     9999999963142769,    10000000000000000 /*   36857232 */,     9999999963142769,     9999999999999937 /*  36857169 */,   1000000, 19E0B8F8BCF0E2DE, 98216464A377198F, D6CB044A5230ADB8 ),
   _(    99999999960819169,   100000000000000000 /*   39180832 */,    99999999960819169,    99999999999999997 /*  39180829 */,   1000000, 02C7CF7BB7826270, 3B6966E20059EEA9, 5D85F14A7AF08E06 ),
   _(   999999999958516801,  1000000000000000000 /*   41483200 */,   999999999958516801,   999999999999999989 /*  41483189 */,   1000000, 1BCEBA0D5ECD2FEE, 5946DFC574AB3CF3, D118C1B8D724181C ),
   _(  9999999999956286497, 10000000000000000000 /*   43713504 */,  9999999999956286497,  9999999999999999961 /*  43713465 */,   1000000, 1613ED6695FB8F08, C1237D83F03C887D, 862E500F9255057E ),
   _( 18446744030715545269, 18446744030759878665 /*   44333397 */, 18446744030715545269, 18446744030759878627 /*  44333359 */,   1000000, FF675557CDDE0298, 7EA5838E55BEEEC5, 71F3BDD5137CBDEC ),
   _( 18446744073665204447, 18446744073709551615 /*   44347169 */, 18446744073665204447, 18446744073709551557 /*  44347111 */,   1000000, FFFFEBD3966C841C, 5E37B0500AB5B8BD, 995B0CEF6CA1E4A8 )
};

test_range_t const Ranges_10M[] =
{
   _(                    1,            100000000 /*  100000000 */,                    2,             99999989 /*  99999988 */,   5761455, 0000FDF0985FB44C, 6D921F88B2D434D2, C0B16F72901E85FF ),
   _(            793877173,           1000000000 /*  206122828 */,            793877173,            999999937 /* 206122765 */,  10000000, 001FDBE1C333ECF8, 651A0A9574621079, C45243BCC5A786A2 ),
   _(           9769857367,          10000000000 /*  230142634 */,           9769857367,           9999999967 /* 230142601 */,  10000000, 015F2EBED9399456, 9E5053E10173B0AB, 13AC8B7227297280 ),
   _(          99746761237,         100000000000 /*  253238764 */,          99746761237,          99999999977 /* 253238741 */,  10000000, 0DDC371E71307554, AABB2057BB24D2C9, C6B49F02CC79C6F4 ),
   _(         999723733787,        1000000000000 /*  276266214 */,         999723733787,         999999999989 /* 276266203 */,  10000000, 8AC23ACFC174A930, 66BAD7F180005B21, 23284B3010659CD0 ),
   _(        9999700629011,       10000000000000 /*  299370990 */,        9999700629011,        9999999999971 /* 299370961 */,  10000000, 6BC20C8E38DE8F36, ED985713434EEDFB, 7B1FD08D2CF8ECD4 ),
   _(       99999677617663,      100000000000000 /*  322382338 */,       99999677617663,       99999999999973 /* 322382311 */,  10000000, 35C3F3CFA654F69A, 72861CA76E6E313F, F8CCDA86C924E3C6 ),
   _(      999999654617569,     1000000000000000 /*  345382432 */,      999999654617569,      999999999999989 /* 345382421 */,  10000000, 19DAA726AC5A2024, 2E27E7F123B0F7B1, 38455B855C8EECAE ),
   _(     9999999631636541,    10000000000000000 /*  368363460 */,     9999999631636541,     9999999999999937 /* 368363397 */,  10000000, 02C1563C4B6432C4, C36AB4E6E6B37F65, D29BB9FED3D6DA14 ),
   _(    99999999608465399,   100000000000000000 /*  391534602 */,    99999999608465399,    99999999999999997 /* 391534599 */,  10000000, 1BC7D81E877E2454, 292A3536FD400481, 9395B877E4E6EEC2 ),
   _(   999999999585415333,  1000000000000000000 /*  414584668 */,   999999999585415333,   999999999999999989 /* 414584657 */,  10000000, 160CA3C90CF00EF0, 7E1A4A66DE1CA111, ADB5D2CEF74C6B04 ),
   _(  9999999999562573471, 10000000000000000000 /*  437426530 */,  9999999999562573471,  9999999999999999961 /* 437426491 */,  10000000, DCC0478D1CC9FA52, 2F64319F30800833, D59808CFCB4ADAEA ),
   _( 18446744030316425227, 18446744030759878665 /*  443453439 */, 18446744030316425227, 18446744030759878627 /* 443453401 */,  10000000, FA023EF2BBA7CDA2, 4C5D19CF0CE2099F, 250841775ECFADF0 ),
   _( 18446744073265777349, 18446744073709551615 /*  443774267 */, 18446744073265777349, 18446744073709551557 /* 443774209 */,  10000000, FFF81E195834C5F0, 4EA8082765E419D5, 0FEE0BBB15025848 )
};

#undef _

digest_pod_t<uint64_t> Digest_4G =
{
   203280221, 0x05E8362E0C903F86u, 0x9B20CCAA5B253F12u, 0xE364FF90C46F979Cu
};

///////////////////////////////////////////////////////////////////////////////////////////////////
// Computing the maximum factor - i.e. the integer square root - via uint32_t(sqrt(double(n))) is
// not safe, for various reasons. One of them is that the result for UINT64_MAX would be 0 instead
// of UINT32_MAX. See http://codereview.stackexchange.com/questions/69069
//
// Simpler versions work well enough with VC++ and gcc but the version shown here is the only one
// that is guaranteed to return the correct result regardless of rounding mode, fp strictness and
// compiler optimisation settings. Since the function is not time-critical I did not clean it up,
// to show the strange antisymmetry. The absolute minimum necessary here is checking whether the
// float result is greater than UINT32_MAX before casting it to integral type; in that case the
// worst that can happen is that the result is one or two above the perfect result, leading to at
// most one unnecessary iteration of the outer sieve loop. That can be avoided in almost all cases
// by decrementing the result once if its square is greater than n; in a tiny fraction of cases a
// decrement of two would be necessary, but that is extremely rare, less than one in a billion.

template<typename word_t>
word_t square (word_t n)
{
   return word_t(n * n);
}

uintxx_t max_factor32 (uint64_t n)
{
   uint64_t r = uint64_t(std::sqrt(double(n)));

   while (r < UINT32_MAX && square(r) < n)
      ++r;
   while (r > UINT32_MAX || square(r) > n)
      --r;

   return uintxx_t(r);
}

///////////////////////////////////////////////////////////////////////////////////////////////////

unsigned char odd_composites32[UINT32_MAX / (2 * CHAR_BIT) + 1];   // the small factor sieve
uintxx_t sieved_bits = 0;                                          // how far it's been initialised

void extend_factor_sieve_to_cover (uintxx_t max_factor_bit);       // bit, not number!

void sieve32 (unsigned char *target_segment, uint64_t bit_offset, uintxx_t bit_count)
{
   assert( bit_count > 0 && bit_count <= UINT32_MAX / 2 + 1 );

   uintxx_t max_bit = bit_count - 1;
   uint64_t max_num = 2 * (bit_offset + max_bit) + 1;
   uintxx_t max_factor_bit = (max_factor32(max_num) - 1) / 2;

   if (target_segment != odd_composites32)
   {
      extend_factor_sieve_to_cover(max_factor_bit);
   }

   std::memset(target_segment, 0, std::size_t((max_bit + CHAR_BIT) / CHAR_BIT));

   for (uintxx_t i = 3u >> 1; i <= max_factor_bit; ++i)
   {
      if (bit(odd_composites32, i))  continue;

      uintxx_t n = (i << 1) + 1;   // the actual prime represented by bit i (< 2^32)

      uintxx_t stride = n;         // == (n * 2) / 2
      uint64_t start = (uint64_t(n) * n) >> 1;
      uintxx_t k;

      if (start >= bit_offset)
      {
         k = uintxx_t(start - bit_offset);
      }
      else // start < offset
      {
         k = stride - (bit_offset - start - 1) % stride - 1;
      }

      while (k <= max_bit)
      {
         set_bit(target_segment, k);

         if ((k += stride) < stride)  // k can wrap since strides go up to almost 2^32
         {
            break;
         }
      }
   }
}

//-------------------------------------------------------------------------------------------------
// Using sieve32() for sieving the small factors as well here; that takes about three times as long
// as a specialised small sieve with all the bells and whistles but it is also a couple hundred
// lines of code shorter. See zrbj_sx_sieve32_v4.cpp for the full Monty. Sieving in small segments
// is a necessity since doing it in one go would slow things down by an order of magnitude.

void extend_factor_sieve_to_cover (uintxx_t max_factor_bit)
{
   uintxx_t const SIEVE_BITS = sizeof(odd_composites32) * CHAR_BIT;

   assert( max_factor_bit < SIEVE_BITS );

   if (max_factor_bit >= sieved_bits)
   {
      assert( sieved_bits % CHAR_BIT == 0 );

      uintxx_t bits_to_sieve = max_factor_bit - sieved_bits + 1;
      uintxx_t segment_size = 1u << 18;  // == 32K * CHAR_BIT == L1 cache size
      uintxx_t partial_bits = bits_to_sieve % segment_size;
      uintxx_t segments     = bits_to_sieve / segment_size + (partial_bits != 0);

      if ((SIEVE_BITS - sieved_bits) % segment_size == 0)
      {
         partial_bits = 0;   // always sieve full segments as long as there is enough space
      }

      for ( ; segments--; )
      {
         uintxx_t bits_this_round = segments == 0 && partial_bits ? partial_bits : segment_size;

         sieve32(odd_composites32 + sieved_bits / CHAR_BIT, sieved_bits, bits_this_round);

         sieved_bits += bits_this_round;
      }
   }
}

///////////////////////////////////////////////////////////////////////////////////////////////////
// utility functions for the tests

namespace util
{
   uint32_t random_seed32 = 42;  // must be nonzero

   uint32_t random_uint32 ()  // xorshift32*
   {
      uint32_t x = random_seed32;

      x ^= x << 13;  x ^= x >> 17;  x ^= x << 5;

      random_seed32 = x;

      return x * 0xEA4E58EDu;
   }

   uint32_t random_uint32 (uint32_t modulus)
   {
      if (modulus > 1)
      {
         for ( ; ; )
         {
            uint32_t raw_bits = random_uint32();
            uint32_t result = raw_bits % modulus;
            uint32_t check = uint32_t(raw_bits - result + modulus);

            if (check >= raw_bits || check == 0)
            {
               return result;
            }
         }
      }

      return 0;
   }

   double ms (std::clock_t clock_delta)
   {
      return 1000 * double(clock_delta) / CLOCKS_PER_SEC;
   }

   void terminate_with_failure (char const *message = 0)
   {
      std::printf("%s", message ? message : "\n\n*** FAIL ***\n");
      std::exit(3);
   }

} // namepace util

///////////////////////////////////////////////////////////////////////////////////////////////////
// various tests...

void time_factor_sieve ()
{
   std::printf("\ninitialising the small factor sieve... ");

   sieved_bits = 0;

   std::clock_t t0 = std::clock();
   extend_factor_sieve_to_cover(UINT32_MAX / 2 - 1);
   double seconds = double(std::clock() - t0) / CLOCKS_PER_SEC;

   std::printf("%.3f s (%.1f M/s)", seconds, sieved_bits * std::ldexp(2.0, -20) / seconds);

   std::printf("\nverifying... ");

   digest64_t digest;

   t0 = std::clock();
   digest.add_odd_composites_bitmap(odd_composites32, 0, sieved_bits - 1, 0);
   seconds = double(std::clock() - t0) / CLOCKS_PER_SEC;

   std::printf("%.3f s (%.1f M/s)", seconds, sieved_bits * std::ldexp(2.0, -20) / seconds);

   if (digest != Digest_4G)
   {
      std::printf("\n\ndigest != Digest_4G\n");
      print("\ntst ", digest);
      print("\nref ", Digest_4G);

      util::terminate_with_failure();
   }
}

//-------------------------------------------------------------------------------------------------

void test_factor_sieve ()
{
   std::printf("\ntesting incremental updates of the small factor sieve...");

   uint32_t ref_bytes = uint32_t(sieved_bits) / CHAR_BIT;
   std::vector<unsigned char> ref(odd_composites32, odd_composites32 + ref_bytes);
   uint32_t compared = 0;
   uint32_t limit = sizeof(odd_composites32) * CHAR_BIT;

   sieved_bits = 0;

   while (sieved_bits < limit)
   {
      uint32_t growth_limit = 1u << 23;

      if (util::random_uint32(100) < 33)  growth_limit <<= 4;
      if (util::random_uint32(100) < 33)  growth_limit >>= 4;

      growth_limit = std::min(growth_limit, limit - uint32_t(sieved_bits));

      uint32_t extend_by = util::random_uint32(growth_limit);

      std::printf("\n   %8X + %7X... ", uint32_t(sieved_bits), extend_by);

      std::clock_t t0 = std::clock();

      extend_factor_sieve_to_cover(sieved_bits + extend_by);

      std::printf("%6.1f ms", util::ms(std::clock() - t0));

      uint32_t full_bytes = std::min(ref_bytes, uint32_t(sieved_bits) / CHAR_BIT);

      if (uint32_t compare_now = full_bytes - compared)
      {
         if (std::memcmp(odd_composites32, &ref[0], compare_now))
         {
            util::terminate_with_failure();
         }

         compared += compare_now;
      }
   }

   std::printf("\n-> sieved_bits = %8X", uint32_t(sieved_bits));

   if (compared < sizeof(odd_composites32))
   {
      std::printf("\nNOTE: only %u of %u sieve bytes have been compared",
         compared, uint32_t(sizeof(odd_composites32))  );
   }
}

///////////////////////////////////////////////////////////////////////////////////////////////////

void sieve_and_print_range (
   uint64_t start,          // number, not bit index
   uint32_t count,
   digest_pod_t<uint64_t> const *ref = 0  )
{
   unsigned odd_start = unsigned(start & 1);

   if (count < 2 - odd_start)  return;

   std::printf("\n%20llu .. %20llu", start, start + count - 1);

   uint32_t max_bit = (count - (2 - odd_start)) >> 1;

   std::vector<unsigned char> buffer((max_bit + CHAR_BIT) / CHAR_BIT);
   unsigned char *bitmap = &buffer[0];

   std::clock_t t0 = std::clock();

   if (count > 1 - odd_start)
   {
      sieve32(bitmap, start >> 1, max_bit + 1);
   }

   double ms = util::ms(std::clock() - t0);
   std::printf(" %6.0f ms %6.1f M/s", ms, count / ms * 1000 * std::ldexp(2, -20));

   digest64_t digest;

   if (count > 1 - odd_start && start + count - 1 >= 2)
   {
      if (start == 2)  digest.add_prime(2);  // digest does not add 2 by itself in this case

      digest.add_odd_composites_bitmap(bitmap, 0, max_bit + 1, start >> 1);
   }

   std::printf(" %9u ", uint32_t(digest.count));

   if (ref != 0)  // bloody gcc complains if inner conditional is not wrapped in braces
   {
      if (digest == *ref)
         std::printf(" OK");
      else
         util::terminate_with_failure();
   }
}

//-------------------------------------------------------------------------------------------------

void sieve_and_print_range (test_range_t const &range)
{
   assert( range.lo <= range.hi );
   assert( range.hi - range.lo + 1 <= UINT32_MAX );

   sieve_and_print_range(range.lo, uint32_t(range.hi - range.lo + 1), &range.digest);
}

//-------------------------------------------------------------------------------------------------

void sieve_and_print_ranges (test_range_t const *ranges, std::size_t count)
{
   for (test_range_t const *p = ranges, *e = p + count; p < e; ++p)
   {
      sieve_and_print_range(*p);
   }
}

//-------------------------------------------------------------------------------------------------

template<std::size_t N>
void sieve_and_print_ranges (test_range_t const (&ranges)[N])
{
   sieve_and_print_ranges(ranges, N);
}

///////////////////////////////////////////////////////////////////////////////////////////////////

int main ()
{
   std::printf("\nsizeof(uintxx_t) == %u", unsigned(sizeof(uintxx_t)));
   std::printf("\nsizeof(void*)    == %u\n", unsigned(sizeof(void*)));

   time_factor_sieve();
   test_factor_sieve();

   std::printf("\nsieving some ranges (1M)...");
   sieve_and_print_ranges(Ranges_1M);

   std::printf("\nsieving some bigger ranges (10M)...");
   sieve_and_print_ranges(Ranges_10M);

   std::printf("\nsieving with maximum window size...");
   sieve_and_print_range(0, UINT32_MAX, &Digest_4G);
   sieve_and_print_range(1, UINT32_MAX, &Digest_4G);
   sieve_and_print_range(2, UINT32_MAX, &Digest_4G);

   return 0;
}