// zrbj_sx_sieve64_v1.cpp - Sieve of Eratosthenes reference implementation // 2014-11-09 // // This implements a segmented odds-only Sieve of Eratosthenes for sieving up to 2^64-1, // for a maximum window size of 32 bits (corresponding to bitmaps with 2^31 bits). All // index-related (32-bit) variables use the uintxx_t typedef so that potential bugs // related to index math can be experimentally excluded by changing that typedef to // uint64_t. The digests for the test ranges and the actual primes have been verified // against the console version of the primesieve.org program, except for those that are // beyond its limits. // // The actual sieve is implemented in sieve32(), the rest is just overhead for testing. #undef NDEBUG //#include "zrbj/hdr_pre.h" #include #include #include #include #include #include #include #include #include #include //#include "zrbj/hdr_post.h" // Dealing only with odd numbers and packed (odd-only) bitmaps makes a lot of things easier. // The same goes for specifying all ranges - like sieve windows - via bit indices and not via // the corresponding numbers. // // For one thing, the small factors bitmap needs only 31 bits, which buys a bit of head room // for bitmap index variables even if those are uint32_t. It also reduces drastically the // number of special cases that need to be handled, pretty much everywhere, and this makes // the code much more robust. Last but not least, this makes it possible to specify all ranges // as half-open or base + count, without dirty tricks like decreeing that 0 be treated as 2^32. // Closed ranges (i.e. 'max_bit' instead of 'end_bit') can be made to work but they require // more care and cause a lot of +1/-1 noise. // // The code here is designed for an index type with 32 bits. When bit indexes can go beyond that, // the corresponding number type (uint64_t) is used instead. // // Changing this typedef to uint64_t eliminates all potential bugs related to 32-bit index math // but it makes the code slower (slightly so if compiled as 64-bit code, signficantly for 32-bit // code). Compiling as 32-bit code with gcc, the difference is 8.9 s instead of 4.7 s for the // sieve init, 9.7 s instead of 5 s for sieving a million primes just below 2^64. typedef uint_fast32_t uintxx_t; /////////////////////////////////////////////////////////////////////////////////////////////////// // Bit manipulation functions are sometimes faster using the machine word size, sometimes using // unsigned char. Using templated functions makes it possible to mix and match. It also makes // it easy to experiment with intrinsics, by the simple expedient of defining specialisations // for certain types and implementing them via intrinsics. template void set_bit (word_t *p, uintxx_t index) { enum { BITS_PER_WORD = sizeof(word_t) * CHAR_BIT }; // we can trust the compiler to use masking and shifting instead of division; we cannot do that // ourselves without having the log2 which cannot easily be computed as a constexpr p[index / BITS_PER_WORD] |= word_t(1) << (index % BITS_PER_WORD); } //------------------------------------------------------------------------------------------------- // Returning the result as unsigned char makes this compatible with the intrinsics for equivalent // CPU instructions; forcing the intrinsics to bool instead would cause potential inefficiencies. template unsigned char bit (word_t const *p, uintxx_t index) { enum { BITS_PER_WORD = sizeof(word_t) * CHAR_BIT }; return (unsigned char)((p[index / BITS_PER_WORD] >> (index % BITS_PER_WORD)) & 1u); } /////////////////////////////////////////////////////////////////////////////////////////////////// // As far as prime counts and checksums are concerned, bit 0 is often treated as a stand-in for the // non-representable number 2 (which, being even, is the odd man out in such a bitmap). This makes // some things simpler; for example, it makes the number of primes in a bitmap equal to the number // of 0 bits. // // Bit 0 needs special consideration in any case since 1 it is neither prime nor composite. // // Prime counts, sums and products (primorials) are often available from sources like oeis.org. // They are independent of position and order, which makes it easy to combine the results of // separately processed blocks. The checksum is order-dependent - intendedly so - and it further // enhances the error detection capability of the digest. template struct digest_pod_t { word_t count; word_t sum; word_t product; word_t checksum; bool operator != (digest_pod_t const &other) const { return !(*this == other); } bool operator == (digest_pod_t const &other) const { return checksum == other.checksum && product == other.product && sum == other.sum && count == other.count; } void add_prime (word_t n) { count += 1; sum += n; product *= n; checksum += n * count; } template void add_primes (const_forward_iterator_t begin, const_forward_iterator_t end) { for (const_forward_iterator_t p = begin; p != end; ++p) { add_prime(*p); } } void add_odd_composites_bitmap ( void const *bitmap, uintxx_t min_bit, uintxx_t end_bit, // first bit past the range (max_bit + 1) word_t bit_offset = 0 ) // virtual offset of bit 0 of the bitmap (first number >> 1) { unsigned char const *odd_composites = static_cast(bitmap); if (min_bit < end_bit) { if (min_bit == 0 && bit_offset == 0) { if (bit(odd_composites, 0) == 0) add_prime(2); ++min_bit; } word_t base = (bit_offset << 1) + 1; for (uintxx_t k = min_bit; k < end_bit; ++k) { if (bit(odd_composites, k)) continue; add_prime(word_t(base + (k << 1))); } } } }; //------------------------------------------------------------------------------------------------- // mixed-size comparison bool operator == (digest_pod_t const &d32, digest_pod_t const &d64) { return d32.checksum == uint32_t(d64.checksum) && d32.product == uint32_t(d64.product) && d32.sum == uint32_t(d64.sum) && d32.count == uint32_t(d64.count); } bool operator == (digest_pod_t const &d64, digest_pod_t const &d32) { return d32 == d64; } bool operator != (digest_pod_t const &d32, digest_pod_t const &d64) { return !(d32 == d64); } bool operator != (digest_pod_t const &d64, digest_pod_t const &d32) { return !(d64 == d32); } //------------------------------------------------------------------------------------------------- // convenience template void print ( char const *prefix, digest_pod_t const &digest, unsigned count_digits = 9 ) { enum { H = 2 * sizeof(value_t) }; std::printf("%s{ %*llu, %0*llX, %0*llX, %0*llX }", prefix ? prefix : "", count_digits, uint64_t(digest.count), H, uint64_t(digest.sum), H, uint64_t(digest.product), H, uint64_t(digest.checksum) ); } template struct digest_t: digest_pod_t { digest_t () { this->count = this->sum = this->checksum = 0; this->product = 1; } digest_t (word_t c, word_t s, word_t p, word_t x) { this->count = c; this->sum = s; this->product = p; this->checksum = x; } }; typedef digest_t digest32_t; typedef digest_t digest64_t; /////////////////////////////////////////////////////////////////////////////////////////////////// // all ranges have been verified against the primesieve.org console executable, except for those // which exceed its cap (18446744030759878665 == UINT64_MAX - 10 * UINT32_MAX). struct test_range_t { uint64_t lo, hi; digest_pod_t digest; }; #define _( lo, hi, first_prime, last_prime, cnt, sum, prd, chk ) \ { lo ## u, hi ## u, { cnt ## u, 0x ## sum ## u, 0x ## prd ## u, 0x ## chk ## u } } test_range_t const Ranges_1M[] = { _( 1, 10000000 /* 10000000 */, 2, 9999991 /* 9999990 */, 664579, 000002E9D50C6334, E284CDD9939E68AA, 13F0C0CBB8773897 ), _( 81673073, 100000000 /* 18326928 */, 81673073, 99999989 /* 18326917 */, 1000000, 00005299CB5A5478, 7236ED226E8BA3D5, 8B636C3E423B6BAA ), _( 979292933, 1000000000 /* 20707068 */, 979292933, 999999937 /* 20707005 */, 1000000, 000384138D549EA0, 879FC41968FAD3E9, EAFCC52696740814 ), _( 9976982339, 10000000000 /* 23017662 */, 9976982339, 9999999967 /* 23017629 */, 1000000, 00237C7AA33A931A, FEEAC78D65A10667, D7C1D531E4400712 ), _( 99974683489, 100000000000 /* 25316512 */, 99974683489, 99999999977 /* 25316489 */, 1000000, 016339F4D1418D36, A1C1A63E3E302FEF, 478434FCE8AB133E ), _( 999972400027, 1000000000000 /* 27599974 */, 999972400027, 999999999989 /* 27599963 */, 1000000, 0DE0AA269BDFD8BA, CEE122E76B72227F, D50470D359BE4DFE ), _( 9999970054519, 10000000000000 /* 29945482 */, 9999970054519, 9999999999971 /* 29945453 */, 1000000, 8AC715685B09F258, 8593DE5292613F85, 8B32A42E25B99C64 ), _( 99999967808393, 100000000000000 /* 32191608 */, 99999967808393, 99999999999973 /* 32191581 */, 1000000, 6BC74F88FD0C5A6C, 1AB5F68D9123D491, D9CDC114AA6B0AAE ), _( 999999965485057, 1000000000000000 /* 34514944 */, 999999965485057, 999999999999989 /* 34514933 */, 1000000, 35C99E13AAF0731E, FC1D792962ADB95B, 1AF84035F20F6A04 ), _( 9999999963142769, 10000000000000000 /* 36857232 */, 9999999963142769, 9999999999999937 /* 36857169 */, 1000000, 19E0B8F8BCF0E2DE, 98216464A377198F, D6CB044A5230ADB8 ), _( 99999999960819169, 100000000000000000 /* 39180832 */, 99999999960819169, 99999999999999997 /* 39180829 */, 1000000, 02C7CF7BB7826270, 3B6966E20059EEA9, 5D85F14A7AF08E06 ), _( 999999999958516801, 1000000000000000000 /* 41483200 */, 999999999958516801, 999999999999999989 /* 41483189 */, 1000000, 1BCEBA0D5ECD2FEE, 5946DFC574AB3CF3, D118C1B8D724181C ), _( 9999999999956286497, 10000000000000000000 /* 43713504 */, 9999999999956286497, 9999999999999999961 /* 43713465 */, 1000000, 1613ED6695FB8F08, C1237D83F03C887D, 862E500F9255057E ), _( 18446744030715545269, 18446744030759878665 /* 44333397 */, 18446744030715545269, 18446744030759878627 /* 44333359 */, 1000000, FF675557CDDE0298, 7EA5838E55BEEEC5, 71F3BDD5137CBDEC ), _( 18446744073665204447, 18446744073709551615 /* 44347169 */, 18446744073665204447, 18446744073709551557 /* 44347111 */, 1000000, FFFFEBD3966C841C, 5E37B0500AB5B8BD, 995B0CEF6CA1E4A8 ) }; test_range_t const Ranges_10M[] = { _( 1, 100000000 /* 100000000 */, 2, 99999989 /* 99999988 */, 5761455, 0000FDF0985FB44C, 6D921F88B2D434D2, C0B16F72901E85FF ), _( 793877173, 1000000000 /* 206122828 */, 793877173, 999999937 /* 206122765 */, 10000000, 001FDBE1C333ECF8, 651A0A9574621079, C45243BCC5A786A2 ), _( 9769857367, 10000000000 /* 230142634 */, 9769857367, 9999999967 /* 230142601 */, 10000000, 015F2EBED9399456, 9E5053E10173B0AB, 13AC8B7227297280 ), _( 99746761237, 100000000000 /* 253238764 */, 99746761237, 99999999977 /* 253238741 */, 10000000, 0DDC371E71307554, AABB2057BB24D2C9, C6B49F02CC79C6F4 ), _( 999723733787, 1000000000000 /* 276266214 */, 999723733787, 999999999989 /* 276266203 */, 10000000, 8AC23ACFC174A930, 66BAD7F180005B21, 23284B3010659CD0 ), _( 9999700629011, 10000000000000 /* 299370990 */, 9999700629011, 9999999999971 /* 299370961 */, 10000000, 6BC20C8E38DE8F36, ED985713434EEDFB, 7B1FD08D2CF8ECD4 ), _( 99999677617663, 100000000000000 /* 322382338 */, 99999677617663, 99999999999973 /* 322382311 */, 10000000, 35C3F3CFA654F69A, 72861CA76E6E313F, F8CCDA86C924E3C6 ), _( 999999654617569, 1000000000000000 /* 345382432 */, 999999654617569, 999999999999989 /* 345382421 */, 10000000, 19DAA726AC5A2024, 2E27E7F123B0F7B1, 38455B855C8EECAE ), _( 9999999631636541, 10000000000000000 /* 368363460 */, 9999999631636541, 9999999999999937 /* 368363397 */, 10000000, 02C1563C4B6432C4, C36AB4E6E6B37F65, D29BB9FED3D6DA14 ), _( 99999999608465399, 100000000000000000 /* 391534602 */, 99999999608465399, 99999999999999997 /* 391534599 */, 10000000, 1BC7D81E877E2454, 292A3536FD400481, 9395B877E4E6EEC2 ), _( 999999999585415333, 1000000000000000000 /* 414584668 */, 999999999585415333, 999999999999999989 /* 414584657 */, 10000000, 160CA3C90CF00EF0, 7E1A4A66DE1CA111, ADB5D2CEF74C6B04 ), _( 9999999999562573471, 10000000000000000000 /* 437426530 */, 9999999999562573471, 9999999999999999961 /* 437426491 */, 10000000, DCC0478D1CC9FA52, 2F64319F30800833, D59808CFCB4ADAEA ), _( 18446744030316425227, 18446744030759878665 /* 443453439 */, 18446744030316425227, 18446744030759878627 /* 443453401 */, 10000000, FA023EF2BBA7CDA2, 4C5D19CF0CE2099F, 250841775ECFADF0 ), _( 18446744073265777349, 18446744073709551615 /* 443774267 */, 18446744073265777349, 18446744073709551557 /* 443774209 */, 10000000, FFF81E195834C5F0, 4EA8082765E419D5, 0FEE0BBB15025848 ) }; #undef _ digest_pod_t Digest_4G = { 203280221, 0x05E8362E0C903F86u, 0x9B20CCAA5B253F12u, 0xE364FF90C46F979Cu }; /////////////////////////////////////////////////////////////////////////////////////////////////// // Computing the maximum factor - i.e. the integer square root - via uint32_t(sqrt(double(n))) is // not safe, for various reasons. One of them is that the result for UINT64_MAX would be 0 instead // of UINT32_MAX. See http://codereview.stackexchange.com/questions/69069 // // Simpler versions work well enough with VC++ and gcc but the version shown here is the only one // that is guaranteed to return the correct result regardless of rounding mode, fp strictness and // compiler optimisation settings. Since the function is not time-critical I did not clean it up, // to show the strange antisymmetry. The absolute minimum necessary here is checking whether the // float result is greater than UINT32_MAX before casting it to integral type; in that case the // worst that can happen is that the result is one or two above the perfect result, leading to at // most one unnecessary iteration of the outer sieve loop. That can be avoided in almost all cases // by decrementing the result once if its square is greater than n; in a tiny fraction of cases a // decrement of two would be necessary, but that is extremely rare, less than one in a billion. template word_t square (word_t n) { return word_t(n * n); } uintxx_t max_factor32 (uint64_t n) { uint64_t r = uint64_t(std::sqrt(double(n))); while (r < UINT32_MAX && square(r) < n) ++r; while (r > UINT32_MAX || square(r) > n) --r; return uintxx_t(r); } /////////////////////////////////////////////////////////////////////////////////////////////////// unsigned char odd_composites32[UINT32_MAX / (2 * CHAR_BIT) + 1]; // the small factor sieve uintxx_t sieved_bits = 0; // how far it's been initialised void extend_factor_sieve_to_cover (uintxx_t max_factor_bit); // bit, not number! void sieve32 (unsigned char *target_segment, uint64_t bit_offset, uintxx_t bit_count) { assert( bit_count > 0 && bit_count <= UINT32_MAX / 2 + 1 ); uintxx_t max_bit = bit_count - 1; uint64_t max_num = 2 * (bit_offset + max_bit) + 1; uintxx_t max_factor_bit = (max_factor32(max_num) - 1) / 2; if (target_segment != odd_composites32) { extend_factor_sieve_to_cover(max_factor_bit); } std::memset(target_segment, 0, std::size_t((max_bit + CHAR_BIT) / CHAR_BIT)); for (uintxx_t i = 3u >> 1; i <= max_factor_bit; ++i) { if (bit(odd_composites32, i)) continue; uintxx_t n = (i << 1) + 1; // the actual prime represented by bit i (< 2^32) uintxx_t stride = n; // == (n * 2) / 2 uint64_t start = (uint64_t(n) * n) >> 1; uintxx_t k; if (start >= bit_offset) { k = uintxx_t(start - bit_offset); } else // start < offset { k = stride - (bit_offset - start - 1) % stride - 1; } while (k <= max_bit) { set_bit(target_segment, k); if ((k += stride) < stride) // k can wrap since strides go up to almost 2^32 { break; } } } } //------------------------------------------------------------------------------------------------- // Using sieve32() for sieving the small factors as well here; that takes about three times as long // as a specialised small sieve with all the bells and whistles but it is also a couple hundred // lines of code shorter. See zrbj_sx_sieve32_v4.cpp for the full Monty. Sieving in small segments // is a necessity since doing it in one go would slow things down by an order of magnitude. void extend_factor_sieve_to_cover (uintxx_t max_factor_bit) { uintxx_t const SIEVE_BITS = sizeof(odd_composites32) * CHAR_BIT; assert( max_factor_bit < SIEVE_BITS ); if (max_factor_bit >= sieved_bits) { assert( sieved_bits % CHAR_BIT == 0 ); uintxx_t bits_to_sieve = max_factor_bit - sieved_bits + 1; uintxx_t segment_size = 1u << 18; // == 32K * CHAR_BIT == L1 cache size uintxx_t partial_bits = bits_to_sieve % segment_size; uintxx_t segments = bits_to_sieve / segment_size + (partial_bits != 0); if ((SIEVE_BITS - sieved_bits) % segment_size == 0) { partial_bits = 0; // always sieve full segments as long as there is enough space } for ( ; segments--; ) { uintxx_t bits_this_round = segments == 0 && partial_bits ? partial_bits : segment_size; sieve32(odd_composites32 + sieved_bits / CHAR_BIT, sieved_bits, bits_this_round); sieved_bits += bits_this_round; } } } /////////////////////////////////////////////////////////////////////////////////////////////////// // utility functions for the tests namespace util { uint32_t random_seed32 = 42; // must be nonzero uint32_t random_uint32 () // xorshift32* { uint32_t x = random_seed32; x ^= x << 13; x ^= x >> 17; x ^= x << 5; random_seed32 = x; return x * 0xEA4E58EDu; } uint32_t random_uint32 (uint32_t modulus) { if (modulus > 1) { for ( ; ; ) { uint32_t raw_bits = random_uint32(); uint32_t result = raw_bits % modulus; uint32_t check = uint32_t(raw_bits - result + modulus); if (check >= raw_bits || check == 0) { return result; } } } return 0; } double ms (std::clock_t clock_delta) { return 1000 * double(clock_delta) / CLOCKS_PER_SEC; } void terminate_with_failure (char const *message = 0) { std::printf("%s", message ? message : "\n\n*** FAIL ***\n"); std::exit(3); } } // namepace util /////////////////////////////////////////////////////////////////////////////////////////////////// // various tests... void time_factor_sieve () { std::printf("\ninitialising the small factor sieve... "); sieved_bits = 0; std::clock_t t0 = std::clock(); extend_factor_sieve_to_cover(UINT32_MAX / 2 - 1); double seconds = double(std::clock() - t0) / CLOCKS_PER_SEC; std::printf("%.3f s (%.1f M/s)", seconds, sieved_bits * std::ldexp(2.0, -20) / seconds); std::printf("\nverifying... "); digest64_t digest; t0 = std::clock(); digest.add_odd_composites_bitmap(odd_composites32, 0, sieved_bits - 1, 0); seconds = double(std::clock() - t0) / CLOCKS_PER_SEC; std::printf("%.3f s (%.1f M/s)", seconds, sieved_bits * std::ldexp(2.0, -20) / seconds); if (digest != Digest_4G) { std::printf("\n\ndigest != Digest_4G\n"); print("\ntst ", digest); print("\nref ", Digest_4G); util::terminate_with_failure(); } } //------------------------------------------------------------------------------------------------- void test_factor_sieve () { std::printf("\ntesting incremental updates of the small factor sieve..."); uint32_t ref_bytes = uint32_t(sieved_bits) / CHAR_BIT; std::vector ref(odd_composites32, odd_composites32 + ref_bytes); uint32_t compared = 0; uint32_t limit = sizeof(odd_composites32) * CHAR_BIT; sieved_bits = 0; while (sieved_bits < limit) { uint32_t growth_limit = 1u << 23; if (util::random_uint32(100) < 33) growth_limit <<= 4; if (util::random_uint32(100) < 33) growth_limit >>= 4; growth_limit = std::min(growth_limit, limit - uint32_t(sieved_bits)); uint32_t extend_by = util::random_uint32(growth_limit); std::printf("\n %8X + %7X... ", uint32_t(sieved_bits), extend_by); std::clock_t t0 = std::clock(); extend_factor_sieve_to_cover(sieved_bits + extend_by); std::printf("%6.1f ms", util::ms(std::clock() - t0)); uint32_t full_bytes = std::min(ref_bytes, uint32_t(sieved_bits) / CHAR_BIT); if (uint32_t compare_now = full_bytes - compared) { if (std::memcmp(odd_composites32, &ref[0], compare_now)) { util::terminate_with_failure(); } compared += compare_now; } } std::printf("\n-> sieved_bits = %8X", uint32_t(sieved_bits)); if (compared < sizeof(odd_composites32)) { std::printf("\nNOTE: only %u of %u sieve bytes have been compared", compared, uint32_t(sizeof(odd_composites32)) ); } } /////////////////////////////////////////////////////////////////////////////////////////////////// void sieve_and_print_range ( uint64_t start, // number, not bit index uint32_t count, digest_pod_t const *ref = 0 ) { unsigned odd_start = unsigned(start & 1); if (count < 2 - odd_start) return; std::printf("\n%20llu .. %20llu", start, start + count - 1); uint32_t max_bit = (count - (2 - odd_start)) >> 1; std::vector buffer((max_bit + CHAR_BIT) / CHAR_BIT); unsigned char *bitmap = &buffer[0]; std::clock_t t0 = std::clock(); if (count > 1 - odd_start) { sieve32(bitmap, start >> 1, max_bit + 1); } double ms = util::ms(std::clock() - t0); std::printf(" %6.0f ms %6.1f M/s", ms, count / ms * 1000 * std::ldexp(2, -20)); digest64_t digest; if (count > 1 - odd_start && start + count - 1 >= 2) { if (start == 2) digest.add_prime(2); // digest does not add 2 by itself in this case digest.add_odd_composites_bitmap(bitmap, 0, max_bit + 1, start >> 1); } std::printf(" %9u ", uint32_t(digest.count)); if (ref != 0) // bloody gcc complains if inner conditional is not wrapped in braces { if (digest == *ref) std::printf(" OK"); else util::terminate_with_failure(); } } //------------------------------------------------------------------------------------------------- void sieve_and_print_range (test_range_t const &range) { assert( range.lo <= range.hi ); assert( range.hi - range.lo + 1 <= UINT32_MAX ); sieve_and_print_range(range.lo, uint32_t(range.hi - range.lo + 1), &range.digest); } //------------------------------------------------------------------------------------------------- void sieve_and_print_ranges (test_range_t const *ranges, std::size_t count) { for (test_range_t const *p = ranges, *e = p + count; p < e; ++p) { sieve_and_print_range(*p); } } //------------------------------------------------------------------------------------------------- template void sieve_and_print_ranges (test_range_t const (&ranges)[N]) { sieve_and_print_ranges(ranges, N); } /////////////////////////////////////////////////////////////////////////////////////////////////// int main () { std::printf("\nsizeof(uintxx_t) == %u", unsigned(sizeof(uintxx_t))); std::printf("\nsizeof(void*) == %u\n", unsigned(sizeof(void*))); time_factor_sieve(); test_factor_sieve(); std::printf("\nsieving some ranges (1M)..."); sieve_and_print_ranges(Ranges_1M); std::printf("\nsieving some bigger ranges (10M)..."); sieve_and_print_ranges(Ranges_10M); std::printf("\nsieving with maximum window size..."); sieve_and_print_range(0, UINT32_MAX, &Digest_4G); sieve_and_print_range(1, UINT32_MAX, &Digest_4G); sieve_and_print_range(2, UINT32_MAX, &Digest_4G); return 0; }