Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // zrbj_sx_sieve32_v4.cpp - test program for odd segmented Sieve of Eratosthenes (sieve init)
- // 2014-11-04 DarthGizka <[email protected]>
- //
- // Ingredients for speed:
- // o odds-only bitmap (i.e. an order 2 wheel which happens to have only one non-zero residue)
- // o sieve in small segments that fit into the L1 cache (16K for Atom, 32K for most else)
- // o zero a segment before sieving to adds an extra boost (even if OS hands over zeroed pages)
- // o avoid BT/BTS intrinsics since these have lock semantics and slow things down drastically
- // o byte write access with stride < word size might cause aliasing problems (i.e. multiple
- // modifying accesses to bytes in the same word)
- // -> ameliorated by bulk replication of the small prime bit pattern via memcpy()
- // -> pattern becomes regular starting with the first word not containing any of the primes
- // (since only multiples get crossed out, not the primes)
- //
- // v4: blasting the presieve pattern over the whole sieve instead of over each segment as it
- // is processed; overall timing improves despite the lost cache-warming from the per-segment
- // initialisation copy
- // (VC++ timings improve to 2.3 s but gcc gains only marginally, 1.9 s vs. 2.1 s in v2)
- //
- // Note: the purpose of this program is to experiment with the algorithmic ideas, to formulate
- // the algorithms as clear and clean as possible, with the least possible amount of the noise
- // that makes so much excellent production code totally unreadable and incomprehensible.
- #undef NDEBUG
- //#include "zrbj/hdr_pre.h"
- #include <cassert>
- #include <cmath>
- #include <ctime>
- #include <climits>
- #include <cstdio>
- #include <cstring>
- #include <vector>
- //#include "zrbj/hdr_post.h"
- enum { unsigned_has_at_least_32_bits = sizeof(char[1 - (sizeof(unsigned) * CHAR_BIT < 32)]) };
- template<typename word_t>
- void set_bit (word_t *p, unsigned index)
- {
- enum { BITS_PER_WORD = sizeof(word_t) * CHAR_BIT };
- // we can trust the compiler to use masking and shifting instead of division; we cannot do that
- // ourselves without having the log2 which cannot easily be computed as a constexpr
- p[index / BITS_PER_WORD] |= word_t(1) << (index % BITS_PER_WORD);
- }
- template<typename word_t>
- void clear_bit (word_t *p, unsigned index)
- {
- enum { BITS_PER_WORD = sizeof(word_t) * CHAR_BIT };
- p[index / BITS_PER_WORD] &= ~(word_t(1) << (index % BITS_PER_WORD));
- }
- template<typename word_t>
- unsigned char bit (word_t const *p, unsigned index)
- {
- enum { BITS_PER_WORD = sizeof(word_t) * CHAR_BIT };
- return (unsigned char)((p[index / BITS_PER_WORD] >> (index % BITS_PER_WORD)) & 1u);
- }
- //-------------------------------------------------------------------------------------------------
- struct prime_and_offset_t
- {
- unsigned prime; // needs at most the 16 bits guaranteed by the standard
- unsigned next_offset; // this needs 32 bits
- prime_and_offset_t (unsigned p, unsigned o): prime(p), next_offset(o) { }
- };
- //-------------------------------------------------------------------------------------------------
- void initialise_odd_primes_and_offsets_64K (std::vector<prime_and_offset_t> &result)
- {
- typedef unsigned char byte;
- result.clear();
- result.reserve(6541); // # of odd primes below 2^16
- // bitmap has 32K, so putting it on the stack is not safe (besides, bitset<> is a C++11 feature)
- std::vector<byte> odd_composites_bitmap((1 << 15) / CHAR_BIT);
- byte *odd_composites = &odd_composites_bitmap[0]; // data() is a C++11 feature
- // looping over the odd numbers 3 to 2^16-1 means iterating over bit indices 1 to 2^15-1
- unsigned const max_bit = 65521 >> 1;
- unsigned const max_factor_bit = 255 >> 1;
- for (unsigned k = (3 >> 1); k <= max_factor_bit; ++k)
- {
- if (bit(odd_composites, k)) continue;
- unsigned n = (k << 1) + 1;
- unsigned i = (n * n) >> 1;
- for ( ; i <= max_bit; i += n)
- {
- set_bit(odd_composites, i);
- }
- }
- for (unsigned k = 1; k <= max_bit; ++k)
- {
- if (!bit(odd_composites, k))
- {
- unsigned n = (k << 1) + 1;
- result.push_back(prime_and_offset_t(n, (n * n) >> 1));
- }
- }
- }
- ///////////////////////////////////////////////////////////////////////////////////////////////////
- // 23 is the last prime for which the running product (111546435) is less than 2^31 / CHAR_BIT;
- // its factor is 2.41 (19: 55.35)
- //
- // # primes (max. prime) vs. presieve time vs. total time (VC++):
- //
- // 2 ( 5) 100 ms 3.207 s
- // 3 ( 7) 42 ms 2.919 s
- // 4 (11) 30 ms 2.758 s
- // 5 (13) 13 ms 2.581 s
- // 6 (17) 18 ms 2.474 s
- // 7 (19) 22 ms 2.380 s
- // 8 (23) 68 ms 2.333 s <-- best overall
- unsigned char const small_odd_primes[] = { 3, 5, 7, 11, 13, 17, 19, 23 };
- unsigned const MAX_PRESIEVE_PRIMES = sizeof(small_odd_primes) / sizeof(small_odd_primes[0]);
- void fill_with_presieve_pattern (void *data, unsigned bytes, unsigned primes)
- {
- assert( primes >= 2 && primes <= MAX_PRESIEVE_PRIMES );
- assert( bytes >= 111546435u );
- static unsigned char const pattern_35[] =
- {
- 0x96, 0x34, 0x4B, 0x9A, 0x25, 0xCD, 0x92, 0x66, 0x49, 0xB3, 0xA4, 0x59, 0xD2, 0x2C, 0x69
- };
- unsigned char *p = static_cast<unsigned char *>(data);
- unsigned size = sizeof(pattern_35);
- std::memcpy(p, pattern_35, size);
- for (unsigned i = 2; i < primes; ++i)
- {
- unsigned n = small_odd_primes[i];
- unsigned new_size = size * n;
- unsigned char *h = p + size, *e = p + new_size;
- for ( ; h < e; h += size) // replicate the current pattern n - 1 times
- {
- std::memcpy(h, p, size);
- }
- size = new_size;
- for (unsigned k = n >> 1, bits = size * CHAR_BIT; k < bits; k += n)
- {
- set_bit(p, k);
- }
- }
- unsigned odd_bytes = bytes % size;
- unsigned char *h = p + size; // unprocessed part starts here
- unsigned char *e = p + bytes - odd_bytes; // end of last multiple
- // self-replicating copy might be possible here but it is not guaranteed to work
- for ( ; h < e; h += size) std::memcpy(h, p, size);
- if (odd_bytes) std::memcpy(e, p, odd_bytes);
- // set the primes themselves back to zero
- for (unsigned i = 0; i < primes; ++i)
- {
- clear_bit(p, small_odd_primes[i] / 2u);
- }
- }
- //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
- // the loop logic must be adjusted if segment_bytes is not a power of two
- unsigned g_presieve_primes = MAX_PRESIEVE_PRIMES;
- void initialise_packed_sieve_4G (void *data, unsigned segment_bytes = 1 << 15, unsigned end_bit = 1u << 31)
- {
- typedef std::vector<prime_and_offset_t>::iterator prime_iter_t;
- std::vector<prime_and_offset_t> small_factors;
- initialise_odd_primes_and_offsets_64K(small_factors);
- fill_with_presieve_pattern(data, (end_bit + CHAR_BIT - 1) / CHAR_BIT, g_presieve_primes);
- unsigned segment_bits = segment_bytes * CHAR_BIT;
- unsigned partial_bits = end_bit % segment_bits;
- unsigned segments = end_bit / segment_bits + (partial_bits != 0);
- for (unsigned offset = 0; segments--; offset += segment_bytes)
- {
- if (segments == 0 && partial_bits) // last segment may not be full size
- {
- segment_bits = partial_bits;
- }
- unsigned char *segment = static_cast<unsigned char *>(data) + offset;
- for (prime_iter_t p = small_factors.begin() + g_presieve_primes; p != small_factors.end(); ++p)
- {
- unsigned n = p->prime;
- unsigned i = p->next_offset;
- for ( ; i < segment_bits; i += n)
- {
- set_bit(segment, i);
- }
- p->next_offset = i - segment_bits;
- }
- }
- }
- ///////////////////////////////////////////////////////////////////////////////////////////////////
- // as far as prime counts and checksums are concerned, bit 0 is treated as a stand-in for the
- // non-representable number 2 (which, being even, is the odd man out in such a bitmap)
- //
- // { 203280220, 0C903F84, 2D929F89, 0DC7071A } 3942.9 ms
- // { 203280221, 0C903F86, 5B253F12, 774A3204 } 4007.3 ms
- struct digest_t
- {
- unsigned count;
- unsigned sum;
- unsigned product;
- unsigned checksum;
- digest_t ()
- : count(0), sum(0), product(1), checksum(0) { }
- digest_t (unsigned c, unsigned s, unsigned p, unsigned x)
- : count(c), sum(s), product(p), checksum(x) { }
- bool operator != (digest_t const &other) const { return !(*this == other); }
- bool operator == (digest_t const &other) const
- {
- return checksum == other.checksum
- && product == other.product
- && sum == other.sum
- && count == other.count;
- }
- void add_prime (unsigned n)
- {
- count += 1;
- sum += n;
- product *= n;
- checksum += n * sum + product;
- }
- void add_odd_composites_bitmap (void const *bitmap, unsigned min_bit = 0, unsigned end_bit = 1u << 31)
- {
- unsigned char const *odd_composites = static_cast<unsigned char const *>(bitmap);
- if (min_bit < end_bit)
- {
- if (min_bit == 0) // conventional stand-in for the non-representable prime 2
- {
- if (bit(odd_composites, 0) == 0) add_prime(2);
- ++min_bit;
- }
- for (unsigned k = min_bit; k < end_bit; ++k)
- {
- if (bit(odd_composites, k)) continue;
- add_prime((k << 1) + 1);
- }
- }
- }
- };
- digest_t const Digest4G( 203280221, 0x0C903F86, 0x5B253F12, 0x774A3204 );
- ///////////////////////////////////////////////////////////////////////////////////////////////////
- int main ()
- {
- unsigned sieve_bits = 1u << 31;
- std::printf("\nsieve bits = %u (equiv. number = %u)", sieve_bits, (sieve_bits - 1) * 2 + 1);
- std::vector<unsigned char> bmv((sieve_bits + CHAR_BIT - 1) / CHAR_BIT);
- unsigned char *bitmap = &bmv[0];
- for (g_presieve_primes = 2; g_presieve_primes <= MAX_PRESIEVE_PRIMES; ++g_presieve_primes)
- {
- std::printf("\n\n*** %u presieve primes (%u to %u) ***\n",
- g_presieve_primes,
- small_odd_primes[0], small_odd_primes[g_presieve_primes - 1] );
- for (unsigned size_bits = 12; size_bits < 20; ++size_bits)
- {
- unsigned segment_size = 1u << size_bits;
- std::printf("\nsegment size %8u (2^%2u) bytes ...", segment_size, size_bits);
- std::clock_t t0 = std::clock();
- initialise_packed_sieve_4G(bitmap, segment_size, sieve_bits);
- double seconds = double(std::clock() - t0) / CLOCKS_PER_SEC;
- std::printf("%7.3f s %8.1f M/s", seconds, sieve_bits * std::ldexp(2.0, -20) / seconds);
- }
- digest_t d;
- d.add_odd_composites_bitmap(bitmap, 0, sieve_bits);
- std::printf("\n\ndigest { %9u, %08X, %08X, %08X }", d.count, d.sum, d.product, d.checksum);
- if (d != Digest4G)
- {
- std::printf("\n*** FAIL ***\nd != Digest4G");
- return 3; // indicate failure
- }
- } // iteration over presieve prime counts
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement