Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // zrbj_sx_sieve32.cpp - test program for odd segmented Sieve of Eratosthenes (sieve init)
- // 2014-11-01 DarthGizka <KnightOwl@live.it>
- //
- // Ingredients for speed:
- // o odds-only bitmap (i.e. an order 2 wheel which happens to have only one non-zero residue)
- // o sieve in small segments that fit into the L1 cache (16K for Atom, 32K for most else)
- // o zero a segment before sieving to adds an extra boost (even if OS hands over zeroed pages)
- // o avoid BT/BTS intrinsics since these have lock semantics and slow things down drastically
- // o byte write access with stride < word size might cause aliasing problems (i.e. multiple
- // modifying accesses to bytes in the same word)
- // -> ameliorated by bulk replication of the small prime bit pattern via memcpy()
- // -> pattern becomes regular starting with the first word not containing any of the primes
- // (since only multiples get crossed out, not the primes)
- //#include "zrbj/hdr_pre.h"
- #include <cassert>
- #include <cmath>
- #include <ctime>
- #include <climits>
- #include <cstdio>
- #include <cstring>
- #include <vector>
- //#include "zrbj/hdr_post.h"
- enum { unsigned_has_at_least_32_bits = sizeof(char[1 - (sizeof(unsigned) * CHAR_BIT < 32)]) };
- template<typename word_t>
- void set_bit (word_t *p, unsigned index)
- {
- enum { BITS_PER_WORD = sizeof(word_t) * CHAR_BIT };
- // we can trust the compiler to use masking and shifting instead of division; we cannot do that
- // ourselves without having the log2 which cannot easily be computed as a constexpr
- p[index / BITS_PER_WORD] |= word_t(1) << (index % BITS_PER_WORD);
- }
- template<typename word_t>
- unsigned char bit (word_t const *p, unsigned index)
- {
- enum { BITS_PER_WORD = sizeof(word_t) * CHAR_BIT };
- return (unsigned char)((p[index / BITS_PER_WORD] >> (index % BITS_PER_WORD)) & 1u);
- }
- //-------------------------------------------------------------------------------------------------
- struct prime_and_offset_t
- {
- unsigned prime; // needs at most the 16 bits guaranteed by the standard
- unsigned next_offset; // this needs 32 bits
- prime_and_offset_t (unsigned p, unsigned o): prime(p), next_offset(o) { }
- };
- //-------------------------------------------------------------------------------------------------
- void initialise_odd_primes_and_offsets_64K (std::vector<prime_and_offset_t> &result)
- {
- typedef unsigned char byte;
- result.clear();
- result.reserve(6541); // # of odd primes below 2^16
- // bitmap has 32K, so putting it on the stack is not safe (besides, bitset<> is a C++11 feature)
- std::vector<byte> odd_composites_bitmap((1 << 15) / CHAR_BIT);
- byte *odd_composites = &odd_composites_bitmap[0]; // data() is a C++11 feature
- // looping over the odd numbers 3 to 2^16-1 means iterating over bit indices 1 to 2^15-1
- unsigned const max_bit = 65521 >> 1;
- unsigned const max_factor_bit = 255 >> 1;
- for (unsigned k = (3 >> 1); k <= max_factor_bit; ++k)
- {
- if (bit(odd_composites, k)) continue;
- unsigned n = (k << 1) + 1;
- unsigned i = (n * n) >> 1;
- for ( ; i <= max_bit; i += n)
- {
- set_bit(odd_composites, i);
- }
- }
- for (unsigned k = 1; k <= max_bit; ++k)
- {
- if (!bit(odd_composites, k))
- {
- unsigned n = (k << 1) + 1;
- result.push_back(prime_and_offset_t(n, (n * n) >> 1));
- }
- }
- }
- //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
- // the loop logic must be adjusted if segment_bytes is not a power of two
- void initialise_packed_sieve_4G (void *data, unsigned segment_bytes = 1 << 15, unsigned end_bit = 1u << 31)
- {
- typedef std::vector<prime_and_offset_t>::iterator prime_iter_t;
- std::vector<prime_and_offset_t> small_factors;
- initialise_odd_primes_and_offsets_64K(small_factors);
- unsigned segment_bits = segment_bytes * CHAR_BIT;
- unsigned partial_bits = end_bit % segment_bits;
- unsigned segments = end_bit / segment_bits + (partial_bits != 0);
- unsigned char *segment = static_cast<unsigned char *>(data);
- unsigned bytes = segment_bytes;
- for ( ; segments--; segment += segment_bytes)
- {
- if (segments == 0 && partial_bits)
- {
- segment_bits = partial_bits;
- bytes = (partial_bits + CHAR_BIT - 1) / CHAR_BIT;
- }
- std::memset(segment, 0, bytes);
- for (prime_iter_t p = small_factors.begin(); p != small_factors.end(); ++p)
- {
- unsigned n = p->prime;
- unsigned i = p->next_offset;
- for ( ; i < segment_bits; i += n)
- {
- set_bit(segment, i);
- }
- p->next_offset = i - segment_bits;
- }
- }
- }
- ///////////////////////////////////////////////////////////////////////////////////////////////////
- // as far as prime counts and checksums are concerned, bit 0 is treated as a stand-in for the
- // non-representable number 2 (which, being even, is the odd man out in such a bitmap)
- //
- // { 203280220, 0C903F84, 2D929F89, 0DC7071A } 3942.9 ms
- // { 203280221, 0C903F86, 5B253F12, 774A3204 } 4007.3 ms
- struct digest_t
- {
- unsigned count;
- unsigned sum;
- unsigned product;
- unsigned checksum;
- digest_t () : count(0), sum(0), product(1), checksum(0) { }
- bool operator == (digest_t const &other) const
- {
- return checksum == other.checksum
- && product == other.product
- && sum == other.sum
- && count == other.count;
- }
- void add_prime (unsigned n)
- {
- count += 1;
- sum += n;
- product *= n;
- checksum += n * sum + product;
- }
- void add_odd_composites_bitmap (void const *bitmap, unsigned min_bit = 0, unsigned end_bit = 1u << 31)
- {
- unsigned char const *odd_composites = static_cast<unsigned char const *>(bitmap);
- if (min_bit < end_bit)
- {
- if (min_bit == 0) // conventional stand-in for the non-representable prime 2
- {
- if (bit(odd_composites, 0) == 0) add_prime(2);
- ++min_bit;
- }
- for (unsigned k = min_bit; k < end_bit; ++k)
- {
- if (bit(odd_composites, k)) continue;
- add_prime((k << 1) + 1);
- }
- }
- }
- };
- ///////////////////////////////////////////////////////////////////////////////////////////////////
- int main ()
- {
- unsigned sieve_bits = /**/ 1u << 31 /*/ 1000000000u >> 1 /**/;
- std::printf("\nsieve bits = %u (equiv. number = %u)\n", sieve_bits, (sieve_bits - 1) * 2 + 1);
- std::vector<unsigned char> bmv((sieve_bits + CHAR_BIT - 1) / CHAR_BIT);
- unsigned char *bitmap = &bmv[0];
- for (unsigned size_bits = 12; size_bits < 24; ++size_bits)
- {
- unsigned segment_size = 1u << size_bits;
- std::printf("\nsegment size %7u (2^%2u) bytes ... ", segment_size, size_bits);
- std::clock_t t0 = std::clock();
- initialise_packed_sieve_4G(bitmap, segment_size, sieve_bits);
- double seconds = double(std::clock() - t0) / CLOCKS_PER_SEC;
- std::printf("%7.3f s %8.1f M/s", seconds, sieve_bits * std::ldexp(2, -20) / seconds);
- }
- digest_t d;
- d.add_odd_composites_bitmap(bitmap, 0, sieve_bits);
- std::printf("\n\ndigest { %9u, %08X, %08X, %08X }", d.count, d.sum, d.product, d.checksum);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement