SHARE
TWEET

Sieve of Eratosthenes - segmented with pre-sieving

DarthGizka Nov 3rd, 2014 (edited) 312 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. // zrbj_sx_sieve32_v2.cpp - test program for odd segmented Sieve of Eratosthenes (sieve init)
  2. // 2014-11-03 DarthGizka <KnightOwl@live.it>
  3. //
  4. // Ingredients for speed:
  5. // o odds-only bitmap (i.e. an order 2 wheel which happens to have only one non-zero residue)
  6. // o sieve in small segments that fit into the L1 cache (16K for Atom, 32K for most else)
  7. //   o zero a segment before sieving to adds an extra boost (even if OS hands over zeroed pages)
  8. // o avoid BT/BTS intrinsics since these have lock semantics and slow things down drastically
  9. // o byte write access with stride < word size might cause aliasing problems (i.e. multiple
  10. //   modifying accesses to bytes in the same word)
  11. //   -> ameliorated by bulk replication of the small prime bit pattern via memcpy()
  12. //   -> pattern becomes regular starting with the first word not containing any of the primes
  13. //      (since only multiples get crossed out, not the primes)
  14.  
  15. //#include "zrbj/hdr_pre.h"
  16. #include <cassert>
  17. #include <cmath>
  18. #include <ctime>
  19. #include <climits>
  20. #include <cstdio>
  21. #include <cstring>
  22. #include <vector>
  23. //#include "zrbj/hdr_post.h"
  24.  
  25. enum {  unsigned_has_at_least_32_bits = sizeof(char[1 - (sizeof(unsigned) * CHAR_BIT < 32)])  };
  26.  
  27. template<typename word_t>
  28. void set_bit (word_t *p, unsigned index)
  29. {
  30.    enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };
  31.  
  32.    // we can trust the compiler to use masking and shifting instead of division; we cannot do that
  33.    // ourselves without having the log2 which cannot easily be computed as a constexpr
  34.    
  35.    p[index / BITS_PER_WORD] |= word_t(1) << (index % BITS_PER_WORD);
  36. }
  37.  
  38. template<typename word_t>
  39. unsigned char bit (word_t const *p, unsigned index)
  40. {
  41.    enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };
  42.  
  43.    return (unsigned char)((p[index / BITS_PER_WORD] >> (index % BITS_PER_WORD)) & 1u);
  44. }
  45.  
  46. //-------------------------------------------------------------------------------------------------
  47.  
  48. struct prime_and_offset_t
  49. {
  50.    unsigned prime;         // needs at most the 16 bits guaranteed by the standard
  51.    unsigned next_offset;   // this needs 32 bits
  52.  
  53.    prime_and_offset_t (unsigned p, unsigned o): prime(p), next_offset(o)  {  }
  54. };
  55.  
  56. //-------------------------------------------------------------------------------------------------
  57.  
  58. void initialise_odd_primes_and_offsets_64K (std::vector<prime_and_offset_t> &result)
  59. {
  60.    typedef unsigned char byte;
  61.  
  62.    result.clear();
  63.    result.reserve(6541);  // # of odd primes below 2^16
  64.  
  65.    // bitmap has 32K, so putting it on the stack is not safe (besides, bitset<> is a C++11 feature)
  66.    std::vector<byte> odd_composites_bitmap((1 << 15) / CHAR_BIT);
  67.    byte *odd_composites = &odd_composites_bitmap[0];  // data() is a C++11 feature
  68.  
  69.    // looping over the odd numbers 3 to 2^16-1 means iterating over bit indices 1 to 2^15-1
  70.    unsigned const max_bit = 65521 >> 1;
  71.    unsigned const max_factor_bit = 255 >> 1;
  72.  
  73.    for (unsigned k = (3 >> 1); k <= max_factor_bit; ++k)
  74.    {
  75.       if (bit(odd_composites, k))  continue;
  76.  
  77.       unsigned n = (k << 1) + 1;
  78.       unsigned i = (n * n) >> 1;
  79.  
  80.       for ( ; i <= max_bit; i += n)
  81.       {
  82.          set_bit(odd_composites, i);
  83.       }
  84.    }
  85.  
  86.    for (unsigned k = 1; k <= max_bit; ++k)
  87.    {
  88.       if (!bit(odd_composites, k))
  89.       {
  90.          unsigned n = (k << 1) + 1;
  91.          
  92.          result.push_back(prime_and_offset_t(n, (n * n) >> 1));
  93.       }
  94.    }
  95. }
  96.  
  97. ///////////////////////////////////////////////////////////////////////////////////////////////////
  98.  
  99. unsigned const LAST_PRESIEVE_PRIME = 13;
  100. unsigned const PATTERN_SIZE = 3 * 5 * 7 * 11 * 13 * (LAST_PRESIEVE_PRIME == 17 ? 17 : 1);
  101. unsigned const PRESIEVED_PRIMES = 5 + (LAST_PRESIEVE_PRIME == 17);
  102. unsigned char const PRESIEVE_0_FLIP = 0x6Eu;
  103.  
  104. void make_presieve_pattern_v1 (std::vector<unsigned char> &result, unsigned segment_bytes)
  105. {
  106.    static unsigned char const pattern_35[] =
  107.    {
  108.       0x96, 0x34, 0x4B, 0x9A, 0x25, 0xCD, 0x92, 0x66, 0x49, 0xB3, 0xA4, 0x59, 0xD2, 0x2C, 0x69
  109.    };
  110.      
  111.    result.resize(segment_bytes + PATTERN_SIZE);
  112.  
  113.    unsigned char *pattern = &result[0];
  114.    unsigned size = sizeof(pattern_35);
  115.  
  116.    std::memcpy(pattern, pattern_35, size);
  117.  
  118.    for (unsigned n = 7; n <= LAST_PRESIEVE_PRIME; n += 2)
  119.    {
  120.       n += (n % 3 == 0) * 2;
  121.       for (unsigned i = 1; i < n; ++i)  std::memcpy(pattern + i * size, pattern, size);
  122.       size *= n;
  123.       for (unsigned i = n >> 1, bits = size * CHAR_BIT; i < bits; i += n)  set_bit(pattern, i);
  124.    }
  125.  
  126.    unsigned char *p = pattern + PATTERN_SIZE, *e = pattern + segment_bytes;
  127.  
  128.    for ( ; p < e; p += PATTERN_SIZE)  std::memcpy(p, pattern, PATTERN_SIZE);
  129.  
  130.    if (unsigned rest = unsigned(e + PATTERN_SIZE - p))  std::memcpy(p, pattern, rest);
  131.  
  132.    pattern[0] &= ~PRESIEVE_0_FLIP;
  133. }
  134.  
  135. //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  136. // the loop logic must be adjusted if segment_bytes is not a power of two
  137.  
  138. void initialise_packed_sieve_4G (void *data, unsigned segment_bytes = 1 << 15, unsigned end_bit = 1u << 31)
  139. {
  140.    typedef std::vector<prime_and_offset_t>::iterator prime_iter_t;
  141.    std::vector<prime_and_offset_t> small_factors;
  142.    std::vector<unsigned char> presieve_pattern;
  143.  
  144.    initialise_odd_primes_and_offsets_64K(small_factors);
  145.    make_presieve_pattern_v1(presieve_pattern, segment_bytes);
  146.  
  147.    unsigned segment_bits = segment_bytes * CHAR_BIT;
  148.    unsigned partial_bits = end_bit % segment_bits;
  149.    unsigned segments     = end_bit / segment_bits + (partial_bits != 0);
  150.  
  151.    unsigned bytes = segment_bytes, offset = 0;
  152.  
  153.    for ( ; segments--; offset += segment_bytes)
  154.    {
  155.       if (segments == 0 && partial_bits)
  156.       {
  157.          segment_bits = partial_bits;
  158.          bytes = (partial_bits + CHAR_BIT - 1) / CHAR_BIT;
  159.       }
  160.      
  161.       unsigned char *segment = static_cast<unsigned char *>(data) + offset;
  162.  
  163.       std::memcpy(segment, &presieve_pattern[offset % PATTERN_SIZE], bytes);
  164.      
  165.       for (prime_iter_t p = small_factors.begin() + PRESIEVED_PRIMES; p != small_factors.end(); ++p)
  166.       {
  167.          unsigned n = p->prime;
  168.          unsigned i = p->next_offset;
  169.  
  170.          for ( ; i < segment_bits; i += n)
  171.          {
  172.             set_bit(segment, i);
  173.          }
  174.  
  175.           p->next_offset = i - segment_bits;
  176.       }
  177.  
  178.       presieve_pattern[0] |= PRESIEVE_0_FLIP;
  179.    }
  180. }
  181.  
  182. ///////////////////////////////////////////////////////////////////////////////////////////////////
  183. // as far as prime counts and checksums are concerned, bit 0 is treated as a stand-in for the
  184. // non-representable number 2 (which, being even, is the odd man out in such a bitmap)
  185. //
  186. // { 203280220, 0C903F84, 2D929F89, 0DC7071A } 3942.9 ms
  187. // { 203280221, 0C903F86, 5B253F12, 774A3204 } 4007.3 ms
  188.  
  189. struct digest_t
  190. {
  191.    unsigned count;
  192.    unsigned sum;
  193.    unsigned product;
  194.    unsigned checksum;
  195.  
  196.    digest_t () : count(0), sum(0), product(1), checksum(0)  {  }
  197.  
  198.    bool operator == (digest_t const &other) const
  199.    {  
  200.       return checksum == other.checksum
  201.           && product  == other.product
  202.           && sum      == other.sum
  203.           && count    == other.count;
  204.    }
  205.  
  206.    void add_prime (unsigned n)
  207.    {
  208.       count    += 1;
  209.       sum      += n;
  210.       product  *= n;
  211.       checksum += n * sum + product;
  212.    }
  213.  
  214.    void add_odd_composites_bitmap (void const *bitmap, unsigned min_bit = 0, unsigned end_bit = 1u << 31)
  215.    {
  216.       unsigned char const *odd_composites = static_cast<unsigned char const *>(bitmap);
  217.  
  218.       if (min_bit < end_bit)
  219.       {
  220.          if (min_bit == 0)  // conventional stand-in for the non-representable prime 2
  221.          {
  222.             if (bit(odd_composites, 0) == 0)  add_prime(2);
  223.    
  224.             ++min_bit;
  225.          }
  226.  
  227.          for (unsigned k = min_bit; k < end_bit; ++k)
  228.          {
  229.             if (bit(odd_composites, k))  continue;
  230.  
  231.             add_prime((k << 1) + 1);
  232.          }
  233.       }
  234.    }
  235. };
  236.  
  237. ///////////////////////////////////////////////////////////////////////////////////////////////////
  238.  
  239. int main ()
  240. {
  241.    unsigned sieve_bits = /**/ 1u << 31 /*/ 1000000000u >> 1 /**/;
  242.  
  243.    std::printf("\nsieve bits = %u (equiv. number = %u)\n", sieve_bits, (sieve_bits - 1) * 2 + 1);
  244.  
  245.    std::vector<unsigned char> bmv((sieve_bits + CHAR_BIT - 1) / CHAR_BIT);
  246.    unsigned char *bitmap = &bmv[0];
  247.  
  248.    for (unsigned size_bits = 12; size_bits < 25; ++size_bits)
  249.    {
  250.       unsigned segment_size = 1u << size_bits;
  251.      
  252.       std::printf("\nsegment size %8u (2^%2u) bytes ...", segment_size, size_bits);
  253.  
  254.       std::clock_t t0 = std::clock();
  255.       initialise_packed_sieve_4G(bitmap, segment_size, sieve_bits);
  256.       double seconds = double(std::clock() - t0) / CLOCKS_PER_SEC;
  257.  
  258.       std::printf("%7.3f s %8.1f M/s", seconds, sieve_bits * std::ldexp(2, -20) / seconds);
  259.    }
  260.  
  261.    digest_t d;
  262.    
  263.    d.add_odd_composites_bitmap(bitmap, 0, sieve_bits);
  264.  
  265.    std::printf("\n\ndigest { %9u, %08X, %08X, %08X }", d.count, d.sum, d.product, d.checksum);
  266.  
  267.    return 0;
  268. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top