Advertisement
DarthGizka

Sieve of Eratosthenes - segmented with pre-sieving #2

Nov 4th, 2014
770
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 11.18 KB | None | 0 0
  1. // zrbj_sx_sieve32_v4.cpp - test program for odd segmented Sieve of Eratosthenes (sieve init)
  2. // 2014-11-04 DarthGizka <KnightOwl@live.it>
  3. //
  4. // Ingredients for speed:
  5. // o odds-only bitmap (i.e. an order 2 wheel which happens to have only one non-zero residue)
  6. // o sieve in small segments that fit into the L1 cache (16K for Atom, 32K for most else)
  7. //   o zero a segment before sieving to adds an extra boost (even if OS hands over zeroed pages)
  8. // o avoid BT/BTS intrinsics since these have lock semantics and slow things down drastically
  9. // o byte write access with stride < word size might cause aliasing problems (i.e. multiple
  10. //   modifying accesses to bytes in the same word)
  11. //   -> ameliorated by bulk replication of the small prime bit pattern via memcpy()
  12. //   -> pattern becomes regular starting with the first word not containing any of the primes
  13. //      (since only multiples get crossed out, not the primes)
  14. //
  15. // v4: blasting the presieve pattern over the whole sieve instead of over each segment as it
  16. //     is processed; overall timing improves despite the lost cache-warming from the per-segment
  17. //     initialisation copy
  18. //     (VC++ timings improve to 2.3 s but gcc gains only marginally, 1.9 s vs. 2.1 s in v2)
  19. //
  20. // Note: the purpose of this program is to experiment with the algorithmic ideas, to formulate
  21. // the algorithms as clear and clean as possible, with the least possible amount of the noise
  22. // that makes so much excellent production code totally unreadable and incomprehensible.
  23.  
  24. #undef NDEBUG
  25.  
  26. //#include "zrbj/hdr_pre.h"
  27. #include <cassert>
  28. #include <cmath>
  29. #include <ctime>
  30. #include <climits>
  31. #include <cstdio>
  32. #include <cstring>
  33. #include <vector>
  34. //#include "zrbj/hdr_post.h"
  35.  
  36. enum {  unsigned_has_at_least_32_bits = sizeof(char[1 - (sizeof(unsigned) * CHAR_BIT < 32)])  };
  37.  
  38. template<typename word_t>
  39. void set_bit (word_t *p, unsigned index)
  40. {
  41.    enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };
  42.  
  43.    // we can trust the compiler to use masking and shifting instead of division; we cannot do that
  44.    // ourselves without having the log2 which cannot easily be computed as a constexpr
  45.    
  46.    p[index / BITS_PER_WORD] |= word_t(1) << (index % BITS_PER_WORD);
  47. }
  48.  
  49. template<typename word_t>
  50. void clear_bit (word_t *p, unsigned index)
  51. {
  52.    enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };
  53.  
  54.    p[index / BITS_PER_WORD] &= ~(word_t(1) << (index % BITS_PER_WORD));
  55. }
  56.  
  57. template<typename word_t>
  58. unsigned char bit (word_t const *p, unsigned index)
  59. {
  60.    enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };
  61.  
  62.    return (unsigned char)((p[index / BITS_PER_WORD] >> (index % BITS_PER_WORD)) & 1u);
  63. }
  64.  
  65. //-------------------------------------------------------------------------------------------------
  66.  
  67. struct prime_and_offset_t
  68. {
  69.    unsigned prime;         // needs at most the 16 bits guaranteed by the standard
  70.    unsigned next_offset;   // this needs 32 bits
  71.  
  72.    prime_and_offset_t (unsigned p, unsigned o): prime(p), next_offset(o)  {  }
  73. };
  74.  
  75. //-------------------------------------------------------------------------------------------------
  76.  
  77. void initialise_odd_primes_and_offsets_64K (std::vector<prime_and_offset_t> &result)
  78. {
  79.    typedef unsigned char byte;
  80.  
  81.    result.clear();
  82.    result.reserve(6541);  // # of odd primes below 2^16
  83.  
  84.    // bitmap has 32K, so putting it on the stack is not safe (besides, bitset<> is a C++11 feature)
  85.    std::vector<byte> odd_composites_bitmap((1 << 15) / CHAR_BIT);
  86.    byte *odd_composites = &odd_composites_bitmap[0];  // data() is a C++11 feature
  87.  
  88.    // looping over the odd numbers 3 to 2^16-1 means iterating over bit indices 1 to 2^15-1
  89.    unsigned const max_bit = 65521 >> 1;
  90.    unsigned const max_factor_bit = 255 >> 1;
  91.  
  92.    for (unsigned k = (3 >> 1); k <= max_factor_bit; ++k)
  93.    {
  94.       if (bit(odd_composites, k))  continue;
  95.  
  96.       unsigned n = (k << 1) + 1;
  97.       unsigned i = (n * n) >> 1;
  98.  
  99.       for ( ; i <= max_bit; i += n)
  100.       {
  101.          set_bit(odd_composites, i);
  102.       }
  103.    }
  104.  
  105.    for (unsigned k = 1; k <= max_bit; ++k)
  106.    {
  107.       if (!bit(odd_composites, k))
  108.       {
  109.          unsigned n = (k << 1) + 1;
  110.          
  111.          result.push_back(prime_and_offset_t(n, (n * n) >> 1));
  112.       }
  113.    }
  114. }
  115.  
  116. ///////////////////////////////////////////////////////////////////////////////////////////////////
  117. // 23 is the last prime for which the running product (111546435) is less than 2^31 / CHAR_BIT;
  118. // its factor is 2.41 (19: 55.35)
  119. //
  120. // # primes (max. prime) vs. presieve time vs. total time (VC++):
  121. //
  122. //    2 ( 5)   100 ms   3.207 s
  123. //    3 ( 7)    42 ms   2.919 s
  124. //    4 (11)    30 ms   2.758 s
  125. //    5 (13)    13 ms   2.581 s
  126. //    6 (17)    18 ms   2.474 s
  127. //    7 (19)    22 ms   2.380 s
  128. //    8 (23)    68 ms   2.333 s  <-- best overall
  129.  
  130. unsigned char const small_odd_primes[] = {  3, 5, 7, 11, 13, 17, 19, 23  };
  131. unsigned const MAX_PRESIEVE_PRIMES = sizeof(small_odd_primes) / sizeof(small_odd_primes[0]);
  132.  
  133. void fill_with_presieve_pattern (void *data, unsigned bytes, unsigned primes)
  134. {
  135.    assert( primes >= 2 && primes <= MAX_PRESIEVE_PRIMES );
  136.    assert( bytes >= 111546435u );
  137.  
  138.    static unsigned char const pattern_35[] =
  139.    {
  140.       0x96, 0x34, 0x4B, 0x9A, 0x25, 0xCD, 0x92, 0x66, 0x49, 0xB3, 0xA4, 0x59, 0xD2, 0x2C, 0x69
  141.    };
  142.  
  143.    unsigned char *p = static_cast<unsigned char *>(data);
  144.    unsigned size = sizeof(pattern_35);
  145.  
  146.    std::memcpy(p, pattern_35, size);
  147.  
  148.    for (unsigned i = 2; i < primes; ++i)
  149.    {
  150.       unsigned n = small_odd_primes[i];
  151.       unsigned new_size = size * n;
  152.       unsigned char *h = p + size, *e = p + new_size;
  153.  
  154.       for ( ; h < e; h += size)  // replicate the current pattern n - 1 times
  155.       {
  156.          std::memcpy(h, p, size);
  157.       }
  158.  
  159.       size = new_size;
  160.  
  161.       for (unsigned k = n >> 1, bits = size * CHAR_BIT; k < bits; k += n)  
  162.       {
  163.          set_bit(p, k);
  164.       }
  165.    }
  166.  
  167.    unsigned odd_bytes = bytes % size;
  168.    unsigned char *h = p + size;                  // unprocessed part starts here
  169.    unsigned char *e = p + bytes - odd_bytes;     // end of last multiple
  170.  
  171.    // self-replicating copy might be possible here but it is not guaranteed to work
  172.    for ( ; h < e; h += size)  std::memcpy(h, p, size);
  173.  
  174.    if (odd_bytes)  std::memcpy(e, p, odd_bytes);
  175.  
  176.    // set the primes themselves back to zero
  177.    for (unsigned i = 0; i < primes; ++i)
  178.    {
  179.       clear_bit(p, small_odd_primes[i] / 2u);
  180.    }
  181. }
  182.  
  183. //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  184. // the loop logic must be adjusted if segment_bytes is not a power of two
  185.  
  186. unsigned g_presieve_primes = MAX_PRESIEVE_PRIMES;
  187.  
  188. void initialise_packed_sieve_4G (void *data, unsigned segment_bytes = 1 << 15, unsigned end_bit = 1u << 31)
  189. {
  190.    typedef std::vector<prime_and_offset_t>::iterator prime_iter_t;
  191.    std::vector<prime_and_offset_t> small_factors;
  192.  
  193.    initialise_odd_primes_and_offsets_64K(small_factors);
  194.    fill_with_presieve_pattern(data, (end_bit + CHAR_BIT - 1) / CHAR_BIT, g_presieve_primes);
  195.  
  196.    unsigned segment_bits = segment_bytes * CHAR_BIT;
  197.    unsigned partial_bits = end_bit % segment_bits;
  198.    unsigned segments     = end_bit / segment_bits + (partial_bits != 0);
  199.  
  200.    for (unsigned offset = 0; segments--; offset += segment_bytes)
  201.    {
  202.       if (segments == 0 && partial_bits)  // last segment may not be full size
  203.       {
  204.          segment_bits = partial_bits;
  205.       }
  206.      
  207.       unsigned char *segment = static_cast<unsigned char *>(data) + offset;
  208.  
  209.       for (prime_iter_t p = small_factors.begin() + g_presieve_primes; p != small_factors.end(); ++p)
  210.       {
  211.          unsigned n = p->prime;
  212.          unsigned i = p->next_offset;
  213.  
  214.          for ( ; i < segment_bits; i += n)
  215.          {
  216.             set_bit(segment, i);
  217.          }
  218.  
  219.           p->next_offset = i - segment_bits;
  220.       }
  221.    }
  222. }
  223.  
  224. ///////////////////////////////////////////////////////////////////////////////////////////////////
  225. // as far as prime counts and checksums are concerned, bit 0 is treated as a stand-in for the
  226. // non-representable number 2 (which, being even, is the odd man out in such a bitmap)
  227. //
  228. // { 203280220, 0C903F84, 2D929F89, 0DC7071A } 3942.9 ms
  229. // { 203280221, 0C903F86, 5B253F12, 774A3204 } 4007.3 ms
  230.  
  231. struct digest_t
  232. {
  233.    unsigned count;
  234.    unsigned sum;
  235.    unsigned product;
  236.    unsigned checksum;
  237.  
  238.    digest_t ()
  239.    : count(0), sum(0), product(1), checksum(0)  {  }
  240.    
  241.    digest_t (unsigned c, unsigned s, unsigned p, unsigned x)
  242.    : count(c), sum(s), product(p), checksum(x)  {  }
  243.  
  244.    bool operator != (digest_t const &other) const {  return !(*this == other);  }
  245.  
  246.    bool operator == (digest_t const &other) const
  247.    {  
  248.       return checksum == other.checksum
  249.           && product  == other.product
  250.           && sum      == other.sum
  251.           && count    == other.count;
  252.    }
  253.  
  254.    void add_prime (unsigned n)
  255.    {
  256.       count    += 1;
  257.       sum      += n;
  258.       product  *= n;
  259.       checksum += n * sum + product;
  260.    }
  261.  
  262.    void add_odd_composites_bitmap (void const *bitmap, unsigned min_bit = 0, unsigned end_bit = 1u << 31)
  263.    {
  264.       unsigned char const *odd_composites = static_cast<unsigned char const *>(bitmap);
  265.  
  266.       if (min_bit < end_bit)
  267.       {
  268.          if (min_bit == 0)  // conventional stand-in for the non-representable prime 2
  269.          {
  270.             if (bit(odd_composites, 0) == 0)  add_prime(2);
  271.    
  272.             ++min_bit;
  273.          }
  274.  
  275.          for (unsigned k = min_bit; k < end_bit; ++k)
  276.          {
  277.             if (bit(odd_composites, k))  continue;
  278.  
  279.             add_prime((k << 1) + 1);
  280.          }
  281.       }
  282.    }
  283. };
  284.  
  285. digest_t const Digest4G( 203280221, 0x0C903F86, 0x5B253F12, 0x774A3204 );
  286.  
  287. ///////////////////////////////////////////////////////////////////////////////////////////////////
  288.  
  289. int main ()
  290. {
  291.    unsigned sieve_bits = 1u << 31;
  292.  
  293.    std::printf("\nsieve bits = %u (equiv. number = %u)", sieve_bits, (sieve_bits - 1) * 2 + 1);
  294.  
  295.    std::vector<unsigned char> bmv((sieve_bits + CHAR_BIT - 1) / CHAR_BIT);
  296.    unsigned char *bitmap = &bmv[0];
  297.  
  298.    for (g_presieve_primes = 2; g_presieve_primes <= MAX_PRESIEVE_PRIMES; ++g_presieve_primes)
  299.    {
  300.       std::printf("\n\n*** %u presieve primes (%u to %u) ***\n",
  301.          g_presieve_primes,
  302.          small_odd_primes[0], small_odd_primes[g_presieve_primes - 1]  );
  303.  
  304.       for (unsigned size_bits = 12; size_bits < 20; ++size_bits)
  305.       {
  306.          unsigned segment_size = 1u << size_bits;
  307.      
  308.          std::printf("\nsegment size %8u (2^%2u) bytes ...", segment_size, size_bits);
  309.  
  310.          std::clock_t t0 = std::clock();
  311.          initialise_packed_sieve_4G(bitmap, segment_size, sieve_bits);
  312.          double seconds = double(std::clock() - t0) / CLOCKS_PER_SEC;
  313.  
  314.          std::printf("%7.3f s %8.1f M/s", seconds, sieve_bits * std::ldexp(2.0, -20) / seconds);
  315.       }
  316.  
  317.       digest_t d;
  318.    
  319.       d.add_odd_composites_bitmap(bitmap, 0, sieve_bits);
  320.  
  321.       std::printf("\n\ndigest { %9u, %08X, %08X, %08X }", d.count, d.sum, d.product, d.checksum);
  322.  
  323.       if (d != Digest4G)  
  324.       {
  325.          std::printf("\n*** FAIL ***\nd != Digest4G");
  326.      
  327.          return 3; // indicate failure
  328.       }
  329.    
  330.    } // iteration over presieve prime counts
  331.  
  332.    return 0;
  333. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement