Advertisement
DarthGizka

Sieve of Eratosthenes - timings for different segment sizes

Nov 3rd, 2014
531
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.34 KB | None | 0 0
  1. // zrbj_sx_sieve32.cpp - test program for odd segmented Sieve of Eratosthenes (sieve init)
  2. // 2014-11-01 DarthGizka <KnightOwl@live.it>
  3. //
  4. // Ingredients for speed:
  5. // o odds-only bitmap (i.e. an order 2 wheel which happens to have only one non-zero residue)
  6. // o sieve in small segments that fit into the L1 cache (16K for Atom, 32K for most else)
  7. //   o zero a segment before sieving to adds an extra boost (even if OS hands over zeroed pages)
  8. // o avoid BT/BTS intrinsics since these have lock semantics and slow things down drastically
  9. // o byte write access with stride < word size might cause aliasing problems (i.e. multiple
  10. //   modifying accesses to bytes in the same word)
  11. //   -> ameliorated by bulk replication of the small prime bit pattern via memcpy()
  12. //   -> pattern becomes regular starting with the first word not containing any of the primes
  13. //      (since only multiples get crossed out, not the primes)
  14.  
  15. //#include "zrbj/hdr_pre.h"
  16. #include <cassert>
  17. #include <cmath>
  18. #include <ctime>
  19. #include <climits>
  20. #include <cstdio>
  21. #include <cstring>
  22. #include <vector>
  23. //#include "zrbj/hdr_post.h"
  24.  
  25. enum {  unsigned_has_at_least_32_bits = sizeof(char[1 - (sizeof(unsigned) * CHAR_BIT < 32)])  };
  26.  
  27. template<typename word_t>
  28. void set_bit (word_t *p, unsigned index)
  29. {
  30.    enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };
  31.  
  32.    // we can trust the compiler to use masking and shifting instead of division; we cannot do that
  33.    // ourselves without having the log2 which cannot easily be computed as a constexpr
  34.    
  35.    p[index / BITS_PER_WORD] |= word_t(1) << (index % BITS_PER_WORD);
  36. }
  37.  
  38. template<typename word_t>
  39. unsigned char bit (word_t const *p, unsigned index)
  40. {
  41.    enum {  BITS_PER_WORD = sizeof(word_t) * CHAR_BIT  };
  42.  
  43.    return (unsigned char)((p[index / BITS_PER_WORD] >> (index % BITS_PER_WORD)) & 1u);
  44. }
  45.  
  46. //-------------------------------------------------------------------------------------------------
  47.  
  48. struct prime_and_offset_t
  49. {
  50.    unsigned prime;         // needs at most the 16 bits guaranteed by the standard
  51.    unsigned next_offset;   // this needs 32 bits
  52.  
  53.    prime_and_offset_t (unsigned p, unsigned o): prime(p), next_offset(o)  {  }
  54. };
  55.  
  56. //-------------------------------------------------------------------------------------------------
  57.  
  58. void initialise_odd_primes_and_offsets_64K (std::vector<prime_and_offset_t> &result)
  59. {
  60.    typedef unsigned char byte;
  61.  
  62.    result.clear();
  63.    result.reserve(6541);  // # of odd primes below 2^16
  64.  
  65.    // bitmap has 32K, so putting it on the stack is not safe (besides, bitset<> is a C++11 feature)
  66.    std::vector<byte> odd_composites_bitmap((1 << 15) / CHAR_BIT);
  67.    byte *odd_composites = &odd_composites_bitmap[0];  // data() is a C++11 feature
  68.  
  69.    // looping over the odd numbers 3 to 2^16-1 means iterating over bit indices 1 to 2^15-1
  70.    unsigned const max_bit = 65521 >> 1;
  71.    unsigned const max_factor_bit = 255 >> 1;
  72.  
  73.    for (unsigned k = (3 >> 1); k <= max_factor_bit; ++k)
  74.    {
  75.       if (bit(odd_composites, k))  continue;
  76.  
  77.       unsigned n = (k << 1) + 1;
  78.       unsigned i = (n * n) >> 1;
  79.  
  80.       for ( ; i <= max_bit; i += n)
  81.       {
  82.          set_bit(odd_composites, i);
  83.       }
  84.    }
  85.  
  86.    for (unsigned k = 1; k <= max_bit; ++k)
  87.    {
  88.       if (!bit(odd_composites, k))
  89.       {
  90.          unsigned n = (k << 1) + 1;
  91.          
  92.          result.push_back(prime_and_offset_t(n, (n * n) >> 1));
  93.       }
  94.    }
  95. }
  96.  
  97. //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  98. // the loop logic must be adjusted if segment_bytes is not a power of two
  99.  
  100. void initialise_packed_sieve_4G (void *data, unsigned segment_bytes = 1 << 15, unsigned end_bit = 1u << 31)
  101. {
  102.    typedef std::vector<prime_and_offset_t>::iterator prime_iter_t;
  103.    std::vector<prime_and_offset_t> small_factors;
  104.  
  105.    initialise_odd_primes_and_offsets_64K(small_factors);
  106.  
  107.    unsigned segment_bits = segment_bytes * CHAR_BIT;
  108.    unsigned partial_bits = end_bit % segment_bits;
  109.    unsigned segments     = end_bit / segment_bits + (partial_bits != 0);
  110.  
  111.    unsigned char *segment = static_cast<unsigned char *>(data);
  112.    unsigned bytes = segment_bytes;
  113.  
  114.    for ( ; segments--; segment += segment_bytes)
  115.    {
  116.       if (segments == 0 && partial_bits)
  117.       {
  118.          segment_bits = partial_bits;
  119.          bytes = (partial_bits + CHAR_BIT - 1) / CHAR_BIT;
  120.       }
  121.      
  122.       std::memset(segment, 0, bytes);
  123.      
  124.       for (prime_iter_t p = small_factors.begin(); p != small_factors.end(); ++p)
  125.       {
  126.          unsigned n = p->prime;
  127.          unsigned i = p->next_offset;
  128.  
  129.          for ( ; i < segment_bits; i += n)
  130.          {
  131.             set_bit(segment, i);
  132.          }
  133.  
  134.           p->next_offset = i - segment_bits;
  135.       }
  136.    }
  137. }
  138.  
  139. ///////////////////////////////////////////////////////////////////////////////////////////////////
  140. // as far as prime counts and checksums are concerned, bit 0 is treated as a stand-in for the
  141. // non-representable number 2 (which, being even, is the odd man out in such a bitmap)
  142. //
  143. // { 203280220, 0C903F84, 2D929F89, 0DC7071A } 3942.9 ms
  144. // { 203280221, 0C903F86, 5B253F12, 774A3204 } 4007.3 ms
  145.  
  146. struct digest_t
  147. {
  148.    unsigned count;
  149.    unsigned sum;
  150.    unsigned product;
  151.    unsigned checksum;
  152.  
  153.    digest_t () : count(0), sum(0), product(1), checksum(0)  {  }
  154.  
  155.    bool operator == (digest_t const &other) const
  156.    {  
  157.       return checksum == other.checksum
  158.           && product  == other.product
  159.           && sum      == other.sum
  160.           && count    == other.count;
  161.    }
  162.  
  163.    void add_prime (unsigned n)
  164.    {
  165.       count    += 1;
  166.       sum      += n;
  167.       product  *= n;
  168.       checksum += n * sum + product;
  169.    }
  170.  
  171.    void add_odd_composites_bitmap (void const *bitmap, unsigned min_bit = 0, unsigned end_bit = 1u << 31)
  172.    {
  173.       unsigned char const *odd_composites = static_cast<unsigned char const *>(bitmap);
  174.  
  175.       if (min_bit < end_bit)
  176.       {
  177.          if (min_bit == 0)  // conventional stand-in for the non-representable prime 2
  178.          {
  179.             if (bit(odd_composites, 0) == 0)  add_prime(2);
  180.    
  181.             ++min_bit;
  182.          }
  183.  
  184.          for (unsigned k = min_bit; k < end_bit; ++k)
  185.          {
  186.             if (bit(odd_composites, k))  continue;
  187.  
  188.             add_prime((k << 1) + 1);
  189.          }
  190.       }
  191.    }
  192. };
  193.  
  194. ///////////////////////////////////////////////////////////////////////////////////////////////////
  195.  
  196. int main ()
  197. {
  198.    unsigned sieve_bits = /**/ 1u << 31 /*/ 1000000000u >> 1 /**/;
  199.  
  200.    std::printf("\nsieve bits = %u (equiv. number = %u)\n", sieve_bits, (sieve_bits - 1) * 2 + 1);
  201.  
  202.    std::vector<unsigned char> bmv((sieve_bits + CHAR_BIT - 1) / CHAR_BIT);
  203.    unsigned char *bitmap = &bmv[0];
  204.  
  205.    for (unsigned size_bits = 12; size_bits < 24; ++size_bits)
  206.    {
  207.       unsigned segment_size = 1u << size_bits;
  208.  
  209.       std::printf("\nsegment size %7u (2^%2u) bytes ... ", segment_size, size_bits);
  210.  
  211.       std::clock_t t0 = std::clock();
  212.       initialise_packed_sieve_4G(bitmap, segment_size, sieve_bits);
  213.       double seconds = double(std::clock() - t0) / CLOCKS_PER_SEC;
  214.  
  215.       std::printf("%7.3f s %8.1f M/s", seconds, sieve_bits * std::ldexp(2, -20) / seconds);
  216.    }
  217.  
  218.    digest_t d;
  219.    
  220.    d.add_odd_composites_bitmap(bitmap, 0, sieve_bits);
  221.  
  222.    std::printf("\n\ndigest { %9u, %08X, %08X, %08X }", d.count, d.sum, d.product, d.checksum);
  223.  
  224.    return 0;
  225. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement