Advertisement
Guest User

Untitled

a guest
Mar 12th, 2016
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.57 KB | None | 0 0
  1. #include <algorithm>
  2. #include <functional>
  3. #include <array>
  4. #include <iostream>
  5. #include <numeric>
  6. #include <omp.h>
  7. #include <chrono>
  8. #include <thread>
  9. #include <sys/mman.h>
  10.  
  11. using std::bind;
  12. using namespace std::placeholders;
  13. using namespace std;
  14.  
  15. template <class T> struct SimpleAllocator {
  16.   typedef T value_type;
  17.   T * allocate(std::size_t n) {
  18.     return (T *)mmap(NULL, n * sizeof(T), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS, 0, 0);
  19.   }
  20.   void deallocate(T * p, std::size_t n) { munmap(p, n * sizeof(T));}
  21. };
  22.  
  23. typedef std::vector<uint32_t, SimpleAllocator<uint32_t>> test_vec_t;
  24.  
  25. uint32_t gen_random(void) {
  26.   const int IM = 139968, IA = 3877, IC = 29573;
  27.   static int last = 42;
  28.   last = (last * IA + IC) % IM;
  29.   return last;
  30. }
  31.  
  32.  
  33. auto gen_def(test_vec_t & v) {
  34.   std::generate(v.begin(), v.end(), gen_random);
  35. }
  36.  
  37.  
  38. typedef __v4df double_vec4_t;
  39.  
  40. double_vec4_t vmod_139968(double_vec4_t n) {
  41.   return n + (_mm256_floor_pd(n * _mm256_set1_pd(1. / 139968)) * _mm256_set1_pd(-139968));
  42. }
  43.  
  44. double_vec4_t vsumm(double_vec4_t apowm, double_vec4_t & vsum_last) {
  45.   double_vec4_t r = _mm256_set_pd(apowm[0] + apowm[1] + apowm[2] + apowm[3], apowm[0] + apowm[1] + apowm[2], apowm[0] + apowm[1], apowm[0]) + vsum_last;
  46.   vsum_last += _mm256_set1_pd(apowm[0] + apowm[1] + apowm[2] + apowm[3]);
  47.   return r;
  48. }
  49.  
  50.  
  51. typedef struct {double_vec4_t _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7, _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15;} way_t;
  52.  
  53. typedef union {
  54.   way_t v;
  55.   double_vec4_t _m[16];
  56. } wayu_t;
  57.  
  58. typedef struct {
  59.   wayu_t last_apown;
  60.   double_vec4_t summ_prev_apown;
  61. } ctx_t;
  62.  
  63. wayu_t set_way1(double x) {
  64.   return wayu_t {{
  65.       _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x),
  66.       _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x),
  67.       _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x),
  68.       _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x)
  69.     }
  70.   };
  71. }
  72.  
  73. wayu_t vsum_way16(ctx_t & ctx) {
  74.   uint64_t i = 0;
  75.   wayu_t ret;
  76.   auto last_apown = ctx.last_apown._m, summ_last_apown = ret._m;
  77.   do { summ_last_apown[i] = vsumm(last_apown[i], ctx.summ_prev_apown); ++i; } while(i != 16);
  78.  
  79.   return ret;
  80. }
  81.  
  82.  
  83.  
  84. wayu_t mul_way16(wayu_t a, wayu_t b) {
  85.   wayu_t ret;
  86.   uint64_t i = 0;
  87.   do { ret._m[i] = a._m[i] * b._m[i]; ++i; } while(i != 16);
  88.  
  89.   return ret;
  90. }
  91.  
  92. wayu_t add_way16(wayu_t a, wayu_t b) {
  93.   wayu_t ret;
  94.   uint64_t i = 0;
  95.   do { ret._m[i] = a._m[i] + b._m[i]; ++i; } while(i != 16);
  96.  
  97.   return ret;
  98. }
  99.  
  100. wayu_t mod_139968_way16(wayu_t x) {
  101.   wayu_t ret;
  102.   uint64_t i = 0;
  103.   do { ret._m[i] = vmod_139968(x._m[i]); ++i; } while(i != 16);
  104.  
  105.   return ret;
  106. }
  107.  
  108. wayu_t a_pow_64_coef = set_way1(35521);//3877^64 mod 139968
  109.  
  110. way_t last_apown = {
  111.   _mm256_set_pd(10333, 54553, 3877, 1),  _mm256_set_pd(135565, 115273, 45013, 30193),
  112.   _mm256_set_pd(29821, 133369, 128197, 5665),    _mm256_set_pd(111277, 70825, 116917, 2449),
  113.   _mm256_set_pd(134557, 128089, 82021, 39553),    _mm256_set_pd(108301, 75337, 6229, 16753),
  114.   _mm256_set_pd(139645, 30073, 95173, 118945),    _mm256_set_pd(45421, 21673, 15349, 7441),
  115.   _mm256_set_pd(129757, 22489, 138277, 17473),    _mm256_set_pd(48781, 25609, 31957, 22897),
  116.   _mm256_set_pd(101437, 29305, 78277, 27169),    _mm256_set_pd(47533, 68137, 57781, 101137),
  117.   _mm256_set_pd(71965, 10777, 20581, 87553),    _mm256_set_pd(115981, 104329, 84181, 52081),
  118.   _mm256_set_pd(94909, 25657, 137989, 81121),    _mm256_set_pd(22573, 78889, 14389, 126289),
  119. };
  120.  
  121.  
  122.  
  123. __attribute__((always_inline)) wayu_t gen_vrandom_way16(ctx_t & ctx) {
  124.   wayu_t IC = set_way1(29573), seed = set_way1(42);
  125.   wayu_t apowm_sum = vsum_way16(ctx);
  126.   auto last = mul_way16(ctx.last_apown, set_way1(3877 * 42));
  127.   last = add_way16(last, mul_way16(apowm_sum, IC));
  128.   last = mod_139968_way16(last);
  129.   ctx.last_apown = mod_139968_way16(mul_way16(ctx.last_apown, a_pow_64_coef));
  130.   ctx.summ_prev_apown = vmod_139968(ctx.summ_prev_apown);
  131.   return last;
  132. }
  133.  
  134.  
  135. template<typename F> auto bench(F f, test_vec_t & v, uint64_t n) {
  136.   v.resize(n), v.reserve(n); std::fill(v.begin(), v.end(), 0);
  137.   auto start_time = std::chrono::system_clock::now();
  138.   f(v);
  139.   auto time = std::chrono::duration<double>(std::chrono::system_clock::now() - start_time).count();
  140.   fprintf(stderr, "(%.3fs)%.2lftpc\n", time, (3700000000 * time) / n);
  141. }
  142.  
  143. void diff(test_vec_t & proturbo, test_vec_t & def) {
  144.   uint64_t i = 0;
  145.  
  146.   do {
  147.     if(proturbo[i] != def[i]) {
  148.       fprintf(stderr, "error(%lu): proturbo(%u) -> def(%u)\n", i, proturbo[i], def[i]);
  149.       return;
  150.     }
  151.   } while(++i != proturbo.size());
  152. }
  153.  
  154.  
  155.  
  156. void gen_proturbo_way16_worker(uint32_t * p, uint64_t start, uint64_t len, double pown_coef, double sumpown_coef) {
  157.   __m128i * it = (__m128i *)(p + start), * ali_end = (__m128i *)((p + start + (len & ~0xful)));
  158.   ctx_t ctx = { mod_139968_way16(mul_way16(set_way1(pown_coef), wayu_t{last_apown})), _mm256_set1_pd(sumpown_coef)};
  159.  
  160.   do {
  161.     uint64_t i = 0;
  162.     wayu_t r = gen_vrandom_way16(ctx);
  163.  
  164.     do {
  165. //       _mm_stream_si128((__m128i *)(p + start), _mm256_cvtpd_epi32(r._m[i])); ++it;//
  166.       _mm_stream_si128(it, _mm256_cvtpd_epi32(r._m[i])); ++it;
  167.     } while(++i != 16);
  168.   } while(it < ali_end);
  169. }
  170.  
  171. double mod_139968(double n) {
  172.   return n + (floor(n * 1. / 139968) * -139968);
  173. }
  174.  
  175. unsigned mod_pow(unsigned num, unsigned pow, unsigned mod) {
  176.   unsigned long long test;
  177.   unsigned long long n = num;
  178.  
  179.   for(test = 1; pow; pow >>= 1) {
  180.     if(pow & 1)
  181.       test = ((test % mod) * (n % mod)) % mod;
  182.  
  183.     n = ((n % mod) * (n % mod)) % mod;
  184.   }
  185.  
  186.   return test; /* note this is potentially lossy */
  187. }
  188.  
  189. uint32_t calc_sumpown_new(uint64_t n) {
  190.   return (mod_pow(3877, n, 139968 * 3876) - 1) / 3876;
  191. }
  192.  
  193. void gen_proturbo_way16_mt(test_vec_t & v) {
  194.   uint64_t len = v.size() / 4;
  195.   std::thread a(gen_proturbo_way16_worker, v.data(), 0, len, mod_pow(3877, len * 0, 139968), calc_sumpown_new(len * 0));
  196.   std::thread b(gen_proturbo_way16_worker, v.data(), len, len, mod_pow(3877, len * 1, 139968), calc_sumpown_new(len * 1));
  197.   std::thread c(gen_proturbo_way16_worker, v.data(), len * 2, len, mod_pow(3877, len * 2, 139968), calc_sumpown_new(len * 2));
  198.   std::thread d(gen_proturbo_way16_worker, v.data(), len * 3, len, mod_pow(3877, len * 3, 139968), calc_sumpown_new(len * 3));
  199.   a.join(); b.join();
  200.   c.join(); d.join();
  201. }
  202.  
  203.  
  204. int main(void) {
  205.   uint64_t n = 250000000;//62500000
  206.   test_vec_t proturbo, def, proturbo_way16;
  207.   bench(gen_def, def, n);
  208.   bench(gen_proturbo_way16_mt, proturbo, n);
  209.   diff(proturbo, def);
  210. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement