Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <algorithm>
- #include <functional>
- #include <array>
- #include <iostream>
- #include <numeric>
- #include <omp.h>
- #include <chrono>
- #include <thread>
- #include <sys/mman.h>
- using std::bind;
- using namespace std::placeholders;
- using namespace std;
- template <class T> struct SimpleAllocator {
- typedef T value_type;
- T * allocate(std::size_t n) {
- return (T *)mmap(NULL, n * sizeof(T), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS, 0, 0);
- }
- void deallocate(T * p, std::size_t n) { munmap(p, n * sizeof(T));}
- };
- typedef std::vector<uint32_t, SimpleAllocator<uint32_t>> test_vec_t;
- uint32_t gen_random(void) {
- const int IM = 139968, IA = 3877, IC = 29573;
- static int last = 42;
- last = (last * IA + IC) % IM;
- return last;
- }
- auto gen_def(test_vec_t & v) {
- std::generate(v.begin(), v.end(), gen_random);
- }
- typedef __v4df double_vec4_t;
- double_vec4_t vmod_139968(double_vec4_t n) {
- return n + (_mm256_floor_pd(n * _mm256_set1_pd(1. / 139968)) * _mm256_set1_pd(-139968));
- }
- double_vec4_t vsumm(double_vec4_t apowm, double_vec4_t & vsum_last) {
- double_vec4_t r = _mm256_set_pd(apowm[0] + apowm[1] + apowm[2] + apowm[3], apowm[0] + apowm[1] + apowm[2], apowm[0] + apowm[1], apowm[0]) + vsum_last;
- vsum_last += _mm256_set1_pd(apowm[0] + apowm[1] + apowm[2] + apowm[3]);
- return r;
- }
- typedef struct {double_vec4_t _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7, _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15;} way_t;
- typedef union {
- way_t v;
- double_vec4_t _m[16];
- } wayu_t;
- typedef struct {
- wayu_t last_apown;
- double_vec4_t summ_prev_apown;
- } ctx_t;
- wayu_t set_way1(double x) {
- return wayu_t {{
- _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x),
- _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x),
- _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x),
- _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x), _mm256_set1_pd(x)
- }
- };
- }
- wayu_t vsum_way16(ctx_t & ctx) {
- uint64_t i = 0;
- wayu_t ret;
- auto last_apown = ctx.last_apown._m, summ_last_apown = ret._m;
- do { summ_last_apown[i] = vsumm(last_apown[i], ctx.summ_prev_apown); ++i; } while(i != 16);
- return ret;
- }
- wayu_t mul_way16(wayu_t a, wayu_t b) {
- wayu_t ret;
- uint64_t i = 0;
- do { ret._m[i] = a._m[i] * b._m[i]; ++i; } while(i != 16);
- return ret;
- }
- wayu_t add_way16(wayu_t a, wayu_t b) {
- wayu_t ret;
- uint64_t i = 0;
- do { ret._m[i] = a._m[i] + b._m[i]; ++i; } while(i != 16);
- return ret;
- }
- wayu_t mod_139968_way16(wayu_t x) {
- wayu_t ret;
- uint64_t i = 0;
- do { ret._m[i] = vmod_139968(x._m[i]); ++i; } while(i != 16);
- return ret;
- }
- wayu_t a_pow_64_coef = set_way1(35521);//3877^64 mod 139968
- way_t last_apown = {
- _mm256_set_pd(10333, 54553, 3877, 1), _mm256_set_pd(135565, 115273, 45013, 30193),
- _mm256_set_pd(29821, 133369, 128197, 5665), _mm256_set_pd(111277, 70825, 116917, 2449),
- _mm256_set_pd(134557, 128089, 82021, 39553), _mm256_set_pd(108301, 75337, 6229, 16753),
- _mm256_set_pd(139645, 30073, 95173, 118945), _mm256_set_pd(45421, 21673, 15349, 7441),
- _mm256_set_pd(129757, 22489, 138277, 17473), _mm256_set_pd(48781, 25609, 31957, 22897),
- _mm256_set_pd(101437, 29305, 78277, 27169), _mm256_set_pd(47533, 68137, 57781, 101137),
- _mm256_set_pd(71965, 10777, 20581, 87553), _mm256_set_pd(115981, 104329, 84181, 52081),
- _mm256_set_pd(94909, 25657, 137989, 81121), _mm256_set_pd(22573, 78889, 14389, 126289),
- };
- __attribute__((always_inline)) wayu_t gen_vrandom_way16(ctx_t & ctx) {
- wayu_t IC = set_way1(29573), seed = set_way1(42);
- wayu_t apowm_sum = vsum_way16(ctx);
- auto last = mul_way16(ctx.last_apown, set_way1(3877 * 42));
- last = add_way16(last, mul_way16(apowm_sum, IC));
- last = mod_139968_way16(last);
- ctx.last_apown = mod_139968_way16(mul_way16(ctx.last_apown, a_pow_64_coef));
- ctx.summ_prev_apown = vmod_139968(ctx.summ_prev_apown);
- return last;
- }
- template<typename F> auto bench(F f, test_vec_t & v, uint64_t n) {
- v.resize(n), v.reserve(n); std::fill(v.begin(), v.end(), 0);
- auto start_time = std::chrono::system_clock::now();
- f(v);
- auto time = std::chrono::duration<double>(std::chrono::system_clock::now() - start_time).count();
- fprintf(stderr, "(%.3fs)%.2lftpc\n", time, (3700000000 * time) / n);
- }
- void diff(test_vec_t & proturbo, test_vec_t & def) {
- uint64_t i = 0;
- do {
- if(proturbo[i] != def[i]) {
- fprintf(stderr, "error(%lu): proturbo(%u) -> def(%u)\n", i, proturbo[i], def[i]);
- return;
- }
- } while(++i != proturbo.size());
- }
- void gen_proturbo_way16_worker(uint32_t * p, uint64_t start, uint64_t len, double pown_coef, double sumpown_coef) {
- __m128i * it = (__m128i *)(p + start), * ali_end = (__m128i *)((p + start + (len & ~0xful)));
- ctx_t ctx = { mod_139968_way16(mul_way16(set_way1(pown_coef), wayu_t{last_apown})), _mm256_set1_pd(sumpown_coef)};
- do {
- uint64_t i = 0;
- wayu_t r = gen_vrandom_way16(ctx);
- do {
- // _mm_stream_si128((__m128i *)(p + start), _mm256_cvtpd_epi32(r._m[i])); ++it;//
- _mm_stream_si128(it, _mm256_cvtpd_epi32(r._m[i])); ++it;
- } while(++i != 16);
- } while(it < ali_end);
- }
- double mod_139968(double n) {
- return n + (floor(n * 1. / 139968) * -139968);
- }
- unsigned mod_pow(unsigned num, unsigned pow, unsigned mod) {
- unsigned long long test;
- unsigned long long n = num;
- for(test = 1; pow; pow >>= 1) {
- if(pow & 1)
- test = ((test % mod) * (n % mod)) % mod;
- n = ((n % mod) * (n % mod)) % mod;
- }
- return test; /* note this is potentially lossy */
- }
- uint32_t calc_sumpown_new(uint64_t n) {
- return (mod_pow(3877, n, 139968 * 3876) - 1) / 3876;
- }
- void gen_proturbo_way16_mt(test_vec_t & v) {
- uint64_t len = v.size() / 4;
- std::thread a(gen_proturbo_way16_worker, v.data(), 0, len, mod_pow(3877, len * 0, 139968), calc_sumpown_new(len * 0));
- std::thread b(gen_proturbo_way16_worker, v.data(), len, len, mod_pow(3877, len * 1, 139968), calc_sumpown_new(len * 1));
- std::thread c(gen_proturbo_way16_worker, v.data(), len * 2, len, mod_pow(3877, len * 2, 139968), calc_sumpown_new(len * 2));
- std::thread d(gen_proturbo_way16_worker, v.data(), len * 3, len, mod_pow(3877, len * 3, 139968), calc_sumpown_new(len * 3));
- a.join(); b.join();
- c.join(); d.join();
- }
- int main(void) {
- uint64_t n = 250000000;//62500000
- test_vec_t proturbo, def, proturbo_way16;
- bench(gen_def, def, n);
- bench(gen_proturbo_way16_mt, proturbo, n);
- diff(proturbo, def);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement