Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- const pik::SIMD_PART(int64_t, 2) i64x2;
- const pik::SIMD_FULL(int64_t) i64xN;
- for(; i + i64xN.N <= size; i += i64xN.N) {
- int64_t* base = (int64_t*) &aos[i];
- const auto vl1 = pik::load(i64x2, base + 0); // [a1, b1]
- const auto vl2 = pik::load(i64x2, base + 2); // [c1, a2]
- const auto vl3 = pik::load(i64x2, base + 4); // [b2, c2]
- const auto tl1 = pik::concat_hi_lo(vl2, vl1); // [a1, a2]
- const auto tl2 = pik::concat_lo_hi(vl3, vl1); // [b1, b2]
- const auto tl3 = pik::concat_hi_lo(vl3, vl2); // [c1, c2]
- #if SIMD_TARGET_WIDTH == 128
- const auto t1 = tl1;
- const auto t2 = tl1;
- const auto t3 = tl1;
- #elif SIMD_TARGET_WIDTH == 256
- base = (int64_t*) &aos[i + 2];
- const auto vh1 = pik::load(i64x2, base + 0); // [a1, b1]
- const auto vh2 = pik::load(i64x2, base + 2); // [c1, a2]
- const auto vh3 = pik::load(i64x2, base + 4); // [b2, c2]
- const auto th1 = pik::concat_hi_lo(vh2, vh1); // [a1, a2]
- const auto th2 = pik::concat_lo_hi(vh3, vh1); // [b1, b2]
- const auto th3 = pik::concat_hi_lo(vh3, vh2); // [c1, c2]
- auto t1 = pik::combine(i64xN, th1, tl1);
- auto t2 = pik::combine(i64xN, th2, tl2);
- auto t3 = pik::combine(i64xN, th3, tl3);
- #endif
- pik::store(t1, i64xN, &soa.a[i]);
- pik::store(t2, i64xN, &soa.b[i]);
- pik::store(t3, i64xN, &soa.c[i]);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement