Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <immintrin.h>
- #include <cstdint>
- #include <string_view>
- #include <cstdio>
- using namespace std::literals;
- #define NOINLINE __attribute__((__noinline__))
- inline static __m256i _mm256_shift_left(__m256i A, const uint32_t N) {
- switch (N) {
- #define CASE_LT16(N) case N:\
- return _mm256_alignr_epi8(A, _mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0)), 16 - N);
- CASE_LT16(1)
- CASE_LT16(2)
- CASE_LT16(3)
- CASE_LT16(4)
- CASE_LT16(5)
- CASE_LT16(6)
- CASE_LT16(7)
- CASE_LT16(8)
- CASE_LT16(9)
- CASE_LT16(10)
- CASE_LT16(11)
- CASE_LT16(12)
- CASE_LT16(13)
- CASE_LT16(14)
- CASE_LT16(15)
- #undef CASE_LT16
- case 16:
- return _mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0));
- #define CASE_GT16(N) case N:\
- return _mm256_slli_si256(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0)), N - 16);
- CASE_GT16(17)
- CASE_GT16(18)
- CASE_GT16(19)
- CASE_GT16(20)
- CASE_GT16(21)
- CASE_GT16(22)
- CASE_GT16(23)
- CASE_GT16(24)
- CASE_GT16(25)
- CASE_GT16(26)
- CASE_GT16(27)
- CASE_GT16(28)
- CASE_GT16(29)
- CASE_GT16(30)
- CASE_GT16(31)
- #undef CASE_GT16
- default:
- __builtin_unreachable();
- }
- }
- inline static __m256i _mm256_shift_right(__m256i A, const uint32_t N) {
- switch (N) {
- #define CASE_LT16(N) case N:\
- return _mm256_alignr_epi8(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1)), A, N);
- CASE_LT16(1)
- CASE_LT16(2)
- CASE_LT16(3)
- CASE_LT16(4)
- CASE_LT16(5)
- CASE_LT16(6)
- CASE_LT16(7)
- CASE_LT16(8)
- CASE_LT16(9)
- CASE_LT16(10)
- CASE_LT16(11)
- CASE_LT16(12)
- CASE_LT16(13)
- CASE_LT16(14)
- CASE_LT16(15)
- #undef CASE_LT16
- case 16:
- return _mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1));
- #define CASE_GT16(N) case N:\
- return _mm256_srli_si256(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1)), N - 16);
- CASE_GT16(17)
- CASE_GT16(18)
- CASE_GT16(19)
- CASE_GT16(20)
- CASE_GT16(21)
- CASE_GT16(22)
- CASE_GT16(23)
- CASE_GT16(24)
- CASE_GT16(25)
- CASE_GT16(26)
- CASE_GT16(27)
- CASE_GT16(28)
- CASE_GT16(29)
- CASE_GT16(30)
- CASE_GT16(31)
- #undef CASE_GT16
- default:
- __builtin_unreachable();
- }
- }
- NOINLINE uint64_t char_count(std::string_view sv, char c = ' ') {
- uint64_t len = (uint32_t)sv.length(), count = 0;
- const char *p = sv.data();
- if (len < sizeof(__m256i)) {
- while (len--) {
- count += *p++ == c;
- }
- return count;
- }
- auto vc = _mm256_set1_epi8(c);
- if (auto align = (size_t)(p) % sizeof(__m256i)) {
- p -= align;
- auto buf = _mm256_load_si256((__m256i *)p);
- buf = _mm256_shift_right(buf, (uint32_t)align);
- auto result = _mm256_cmpeq_epi8(buf, vc);
- auto mask = _mm256_movemask_epi8(result);
- count += _mm_popcnt_u32(mask);
- p += sizeof(__m256i);
- len -= sizeof(__m256i) - align;
- }
- while (len >= sizeof(__m256i) * 2) {
- auto buf = _mm256_load_si256((__m256i *)p);
- auto buf1 = _mm256_load_si256(((__m256i *)p) + 1);
- auto result = _mm256_cmpeq_epi8(buf, vc);
- auto result1 = _mm256_cmpeq_epi8(buf1, vc);
- uint64_t mask = (uint32_t)_mm256_movemask_epi8(result);
- uint64_t mask1 = (uint32_t)_mm256_movemask_epi8(result1);
- count += _mm_popcnt_u64((mask << 32) | mask1);
- p += sizeof(__m256i) * 2;
- len -= sizeof(__m256i) * 2;
- }
- if (len >= sizeof(__m256i)) {
- auto buf = _mm256_load_si256((__m256i *)p);
- auto result = _mm256_cmpeq_epi8(buf, vc);
- auto mask = _mm256_movemask_epi8(result);
- count += _mm_popcnt_u32(mask);
- p += sizeof(__m256i);
- len -= sizeof(__m256i);
- }
- if (len) {
- auto buf = _mm256_load_si256((__m256i *)p);
- buf = _mm256_shift_left(buf, (uint32_t)(sizeof(__m256i) - len));
- auto result = _mm256_cmpeq_epi8(buf, vc);
- auto mask = _mm256_movemask_epi8(result);
- count += _mm_popcnt_u32(mask);
- }
- return count;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement