Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <functional>
- #include <utility>
- #include <algorithm>
- #define NOMINMAX
- #include <Windows.h>
- #include <intrin.h>
- #include <cstdlib>
- const double clock_cycle = 3.4 * 1000 * 1000 * 1000; // Core i7-4770 @3.4GHz
- class timecounter
- {
- private:
- LARGE_INTEGER liStart, liStop;
- static double freq;
- public:
- void start() { QueryPerformanceCounter(&liStart); }
- void stop() { QueryPerformanceCounter(&liStop); }
- double get() { return (double)(liStop.QuadPart - liStart.QuadPart) / freq; }
- };
- double timecounter::freq = []() {
- LARGE_INTEGER li;
- QueryPerformanceFrequency(&li);
- return (double)li.QuadPart;
- }();
- #define FUNCNAME(fn) { #fn, fn }
- size_t reference_implementation(const char* mem, size_t sz)
- {
- size_t ret = 0;
- for (size_t i = 0; i < sz; ++i)
- if ((mem[i] & 0x80) == 0x00 || (mem[i] & 0xc0) == 0xC0)
- ++ret;
- return ret;
- }
- inline int32_t avx2_horizontal_sum_epi8(__m256i x)
- {
- __m256i sumhi = _mm256_unpackhi_epi8(x, _mm256_setzero_si256());
- __m256i sumlo = _mm256_unpacklo_epi8(x, _mm256_setzero_si256());
- __m256i sum16x16 = _mm256_add_epi16(sumhi, sumlo);
- __m256i sum16x8 = _mm256_add_epi16(sum16x16, _mm256_permute2x128_si256(sum16x16, sum16x16, 1));
- __m256i sum16x4 = _mm256_add_epi16(sum16x8, _mm256_shuffle_epi32(sum16x8, _MM_SHUFFLE(0, 0, 2, 3)));
- uint64_t tmp = _mm256_extract_epi64(sum16x4, 0);
- int32_t result = 0;
- result += (tmp >> 0) & 0xffff;
- result += (tmp >> 16) & 0xffff;
- result += (tmp >> 32) & 0xffff;
- result += (tmp >> 48) & 0xffff;
- return result;
- }
- size_t avx_count_utf8_codepoint(const char *p, size_t sz)
- {
- size_t result = 0;
- for (size_t i = 0; i + 31 < sz;) {
- __m256i sum = _mm256_setzero_si256();
- size_t j = 0;
- for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) {
- const __m256i table = _mm256_setr_epi8(
- 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
- 0, 0, 0, 0, // 0x8 .. 0xB
- 1, 1, 1, 1, // 0xC .. 0xF
- 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
- 0, 0, 0, 0, // 0x8 .. 0xB
- 1, 1, 1, 1 // 0xC .. 0xF
- );
- __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
- s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F));
- s = _mm256_shuffle_epi8(table, s);
- sum = _mm256_add_epi8(sum, s);
- }
- i += j;
- result += avx2_horizontal_sum_epi8(sum);
- }
- return result;
- }
- size_t avx_count_utf8_codepoint_loopend(const char *p, size_t sz)
- {
- size_t result = 0;
- for (size_t i = 0; i < sz;) {
- __m256i sum = _mm256_setzero_si256();
- size_t j = 0;
- size_t limit = std::min<size_t>(255 * 32, sz - i);
- for (; j < limit; j += 32) {
- const __m256i table = _mm256_setr_epi8(
- 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
- 0, 0, 0, 0, // 0x8 .. 0xB
- 1, 1, 1, 1, // 0xC .. 0xF
- 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
- 0, 0, 0, 0, // 0x8 .. 0xB
- 1, 1, 1, 1 // 0xC .. 0xF
- );
- __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
- s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F));
- s = _mm256_shuffle_epi8(table, s);
- sum = _mm256_add_epi8(sum, s);
- }
- i += j;
- result += avx2_horizontal_sum_epi8(sum);
- }
- return result;
- }
- size_t opt_innermost_content(const char *p, size_t sz)
- {
- size_t result = 0;
- for (size_t i = 0; i + 31 < sz;) {
- __m256i sum = _mm256_setzero_si256();
- size_t j = 0;
- for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) {
- __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
- sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41)));
- }
- i += j;
- result += avx2_horizontal_sum_epi8(sum);
- }
- return result;
- }
- size_t opt_innermost_content_loopend(const char *p, size_t sz)
- {
- size_t result = 0;
- for (size_t i = 0; i < sz;) {
- __m256i sum = _mm256_setzero_si256();
- size_t j = 0;
- size_t limit = std::min<size_t>(255 * 32, sz - i);
- for (; j < limit; j += 32) {
- __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
- sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41)));
- }
- i += j;
- result += avx2_horizontal_sum_epi8(sum);
- }
- return result;
- }
- const std::pair<size_t, int> sizereps[] = {
- #ifndef _DEBUG
- // { 32 * 255 * 2, 10000000 },
- { 32 * 256 * 2, 10000000 },
- // { 32 * 255 * 28, 1000000 },
- { 32 * 256 * 28, 1000000 },
- // { 32 * 255 * 768, 40000 },
- { 32 * 256 * 768, 40000 },
- // { 32 * 255 * 16384, 1000 },
- { 32 * 256 * 16384, 1000 },
- #else
- { 32 * 255 * 2, 1 },
- #endif
- };
- /* const をつけると clang の最適化がすごい ※1 */ std::pair<const char*, size_t(*)(const char*, size_t)> funcs[] = {
- // FUNCNAME(reference_implementation),
- FUNCNAME(avx_count_utf8_codepoint),
- FUNCNAME(avx_count_utf8_codepoint_loopend),
- FUNCNAME(opt_innermost_content),
- FUNCNAME(opt_innermost_content_loopend),
- };
- int main()
- {
- for (const auto [sz, rep] : sizereps)
- {
- printf("size %zu\n", sz);
- char* mem = (char*)_aligned_malloc(sz, 32);
- for (size_t i = 0; i < sz; ++i)
- mem[i] = rand();
- const size_t ncodepoint = reference_implementation(mem, sz);
- for (const auto[name, fn] : funcs)
- {
- timecounter tc;
- size_t r;
- printf(" %-32s: ", name);
- tc.start();
- for (int i = 0; i < rep; ++i)
- r = fn(mem, sz); // ※1 のところで const を付けると clang がここの呼び出しが純粋関数になることを利用してループせずに1回だけの実行になってしまう
- tc.stop();
- if (r != ncodepoint)
- {
- printf("wrong answer!\n");
- return 1;
- }
- printf("%10.3fus %5.1fGB/s %5.2fclk/32B\n",
- tc.get() / rep * (1000 * 1000),
- ((double)sz) * rep / tc.get() / (1000 * 1000 * 1000),
- tc.get() * clock_cycle / sz / rep * 32);
- }
- _aligned_free(mem);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement