Advertisement
Guest User

Untitled

a guest
Apr 23rd, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.60 KB | None | 0 0
  1. #include <functional>
  2. #include <utility>
  3. #include <algorithm>
  4. #define NOMINMAX
  5. #include <Windows.h>
  6. #include <intrin.h>
  7. #include <cstdlib>
  8.  
  9. const double clock_cycle = 3.4 * 1000 * 1000 * 1000; // Core i7-4770 @3.4GHz
  10.  
  11. class timecounter
  12. {
  13. private:
  14. LARGE_INTEGER liStart, liStop;
  15. static double freq;
  16. public:
  17. void start() { QueryPerformanceCounter(&liStart); }
  18. void stop() { QueryPerformanceCounter(&liStop); }
  19. double get() { return (double)(liStop.QuadPart - liStart.QuadPart) / freq; }
  20. };
  21. double timecounter::freq = []() {
  22. LARGE_INTEGER li;
  23. QueryPerformanceFrequency(&li);
  24. return (double)li.QuadPart;
  25. }();
  26.  
  27. #define FUNCNAME(fn) { #fn, fn }
  28.  
  29. size_t reference_implementation(const char* mem, size_t sz)
  30. {
  31. size_t ret = 0;
  32. for (size_t i = 0; i < sz; ++i)
  33. if ((mem[i] & 0x80) == 0x00 || (mem[i] & 0xc0) == 0xC0)
  34. ++ret;
  35. return ret;
  36. }
  37.  
  38. inline int32_t avx2_horizontal_sum_epi8(__m256i x)
  39. {
  40. __m256i sumhi = _mm256_unpackhi_epi8(x, _mm256_setzero_si256());
  41. __m256i sumlo = _mm256_unpacklo_epi8(x, _mm256_setzero_si256());
  42. __m256i sum16x16 = _mm256_add_epi16(sumhi, sumlo);
  43. __m256i sum16x8 = _mm256_add_epi16(sum16x16, _mm256_permute2x128_si256(sum16x16, sum16x16, 1));
  44. __m256i sum16x4 = _mm256_add_epi16(sum16x8, _mm256_shuffle_epi32(sum16x8, _MM_SHUFFLE(0, 0, 2, 3)));
  45. uint64_t tmp = _mm256_extract_epi64(sum16x4, 0);
  46. int32_t result = 0;
  47. result += (tmp >> 0) & 0xffff;
  48. result += (tmp >> 16) & 0xffff;
  49. result += (tmp >> 32) & 0xffff;
  50. result += (tmp >> 48) & 0xffff;
  51. return result;
  52. }
  53.  
  54. size_t avx_count_utf8_codepoint(const char *p, size_t sz)
  55. {
  56. size_t result = 0;
  57. for (size_t i = 0; i + 31 < sz;) {
  58. __m256i sum = _mm256_setzero_si256();
  59. size_t j = 0;
  60. for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) {
  61. const __m256i table = _mm256_setr_epi8(
  62. 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
  63. 0, 0, 0, 0, // 0x8 .. 0xB
  64. 1, 1, 1, 1, // 0xC .. 0xF
  65. 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
  66. 0, 0, 0, 0, // 0x8 .. 0xB
  67. 1, 1, 1, 1 // 0xC .. 0xF
  68. );
  69. __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
  70. s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F));
  71. s = _mm256_shuffle_epi8(table, s);
  72. sum = _mm256_add_epi8(sum, s);
  73. }
  74. i += j;
  75. result += avx2_horizontal_sum_epi8(sum);
  76. }
  77. return result;
  78. }
  79.  
  80. size_t avx_count_utf8_codepoint_loopend(const char *p, size_t sz)
  81. {
  82. size_t result = 0;
  83. for (size_t i = 0; i < sz;) {
  84. __m256i sum = _mm256_setzero_si256();
  85. size_t j = 0;
  86. size_t limit = std::min<size_t>(255 * 32, sz - i);
  87. for (; j < limit; j += 32) {
  88. const __m256i table = _mm256_setr_epi8(
  89. 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
  90. 0, 0, 0, 0, // 0x8 .. 0xB
  91. 1, 1, 1, 1, // 0xC .. 0xF
  92. 1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
  93. 0, 0, 0, 0, // 0x8 .. 0xB
  94. 1, 1, 1, 1 // 0xC .. 0xF
  95. );
  96. __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
  97. s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F));
  98. s = _mm256_shuffle_epi8(table, s);
  99. sum = _mm256_add_epi8(sum, s);
  100. }
  101. i += j;
  102. result += avx2_horizontal_sum_epi8(sum);
  103. }
  104. return result;
  105. }
  106.  
  107. size_t opt_innermost_content(const char *p, size_t sz)
  108. {
  109. size_t result = 0;
  110. for (size_t i = 0; i + 31 < sz;) {
  111. __m256i sum = _mm256_setzero_si256();
  112. size_t j = 0;
  113. for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) {
  114. __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
  115. sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41)));
  116. }
  117. i += j;
  118. result += avx2_horizontal_sum_epi8(sum);
  119. }
  120. return result;
  121. }
  122.  
  123. size_t opt_innermost_content_loopend(const char *p, size_t sz)
  124. {
  125. size_t result = 0;
  126. for (size_t i = 0; i < sz;) {
  127. __m256i sum = _mm256_setzero_si256();
  128. size_t j = 0;
  129. size_t limit = std::min<size_t>(255 * 32, sz - i);
  130. for (; j < limit; j += 32) {
  131. __m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
  132. sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41)));
  133. }
  134. i += j;
  135. result += avx2_horizontal_sum_epi8(sum);
  136. }
  137. return result;
  138. }
  139.  
  140. const std::pair<size_t, int> sizereps[] = {
  141. #ifndef _DEBUG
  142. // { 32 * 255 * 2, 10000000 },
  143. { 32 * 256 * 2, 10000000 },
  144. // { 32 * 255 * 28, 1000000 },
  145. { 32 * 256 * 28, 1000000 },
  146. // { 32 * 255 * 768, 40000 },
  147. { 32 * 256 * 768, 40000 },
  148. // { 32 * 255 * 16384, 1000 },
  149. { 32 * 256 * 16384, 1000 },
  150. #else
  151. { 32 * 255 * 2, 1 },
  152. #endif
  153. };
  154.  
  155. /* const をつけると clang の最適化がすごい ※1 */ std::pair<const char*, size_t(*)(const char*, size_t)> funcs[] = {
  156. // FUNCNAME(reference_implementation),
  157. FUNCNAME(avx_count_utf8_codepoint),
  158. FUNCNAME(avx_count_utf8_codepoint_loopend),
  159. FUNCNAME(opt_innermost_content),
  160. FUNCNAME(opt_innermost_content_loopend),
  161. };
  162.  
  163. int main()
  164. {
  165. for (const auto [sz, rep] : sizereps)
  166. {
  167. printf("size %zu\n", sz);
  168. char* mem = (char*)_aligned_malloc(sz, 32);
  169. for (size_t i = 0; i < sz; ++i)
  170. mem[i] = rand();
  171.  
  172. const size_t ncodepoint = reference_implementation(mem, sz);
  173.  
  174. for (const auto[name, fn] : funcs)
  175. {
  176. timecounter tc;
  177. size_t r;
  178.  
  179. printf(" %-32s: ", name);
  180. tc.start();
  181. for (int i = 0; i < rep; ++i)
  182. r = fn(mem, sz); // ※1 のところで const を付けると clang がここの呼び出しが純粋関数になることを利用してループせずに1回だけの実行になってしまう
  183. tc.stop();
  184. if (r != ncodepoint)
  185. {
  186. printf("wrong answer!\n");
  187. return 1;
  188. }
  189. printf("%10.3fus %5.1fGB/s %5.2fclk/32B\n",
  190. tc.get() / rep * (1000 * 1000),
  191. ((double)sz) * rep / tc.get() / (1000 * 1000 * 1000),
  192. tc.get() * clock_cycle / sz / rep * 32);
  193. }
  194.  
  195. _aligned_free(mem);
  196. }
  197. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement