Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <time.h>
- #include <nmmintrin.h>
- #include <stdint.h>
- #include <stdlib.h>
- #include <string.h>
- #include <intrin.h>
- #ifdef _MSC_VER
- #define ALIGN(n) __declspec(align(n))
- int __builtin_ctz (unsigned int x) {
- unsigned long res;
- _BitScanForward(&res, x);
- return res;
- }
- #else
- #define ALIGN(n) __attribute__((aligned(n)))
- #endif
- inline int min(int a, int b) { return a < b ? a : b; }
- inline int max(int a, int b) { return a > b ? a : b; }
- //OP's original code
- int CommonAsciiLength_original(int len, unsigned char *p) {
- int i = 0;
- while (i < len && p[i] >= 32 && p[i] <= 127)
- i++;
- return i;
- }
- //OP's improved code with 64-bit integers
- int CommonAsciiLength_bitmask(int len, unsigned char *p) {
- int i = 0;
- while (i < len - 8) {
- uint64_t bytes = *(uint64_t *)(p + i);
- uint64_t middleBits = bytes & 0x6060606060606060;
- uint64_t highBits = bytes & 0x8080808080808080;
- middleBits |= (middleBits >> 1);
- middleBits &= ~(highBits >> 2);
- if ((middleBits & 0x2020202020202020) != 0x2020202020202020)
- break;
- i += 8;
- }
- while (i < len && p[i] >= 32 && p[i] <= 127)
- i++;
- return i;
- }
- //SSE2 vectorized answer by Pete
- int CommonAsciiLength_Pete(int len, unsigned char *p) {
- int i = 0;
- __m128i A;
- __m128i B;
- __m128i C;
- __m128i* pu = (__m128i*)p;
- int const len16 = len / 16;
- while (i < len16) {
- A = pu[i];
- B = _mm_slli_epi32(A, 1);
- C = _mm_slli_epi32(A, 2);
- B = _mm_or_si128(B, C);
- A = _mm_andnot_si128(A, B);
- int mask = _mm_movemask_epi8(A);
- if (mask == 0xFFFF) {
- ++i;
- }
- else {
- if (mask == 0) {
- return i * 16;
- }
- break;
- }
- }
- i *= 16;
- while (i < len && p[i] >= 32 && p[i] <= 127) {
- i++;
- }
- return i;
- }
- //SSE2 vectorization by stgatilov: no unrolling, per-byte tail
- int CommonAsciiLength_sse2(int len, unsigned char *p) {
- const __m128i *ptr = (const __m128i *)p;
- int blocks = len >> 4;
- int cnt;
- for (cnt = 0; cnt < blocks; cnt++) {
- __m128i mask = _mm_cmplt_epi8(ptr[cnt], _mm_set1_epi8(32));
- if (_mm_movemask_epi8(mask))
- break;
- }
- int pos;
- for (pos = cnt * 16; pos < len; pos++)
- if (char(p[pos]) < 32)
- return pos;
- return len;
- }
- //SSE2 vectorization by stgatilov: 4x unrolling, per-byte tail
- int CommonAsciiLength_sse2_x4(int len, unsigned char *p) {
- const __m128i *ptr = (const __m128i *)p;
- int blocks = (len >> 6) << 2;
- int cnt;
- for (cnt = 0; cnt < blocks; cnt += 4) {
- __m128i m0 = _mm_cmplt_epi8(ptr[cnt+0], _mm_set1_epi8(32));
- __m128i m1 = _mm_cmplt_epi8(ptr[cnt+1], _mm_set1_epi8(32));
- __m128i m2 = _mm_cmplt_epi8(ptr[cnt+2], _mm_set1_epi8(32));
- __m128i m3 = _mm_cmplt_epi8(ptr[cnt+3], _mm_set1_epi8(32));
- __m128i mask = _mm_or_si128(_mm_or_si128(m0, m1), _mm_or_si128(m2, m3));
- if (_mm_movemask_epi8(mask))
- break;
- }
- int pos;
- for (pos = cnt * 16; pos < len; pos++)
- if (char(p[pos]) < 32)
- return pos;
- return len;
- }
- //SSE4.2 vectorization by stgatilov: using range string operations
- int CommonAsciiLength_sse42(int len, unsigned char *p) {
- const __m128i *ptr = (const __m128i *)p;
- int blocks = (len >> 4);
- __m128i range = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 32);
- int cnt;
- for (cnt = 0; cnt < blocks; cnt++) {
- int res = _mm_cmpestri(range, 2, ptr[cnt], 16, _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_MASKED_NEGATIVE_POLARITY | _SIDD_LEAST_SIGNIFICANT);
- if (res < 16)
- return cnt * 16 + res;
- }
- int pos = 16 * cnt;
- int res = _mm_cmpestri(range, 2, ptr[cnt], len - pos, _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_MASKED_NEGATIVE_POLARITY | _SIDD_LEAST_SIGNIFICANT);
- res = (res == 16 ? len - pos : res);
- return pos + res;
- }
- //SSE2 vectorization by stgatilov: no unrolling, fast BSF tail
- int CommonAsciiLength_sse2_end(int len, unsigned char *p) {
- const __m128i *ptr = (const __m128i *)p;
- int blocks = len >> 4;
- int cnt;
- for (cnt = 0; cnt < blocks; cnt++) {
- __m128i mask = _mm_cmplt_epi8(ptr[cnt], _mm_set1_epi8(32));
- int val = _mm_movemask_epi8(mask);
- if (val)
- return 16 * cnt + __builtin_ctz(val);
- }
- __m128i mask = _mm_cmplt_epi8(ptr[cnt], _mm_set1_epi8(32));
- int val = _mm_movemask_epi8(mask);
- val |= -(1 << (len - 16 * cnt));
- return 16 * cnt + __builtin_ctz(val);
- }
- //SSE2 vectorization by stgatilov: x4 unrolling, fast SSE2 + BSF tail
- int CommonAsciiLength_sse2_x4_end(int len, unsigned char *p) {
- const __m128i *ptr = (const __m128i *)p;
- int blocks = len >> 4;
- int fastBlocks = (blocks >> 2) << 2;
- int cnt;
- for (cnt = 0; cnt < fastBlocks; cnt += 4) {
- __m128i m0 = _mm_cmplt_epi8(ptr[cnt+0], _mm_set1_epi8(32));
- __m128i m1 = _mm_cmplt_epi8(ptr[cnt+1], _mm_set1_epi8(32));
- __m128i m2 = _mm_cmplt_epi8(ptr[cnt+2], _mm_set1_epi8(32));
- __m128i m3 = _mm_cmplt_epi8(ptr[cnt+3], _mm_set1_epi8(32));
- __m128i mask = _mm_or_si128(_mm_or_si128(m0, m1), _mm_or_si128(m2, m3));
- if (_mm_movemask_epi8(mask))
- break;
- }
- for (; cnt < blocks; cnt++) {
- __m128i mask = _mm_cmplt_epi8(ptr[cnt], _mm_set1_epi8(32));
- int val = _mm_movemask_epi8(mask);
- if (val)
- return 16 * cnt + __builtin_ctz(val);
- }
- __m128i mask = _mm_cmplt_epi8(ptr[cnt], _mm_set1_epi8(32));
- int val = _mm_movemask_epi8(mask);
- val |= -(1 << (len - 16 * cnt));
- return 16 * cnt + __builtin_ctz(val);
- }
- //========================================== TESTING =================================================
- static const int SIZE = 1<<20;
- unsigned char data[SIZE]; //number of chars in data buffer
- static const int CNT = 1<<16; //number of queries generated (all in one buffer)
- unsigned char *queryPtr[CNT];
- int queryLen[CNT];
- static const int64_t WORK = 1LL<<32; //total amount of work is proportional to this
- static const int AVGLEN = 100; //with probability (1 - 1/A) ASCII character is generated
- int main() {
- for (int i = 0; i < SIZE; i++) {
- if (rand() % AVGLEN == 0)
- data[i] = rand() % 256;
- else
- data[i] = 32 + rand() % (128-32);
- }
- double avgLen = 0.0;
- for (int i = 0; i < CNT; i++) {
- int start = rand() % (SIZE/16) * 16;
- int len = rand() % min(SIZE - start + 1, 1<<12);
- queryPtr[i] = data + start;
- queryLen[i] = len;
- #define VERSIONS 8
- int res[VERSIONS];
- res[0] = CommonAsciiLength_original(queryLen[i], queryPtr[i]);
- res[1] = CommonAsciiLength_bitmask(queryLen[i], queryPtr[i]);
- res[2] = CommonAsciiLength_Pete(queryLen[i], queryPtr[i]);
- res[3] = CommonAsciiLength_sse2(queryLen[i], queryPtr[i]);
- res[4] = CommonAsciiLength_sse2_x4(queryLen[i], queryPtr[i]);
- res[5] = CommonAsciiLength_sse42(queryLen[i], queryPtr[i]);
- res[6] = CommonAsciiLength_sse2_end(queryLen[i], queryPtr[i]);
- res[7] = CommonAsciiLength_sse2_x4_end(queryLen[i], queryPtr[i]);
- avgLen += res[0];
- bool same = true;
- for (int i = 0; i+1 < VERSIONS; i++)
- if (res[i] != res[i+1])
- same = false;
- if (!same)
- printf("Error: %d %d %d %d %d %d %d %d\n", res[0], res[1], res[2], res[3], res[4], res[5], res[6], res[7]);
- }
- avgLen /= CNT;
- printf("All checked.\n");
- printf("Average answer = %0.1lf\n", avgLen);
- int CALLS = WORK / max(AVGLEN, 16);
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_original(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) original\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_bitmask(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) bitmask\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_Pete(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) Pete\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_sse2(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) sse2\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_sse2_x4(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) sse2_x4\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_sse42(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) sse42\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_sse2_end(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) sse2_end\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- {
- int start = clock();
- int sum = 0;
- for (int i = 0; i < CALLS; i++) {
- int k = i & (CNT-1);
- sum += CommonAsciiLength_sse2_x4_end(queryLen[k], queryPtr[k]);
- }
- printf("Time = %0.3lf (%d) sse2_x4_end\n", double(clock() - start) / CLOCKS_PER_SEC, sum);
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement