Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdint.h>
- #include <string.h>
- #include <stdlib.h>
- #include <time.h>
- #include <tmmintrin.h>
- #ifdef _MSC_VER
- #define FORCEINLINE __forceinline
- #define NOINLINE __declspec(noinline)
- #define RESTRICT __restrict
- #else
- #define FORCEINLINE __attribute__((always_inline)) inline
- #define NOINLINE __attribute__((noinline))
- #define RESTRICT __restrict__
- #endif
- //switch to NOINLINE to see performance with forbidden inlining
- #define INLINING_MODE inline
- INLINING_MODE void OffsetMemCpy_PaulR(uint8_t* pDest, const uint8_t* pSrc, const uint8_t srcBitOffset, const size_t size)
- {
- if (srcBitOffset == 0)
- {
- for (size_t i = 0; i < size; ++i)
- {
- pDest[i] = pSrc[i];
- }
- }
- else if (size > 0)
- {
- uint8_t v0 = pSrc[0];
- for (size_t i = 0; i < size; ++i)
- {
- uint8_t v1 = pSrc[i + 1];
- pDest[i] = (v0 >> srcBitOffset) | (v1 << (8 - srcBitOffset));
- v0 = v1;
- }
- }
- }
- INLINING_MODE void OffsetMemCpy_PaulR_64(uint8_t* pDest, const uint8_t* pSrc, const uint8_t srcBitOffset, size_t size)
- {
- const uint64_t *RESTRICT source = (const uint64_t *)pSrc;
- uint64_t *RESTRICT destination = (uint64_t *)pDest;
- size = (size + 7) >> 3;
- if (srcBitOffset == 0)
- {
- for (size_t i = 0; i < size; ++i)
- {
- destination[i] = source[i];
- }
- }
- else if (size > 0)
- {
- uint64_t v0 = source[0];
- for (size_t i = 0; i < size; ++i)
- {
- uint64_t v1 = source[i + 1];
- destination[i] = (v0 >> srcBitOffset) | (v1 << (64 - srcBitOffset));
- v0 = v1;
- }
- }
- }
- INLINING_MODE void OffsetMemCpy_stgatilov(uint8_t *RESTRICT pDest, const uint8_t *RESTRICT pSrc, const uint8_t srcBitOffset, const size_t size) {
- __m128i bits = (sizeof(size_t) == 8 ? _mm_cvtsi64_si128(srcBitOffset) : _mm_cvtsi32_si128(srcBitOffset));
- const uint8_t *pEnd = pSrc + size;
- while (pSrc < pEnd) {
- __m128i input = _mm_loadu_si128((__m128i*)pSrc);
- __m128i reg = _mm_shuffle_epi8(input, _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14));
- __m128i shifted = _mm_srl_epi64(reg, bits);
- __m128i comp = _mm_shuffle_epi8(shifted, _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, -1, -1));
- _mm_storeu_si128((__m128i*)pDest, comp);
- pSrc += 14; pDest += 14;
- }
- }
- const int ARRAYSIZE = 1<<16;
- const int MAXBLOCKSIZE = 64;
- const int QUERYCNT = 1<<12;
- const int LAUNCHES = 1<<26;
- uint8_t input[ARRAYSIZE];
- uint8_t output[3][MAXBLOCKSIZE * 2];
- uint8_t offsets[QUERYCNT], starts[QUERYCNT];
- int main() {
- for (int i = 0; i < ARRAYSIZE; i++)
- input[i] = rand() & 255;
- for (int i = 0; i < 1000000; i++) {
- int st = rand() % (ARRAYSIZE/2);
- int len = 1 + rand() % MAXBLOCKSIZE;
- int offs = rand() % 8;
- memset(&output[0][0], 0, sizeof(output));
- OffsetMemCpy_PaulR(output[0], input + st, offs, len);
- OffsetMemCpy_PaulR_64(output[1], input + st, offs, len);
- OffsetMemCpy_stgatilov(output[2], input + st, offs, len);
- if (memcmp(output[0], output[1], len) != 0 || memcmp(output[0], output[2], len) != 0) {
- printf("Error:\n");
- printf("len = %d, offset = %d\n", len, offs);
- for (int q = 0; q < 4; q++) {
- for (int i = 0; i < len + (q==0); i++) {
- for (int j = 0; j < 8; j++)
- printf("%d", ((q==0 ? input + st : output[q-1])[i] >> j) & 1);
- printf(" ");
- }
- printf("\n");
- }
- }
- }
- for (int i = 0; i < QUERYCNT; i++) {
- offsets[i] = rand() % 8;
- starts[i] = rand() % (ARRAYSIZE/2);
- }
- printf("(billions of calls per second)\n");
- for (int size = 1; size <= 32; size++) {
- printf("size = %d:\n", size);
- int start = clock();
- for (int i = 0; i < LAUNCHES; i++) {
- int q = i & (QUERYCNT-1);
- OffsetMemCpy_PaulR(output[0], input + starts[q], offsets[q], size);
- }
- printf(" %0.3g (Paul R)\n", 1e-9 * LAUNCHES / (double(clock() - start) / CLOCKS_PER_SEC));
- start = clock();
- for (int i = 0; i < LAUNCHES; i++) {
- int q = i & (QUERYCNT-1);
- OffsetMemCpy_PaulR_64(output[0], input + starts[q], offsets[q], size);
- }
- printf(" %0.3g (Paul R x64)\n", 1e-9 * LAUNCHES / (double(clock() - start) / CLOCKS_PER_SEC));
- start = clock();
- for (int i = 0; i < LAUNCHES; i++) {
- int q = i & (QUERYCNT-1);
- OffsetMemCpy_stgatilov(output[0], input + starts[q], offsets[q], size);
- }
- printf(" %0.3g (stgatilov)\n", 1e-9 * LAUNCHES / (double(clock() - start) / CLOCKS_PER_SEC));
- }
- return 0;
- }
- /*
- Output with MSVC2013 x64 on Ivy Bridge 3.4 Ghz:
- (billions of calls per second)
- size = 1:
- 0.261 (Paul R)
- 0.249 (Paul R x64)
- 0.453 (stgatilov)
- size = 2:
- 0.201 (Paul R)
- 0.248 (Paul R x64)
- 0.45 (stgatilov)
- size = 3:
- 0.153 (Paul R)
- 0.249 (Paul R x64)
- 0.45 (stgatilov)
- size = 4:
- 0.132 (Paul R)
- 0.248 (Paul R x64)
- 0.45 (stgatilov)
- size = 5:
- 0.111 (Paul R)
- 0.248 (Paul R x64)
- 0.453 (stgatilov)
- size = 6:
- 0.0981 (Paul R)
- 0.248 (Paul R x64)
- 0.453 (stgatilov)
- size = 7:
- 0.0863 (Paul R)
- 0.248 (Paul R x64)
- 0.45 (stgatilov)
- size = 8:
- 0.0782 (Paul R)
- 0.249 (Paul R x64)
- 0.45 (stgatilov)
- size = 9:
- 0.0712 (Paul R)
- 0.191 (Paul R x64)
- 0.453 (stgatilov)
- size = 10:
- 0.0652 (Paul R)
- 0.191 (Paul R x64)
- 0.453 (stgatilov)
- size = 11:
- 0.0599 (Paul R)
- 0.191 (Paul R x64)
- 0.45 (stgatilov)
- size = 12:
- 0.0559 (Paul R)
- 0.191 (Paul R x64)
- 0.453 (stgatilov)
- size = 13:
- 0.0474 (Paul R)
- 0.19 (Paul R x64)
- 0.453 (stgatilov)
- size = 14:
- 0.0489 (Paul R)
- 0.191 (Paul R x64)
- 0.45 (stgatilov)
- size = 15:
- 0.0453 (Paul R)
- 0.191 (Paul R x64)
- 0.317 (stgatilov)
- size = 16:
- 0.0435 (Paul R)
- 0.191 (Paul R x64)
- 0.317 (stgatilov)
- size = 17:
- 0.0392 (Paul R)
- 0.121 (Paul R x64)
- 0.317 (stgatilov)
- size = 18:
- 0.0373 (Paul R)
- 0.122 (Paul R x64)
- 0.318 (stgatilov)
- size = 19:
- 0.0353 (Paul R)
- 0.121 (Paul R x64)
- 0.317 (stgatilov)
- size = 20:
- 0.0341 (Paul R)
- 0.122 (Paul R x64)
- 0.315 (stgatilov)
- size = 21:
- 0.0327 (Paul R)
- 0.121 (Paul R x64)
- 0.32 (stgatilov)
- size = 22:
- 0.0313 (Paul R)
- 0.122 (Paul R x64)
- 0.317 (stgatilov)
- size = 23:
- 0.0302 (Paul R)
- 0.121 (Paul R x64)
- 0.32 (stgatilov)
- size = 24:
- 0.0291 (Paul R)
- 0.122 (Paul R x64)
- 0.318 (stgatilov)
- size = 25:
- 0.028 (Paul R)
- 0.107 (Paul R x64)
- 0.317 (stgatilov)
- size = 26:
- 0.0271 (Paul R)
- 0.107 (Paul R x64)
- 0.317 (stgatilov)
- size = 27:
- 0.0262 (Paul R)
- 0.107 (Paul R x64)
- 0.318 (stgatilov)
- size = 28:
- 0.0253 (Paul R)
- 0.107 (Paul R x64)
- 0.315 (stgatilov)
- size = 29:
- 0.0246 (Paul R)
- 0.107 (Paul R x64)
- 0.235 (stgatilov)
- size = 30:
- 0.0238 (Paul R)
- 0.107 (Paul R x64)
- 0.235 (stgatilov)
- size = 31:
- 0.023 (Paul R)
- 0.107 (Paul R x64)
- 0.235 (stgatilov)
- size = 32:
- 0.0224 (Paul R)
- 0.107 (Paul R x64)
- 0.235 (stgatilov)
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement