Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- compile with:
- gcc -c -Wall -O2 -march=skylake-avx512 sum64avx512.c
- g++ -s -Wall -O2 -march=skylake-avx512 aligned_add_main.o sum64avx512.o -o tst_64b
- */
- #include <stdint.h>
- #include <string.h>
- #include <intrin.h>
- uint64_t sum64(char* buf, int incr)
- {
- __m512i acc0=_mm512_setzero_si512();
- __m512i acc1=_mm512_setzero_si512();
- __m512i acc2=_mm512_setzero_si512();
- __m512i acc3=_mm512_setzero_si512();
- for (int i = 0; i < 4000; ++i) {
- char* p = buf;
- for (int k = 0; k < 125; ++k) {
- __m512i x0 = _mm512_loadu_si512((__m512i*)&p[0*sizeof(__m512i)]);
- __m512i x1 = _mm512_loadu_si512((__m512i*)&p[1*sizeof(__m512i)]);
- __m512i x2 = _mm512_loadu_si512((__m512i*)&p[2*sizeof(__m512i)]);
- __m512i x3 = _mm512_loadu_si512((__m512i*)&p[3*sizeof(__m512i)]);
- acc0 = _mm512_add_epi64(acc0, x0);
- acc1 = _mm512_add_epi64(acc1, x1);
- acc2 = _mm512_add_epi64(acc2, x2);
- acc3 = _mm512_add_epi64(acc3, x3);
- p += sizeof(__m512i)*4;
- }
- buf += incr;
- }
- acc0 = _mm512_add_epi64(acc0, acc1);
- acc2 = _mm512_add_epi64(acc2, acc3);
- acc0 = _mm512_add_epi64(acc0, acc2);
- __m256i acc = _mm256_add_epi64(_mm512_castsi512_si256(acc0), _mm512_extracti64x4_epi64(acc0, 1));
- return
- _mm256_extract_epi64(acc, 0) +
- _mm256_extract_epi64(acc, 1) +
- _mm256_extract_epi64(acc, 2) +
- _mm256_extract_epi64(acc, 3) ;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement