Guest User

Untitled

a guest
Sep 20th, 2021
57
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /*
  2. compile with:
  3. gcc -c -Wall -O2 -march=skylake-avx512 sum64avx512.c
  4. g++ -s -Wall -O2 -march=skylake-avx512 aligned_add_main.o sum64avx512.o -o tst_64b
  5. */
  6. #include <stdint.h>
  7. #include <string.h>
  8. #include <intrin.h>
  9.  
  10. uint64_t sum64(char* buf, int incr)
  11. {
  12.   __m512i acc0=_mm512_setzero_si512();
  13.   __m512i acc1=_mm512_setzero_si512();
  14.   __m512i acc2=_mm512_setzero_si512();
  15.   __m512i acc3=_mm512_setzero_si512();
  16.   for (int i = 0; i < 4000; ++i) {
  17.     char* p = buf;
  18.     for (int k = 0; k < 125; ++k) {
  19.       __m512i x0 = _mm512_loadu_si512((__m512i*)&p[0*sizeof(__m512i)]);
  20.       __m512i x1 = _mm512_loadu_si512((__m512i*)&p[1*sizeof(__m512i)]);
  21.       __m512i x2 = _mm512_loadu_si512((__m512i*)&p[2*sizeof(__m512i)]);
  22.       __m512i x3 = _mm512_loadu_si512((__m512i*)&p[3*sizeof(__m512i)]);
  23.       acc0 = _mm512_add_epi64(acc0, x0);
  24.       acc1 = _mm512_add_epi64(acc1, x1);
  25.       acc2 = _mm512_add_epi64(acc2, x2);
  26.       acc3 = _mm512_add_epi64(acc3, x3);
  27.       p += sizeof(__m512i)*4;
  28.     }
  29.     buf += incr;
  30.   }
  31.   acc0 = _mm512_add_epi64(acc0, acc1);
  32.   acc2 = _mm512_add_epi64(acc2, acc3);
  33.   acc0 = _mm512_add_epi64(acc0, acc2);
  34.   __m256i acc = _mm256_add_epi64(_mm512_castsi512_si256(acc0), _mm512_extracti64x4_epi64(acc0, 1));
  35.   return
  36.     _mm256_extract_epi64(acc, 0) +
  37.     _mm256_extract_epi64(acc, 1) +
  38.     _mm256_extract_epi64(acc, 2) +
  39.     _mm256_extract_epi64(acc, 3) ;
  40. }
RAW Paste Data