Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <immintrin.h>
- #include <stdint.h>
- #include <stddef.h>
- bool memeq(const uint8_t* pu1, const uint8_t* pu2, size_t n) {
- if (n >= sizeof(__m512i)) {
- size_t off;
- for (off = 0; off <= n - sizeof(__m512i); off += sizeof(__m512i)) {
- auto ymm1 = _mm512_loadu_si512((const __m512i*)(pu1 + off));
- auto ymm2 = _mm512_loadu_si512((const __m512i*)(pu2 + off));
- auto res = _mm512_cmpeq_epi8_mask(ymm1, ymm2);
- if (_kortestz_mask64_u8(res, res)) return false;
- }
- if (off < n) {
- auto ymm1 = _mm512_loadu_si512((const __m512i*)(pu1 + n - sizeof(__m512i)));
- auto ymm2 = _mm512_loadu_si512((const __m512i*)(pu2 + n - sizeof(__m512i)));
- auto res = _mm512_cmpeq_epi8_mask(ymm1, ymm2);
- if (_kortestz_mask64_u8(res, res)) return false;
- }
- }
- else if (n >= sizeof(__m256i)) {
- size_t off;
- for (off = 0; off <= n - sizeof(__m256i); off += sizeof(__m256i)) {
- auto ymm1 = _mm256_loadu_si256((const __m256i*)(pu1 + off));
- auto ymm2 = _mm256_loadu_si256((const __m256i*)(pu2 + off));
- auto res = _mm256_cmpeq_epi8(ymm1, ymm2);
- if (!_mm256_testc_si256((res), _mm256_set1_epi8(~0))) return false;
- }
- if (off < n) {
- auto ymm1 = _mm256_loadu_si256((const __m256i*)(pu1 + n - sizeof(__m256i)));
- auto ymm2 = _mm256_loadu_si256((const __m256i*)(pu2 + n - sizeof(__m256i)));
- auto res = _mm256_cmpeq_epi8(ymm1, ymm2);
- if (!_mm256_testc_si256((res), _mm256_set1_epi8(~0))) return false;
- }
- }
- else if (n >= sizeof(__m128i)) {
- {
- auto xmm1 = _mm_loadu_si128((const __m128i*)pu1);
- auto xmm2 = _mm_loadu_si128((const __m128i*)pu2);
- auto res = _mm_cmpeq_epi8(xmm1, xmm2);
- if (!_mm_test_all_ones(res)) return false;
- }
- if (n > sizeof(__m128i)) {
- auto xmm1 = _mm_loadu_si128((const __m128i*)(pu1 + n - sizeof(__m128i)));
- auto xmm2 = _mm_loadu_si128((const __m128i*)(pu2 + n - sizeof(__m128i)));
- auto res = _mm_cmpeq_epi8(xmm1, xmm2);
- if (!_mm_test_all_ones(res)) return false;
- }
- }
- else if (n >= sizeof(uint64_t)) {
- {
- auto v1 = *(const uint64_t*)pu1;
- auto v2 = *(const uint64_t*)pu2;
- if (v1 != v2) return false;
- }
- if (n > sizeof(uint64_t)) {
- auto v1 = *(const uint64_t*)(pu1 + n - sizeof(uint64_t));
- auto v2 = *(const uint64_t*)(pu2 + n - sizeof(uint64_t));
- if (v1 != v2) return false;
- }
- }
- else if (n >= sizeof(uint32_t)) {
- {
- auto v1 = *(const uint32_t*)pu1;
- auto v2 = *(const uint32_t*)pu2;
- if (v1 != v2) return false;
- }
- if (n > sizeof(uint32_t)) {
- auto v1 = *(const uint32_t*)(pu1 + n - sizeof(uint32_t));
- auto v2 = *(const uint32_t*)(pu2 + n - sizeof(uint32_t));
- if (v1 != v2) return false;
- }
- }
- else if (n >= sizeof(uint16_t)) {
- {
- auto v1 = *(const uint16_t*)pu1;
- auto v2 = *(const uint16_t*)pu2;
- if (v1 != v2) return false;
- }
- if (n > sizeof(uint16_t)) {
- auto v1 = *(pu1 + 2);
- auto v2 = *(pu2 + 2);
- if (v1 != v2) return false;
- }
- }
- else {
- if (*pu1 != *pu2) return false;
- }
- return true;
- }
Add Comment
Please, Sign In to add comment