Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // My Version
- void clCompute::IMPL_BASE_UnPackBits_Scatter(i64 num, const u32 *src, u8 *dst)
- {
- i64 bitIndex = 0;
- for (i32 b = 31; b >= 0; b--)
- {
- u32 mask = 1 << b;
- i64 localBitIndex = bitIndex;
- for (i64 i = 0; i < num; i++, localBitIndex++)
- dst[localBitIndex >> 3] |= ((src[i] & mask) >> b) << (7 - (localBitIndex & 7));
- bitIndex += num;
- }
- }
- // AI Version
- void clCompute::IMPL_AVX2_UnPackBits_Scatter(i64 num, const u32 * src, u8 * dst)
- {
- memset(dst, 0, num * sizeof(u32));
- // Process 8 elements at a time
- const int64_t numChunks = num >> 3;
- const int64_t remainder = num & 3;
- for (int bit = 31; bit >= 0; bit--)
- {
- __m256i bitMask = _mm256_set1_epi32(1 << bit);
- int64_t bitOffset = (31 - bit) * num;
- for (int64_t i = 0; i < numChunks; i++)
- {
- __m256i data = _mm256_loadu_si256((__m256i *) & src[i * 8]);
- __m256i masked = _mm256_and_si256(data, bitMask);
- __m256i shifted = _mm256_srli_epi32(masked, bit);
- uint32_t bitResults =
- ((_mm256_extract_epi32(shifted, 0) & 1) << 7) |
- ((_mm256_extract_epi32(shifted, 1) & 1) << 6) |
- ((_mm256_extract_epi32(shifted, 2) & 1) << 5) |
- ((_mm256_extract_epi32(shifted, 3) & 1) << 4) |
- ((_mm256_extract_epi32(shifted, 4) & 1) << 3) |
- ((_mm256_extract_epi32(shifted, 5) & 1) << 2) |
- ((_mm256_extract_epi32(shifted, 6) & 1) << 1) |
- ((_mm256_extract_epi32(shifted, 7) & 1) << 0);
- dst[(bitOffset + i * 8) >> 3] = (uint8_t)bitResults;
- }
- // Process remaining elements
- int64_t offset = numChunks * 8;
- for (int64_t i = 0; i < remainder; i++)
- {
- uint32_t bitValue = (src[offset + i] & (1 << bit)) >> bit;
- dst[(bitOffset + offset + i) >> 3] |= (bitValue << (7 - ((bitOffset + offset + i) & 7)));
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement