Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- inline static void integral2(const cv::Size2i& size, const uint8_t* srcBase, ptrdiff_t srcStride, uint32_t* sumBase, ptrdiff_t sumStride, double* sqsumBase, ptrdiff_t sqsumStride)
- {
- uint32x4_t v_zero = vmovq_n_u32(0u);
- uint16x8_t v_zero8 = vmovq_n_u16(0u);
- // the first iteration
- const uint8_t* src = internal::getRowPtr(srcBase, srcStride, 0);
- uint32_t* sum = internal::getRowPtr(sumBase, sumStride, 0);
- double* sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
- double prevsq = 0.;
- uint32x4_t prev = v_zero;
- size_t j = 0u;
- for (; j + 7 < size.width; j += 8)
- {
- internal::prefetch(sqsum + j);
- internal::prefetch(sum + j);
- internal::prefetch(src + j);
- uint8x8_t el8shr0 = vld1_u8(src + j);
- uint16x8_t sqel8shr0 = vmull_u8(el8shr0, el8shr0);
- uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
- uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
- uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
- uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
- uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
- uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
- uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
- uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
- uint32x4_t vsumh = vaddw_u16(prev, el4h);
- vst1q_u32(sum + j, vsuml);
- vst1q_u32(sum + j + 4, vsumh);
- prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
- uint16x8_t sqel8shr1 = vextq_u16(v_zero8, sqel8shr0, 7);
- uint32x4_t sqel8shr01l = vaddl_u16(vget_low_u16(sqel8shr0), vget_low_u16(sqel8shr1));
- uint32x4_t sqel8shr01h = vaddl_u16(vget_high_u16(sqel8shr0), vget_high_u16(sqel8shr1));
- uint32x4_t sqel4h = vaddq_u32(sqel8shr01l, sqel8shr01h);
- uint32x2_t sqel2l = vadd_u32(vget_low_u32(sqel8shr01l), vget_high_u32(sqel8shr01l));
- uint32x2_t sqel2hl = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel8shr01l));
- uint32x2_t sqel2hh = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel4h));
- uint32_t buf[8];
- vst1_u32(buf, vget_low_u32(sqel8shr01l));
- vst1_u32(buf + 2, sqel2l);
- vst1_u32(buf + 4, sqel2hl);
- vst1_u32(buf + 6, sqel2hh);
- for (uint32_t k = 0; k < 8; k++)
- sqsum[j + k] = prevsq + buf[k];
- prevsq += buf[7];
- }
- for (uint32_t v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
- {
- sum[j] = (v += src[j]);
- sqsum[j] = (prevsq += src[j] * src[j]);
- }
- // the others
- for (size_t i = 1; i < size.height; ++i)
- {
- src = internal::getRowPtr(srcBase, srcStride, i);
- uint32_t* prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
- sum = internal::getRowPtr(sumBase, sumStride, i);
- double* prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
- sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
- prev = v_zero;
- prevsq = 0.;
- j = 0u;
- for (; j + 7 < size.width; j += 8)
- {
- internal::prefetch(sqsum + j);
- internal::prefetch(sum + j);
- internal::prefetch(src + j);
- uint32x4_t vsuml = vld1q_u32(prevSum + j);
- uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
- uint8x8_t el8shr0 = vld1_u8(src + j);
- uint16x8_t sqel8shr0 = vmull_u8(el8shr0, el8shr0);
- uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
- uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
- uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
- vsuml = vaddq_u32(vsuml, prev);
- vsumh = vaddq_u32(vsumh, prev);
- uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
- uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
- uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
- uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
- vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
- vsumh = vaddw_u16(vsumh, el4h);
- vst1q_u32(sum + j, vsuml);
- vst1q_u32(sum + j + 4, vsumh);
- prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
- uint16x8_t sqel8shr1 = vextq_u16(v_zero8, sqel8shr0, 7);
- uint32x4_t sqel8shr01l = vaddl_u16(vget_low_u16(sqel8shr0), vget_low_u16(sqel8shr1));
- uint32x4_t sqel8shr01h = vaddl_u16(vget_high_u16(sqel8shr0), vget_high_u16(sqel8shr1));
- uint32x4_t sqel4h = vaddq_u32(sqel8shr01l, sqel8shr01h);
- uint32x2_t sqel2l = vadd_u32(vget_low_u32(sqel8shr01l), vget_high_u32(sqel8shr01l));
- uint32x2_t sqel2hl = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel8shr01l));
- uint32x2_t sqel2hh = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel4h));
- uint32_t buf[8];
- vst1_u32(buf, vget_low_u32(sqel8shr01l));
- vst1_u32(buf + 2, sqel2l);
- vst1_u32(buf + 4, sqel2hl);
- vst1_u32(buf + 6, sqel2hh);
- for (uint32_t k = 0; k < 8; k++)
- sqsum[j + k] = prevsq + prevSqSum[j + k] + buf[k];
- prevsq += buf[7];
- }
- for (uint32_t v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
- {
- sum[j] = (v += src[j]) + prevSum[j];
- sqsum[j] = (prevsq += src[j] * src[j]) + prevSqSum[j];
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement