Advertisement
Guest User

Untitled

a guest
Feb 19th, 2020
142
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.07 KB | None | 0 0
  1. inline static void integral2(const cv::Size2i& size, const uint8_t* srcBase, ptrdiff_t srcStride, uint32_t* sumBase, ptrdiff_t sumStride, double* sqsumBase, ptrdiff_t sqsumStride)
  2.     {
  3.         uint32x4_t v_zero = vmovq_n_u32(0u);
  4.         uint16x8_t v_zero8 = vmovq_n_u16(0u);
  5.  
  6.         // the first iteration
  7.         const uint8_t* src = internal::getRowPtr(srcBase, srcStride, 0);
  8.         uint32_t* sum = internal::getRowPtr(sumBase, sumStride, 0);
  9.         double* sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
  10.  
  11.         double prevsq = 0.;
  12.  
  13.         uint32x4_t prev = v_zero;
  14.         size_t j = 0u;
  15.  
  16.         for (; j + 7 < size.width; j += 8)
  17.         {
  18.             internal::prefetch(sqsum + j);
  19.             internal::prefetch(sum + j);
  20.             internal::prefetch(src + j);
  21.  
  22.             uint8x8_t el8shr0 = vld1_u8(src + j);
  23.             uint16x8_t sqel8shr0 = vmull_u8(el8shr0, el8shr0);
  24.             uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
  25.             uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
  26.             uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
  27.  
  28.             uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
  29.             uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
  30.  
  31.             uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
  32.             uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
  33.  
  34.             uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
  35.             uint32x4_t vsumh = vaddw_u16(prev, el4h);
  36.  
  37.             vst1q_u32(sum + j, vsuml);
  38.             vst1q_u32(sum + j + 4, vsumh);
  39.  
  40.             prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
  41.  
  42.  
  43.  
  44.             uint16x8_t sqel8shr1 = vextq_u16(v_zero8, sqel8shr0, 7);
  45.  
  46.             uint32x4_t sqel8shr01l = vaddl_u16(vget_low_u16(sqel8shr0), vget_low_u16(sqel8shr1));
  47.             uint32x4_t sqel8shr01h = vaddl_u16(vget_high_u16(sqel8shr0), vget_high_u16(sqel8shr1));
  48.  
  49.             uint32x4_t sqel4h = vaddq_u32(sqel8shr01l, sqel8shr01h);
  50.  
  51.             uint32x2_t sqel2l = vadd_u32(vget_low_u32(sqel8shr01l), vget_high_u32(sqel8shr01l));
  52.             uint32x2_t sqel2hl = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel8shr01l));
  53.             uint32x2_t sqel2hh = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel4h));
  54.  
  55.             uint32_t buf[8];
  56.             vst1_u32(buf, vget_low_u32(sqel8shr01l));
  57.             vst1_u32(buf + 2, sqel2l);
  58.             vst1_u32(buf + 4, sqel2hl);
  59.             vst1_u32(buf + 6, sqel2hh);
  60.             for (uint32_t k = 0; k < 8; k++)
  61.                 sqsum[j + k] = prevsq + buf[k];
  62.             prevsq += buf[7];
  63.  
  64.         }
  65.  
  66.         for (uint32_t v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
  67.         {
  68.             sum[j] = (v += src[j]);
  69.             sqsum[j] = (prevsq += src[j] * src[j]);
  70.         }
  71.  
  72.         // the others
  73.         for (size_t i = 1; i < size.height; ++i)
  74.         {
  75.             src = internal::getRowPtr(srcBase, srcStride, i);
  76.             uint32_t* prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
  77.             sum = internal::getRowPtr(sumBase, sumStride, i);
  78.  
  79.             double* prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
  80.             sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
  81.  
  82.             prev = v_zero;
  83.             prevsq = 0.;
  84.             j = 0u;
  85.  
  86.             for (; j + 7 < size.width; j += 8)
  87.             {
  88.                 internal::prefetch(sqsum + j);
  89.                 internal::prefetch(sum + j);
  90.                 internal::prefetch(src + j);
  91.  
  92.                 uint32x4_t vsuml = vld1q_u32(prevSum + j);
  93.                 uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
  94.  
  95.                 uint8x8_t el8shr0 = vld1_u8(src + j);
  96.                 uint16x8_t sqel8shr0 = vmull_u8(el8shr0, el8shr0);
  97.                 uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
  98.                 uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
  99.                 uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
  100.  
  101.                 vsuml = vaddq_u32(vsuml, prev);
  102.                 vsumh = vaddq_u32(vsumh, prev);
  103.  
  104.                 uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
  105.                 uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
  106.  
  107.                 uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
  108.                 uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
  109.  
  110.                 vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
  111.                 vsumh = vaddw_u16(vsumh, el4h);
  112.  
  113.                 vst1q_u32(sum + j, vsuml);
  114.                 vst1q_u32(sum + j + 4, vsumh);
  115.  
  116.                 prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
  117.  
  118.  
  119.                 uint16x8_t sqel8shr1 = vextq_u16(v_zero8, sqel8shr0, 7);
  120.  
  121.                 uint32x4_t sqel8shr01l = vaddl_u16(vget_low_u16(sqel8shr0), vget_low_u16(sqel8shr1));
  122.                 uint32x4_t sqel8shr01h = vaddl_u16(vget_high_u16(sqel8shr0), vget_high_u16(sqel8shr1));
  123.  
  124.                 uint32x4_t sqel4h = vaddq_u32(sqel8shr01l, sqel8shr01h);
  125.  
  126.                 uint32x2_t sqel2l = vadd_u32(vget_low_u32(sqel8shr01l), vget_high_u32(sqel8shr01l));
  127.                 uint32x2_t sqel2hl = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel8shr01l));
  128.                 uint32x2_t sqel2hh = vadd_u32(vget_low_u32(sqel4h), vget_high_u32(sqel4h));
  129.  
  130.                 uint32_t buf[8];
  131.                 vst1_u32(buf, vget_low_u32(sqel8shr01l));
  132.                 vst1_u32(buf + 2, sqel2l);
  133.                 vst1_u32(buf + 4, sqel2hl);
  134.                 vst1_u32(buf + 6, sqel2hh);
  135.                 for (uint32_t k = 0; k < 8; k++)
  136.                     sqsum[j + k] = prevsq + prevSqSum[j + k] + buf[k];
  137.                 prevsq += buf[7];
  138.  
  139.             }
  140.  
  141.             for (uint32_t v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
  142.             {
  143.                 sum[j] = (v += src[j]) + prevSum[j];
  144.                 sqsum[j] = (prevsq += src[j] * src[j]) + prevSqSum[j];
  145.             }
  146.         }
  147.     }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement