Advertisement
stgatilov

Canny edge detector: nonmaximum suppression on SSE

Sep 16th, 2015
762
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.92 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <assert.h>
  3. #include <string.h>
  4. #include <stdlib.h>
  5. #include <time.h>
  6. #include <xmmintrin.h>
  7.  
  8. void NonMaximumSuppression_original(
  9.     float* fpDst,
  10.     float const*const fpMagnitude,
  11.     unsigned char const*const ucpGradient,   ///< [in] 0 -> 0°, 1 -> 45°, 2 -> 90°, 3 -> 135°
  12.     int iXCount,
  13.     int iStride,
  14.     int iYCount,
  15.     int ignoreX,
  16.     int ignoreY
  17. ) {
  18.     memset(fpDst, 0, sizeof(fpDst[0]) * iYCount * iStride);
  19.  
  20.     for (int y = ignoreY; y < iYCount - ignoreY; ++y)
  21.     {
  22.         for (int x = ignoreX; x < iXCount - ignoreX; ++x)
  23.         {
  24.             int idx = iStride * y + x;
  25.             unsigned char dir = ucpGradient[idx];
  26.             float fMag = fpMagnitude[idx];
  27.          
  28.             if (dir == 0 && fpMagnitude[idx - 1]           <= fMag && fMag >= fpMagnitude[idx + 1] ||
  29.                 dir == 1 && fpMagnitude[idx - iXCount + 1] <= fMag && fMag >= fpMagnitude[idx + iXCount - 1] ||
  30.                 dir == 2 && fpMagnitude[idx - iXCount]     <= fMag && fMag >= fpMagnitude[idx + iXCount] ||
  31.                 dir == 3 && fpMagnitude[idx - iXCount - 1] <= fMag && fMag >= fpMagnitude[idx + iXCount + 1]
  32.             )
  33.                 fpDst[idx] = fMag;
  34.             else
  35.                 fpDst[idx] = 0;
  36.         }
  37.     }
  38. }
  39.  
  40. void NonMaximumSuppression_branchless(
  41.     float *dst,
  42.     float const*const src,
  43.     unsigned char const*const dir,
  44.     int width,
  45.     int stride,
  46.     int height,
  47.     int ignoreX,
  48.     int ignoreY
  49. ) {
  50.     size_t delta[4] = {1, stride - 1, stride, stride + 1};
  51.     for (size_t y = ignoreY; y < height - ignoreY; ++y) {
  52.         size_t x = ignoreX;
  53.         size_t idx = stride * y + x;
  54.         for (; x < width - ignoreX; ++x, ++idx) {
  55.             float curr = src[idx];
  56.             int offset = delta[dir[idx]];
  57.             bool isMax = (curr >= src[idx + offset] && curr >= src[idx - offset]);
  58.             dst[idx] = (isMax ? curr : 0.0f);
  59.         }
  60.     }
  61. }
  62.  
  63. void NonMaximumSuppression_scalarsse(
  64.     float *dst,
  65.     float const*const src,
  66.     unsigned char const*const dir,
  67.     int width,
  68.     int stride,
  69.     int height,
  70.     int ignoreX,
  71.     int ignoreY
  72. ) {
  73.     size_t delta[4] = {1, stride - 1, stride, stride + 1};
  74.     for (size_t y = ignoreY; y < height - ignoreY; ++y) {
  75.         size_t x = ignoreX;
  76.         size_t idx = stride * y + x;
  77.         size_t capX = width - ignoreX;
  78. /*        for (; x < capX/4*4; x+=4, idx+=4) {
  79.             #define DOIT(pos) {\
  80.                 size_t offset = delta[dir[pos]];\
  81.                 __m128 curr = _mm_load_ss(&src[pos]);\
  82.                 __m128 cmp1 = _mm_cmpge_ss(curr, _mm_load_ss(&src[pos + offset]));\
  83.                 __m128 cmp2 = _mm_cmpge_ss(curr, _mm_load_ss(&src[pos - offset]));\
  84.                 __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);\
  85.                 _mm_store_ss(&dst[pos], res);\
  86.             }
  87.             DOIT(idx+0);
  88.             DOIT(idx+1);
  89.             DOIT(idx+2);
  90.             DOIT(idx+3);
  91.         }*/
  92.         for (; x < capX; ++x, ++idx) {
  93.             size_t offset = delta[dir[idx]];
  94.             __m128 curr = _mm_load_ss(&src[idx]);
  95.             __m128 cmp1 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx + offset]));
  96.             __m128 cmp2 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx - offset]));
  97.             __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);
  98.             _mm_store_ss(&dst[idx], res);
  99.         }
  100.     }
  101. }
  102.  
  103. void NonMaximumSuppression_hybrid(
  104.     float *dst,
  105.     float const*const src,
  106.     unsigned char const*const dir,
  107.     int width,
  108.     int stride,
  109.     int height,
  110.     int ignoreX,
  111.     int ignoreY
  112. ) {
  113.     size_t delta[4] = {1, stride - 1, stride, stride + 1};
  114.     for (size_t y = ignoreY; y < height - ignoreY; ++y) {
  115.         size_t x = ignoreX;
  116.         size_t idx = stride * y + x;
  117.         size_t capX = width - ignoreX;
  118.         for (; x < capX/4*4; x+=4, idx+=4) {
  119.             size_t offset0 = delta[dir[idx + 0]];
  120.             size_t offset1 = delta[dir[idx + 1]];
  121.             size_t offset2 = delta[dir[idx + 2]];
  122.             size_t offset3 = delta[dir[idx + 3]];
  123.             __m128 curr = _mm_loadu_ps(&src[idx]);
  124.             __m128 forw = _mm_setr_ps(src[idx+0 + offset0], src[idx+1 + offset1], src[idx+2 + offset2], src[idx+3 + offset3]);
  125.             __m128 back = _mm_setr_ps(src[idx+0 - offset0], src[idx+1 - offset1], src[idx+2 - offset2], src[idx+3 - offset3]);
  126.             __m128 cmp1 = _mm_cmpge_ps(curr, forw);
  127.             __m128 cmp2 = _mm_cmpge_ps(curr, back);
  128.             __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);
  129.             _mm_storeu_ps(&dst[idx], res);
  130.         }
  131.         for (; x < capX; ++x, ++idx) {
  132.             size_t offset = delta[dir[idx]];
  133.             __m128 curr = _mm_load_ss(&src[idx]);
  134.             __m128 cmp1 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx + offset]));
  135.             __m128 cmp2 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx - offset]));
  136.             __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);
  137.             _mm_store_ss(&dst[idx], res);
  138.         }
  139.     }
  140. }
  141.  
  142.  
  143. const int WIDTH = 1024;
  144. const int HEIGHT = 1024;
  145.  
  146. float src[WIDTH * HEIGHT];
  147. unsigned char dir[WIDTH * HEIGHT];
  148. float dst1[WIDTH * HEIGHT];
  149. float dst2[WIDTH * HEIGHT];
  150. float dst3[WIDTH * HEIGHT];
  151. float dst4[WIDTH * HEIGHT];
  152.  
  153. int main() {
  154.     srand(666);
  155.     for (int p = 0; p < WIDTH * HEIGHT; p++) {
  156.         src[p] = 0.0f + rand();
  157.         dir[p] = rand() % 4;
  158.     }
  159.  
  160.     {
  161.         int start = clock();
  162.         int done = 0;
  163.         do {
  164.             NonMaximumSuppression_original(dst1, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
  165.             done++;
  166.         } while (clock() - start < CLOCKS_PER_SEC);
  167.         printf("original: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
  168.     }
  169.  
  170.     {
  171.         int start = clock();
  172.         int done = 0;
  173.         do {
  174.             NonMaximumSuppression_branchless(dst2, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
  175.             done++;
  176.         } while (clock() - start < CLOCKS_PER_SEC);
  177.         printf("branchless: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
  178.     }
  179.  
  180.     {
  181.         int start = clock();
  182.         int done = 0;
  183.         do {
  184.             NonMaximumSuppression_scalarsse(dst3, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
  185.             done++;
  186.         } while (clock() - start < CLOCKS_PER_SEC);
  187.         printf("scalarsse: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
  188.     }
  189.  
  190.     {
  191.         int start = clock();
  192.         int done = 0;
  193.         do {
  194.             NonMaximumSuppression_hybrid(dst4, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
  195.             done++;
  196.         } while (clock() - start < CLOCKS_PER_SEC);
  197.         printf("hybrid: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
  198.     }
  199.  
  200.     for (int p = 0; p < WIDTH * HEIGHT; p++) {
  201.         assert(dst1[p] == dst2[p]);
  202.         assert(dst1[p] == dst3[p]);
  203.         assert(dst1[p] == dst4[p]);
  204.     }
  205.  
  206.     return 0;  
  207. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement