Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <assert.h>
- #include <string.h>
- #include <stdlib.h>
- #include <time.h>
- #include <xmmintrin.h>
- void NonMaximumSuppression_original(
- float* fpDst,
- float const*const fpMagnitude,
- unsigned char const*const ucpGradient, ///< [in] 0 -> 0°, 1 -> 45°, 2 -> 90°, 3 -> 135°
- int iXCount,
- int iStride,
- int iYCount,
- int ignoreX,
- int ignoreY
- ) {
- memset(fpDst, 0, sizeof(fpDst[0]) * iYCount * iStride);
- for (int y = ignoreY; y < iYCount - ignoreY; ++y)
- {
- for (int x = ignoreX; x < iXCount - ignoreX; ++x)
- {
- int idx = iStride * y + x;
- unsigned char dir = ucpGradient[idx];
- float fMag = fpMagnitude[idx];
- if (dir == 0 && fpMagnitude[idx - 1] <= fMag && fMag >= fpMagnitude[idx + 1] ||
- dir == 1 && fpMagnitude[idx - iXCount + 1] <= fMag && fMag >= fpMagnitude[idx + iXCount - 1] ||
- dir == 2 && fpMagnitude[idx - iXCount] <= fMag && fMag >= fpMagnitude[idx + iXCount] ||
- dir == 3 && fpMagnitude[idx - iXCount - 1] <= fMag && fMag >= fpMagnitude[idx + iXCount + 1]
- )
- fpDst[idx] = fMag;
- else
- fpDst[idx] = 0;
- }
- }
- }
- void NonMaximumSuppression_branchless(
- float *dst,
- float const*const src,
- unsigned char const*const dir,
- int width,
- int stride,
- int height,
- int ignoreX,
- int ignoreY
- ) {
- size_t delta[4] = {1, stride - 1, stride, stride + 1};
- for (size_t y = ignoreY; y < height - ignoreY; ++y) {
- size_t x = ignoreX;
- size_t idx = stride * y + x;
- for (; x < width - ignoreX; ++x, ++idx) {
- float curr = src[idx];
- int offset = delta[dir[idx]];
- bool isMax = (curr >= src[idx + offset] && curr >= src[idx - offset]);
- dst[idx] = (isMax ? curr : 0.0f);
- }
- }
- }
- void NonMaximumSuppression_scalarsse(
- float *dst,
- float const*const src,
- unsigned char const*const dir,
- int width,
- int stride,
- int height,
- int ignoreX,
- int ignoreY
- ) {
- size_t delta[4] = {1, stride - 1, stride, stride + 1};
- for (size_t y = ignoreY; y < height - ignoreY; ++y) {
- size_t x = ignoreX;
- size_t idx = stride * y + x;
- size_t capX = width - ignoreX;
- /* for (; x < capX/4*4; x+=4, idx+=4) {
- #define DOIT(pos) {\
- size_t offset = delta[dir[pos]];\
- __m128 curr = _mm_load_ss(&src[pos]);\
- __m128 cmp1 = _mm_cmpge_ss(curr, _mm_load_ss(&src[pos + offset]));\
- __m128 cmp2 = _mm_cmpge_ss(curr, _mm_load_ss(&src[pos - offset]));\
- __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);\
- _mm_store_ss(&dst[pos], res);\
- }
- DOIT(idx+0);
- DOIT(idx+1);
- DOIT(idx+2);
- DOIT(idx+3);
- }*/
- for (; x < capX; ++x, ++idx) {
- size_t offset = delta[dir[idx]];
- __m128 curr = _mm_load_ss(&src[idx]);
- __m128 cmp1 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx + offset]));
- __m128 cmp2 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx - offset]));
- __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);
- _mm_store_ss(&dst[idx], res);
- }
- }
- }
- void NonMaximumSuppression_hybrid(
- float *dst,
- float const*const src,
- unsigned char const*const dir,
- int width,
- int stride,
- int height,
- int ignoreX,
- int ignoreY
- ) {
- size_t delta[4] = {1, stride - 1, stride, stride + 1};
- for (size_t y = ignoreY; y < height - ignoreY; ++y) {
- size_t x = ignoreX;
- size_t idx = stride * y + x;
- size_t capX = width - ignoreX;
- for (; x < capX/4*4; x+=4, idx+=4) {
- size_t offset0 = delta[dir[idx + 0]];
- size_t offset1 = delta[dir[idx + 1]];
- size_t offset2 = delta[dir[idx + 2]];
- size_t offset3 = delta[dir[idx + 3]];
- __m128 curr = _mm_loadu_ps(&src[idx]);
- __m128 forw = _mm_setr_ps(src[idx+0 + offset0], src[idx+1 + offset1], src[idx+2 + offset2], src[idx+3 + offset3]);
- __m128 back = _mm_setr_ps(src[idx+0 - offset0], src[idx+1 - offset1], src[idx+2 - offset2], src[idx+3 - offset3]);
- __m128 cmp1 = _mm_cmpge_ps(curr, forw);
- __m128 cmp2 = _mm_cmpge_ps(curr, back);
- __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);
- _mm_storeu_ps(&dst[idx], res);
- }
- for (; x < capX; ++x, ++idx) {
- size_t offset = delta[dir[idx]];
- __m128 curr = _mm_load_ss(&src[idx]);
- __m128 cmp1 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx + offset]));
- __m128 cmp2 = _mm_cmpge_ss(curr, _mm_load_ss(&src[idx - offset]));
- __m128 res = _mm_and_ps(_mm_and_ps(cmp1, cmp2), curr);
- _mm_store_ss(&dst[idx], res);
- }
- }
- }
- const int WIDTH = 1024;
- const int HEIGHT = 1024;
- float src[WIDTH * HEIGHT];
- unsigned char dir[WIDTH * HEIGHT];
- float dst1[WIDTH * HEIGHT];
- float dst2[WIDTH * HEIGHT];
- float dst3[WIDTH * HEIGHT];
- float dst4[WIDTH * HEIGHT];
- int main() {
- srand(666);
- for (int p = 0; p < WIDTH * HEIGHT; p++) {
- src[p] = 0.0f + rand();
- dir[p] = rand() % 4;
- }
- {
- int start = clock();
- int done = 0;
- do {
- NonMaximumSuppression_original(dst1, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
- done++;
- } while (clock() - start < CLOCKS_PER_SEC);
- printf("original: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
- }
- {
- int start = clock();
- int done = 0;
- do {
- NonMaximumSuppression_branchless(dst2, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
- done++;
- } while (clock() - start < CLOCKS_PER_SEC);
- printf("branchless: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
- }
- {
- int start = clock();
- int done = 0;
- do {
- NonMaximumSuppression_scalarsse(dst3, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
- done++;
- } while (clock() - start < CLOCKS_PER_SEC);
- printf("scalarsse: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
- }
- {
- int start = clock();
- int done = 0;
- do {
- NonMaximumSuppression_hybrid(dst4, src, dir, WIDTH, WIDTH, HEIGHT, 2, 2);
- done++;
- } while (clock() - start < CLOCKS_PER_SEC);
- printf("hybrid: %0.3lf\n", 1000.0 * (clock() - start) / done / CLOCKS_PER_SEC);
- }
- for (int p = 0; p < WIDTH * HEIGHT; p++) {
- assert(dst1[p] == dst2[p]);
- assert(dst1[p] == dst3[p]);
- assert(dst1[p] == dst4[p]);
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement