Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // various SSE1 and SSE2 floor function for float
- // assumes only normal input values
- // does not handle NAN or INF - they may or may not work
- // only guarantee is that 0, subnormal or normal float values will work
- #include <emmintrin.h>
- #ifdef _MSC_VER
- #include <intrin.h>
- #define NOINLINE __declspec(noinline)
- #else
- #define __rdtsc() __builtin_ia32_rdtsc()
- #define NOINLINE __attribute__((noinline))
- #endif
- #include <stdio.h>
- #include <math.h>
- // no inlining prevents compiler to vectorize benchmark loop
- static float NOINLINE std_floor(float x)
- {
- return floorf(x);
- }
- // full float range
- static float sse_floor(float x)
- {
- __m128 kSignBit = _mm_set1_ps(-0.f);
- __m128 kOne = _mm_set1_ps(1.f);
- __m128 kNoFraction = _mm_set1_ps(8388608.f);
- __m128 f = _mm_set_ss(x);
- __m128 r = f;
- // r = (int)f;
- r = _mm_add_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_add_ss(r, kNoFraction);
- // if (f < r) r -= 1;
- r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
- // if (abs(f) > 2**23) r = f;
- __m128 m = _mm_cmplt_ss(kNoFraction, _mm_andnot_ps(kSignBit, f));
- r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
- return _mm_cvtss_f32(r);
- }
- // only non-negative floats (x >= 0.f)
- static float sse_floor_pos(float x)
- {
- __m128 kOne = _mm_set1_ps(1.f);
- __m128 kNoFraction = _mm_set1_ps(8388608.f);
- __m128 f = _mm_set_ss(x);
- __m128 r = f;
- r = _mm_add_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
- __m128 m = _mm_cmplt_ss(kNoFraction, f);
- r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
- return _mm_cvtss_f32(r);
- }
- // only non-positive floats (x <= 0.f)
- static float sse_floor_neg(float x)
- {
- __m128 kOne = _mm_set1_ps(1.f);
- __m128 kNoFraction = _mm_set1_ps(-8388608.f);
- __m128 f = _mm_set_ss(x);
- __m128 r = f;
- r = _mm_add_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
- __m128 m = _mm_cmplt_ss(f, kNoFraction);
- r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
- return _mm_cvtss_f32(r);
- }
- // only [-8388608 .. +8388608] range
- static float sse_floor_small(float x)
- {
- __m128 kSignBit = _mm_set1_ps(-0.f);
- __m128 kOne = _mm_set1_ps(1.f);
- __m128 kNoFraction = _mm_set1_ps(8388608.f);
- __m128 f = _mm_set_ss(x);
- __m128 r = f;
- r = _mm_add_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_add_ss(r, kNoFraction);
- r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
- return _mm_cvtss_f32(r);
- }
- // only [0 .. +8388608] range
- static float sse_floor_small_pos(float x)
- {
- __m128 kOne = _mm_set1_ps(1.f);
- __m128 kNoFraction = _mm_set1_ps(8388608.f);
- __m128 f = _mm_set_ss(x);
- __m128 r = f;
- r = _mm_add_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
- return _mm_cvtss_f32(r);
- }
- // only [-8388608 .. 0] range
- static float sse_floor_small_neg(float x)
- {
- __m128 kOne = _mm_set1_ps(1.f);
- __m128 kNoFraction = _mm_set1_ps(-8388608.f);
- __m128 f = _mm_set_ss(x);
- __m128 r = f;
- r = _mm_add_ss(r, kNoFraction);
- r = _mm_sub_ss(r, kNoFraction);
- r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
- return _mm_cvtss_f32(r);
- }
- // full float range
- static float sse2_floor(float x)
- {
- __m128 kSignBit = _mm_set1_ps(-0.f);
- __m128 kOne = _mm_set1_ps(1.f);
- __m128 kMaxValue = _mm_set1_ps(2147483648.f);
- __m128 f = _mm_set_ss(x);
- __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
- __m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), kOne));
- __m128 m = _mm_cmple_ss(kMaxValue, _mm_andnot_ps(kSignBit, f));
- r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
- return _mm_cvtss_f32(r);
- }
- // only [-2147483648 .. +2147483647) range (actual range interval endpoints are truncated to values that fits in float)
- // or in other words: if floor(x) fits into int32_t, then result will be correct
- static float sse2_floor_small(float x)
- {
- __m128 kOne = _mm_set_ss(1.f);
- __m128 f = _mm_set_ss(x);
- __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
- __m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), kOne));
- return _mm_cvtss_f32(r);
- }
- // only [0 .. +2147483647) range (actual range interval endpoint is truncated to value that fits in float)
- // same as above, bot only for non-negative values
- static float sse2_floor_small_pos(float x)
- {
- __m128 f = _mm_set_ss(x);
- __m128 r = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
- return _mm_cvtss_f32(r);
- }
- static float u2f(unsigned int u)
- {
- union
- {
- unsigned int u;
- float f;
- } uf;
- uf.u = u;
- return uf.f;
- }
- static unsigned int f2u(float f)
- {
- union
- {
- unsigned int u;
- float f;
- } uf;
- uf.f = f;
- return uf.u;
- }
- // total benchmark array size = 1MB, reasonable value to fully fit in CPU cache?
- #define BENCH_COUNT (256 * 1024)
- // no inlining prevents compiler to optimize bechmark loop to NOP
- static NOINLINE float* bench_data()
- {
- static float data[BENCH_COUNT];
- return data;
- }
- #define CHECK(f, x, expected, func) do \
- { \
- float actual = func(f); \
- if (actual != expected) \
- { \
- printf("ERROR: f=%1.8e (0x%08x) floorf=%1.8e %s=%1.8e\n", f, x, expected, #func, actual); \
- return 0; \
- } \
- } while (0)
- #define BENCH(func) do \
- { \
- volatile float tmp = 0.f; \
- float f = tmp; \
- unsigned long long t1 = __rdtsc(); \
- for (int k=0; k<4096; k++) \
- { \
- float* data = bench_data(); \
- for (int i=0; i<BENCH_COUNT; i++) \
- { \
- f += func(data[i]); \
- } \
- } \
- tmp = f; \
- unsigned long long t2 = __rdtsc(); \
- printf("%-23s%5.1f\n", #func, (double)(t2 - t1) / BENCH_COUNT / 4096); \
- } while (0)
- int main()
- {
- volatile unsigned int x = 0;
- do
- {
- if ((x % (1<<26) == 0))
- {
- printf(".");
- }
- float f = u2f(x);
- int c = fpclassify(f);
- if (c == FP_ZERO || c == FP_NORMAL || c == FP_SUBNORMAL)
- {
- float expected = std_floor(f);
- CHECK(f, x, expected, sse_floor);
- if (f >= 0.f)
- {
- CHECK(f, x, expected, sse_floor_pos);
- }
- if (f <= 0.f)
- {
- CHECK(f, x, expected, sse_floor_neg);
- }
- if (f >= -8388608.f && f <= 8388608.f)
- {
- CHECK(f, x, expected, sse_floor_small);
- }
- if (f >= 0.f && f <= 8388608.f)
- {
- CHECK(f, x, expected, sse_floor_small_pos);
- }
- if (f >= -8388608.f && f <= 0.f)
- {
- CHECK(f, x, expected, sse_floor_small_neg);
- }
- CHECK(f, x, expected, sse2_floor);
- if (f >= -2147483648.f && f < +2147483647.f)
- {
- CHECK(f, x, expected, sse2_floor_small);
- }
- if (f >= 0.f && f < +2147483647.f)
- {
- CHECK(f, x, expected, sse2_floor_small_pos);
- }
- }
- x++;
- }
- while (x != 0);
- printf("\n");
- // go over array just to warm up caches
- {
- volatile float tmp = 0.f;
- float f = tmp;
- float* data = bench_data();
- for (int i=0; i<BENCH_COUNT; i++)
- {
- f += floorf(data[i]);
- }
- tmp = f;
- }
- // NOTE: don't fully trust benchmark results - verify it with your own code and data
- // this bencmark only tries to floor value "0" wich is repeated in a big array
- BENCH(std_floor);
- BENCH(sse_floor);
- BENCH(sse_floor_pos);
- BENCH(sse_floor_neg);
- BENCH(sse_floor_small);
- BENCH(sse_floor_small_pos);
- BENCH(sse_floor_small_neg);
- BENCH(sse2_floor);
- BENCH(sse2_floor_small);
- BENCH(sse2_floor_small_pos);
- }
Add Comment
Please, Sign In to add comment