Guest User

Untitled

a guest
Jun 18th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.21 KB | None | 0 0
  1. // various SSE1 and SSE2 floor function for float
  2. // assumes only normal input values
  3. // does not handle NAN or INF - they may or may not work
  4. // only guarantee is that 0, subnormal or normal float values will work
  5.  
  6. #include <emmintrin.h>
  7.  
  8. #ifdef _MSC_VER
  9. #include <intrin.h>
  10. #define NOINLINE __declspec(noinline)
  11. #else
  12. #define __rdtsc() __builtin_ia32_rdtsc()
  13. #define NOINLINE __attribute__((noinline))
  14. #endif
  15.  
  16. #include <stdio.h>
  17. #include <math.h>
  18.  
  19. // no inlining prevents compiler to vectorize benchmark loop
  20. static float NOINLINE std_floor(float x)
  21. {
  22. return floorf(x);
  23. }
  24.  
  25. // full float range
  26. static float sse_floor(float x)
  27. {
  28. __m128 kSignBit = _mm_set1_ps(-0.f);
  29. __m128 kOne = _mm_set1_ps(1.f);
  30. __m128 kNoFraction = _mm_set1_ps(8388608.f);
  31.  
  32. __m128 f = _mm_set_ss(x);
  33. __m128 r = f;
  34.  
  35. // r = (int)f;
  36. r = _mm_add_ss(r, kNoFraction);
  37. r = _mm_sub_ss(r, kNoFraction);
  38. r = _mm_sub_ss(r, kNoFraction);
  39. r = _mm_add_ss(r, kNoFraction);
  40.  
  41. // if (f < r) r -= 1;
  42. r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
  43.  
  44. // if (abs(f) > 2**23) r = f;
  45. __m128 m = _mm_cmplt_ss(kNoFraction, _mm_andnot_ps(kSignBit, f));
  46. r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
  47.  
  48. return _mm_cvtss_f32(r);
  49. }
  50.  
  51. // only non-negative floats (x >= 0.f)
  52. static float sse_floor_pos(float x)
  53. {
  54. __m128 kOne = _mm_set1_ps(1.f);
  55. __m128 kNoFraction = _mm_set1_ps(8388608.f);
  56.  
  57. __m128 f = _mm_set_ss(x);
  58. __m128 r = f;
  59.  
  60. r = _mm_add_ss(r, kNoFraction);
  61. r = _mm_sub_ss(r, kNoFraction);
  62.  
  63. r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
  64.  
  65. __m128 m = _mm_cmplt_ss(kNoFraction, f);
  66. r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
  67.  
  68. return _mm_cvtss_f32(r);
  69. }
  70.  
  71. // only non-positive floats (x <= 0.f)
  72. static float sse_floor_neg(float x)
  73. {
  74. __m128 kOne = _mm_set1_ps(1.f);
  75. __m128 kNoFraction = _mm_set1_ps(-8388608.f);
  76.  
  77. __m128 f = _mm_set_ss(x);
  78. __m128 r = f;
  79.  
  80. r = _mm_add_ss(r, kNoFraction);
  81. r = _mm_sub_ss(r, kNoFraction);
  82.  
  83. r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
  84.  
  85. __m128 m = _mm_cmplt_ss(f, kNoFraction);
  86. r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
  87.  
  88. return _mm_cvtss_f32(r);
  89. }
  90.  
  91. // only [-8388608 .. +8388608] range
  92. static float sse_floor_small(float x)
  93. {
  94. __m128 kSignBit = _mm_set1_ps(-0.f);
  95. __m128 kOne = _mm_set1_ps(1.f);
  96. __m128 kNoFraction = _mm_set1_ps(8388608.f);
  97.  
  98. __m128 f = _mm_set_ss(x);
  99. __m128 r = f;
  100.  
  101. r = _mm_add_ss(r, kNoFraction);
  102. r = _mm_sub_ss(r, kNoFraction);
  103. r = _mm_sub_ss(r, kNoFraction);
  104. r = _mm_add_ss(r, kNoFraction);
  105.  
  106. r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
  107.  
  108. return _mm_cvtss_f32(r);
  109. }
  110.  
  111. // only [0 .. +8388608] range
  112. static float sse_floor_small_pos(float x)
  113. {
  114. __m128 kOne = _mm_set1_ps(1.f);
  115. __m128 kNoFraction = _mm_set1_ps(8388608.f);
  116.  
  117. __m128 f = _mm_set_ss(x);
  118. __m128 r = f;
  119.  
  120. r = _mm_add_ss(r, kNoFraction);
  121. r = _mm_sub_ss(r, kNoFraction);
  122.  
  123. r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
  124.  
  125. return _mm_cvtss_f32(r);
  126. }
  127.  
  128. // only [-8388608 .. 0] range
  129. static float sse_floor_small_neg(float x)
  130. {
  131. __m128 kOne = _mm_set1_ps(1.f);
  132. __m128 kNoFraction = _mm_set1_ps(-8388608.f);
  133.  
  134. __m128 f = _mm_set_ss(x);
  135. __m128 r = f;
  136.  
  137. r = _mm_add_ss(r, kNoFraction);
  138. r = _mm_sub_ss(r, kNoFraction);
  139.  
  140. r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
  141.  
  142. return _mm_cvtss_f32(r);
  143. }
  144.  
  145. // full float range
  146. static float sse2_floor(float x)
  147. {
  148. __m128 kSignBit = _mm_set1_ps(-0.f);
  149. __m128 kOne = _mm_set1_ps(1.f);
  150. __m128 kMaxValue = _mm_set1_ps(2147483648.f);
  151.  
  152. __m128 f = _mm_set_ss(x);
  153. __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
  154. __m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), kOne));
  155.  
  156. __m128 m = _mm_cmple_ss(kMaxValue, _mm_andnot_ps(kSignBit, f));
  157. r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
  158.  
  159. return _mm_cvtss_f32(r);
  160. }
  161.  
  162. // only [-2147483648 .. +2147483647) range (actual range interval endpoints are truncated to values that fits in float)
  163. // or in other words: if floor(x) fits into int32_t, then result will be correct
  164. static float sse2_floor_small(float x)
  165. {
  166. __m128 kOne = _mm_set_ss(1.f);
  167.  
  168. __m128 f = _mm_set_ss(x);
  169. __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
  170. __m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), kOne));
  171.  
  172. return _mm_cvtss_f32(r);
  173. }
  174.  
  175. // only [0 .. +2147483647) range (actual range interval endpoint is truncated to value that fits in float)
  176. // same as above, bot only for non-negative values
  177. static float sse2_floor_small_pos(float x)
  178. {
  179. __m128 f = _mm_set_ss(x);
  180. __m128 r = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
  181.  
  182. return _mm_cvtss_f32(r);
  183. }
  184.  
  185. static float u2f(unsigned int u)
  186. {
  187. union
  188. {
  189. unsigned int u;
  190. float f;
  191. } uf;
  192.  
  193. uf.u = u;
  194. return uf.f;
  195. }
  196.  
  197. static unsigned int f2u(float f)
  198. {
  199. union
  200. {
  201. unsigned int u;
  202. float f;
  203. } uf;
  204.  
  205. uf.f = f;
  206. return uf.u;
  207. }
  208.  
  209. // total benchmark array size = 1MB, reasonable value to fully fit in CPU cache?
  210. #define BENCH_COUNT (256 * 1024)
  211.  
  212. // no inlining prevents compiler to optimize bechmark loop to NOP
  213. static NOINLINE float* bench_data()
  214. {
  215. static float data[BENCH_COUNT];
  216. return data;
  217. }
  218.  
  219. #define CHECK(f, x, expected, func) do \
  220. { \
  221. float actual = func(f); \
  222. if (actual != expected) \
  223. { \
  224. printf("ERROR: f=%1.8e (0x%08x) floorf=%1.8e %s=%1.8e\n", f, x, expected, #func, actual); \
  225. return 0; \
  226. } \
  227. } while (0)
  228.  
  229. #define BENCH(func) do \
  230. { \
  231. volatile float tmp = 0.f; \
  232. float f = tmp; \
  233. unsigned long long t1 = __rdtsc(); \
  234. for (int k=0; k<4096; k++) \
  235. { \
  236. float* data = bench_data(); \
  237. for (int i=0; i<BENCH_COUNT; i++) \
  238. { \
  239. f += func(data[i]); \
  240. } \
  241. } \
  242. tmp = f; \
  243. unsigned long long t2 = __rdtsc(); \
  244. printf("%-23s%5.1f\n", #func, (double)(t2 - t1) / BENCH_COUNT / 4096); \
  245. } while (0)
  246.  
  247. int main()
  248. {
  249. volatile unsigned int x = 0;
  250. do
  251. {
  252. if ((x % (1<<26) == 0))
  253. {
  254. printf(".");
  255. }
  256.  
  257. float f = u2f(x);
  258. int c = fpclassify(f);
  259. if (c == FP_ZERO || c == FP_NORMAL || c == FP_SUBNORMAL)
  260. {
  261. float expected = std_floor(f);
  262.  
  263. CHECK(f, x, expected, sse_floor);
  264. if (f >= 0.f)
  265. {
  266. CHECK(f, x, expected, sse_floor_pos);
  267. }
  268. if (f <= 0.f)
  269. {
  270. CHECK(f, x, expected, sse_floor_neg);
  271. }
  272. if (f >= -8388608.f && f <= 8388608.f)
  273. {
  274. CHECK(f, x, expected, sse_floor_small);
  275. }
  276. if (f >= 0.f && f <= 8388608.f)
  277. {
  278. CHECK(f, x, expected, sse_floor_small_pos);
  279. }
  280. if (f >= -8388608.f && f <= 0.f)
  281. {
  282. CHECK(f, x, expected, sse_floor_small_neg);
  283. }
  284.  
  285. CHECK(f, x, expected, sse2_floor);
  286. if (f >= -2147483648.f && f < +2147483647.f)
  287. {
  288. CHECK(f, x, expected, sse2_floor_small);
  289. }
  290. if (f >= 0.f && f < +2147483647.f)
  291. {
  292. CHECK(f, x, expected, sse2_floor_small_pos);
  293. }
  294. }
  295. x++;
  296. }
  297. while (x != 0);
  298. printf("\n");
  299.  
  300. // go over array just to warm up caches
  301. {
  302. volatile float tmp = 0.f;
  303. float f = tmp;
  304. float* data = bench_data();
  305. for (int i=0; i<BENCH_COUNT; i++)
  306. {
  307. f += floorf(data[i]);
  308. }
  309. tmp = f;
  310. }
  311.  
  312. // NOTE: don't fully trust benchmark results - verify it with your own code and data
  313. // this bencmark only tries to floor value "0" wich is repeated in a big array
  314.  
  315. BENCH(std_floor);
  316. BENCH(sse_floor);
  317. BENCH(sse_floor_pos);
  318. BENCH(sse_floor_neg);
  319. BENCH(sse_floor_small);
  320. BENCH(sse_floor_small_pos);
  321. BENCH(sse_floor_small_neg);
  322. BENCH(sse2_floor);
  323. BENCH(sse2_floor_small);
  324. BENCH(sse2_floor_small_pos);
  325. }
Add Comment
Please, Sign In to add comment