Advertisement
Guest User

Untitled

a guest
Mar 31st, 2012
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.66 KB | None | 0 0
  1. On Sandy Bridge i7. All reported numbers are thread cycles (QueryThreadCycleTime),
  2. best out of 7 runs over given internal.
  3.  
  4. All runs go over the full set of floats in the given range. This tends to bias towards
  5. very small values (since they're more of them) and is not a realistic distribution, so
  6. keep that in mind. Reported values are total number of cycles and average number of cycles
  7. spent per 4-vector processed.
  8.  
  9. On SNB i7s, approx is barely faster; SNB has enough execution units to handle multiple
  10. cases in parallel, so the extra ops don't cost that much, it's all about the critical
  11. path (which is roughly the same length). This should look different on older i7s, Cores
  12. or P4s.
  13.  
  14. normalized values: all floats in [2^(-14),2^20-1]
  15. =================================================
  16. scalar fox tk: 942325268 cycles = 13.22 / vec
  17. "exact": 727730077 cycles = 10.21 / vec
  18. approx: 718301917 cycles = 10.07 / vec
  19.  
  20. denormal values: all floats in [2^(-25),2^(-14)-1] - DAZ/FZ (denormals are zero/flush to zero) flags OFF
  21. ====================================================
  22.  
  23. scalar fox tk: 307415404 cycles = 13.33 / vec
  24. "exact": 3832946063 cycles = 166.15 / vec
  25. approx: 3742647226 cycles = 162.24 / vec
  26.  
  27. denormal values: all floats in [2^(-25),2^(-14)-1] - DAZ/FZ (denormals are zero/flush to zero) flags ON
  28. ====================================================
  29.  
  30. scalar fox tk: 304416821 cycles = 13.20 / vec
  31. "exact": 247482147 cycles = 10.73 / vec
  32. approx: 234123906 cycles = 10.15 / vec
  33.  
  34. large range: all floats in [2^(-25),2^20-1] - DAZ/FZ (denormals are zero/flush to zero) flags OFF
  35. =================================================
  36.  
  37. scalar fox tk: 1251434493 cycles = 13.26 / vec
  38. "exact": 4642612090 cycles = 49.19 / vec
  39. approx: 4459117788 cycles = 47.25 / vec
  40.  
  41. large range: all floats in [2^(-25),2^20-1] - DAZ/FZ (denormals are zero/flush to zero) flags ON
  42. =================================================
  43.  
  44. scalar fox tk: 1249673516 cycles = 13.24 / vec
  45. "exact": 953292988 cycles = 10.10 / vec
  46. approx: 949879436 cycles = 10.07 / vec
  47.  
  48. ----------------
  49.  
  50. // Test code:
  51.  
  52. // int start = (127 - 14) << 23, end = (127 + 20) << 23;
  53. // int start = (127 - 15 - 10) << 23, end = (127 - 14) << 23;
  54. int start = (127 - 15 - 10) << 23, end = (127 + 20) << 23;
  55. static __m128i output[1024];
  56. HANDLE hThread = GetCurrentThread();
  57.  
  58. uint64 best = ~0ull;
  59.  
  60. // comment out next line to get benchmark without FZ/DAZ
  61. _mm_setcsr(_mm_getcsr() | 0x8040); // set FZ/DAZ flags
  62.  
  63. for (int runs=0; runs < 7; runs++)
  64. {
  65. __m128i vals = _mm_set_epi32(start + 3, start + 2, start + 1, start + 0);
  66. __m128i incr = _mm_set1_epi32(4);
  67.  
  68. uint64 tstart, tend;
  69. QueryThreadCycleTime(hThread, &tstart);
  70.  
  71. for (int i=start; i < end; i += 4)
  72. {
  73. #if 0 // scalar
  74. __m128i *p = &output[i & 1023];
  75. p->m128i_u32[0] = float_to_half_foxtk(i + 0);
  76. p->m128i_u32[1] = float_to_half_foxtk(i + 1);
  77. p->m128i_u32[2] = float_to_half_foxtk(i + 2);
  78. p->m128i_u32[3] = float_to_half_foxtk(i + 3);
  79. #else // SSE: flip between float_to_half_SSE2 and approx_float_to_half_SSE2 here
  80. __m128i out = approx_float_to_half_SSE2(_mm_castsi128_ps(vals));
  81. _mm_store_si128(&output[i & 1023], out);
  82. #endif
  83. vals = _mm_add_epi32(vals, incr);
  84. }
  85.  
  86. QueryThreadCycleTime(hThread, &tend);
  87.  
  88. uint64 time = tend - tstart;
  89. if (time < best)
  90. best = time;
  91. }
  92.  
  93. printf("best: %lld cycles = %.2f / vec\n", best, 4.0f * best / (end - start));
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement