Advertisement
Guest User

Untitled

a guest
Nov 2nd, 2011
1,434
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 4.12 KB | None | 0 0
  1. /* invsqrt.c
  2.  * This program is used to analyse difference methods computing
  3.  * square root reciprocal.
  4.  *
  5.  * Written by littleshan <[email protected]>.
  6.  * NO RIGHT RESERVED, you can modify or redistribute this file as you want.
  7.  */
  8.  
  9. #include <stdio.h>
  10. #include <stdlib.h>
  11. #include <math.h>
  12. #include <string.h>
  13. #include <time.h>
  14. #include <stddef.h>
  15. #include <stdint.h>
  16.  
  17. /* Uncomment the following line if you want to check computation error
  18.  * against the naive one.
  19.  */
  20. #define CHECK_ERROR
  21.  
  22. /* The size of the input array. If you want to eliminate factors caused
  23.  * by memory latency, please make it smaller then L1 cache size.
  24.  * Note that you should multiply this value by sizeof(float) to get
  25.  * the actual size.
  26.  */
  27. #ifndef SIZE
  28. #define SIZE 1024   /* default is 4KB */
  29. #endif
  30.  
  31.  
  32. /* We use rdtsc to read timestamp count from CPU. */
  33. uint64_t rdtsc(void){
  34.     uint64_t tick;
  35.  
  36. #ifdef __x86_64__
  37.     __asm__ __volatile__(
  38.         "xorq %%rax, %%rax\n\t"
  39.         "rdtsc\n\t"
  40.         "shlq $32, %%rdx\n\t"
  41.         "xorq %%rdx, %%rax"
  42.         : "=a"(tick)
  43.         :
  44.         : "rdx"
  45.     );
  46. #else
  47.     __asm__ __volatile__(
  48.         "rdtsc"
  49.         : "=A"(tick)
  50.     );
  51. #endif
  52.  
  53.     return tick;
  54. }
  55.  
  56. void inv_sqrt_naive(float* begin, float* end, float* output)
  57. {
  58.     /* naive method */
  59.     for(; begin < end; ++begin, ++output)
  60.         *output = 1.0f / sqrtf(*begin);
  61. }
  62.  
  63. void inv_sqrt_marvelous(float* begin, float* end, float* output)
  64. {
  65.     /* marvelous method
  66.      * from http://www.beyond3d.com/articles/fastinvsqrt/
  67.      */
  68.     float xhalf;
  69.     /* We must use union to prevent breaking the strict aliasing rule. */
  70.     union {
  71.         int i;
  72.         float f;
  73.     } x;
  74.     for(; begin < end; ++begin, ++output){
  75.         xhalf = 0.5f * (*begin);
  76.         x.f = *begin;
  77.         x.i = 0x5f3759df - (x.i>>1);
  78.         *output = x.f * (1.5f - xhalf * x.f * x.f);
  79.     }
  80. }
  81.  
  82. void inv_sqrt_sse(float* begin, float* end, float* output)
  83. {
  84.     /* vectorized SSE */
  85.  
  86.     /* Since we process 16 floating point values at a time, there are
  87.      * remaining ones if the input size is not a multiple of 16, and
  88.      * we must take care of them.
  89.      */
  90.     size_t padding = (end - begin) % 16;
  91.     for(; padding > 0; --padding, ++begin, ++output)
  92.         *output = 1.0f / sqrt(*begin);
  93.  
  94.     __asm__ __volatile__(
  95.     "1:\n\t"
  96.         "cmp %0, %1\n\t"
  97.         "jbe 2f\n\t"
  98.         "movups (%0),   %%xmm0\n\t"
  99.         "movups 16(%0), %%xmm1\n\t"
  100.         "movups 32(%0), %%xmm2\n\t"
  101.         "movups 48(%0), %%xmm3\n\t"
  102.         "rsqrtps %%xmm0, %%xmm0\n\t"
  103.         "rsqrtps %%xmm1, %%xmm1\n\t"
  104.         "rsqrtps %%xmm2, %%xmm2\n\t"
  105.         "rsqrtps %%xmm3, %%xmm3\n\t"
  106.         "movups %%xmm0, (%2)\n\t"
  107.         "movups %%xmm1, 16(%2)\n\t"
  108.         "movups %%xmm2, 32(%2)\n\t"
  109.         "movups %%xmm3, 48(%2)\n\t"
  110.         "add $64, %0\n\t"
  111.         "add $64, %2\n\t"
  112.         "jmp 1b\n\t"
  113.     "2:\n\t"
  114.         :
  115.         : "r"(begin), "r"(end), "r"(output)
  116.         : "xmm0", "xmm1", "xmm2", "xmm3"
  117.     );
  118. }
  119.  
  120. /* This function checks the answer against the naive one. */
  121. float check_error(const float* begin, const float* end, const float* output)
  122. {
  123.     float max_err = 0.0f;
  124.     float err;
  125.     float ans;
  126.     for(; begin < end; ++begin, ++output){
  127.         ans = 1.0f / sqrtf(*begin);
  128.         err = fabsf(ans - *output);
  129.         max_err = err > max_err ? err : max_err;
  130.     }
  131.     return max_err;
  132. }
  133.  
  134. void benchmark(
  135.         const char* name, /* name of this method */
  136.         void (*fptr)(float*, float*, float*), /* function pointer */
  137.         float* begin,
  138.         float* end,
  139.         float* output )
  140. {
  141.     uint64_t tick;
  142.     printf("\n%s\n", name);
  143.     tick = rdtsc();
  144.     fptr(begin, end, output);
  145.     tick = rdtsc() - tick;
  146. #ifdef CHECK_ERROR
  147.     printf(
  148.         "    CPU cycles used: %10llu, error: %f\n",
  149.         tick,
  150.         check_error(begin, end, output)
  151.     );
  152. #else
  153.     printf("    CPU cycles used: %10llu\n", tick);
  154. #endif
  155. }
  156.  
  157. int main(void)
  158. {
  159.     int i;
  160.     float *input = (float*)malloc( sizeof(float) * SIZE );
  161.     float *output = (float*)malloc( sizeof(float) * SIZE );
  162.  
  163.     /* generate random input, from 0.5 to 1.5 */
  164.     srand(time(0));
  165.     for(i = 0; i < SIZE; i++)
  166.         input[i] = (float)rand() / RAND_MAX + 0.5f;
  167.  
  168.     printf("Input array size is %u\n", SIZE);
  169.    
  170.     benchmark("Naive sqrtf()", inv_sqrt_naive, input, input+SIZE, output);
  171.     benchmark("Vectorized SSE", inv_sqrt_sse, input, input+SIZE, output);
  172.     benchmark("Marvelous", inv_sqrt_marvelous, input, input+SIZE, output);
  173.  
  174.     return 0;
  175. }
  176.  
  177.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement