Guest User

Untitled

a guest
Nov 2nd, 2011
45
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #if 0
  2. [root@graphitemaster ~]# ./test
  3.  
  4.  
  5. kernel: 8.980000 (sec)
  6. sqrtf : 8.740000 (sec)
  7. sqrt  : 9.870000 (sec)
  8. johnc : 13.950000 (sec)
  9. sse   : 18.300000 (sec)
  10.  
  11. #endif
  12. #include <stdio.h>
  13. #include <time.h>
  14. #include <math.h>
  15. #include <limits.h>
  16. #include <stdlib.h>
  17. #include <xmmintrin.h>
  18.  
  19. /*
  20.  * Square root lookup table, this should be
  21.  * fast, again, accuracy is lost.
  22.  */
  23. static const short int sqrt_table[] = {
  24.     0x0000, 0x0010, 0x0016, 0x001B, 0x0020, 0x0023, 0x0027, 0x002A,
  25.     0x002D, 0x0030, 0x0032, 0x0035, 0x0037, 0x0039, 0x003B, 0x003D,
  26.     0x0040, 0x0041, 0x0043, 0x0045, 0x0047, 0x0049, 0x004B, 0x004C,
  27.     0x004E, 0x0050, 0x0051, 0x0053, 0x0054, 0x0056, 0x0057, 0x0059,
  28.     0x005A, 0x005B, 0x005D, 0x005E, 0x0060, 0x0061, 0x0062, 0x0063,
  29.     0x0065, 0x0066, 0x0067, 0x0068, 0x006A, 0x006B, 0x006C, 0x006D,
  30.     0x006E, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076,
  31.     0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E,
  32.     0x0080, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086,
  33.     0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E,
  34.     0x008F, 0x0090, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095,
  35.     0x0096, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009B,
  36.     0x009C, 0x009D, 0x009E, 0x009F, 0x00A0, 0x00A0, 0x00A1, 0x00A2,
  37.     0x00A3, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A7, 0x00A8,
  38.     0x00A9, 0x00AA, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AD, 0x00AE,
  39.     0x00AF, 0x00B0, 0x00B0, 0x00B1, 0x00B2, 0x00B2, 0x00B3, 0x00B4,
  40.     0x00B5, 0x00B5, 0x00B6, 0x00B7, 0x00B7, 0x00B8, 0x00B9, 0x00B9,
  41.     0x00BA, 0x00BB, 0x00BB, 0x00BC, 0x00BD, 0x00BD, 0x00BE, 0x00BF,
  42.     0x00C0, 0x00C0, 0x00C1, 0x00C1, 0x00C2, 0x00C3, 0x00C3, 0x00C4,
  43.     0x00C5, 0x00C5, 0x00C6, 0x00C7, 0x00C7, 0x00C8, 0x00C9, 0x00C9,
  44.     0x00CA, 0x00CB, 0x00CB, 0x00CC, 0x00CC, 0x00CD, 0x00CE, 0x00CE,
  45.     0x00CF, 0x00D0, 0x00D0, 0x00D1, 0x00D1, 0x00D2, 0x00D3, 0x00D3,
  46.     0x00D4, 0x00D4, 0x00D5, 0x00D6, 0x00D6, 0x00D7, 0x00D7, 0x00D8,
  47.     0x00D9, 0x00D9, 0x00DA, 0x00DA, 0x00DB, 0x00DB, 0x00DC, 0x00DD,
  48.     0x00DD, 0x00DE, 0x00DE, 0x00DF, 0x00E0, 0x00E0, 0x00E1, 0x00E1,
  49.     0x00E2, 0x00E2, 0x00E3, 0x00E3, 0x00E4, 0x00E5, 0x00E5, 0x00E6,
  50.     0x00E6, 0x00E7, 0x00E7, 0x00E8, 0x00E8, 0x00E9, 0x00EA, 0x00EA,
  51.     0x00EB, 0x00EB, 0x00EC, 0x00EC, 0x00ED, 0x00ED, 0x00EE, 0x00EE,
  52.     0x00EF, 0x00F0, 0x00F0, 0x00F1, 0x00F1, 0x00F2, 0x00F2, 0x00F3,
  53.     0x00F3, 0x00F4, 0x00F4, 0x00F5, 0x00F5, 0x00F6, 0x00F6, 0x00F7,
  54.     0x00F7, 0x00F8, 0x00F8, 0x00F9, 0x00F9, 0x00FA, 0x00FA, 0x00FB,
  55.     0x00FB, 0x00FC, 0x00FC, 0x00FD, 0x00FD, 0x00FE, 0x00FE, 0x00FF,
  56. };
  57.  
  58. inline static int kernel_sqrt(int x)
  59. {
  60.          if (x >= 0x00010000) {
  61.          if (x >= 0x01000000) {
  62.          if (x >= 0x10000000) {
  63.          if (x >= 0x40000000) { return (sqrt_table[x >> 24] << 8); }
  64.     else                      { return (sqrt_table[x >> 22] << 7); }}
  65.     else if (x >= 0x04000000) { return (sqrt_table[x >> 20] << 6); }
  66.     else                      { return (sqrt_table[x >> 18] << 5); }}
  67.     else if (x >= 0x00100000) {
  68.          if (x >= 0x00400000) { return (sqrt_table[x >> 16] << 4); }
  69.     else                      { return (sqrt_table[x >> 14] << 3); }}
  70.     else if (x >= 0x00040000) { return (sqrt_table[x >> 12] << 2); }
  71.     else                      { return (sqrt_table[x >> 10] << 1); }}
  72.     else if (x >= 0x00000100) {
  73.          if (x >= 0x00001000) {
  74.          if (x >= 0x00004000) { return (sqrt_table[x >>  8] >> 0); }
  75.     else                      { return (sqrt_table[x >>  6] >> 1); }}
  76.     else if (x >= 0x00000400) { return (sqrt_table[x >>  4] >> 2); }
  77.     else                      { return (sqrt_table[x >>  2] >> 3); }}
  78.     else if (x >= 0x00000000) { return (sqrt_table[x >>  0] >> 4); }
  79.    
  80.     return -1;
  81. }
  82.  
  83. float johns_sqrt (float x)
  84. {
  85.     float in = x;
  86.     float xhalf = 0.5f*x;
  87.     int i = *(int*)&x;
  88.     i = 0x5f3759df - (i>>1);
  89.     x = *(float*)&i;
  90.     x = x*(1.5f - xhalf*x*x);
  91.     return x*in;
  92. }
  93.  
  94. inline void SSESqrt(float *pOut, float * pIn )
  95. {
  96.    _mm_store_ss( pOut, _mm_sqrt_ss( _mm_load_ss( pIn ) ) );
  97. }
  98.  
  99. float sse_sqrt(float x)
  100. {
  101.     float ret;
  102.     SSESqrt(&ret, &x);
  103.     return ret;
  104. }
  105.  
  106. #define CYCLES (INT_MAX/5)
  107. int main()
  108. {
  109.         int     i,j;
  110.         volatile int v;
  111.         clock_t x,y,z,w,c,k,p,u,f,g;
  112.         srand(time(0));
  113.         x = clock();
  114.         for (i=0; i<CYCLES; i++) {
  115.           v = kernel_sqrt((int)(rand()%9999+1));
  116.         }
  117.         y = clock();
  118.         z = clock();
  119.         for (j=0; j<CYCLES; j++) {
  120.           v = (int)sqrtf((int)(rand()%9999+1));
  121.         }
  122.         w = clock();
  123.         f = clock();
  124.         for (j=0; j<CYCLES; j++) {
  125.           v = (int)sqrt((int)(rand()%9999+1));
  126.         }
  127.         g = clock();
  128.         c = clock();
  129.         for (j=0; j<CYCLES; j++) {
  130.           v = (int)johns_sqrt((int)(rand()%9999+1));
  131.         }
  132.         k = clock();
  133.         p = clock();
  134.         for (j=0; j<CYCLES; j++) {
  135.           v = (int)sse_sqrt((int)(rand()%9999+1));
  136.         }
  137.         u = clock();
  138.  
  139.   printf("\n\n");
  140.   printf("kernel: %f (sec)\n", (double)(y-x)/CLOCKS_PER_SEC);
  141.   printf("sqrtf : %f (sec)\n", (double)(w-z)/CLOCKS_PER_SEC);
  142.   printf("sqrt  : %f (sec)\n", (double)(g-f)/CLOCKS_PER_SEC);
  143.   printf("johnc : %f (sec)\n", (double)(k-c)/CLOCKS_PER_SEC);
  144.   printf("sse   : %f (sec)\n", (double)(u-p)/CLOCKS_PER_SEC);
  145.  
  146.   return 0;
  147. }
  148.  
  149.  
RAW Paste Data