Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #if 0
- [root@graphitemaster ~]# ./test
- kernel: 8.980000 (sec)
- sqrtf : 8.740000 (sec)
- sqrt : 9.870000 (sec)
- johnc : 13.950000 (sec)
- sse : 18.300000 (sec)
- #endif
- #include <stdio.h>
- #include <time.h>
- #include <math.h>
- #include <limits.h>
- #include <stdlib.h>
- #include <xmmintrin.h>
- /*
- * Square root lookup table, this should be
- * fast, again, accuracy is lost.
- */
- static const short int sqrt_table[] = {
- 0x0000, 0x0010, 0x0016, 0x001B, 0x0020, 0x0023, 0x0027, 0x002A,
- 0x002D, 0x0030, 0x0032, 0x0035, 0x0037, 0x0039, 0x003B, 0x003D,
- 0x0040, 0x0041, 0x0043, 0x0045, 0x0047, 0x0049, 0x004B, 0x004C,
- 0x004E, 0x0050, 0x0051, 0x0053, 0x0054, 0x0056, 0x0057, 0x0059,
- 0x005A, 0x005B, 0x005D, 0x005E, 0x0060, 0x0061, 0x0062, 0x0063,
- 0x0065, 0x0066, 0x0067, 0x0068, 0x006A, 0x006B, 0x006C, 0x006D,
- 0x006E, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076,
- 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E,
- 0x0080, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086,
- 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E,
- 0x008F, 0x0090, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095,
- 0x0096, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009B,
- 0x009C, 0x009D, 0x009E, 0x009F, 0x00A0, 0x00A0, 0x00A1, 0x00A2,
- 0x00A3, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A7, 0x00A8,
- 0x00A9, 0x00AA, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AD, 0x00AE,
- 0x00AF, 0x00B0, 0x00B0, 0x00B1, 0x00B2, 0x00B2, 0x00B3, 0x00B4,
- 0x00B5, 0x00B5, 0x00B6, 0x00B7, 0x00B7, 0x00B8, 0x00B9, 0x00B9,
- 0x00BA, 0x00BB, 0x00BB, 0x00BC, 0x00BD, 0x00BD, 0x00BE, 0x00BF,
- 0x00C0, 0x00C0, 0x00C1, 0x00C1, 0x00C2, 0x00C3, 0x00C3, 0x00C4,
- 0x00C5, 0x00C5, 0x00C6, 0x00C7, 0x00C7, 0x00C8, 0x00C9, 0x00C9,
- 0x00CA, 0x00CB, 0x00CB, 0x00CC, 0x00CC, 0x00CD, 0x00CE, 0x00CE,
- 0x00CF, 0x00D0, 0x00D0, 0x00D1, 0x00D1, 0x00D2, 0x00D3, 0x00D3,
- 0x00D4, 0x00D4, 0x00D5, 0x00D6, 0x00D6, 0x00D7, 0x00D7, 0x00D8,
- 0x00D9, 0x00D9, 0x00DA, 0x00DA, 0x00DB, 0x00DB, 0x00DC, 0x00DD,
- 0x00DD, 0x00DE, 0x00DE, 0x00DF, 0x00E0, 0x00E0, 0x00E1, 0x00E1,
- 0x00E2, 0x00E2, 0x00E3, 0x00E3, 0x00E4, 0x00E5, 0x00E5, 0x00E6,
- 0x00E6, 0x00E7, 0x00E7, 0x00E8, 0x00E8, 0x00E9, 0x00EA, 0x00EA,
- 0x00EB, 0x00EB, 0x00EC, 0x00EC, 0x00ED, 0x00ED, 0x00EE, 0x00EE,
- 0x00EF, 0x00F0, 0x00F0, 0x00F1, 0x00F1, 0x00F2, 0x00F2, 0x00F3,
- 0x00F3, 0x00F4, 0x00F4, 0x00F5, 0x00F5, 0x00F6, 0x00F6, 0x00F7,
- 0x00F7, 0x00F8, 0x00F8, 0x00F9, 0x00F9, 0x00FA, 0x00FA, 0x00FB,
- 0x00FB, 0x00FC, 0x00FC, 0x00FD, 0x00FD, 0x00FE, 0x00FE, 0x00FF,
- };
- inline static int kernel_sqrt(int x)
- {
- if (x >= 0x00010000) {
- if (x >= 0x01000000) {
- if (x >= 0x10000000) {
- if (x >= 0x40000000) { return (sqrt_table[x >> 24] << 8); }
- else { return (sqrt_table[x >> 22] << 7); }}
- else if (x >= 0x04000000) { return (sqrt_table[x >> 20] << 6); }
- else { return (sqrt_table[x >> 18] << 5); }}
- else if (x >= 0x00100000) {
- if (x >= 0x00400000) { return (sqrt_table[x >> 16] << 4); }
- else { return (sqrt_table[x >> 14] << 3); }}
- else if (x >= 0x00040000) { return (sqrt_table[x >> 12] << 2); }
- else { return (sqrt_table[x >> 10] << 1); }}
- else if (x >= 0x00000100) {
- if (x >= 0x00001000) {
- if (x >= 0x00004000) { return (sqrt_table[x >> 8] >> 0); }
- else { return (sqrt_table[x >> 6] >> 1); }}
- else if (x >= 0x00000400) { return (sqrt_table[x >> 4] >> 2); }
- else { return (sqrt_table[x >> 2] >> 3); }}
- else if (x >= 0x00000000) { return (sqrt_table[x >> 0] >> 4); }
- return -1;
- }
- float johns_sqrt (float x)
- {
- float in = x;
- float xhalf = 0.5f*x;
- int i = *(int*)&x;
- i = 0x5f3759df - (i>>1);
- x = *(float*)&i;
- x = x*(1.5f - xhalf*x*x);
- return x*in;
- }
- inline void SSESqrt(float *pOut, float * pIn )
- {
- _mm_store_ss( pOut, _mm_sqrt_ss( _mm_load_ss( pIn ) ) );
- }
- float sse_sqrt(float x)
- {
- float ret;
- SSESqrt(&ret, &x);
- return ret;
- }
- #define CYCLES (INT_MAX/5)
- int main()
- {
- int i,j;
- volatile int v;
- clock_t x,y,z,w,c,k,p,u,f,g;
- srand(time(0));
- x = clock();
- for (i=0; i<CYCLES; i++) {
- v = kernel_sqrt((int)(rand()%9999+1));
- }
- y = clock();
- z = clock();
- for (j=0; j<CYCLES; j++) {
- v = (int)sqrtf((int)(rand()%9999+1));
- }
- w = clock();
- f = clock();
- for (j=0; j<CYCLES; j++) {
- v = (int)sqrt((int)(rand()%9999+1));
- }
- g = clock();
- c = clock();
- for (j=0; j<CYCLES; j++) {
- v = (int)johns_sqrt((int)(rand()%9999+1));
- }
- k = clock();
- p = clock();
- for (j=0; j<CYCLES; j++) {
- v = (int)sse_sqrt((int)(rand()%9999+1));
- }
- u = clock();
- printf("\n\n");
- printf("kernel: %f (sec)\n", (double)(y-x)/CLOCKS_PER_SEC);
- printf("sqrtf : %f (sec)\n", (double)(w-z)/CLOCKS_PER_SEC);
- printf("sqrt : %f (sec)\n", (double)(g-f)/CLOCKS_PER_SEC);
- printf("johnc : %f (sec)\n", (double)(k-c)/CLOCKS_PER_SEC);
- printf("sse : %f (sec)\n", (double)(u-p)/CLOCKS_PER_SEC);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement