Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* invsqrt.c
- * This program is used to analyse difference methods computing
- * square root reciprocal.
- *
- * Written by littleshan <[email protected]>.
- * NO RIGHT RESERVED, you can modify or redistribute this file as you want.
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include <string.h>
- #include <time.h>
- #include <stddef.h>
- #include <stdint.h>
- /* Uncomment the following line if you want to check computation error
- * against the naive one.
- */
- #define CHECK_ERROR
- /* The size of the input array. If you want to eliminate factors caused
- * by memory latency, please make it smaller then L1 cache size.
- * Note that you should multiply this value by sizeof(float) to get
- * the actual size.
- */
- #ifndef SIZE
- #define SIZE 1024 /* default is 4KB */
- #endif
- /* We use rdtsc to read timestamp count from CPU. */
- uint64_t rdtsc(void){
- uint64_t tick;
- #ifdef __x86_64__
- __asm__ __volatile__(
- "xorq %%rax, %%rax\n\t"
- "rdtsc\n\t"
- "shlq $32, %%rdx\n\t"
- "xorq %%rdx, %%rax"
- : "=a"(tick)
- :
- : "rdx"
- );
- #else
- __asm__ __volatile__(
- "rdtsc"
- : "=A"(tick)
- );
- #endif
- return tick;
- }
- void inv_sqrt_naive(float* begin, float* end, float* output)
- {
- /* naive method */
- for(; begin < end; ++begin, ++output)
- *output = 1.0f / sqrtf(*begin);
- }
- void inv_sqrt_marvelous(float* begin, float* end, float* output)
- {
- /* marvelous method
- * from http://www.beyond3d.com/articles/fastinvsqrt/
- */
- float xhalf;
- /* We must use union to prevent breaking the strict aliasing rule. */
- union {
- int i;
- float f;
- } x;
- for(; begin < end; ++begin, ++output){
- xhalf = 0.5f * (*begin);
- x.f = *begin;
- x.i = 0x5f3759df - (x.i>>1);
- *output = x.f * (1.5f - xhalf * x.f * x.f);
- }
- }
- void inv_sqrt_sse(float* begin, float* end, float* output)
- {
- /* vectorized SSE */
- /* Since we process 16 floating point values at a time, there are
- * remaining ones if the input size is not a multiple of 16, and
- * we must take care of them.
- */
- size_t padding = (end - begin) % 16;
- for(; padding > 0; --padding, ++begin, ++output)
- *output = 1.0f / sqrt(*begin);
- __asm__ __volatile__(
- "1:\n\t"
- "cmp %0, %1\n\t"
- "jbe 2f\n\t"
- "movups (%0), %%xmm0\n\t"
- "movups 16(%0), %%xmm1\n\t"
- "movups 32(%0), %%xmm2\n\t"
- "movups 48(%0), %%xmm3\n\t"
- "rsqrtps %%xmm0, %%xmm0\n\t"
- "rsqrtps %%xmm1, %%xmm1\n\t"
- "rsqrtps %%xmm2, %%xmm2\n\t"
- "rsqrtps %%xmm3, %%xmm3\n\t"
- "movups %%xmm0, (%2)\n\t"
- "movups %%xmm1, 16(%2)\n\t"
- "movups %%xmm2, 32(%2)\n\t"
- "movups %%xmm3, 48(%2)\n\t"
- "add $64, %0\n\t"
- "add $64, %2\n\t"
- "jmp 1b\n\t"
- "2:\n\t"
- :
- : "r"(begin), "r"(end), "r"(output)
- : "xmm0", "xmm1", "xmm2", "xmm3"
- );
- }
- /* This function checks the answer against the naive one. */
- float check_error(const float* begin, const float* end, const float* output)
- {
- float max_err = 0.0f;
- float err;
- float ans;
- for(; begin < end; ++begin, ++output){
- ans = 1.0f / sqrtf(*begin);
- err = fabsf(ans - *output);
- max_err = err > max_err ? err : max_err;
- }
- return max_err;
- }
- void benchmark(
- const char* name, /* name of this method */
- void (*fptr)(float*, float*, float*), /* function pointer */
- float* begin,
- float* end,
- float* output )
- {
- uint64_t tick;
- printf("\n%s\n", name);
- tick = rdtsc();
- fptr(begin, end, output);
- tick = rdtsc() - tick;
- #ifdef CHECK_ERROR
- printf(
- " CPU cycles used: %10llu, error: %f\n",
- tick,
- check_error(begin, end, output)
- );
- #else
- printf(" CPU cycles used: %10llu\n", tick);
- #endif
- }
- int main(void)
- {
- int i;
- float *input = (float*)malloc( sizeof(float) * SIZE );
- float *output = (float*)malloc( sizeof(float) * SIZE );
- /* generate random input, from 0.5 to 1.5 */
- srand(time(0));
- for(i = 0; i < SIZE; i++)
- input[i] = (float)rand() / RAND_MAX + 0.5f;
- printf("Input array size is %u\n", SIZE);
- benchmark("Naive sqrtf()", inv_sqrt_naive, input, input+SIZE, output);
- benchmark("Vectorized SSE", inv_sqrt_sse, input, input+SIZE, output);
- benchmark("Marvelous", inv_sqrt_marvelous, input, input+SIZE, output);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement