Untitled

/* invsqrt.c
 * This program is used to analyse difference methods computing
 * square root reciprocal.
 *
 * Written by littleshan <[email protected]>.
 * NO RIGHT RESERVED, you can modify or redistribute this file as you want.
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <stddef.h>
#include <stdint.h>

/* Uncomment the following line if you want to check computation error
 * against the naive one.
 */
#define CHECK_ERROR

/* The size of the input array. If you want to eliminate factors caused
 * by memory latency, please make it smaller then L1 cache size.
 * Note that you should multiply this value by sizeof(float) to get
 * the actual size.
 */
#ifndef SIZE
#define SIZE 1024   /* default is 4KB */
#endif


/* We use rdtsc to read timestamp count from CPU. */
uint64_t rdtsc(void){
    uint64_t tick;

#ifdef __x86_64__
    __asm__ __volatile__(
        "xorq %%rax, %%rax\n\t"
        "rdtsc\n\t"
        "shlq $32, %%rdx\n\t"
        "xorq %%rdx, %%rax"
        : "=a"(tick)
        :
        : "rdx"
    );
#else
    __asm__ __volatile__(
        "rdtsc"
        : "=A"(tick)
    );
#endif

    return tick;
}

void inv_sqrt_naive(float* begin, float* end, float* output)
{
    /* naive method */
    for(; begin < end; ++begin, ++output)
        *output = 1.0f / sqrtf(*begin);
}

void inv_sqrt_marvelous(float* begin, float* end, float* output)
{
    /* marvelous method
     * from http://www.beyond3d.com/articles/fastinvsqrt/
     */
    float xhalf;
    /* We must use union to prevent breaking the strict aliasing rule. */
    union {
        int i;
        float f;
    } x;
    for(; begin < end; ++begin, ++output){
        xhalf = 0.5f * (*begin);
        x.f = *begin;
        x.i = 0x5f3759df - (x.i>>1);
        *output = x.f * (1.5f - xhalf * x.f * x.f);
    }
}

void inv_sqrt_sse(float* begin, float* end, float* output)
{
    /* vectorized SSE */

    /* Since we process 16 floating point values at a time, there are
     * remaining ones if the input size is not a multiple of 16, and
     * we must take care of them.
     */
    size_t padding = (end - begin) % 16;
    for(; padding > 0; --padding, ++begin, ++output)
        *output = 1.0f / sqrt(*begin);

    __asm__ __volatile__(
    "1:\n\t"
        "cmp %0, %1\n\t"
        "jbe 2f\n\t"
        "movups (%0),   %%xmm0\n\t"
        "movups 16(%0), %%xmm1\n\t"
        "movups 32(%0), %%xmm2\n\t"
        "movups 48(%0), %%xmm3\n\t"
        "rsqrtps %%xmm0, %%xmm0\n\t"
        "rsqrtps %%xmm1, %%xmm1\n\t"
        "rsqrtps %%xmm2, %%xmm2\n\t"
        "rsqrtps %%xmm3, %%xmm3\n\t"
        "movups %%xmm0, (%2)\n\t"
        "movups %%xmm1, 16(%2)\n\t"
        "movups %%xmm2, 32(%2)\n\t"
        "movups %%xmm3, 48(%2)\n\t"
        "add $64, %0\n\t"
        "add $64, %2\n\t"
        "jmp 1b\n\t"
    "2:\n\t"
        :
        : "r"(begin), "r"(end), "r"(output)
        : "xmm0", "xmm1", "xmm2", "xmm3"
    );
}

/* This function checks the answer against the naive one. */
float check_error(const float* begin, const float* end, const float* output)
{
    float max_err = 0.0f;
    float err;
    float ans;
    for(; begin < end; ++begin, ++output){
        ans = 1.0f / sqrtf(*begin);
        err = fabsf(ans - *output);
        max_err = err > max_err ? err : max_err;
    }
    return max_err;
}

void benchmark(
        const char* name, /* name of this method */
        void (*fptr)(float*, float*, float*), /* function pointer */
        float* begin,
        float* end,
        float* output )
{
    uint64_t tick;
    printf("\n%s\n", name);
    tick = rdtsc();
    fptr(begin, end, output);
    tick = rdtsc() - tick;
#ifdef CHECK_ERROR
    printf(
        "    CPU cycles used: %10llu, error: %f\n",
        tick,
        check_error(begin, end, output)
    );
#else
    printf("    CPU cycles used: %10llu\n", tick);
#endif
}

int main(void)
{
    int i;
    float *input = (float*)malloc( sizeof(float) * SIZE );
    float *output = (float*)malloc( sizeof(float) * SIZE );

    /* generate random input, from 0.5 to 1.5 */
    srand(time(0));
    for(i = 0; i < SIZE; i++)
        input[i] = (float)rand() / RAND_MAX + 0.5f;

    printf("Input array size is %u\n", SIZE);

    benchmark("Naive sqrtf()", inv_sqrt_naive, input, input+SIZE, output);
    benchmark("Vectorized SSE", inv_sqrt_sse, input, input+SIZE, output);
    benchmark("Marvelous", inv_sqrt_marvelous, input, input+SIZE, output);

    return 0;
}