Mandelbrot SIMD calculation snippets

/**
 * Mandelbrot fractal portable SIMD calculation core
 * Copyright (c) 2017 Joel Yliluoma (http://iki.fi/bisqwit/)
 *
 * T is the underlying (real) numeric type, with which the calculations
 * are peformed. It is ideally float or double.
 *
 * N is the number of mandelbrot coordinates to calculate in parallel (vector size).
 * Ideally N should be a small integer multiple of the largest native vector size for T.
 * Some options:
 *                MMX SSE SSE2 AVX AVX512 Altivec Neon
 *   ------------ --- --- ---- --- ------ ------- ----
 *   float16      *   *   *    *   *      *       8
 *   float        2   4   4    8   16     4       4
 *   double       1   1   2    4   8      1       2
 *   long double  1   1   1    1   1      1       1
 *
 * MaxIter is the maximum number of iterations to perform.
 *
 * If do_smooth is enabled, points that are outside the set are given
 * a smooth (floating point) iteration count rather than a discrete
 * (integral) iteration count. This incurs a slight performance penalty.
 * If you use this, you should also set iter_mul to e.g. 4096.
 *
 * If do_periodicity_checking is enabled, light-weight periodicity checking
 * is enabled to improve the performance significantly for points that are
 * inside the set. It has a slight performance penalty for points that are outside the set.
 *
 * If do_logarithmic_iteration_counts is enabled, a log() function is applied
 * to the returned iteration counts. You should also set iter_mul to e.g. 4096.
 *
 * The returned iteration counts (float) are multiplied by iter_mul before
 * they are converted into integers and returned (fixed-point).
 */
#include <array>
#include <cmath>
template<typename T, unsigned N,
         bool do_smooth = true                        /* S */,
         bool do_periodicity_checking = true          /* P */,
         bool do_logarithmic_iteration_counts = false /* L */,
         unsigned iter_mul = 1                        /* I */>
struct Calculator
{
    static constexpr T color_scale           = 8;
    static constexpr T escape_radius_squared = do_smooth ? 6*6 : 4;

    typedef std::array<T,   N> dvec;
    typedef std::array<int, N> bvec;

    dvec x,y;           // The "c", complex coordinate we are iterating
    dvec r,i;           // The "z", temporary complex number
    dvec dist;          // Last recorded squared distance before iteration was stopped
    //                  // If do_smooth is false, it continues being updated
    //                  // even then, for a moderate improvement in performance.
    dvec perr,peri;     // A recorded sample of "z" used in periodicity checking
    bvec escaped;       // Flag used to stop iterating (indicates whether an escape happened)
    bvec iter;          // The iteration count
    bvec maxiter;       // Maximum iteration count

    /* Start(): Initialize calculation for the specified number of maximum iterations, at given coordinates */
    Calculator(unsigned MaxIter, const T* cr, const T* ci)
    {
        #pragma omp simd
        for(unsigned n=0; n<N; ++n)
        {
            x[n]       = cr[n];
            y[n]       = ci[n];
            r[n]       = cr[n];
            i[n]       = ci[n];
            maxiter[n] = MaxIter;
            iter[n]    = MaxIter;
            perr[n]    = 0;
            peri[n]    = 0;
            dist[n]    = 0;
            escaped[n] = 0;
        }
    }

    /* End(): Returns true if all N coordinates have escaped or MaxIter iterations has been reached */
    bool End() const
    {
        typename bvec::value_type n_escaped = 0;
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) n_escaped += escaped[n];
        return n_escaped >= typename bvec::value_type(N);
    }

    /* Round(): Performs one iteration for all N coordinates */
    void Round()
    {
        dvec r2, i2, ri;
        /* Note: GCC generates slightly better performing code, when this code is
         * in a single loop than when the statements are each in separate loops.
         * However, if one of the statements fails to be vectorized, the entire loop
         * will be de-vectorized. If that happens (inspect the code with gcc -S!),
         * try splitting the loop into separate consecutive loops.
         */
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) { r2[n] = (r[n] * r[n]);
                                      i2[n] = (i[n] * i[n]);
                                      if(!do_smooth || !escaped[n]) { dist[n] = r2[n] + i2[n]; }
                                      escaped[n] = escaped[n] || !iter[n] || (dist[n] >= escape_radius_squared);
                                      iter[n] -= !escaped[n];
                                      ri[n] = (r[n] * i[n]);
                                      i[n] = y[n] + (ri[n] * 2);
                                      r[n] = x[n] + (r2[n] - i2[n]); }

        /* Periodicity checking produces a significant speedup for coordinates
         * that are inside the set (especially if they go into a cycle early on),
         * but at a cost of a slight performance penalty for coordinates that are not.
         */
        if(do_periodicity_checking)
        {
            bvec moment;
            /* When the number of remaining iterations is a power of two,
             * save the current complex number (r & i).
             * Otherwise, compare it to the saved number. If they match, we are inside the set,
             * and can set the iteration count to max immediately.
             */
            #pragma omp simd
            for(unsigned n=0; n<N; ++n) { int tmp = iter[n]; moment[n] = tmp & (tmp-1); }
            #pragma omp simd
            for(unsigned n=0; n<N; ++n) { if(!moment[n])                              { perr[n] = r[n]; peri[n] = i[n]; }
                                          else if(r[n] == perr[n] && i[n] == peri[n]) { iter[n] = 0; } }
        }
    }

    /* Get(): Get the translation iteration counts for all N coordinates according
     *        to the current progress (call after End()==true, but can be called before) */
    std::array<int,N> Get() const
    {
        std::array<int,N> out;
        if(!do_logarithmic_iteration_counts && iter_mul == 1)
        {
            #pragma omp simd
            for(unsigned n=0; n<N; ++n) out[n] = maxiter[n] - iter[n];
            return out;
        }
        dvec iter_f;
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) iter_f[n] = maxiter[n] - iter[n];
        #pragma omp simd
        for(unsigned n=0; n<N; ++n)
            if(do_smooth && iter[n] != 0)
                iter_f[n] += 1 - std::log2(std::log2(dist[n]) / T(2));
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) iter_f[n] = (do_logarithmic_iteration_counts ? std::log(iter_f[n]) : iter_f[n]) * color_scale;
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) out[n] = iter_f[n] * iter_mul;
        return out;
    }
};

/* Utility example: Calculate for N coordinates, return iteration counts as array */
template<unsigned N, bool S,bool P=true,bool L=true,unsigned I, typename T>
std::array<int,N> Iterate(unsigned MaxIter, const T* x, const T* y)
{
    Calculator<T,N,  S,P,L,I> calc(MaxIter, x,y);
    while(!calc.End()) { calc.Round(); }
    return calc.Get();
}

/* Utility example: Calculate for N coordinates passed as std::array, return iteration counts as array */
template<bool S,bool P=true,bool L=true,unsigned I, typename T,std::size_t N>
std::array<int,N> Iterate(unsigned MaxIter, const std::array<T,N>& x, const std::array<T,N>& y)
{
    return Iterate<N, S,P,L,I>(MaxIter, &x[0], &y[0]);
}

/* Utility example: Calculate for N coordinates where N is not a compile-time constant */
template<bool S,bool P=true,bool L=true,unsigned I, typename T>
void Iterate(unsigned MaxIter, const T* x, const T* y, int* out, unsigned N)
{
    while(N >= 16)
    {
        auto tmp = Iterate<16, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n<16; ++n) out[n] = tmp[n];
        x += 16; y += 16; out += 16; N -= 16;
    }
    if(N == 8)
    {
        auto tmp = Iterate< 8, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 8; ++n) out[n] = tmp[n];
        return;
    }
    while(N >= 4)
    {
        auto tmp = Iterate< 4, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 4; ++n) out[n] = tmp[n];
        x += 4; y += 4; out += 4; N -= 4;
    }
    while(N >= 1)
    {
        auto tmp = Iterate< 1, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 1; ++n) out[n] = tmp[n];
        x += 1; y += 1; out += 1; N -= 1;
    }
}