BC1V float CppSPMD kernel

// bc1v_kernel.h

struct CPPSPMD_MAKE_NAME(bc1v_kernel) : CPPSPMD::spmd_kernel
{
    struct bc1_block
    {
        uint16_t m_lo;
        uint16_t m_hi;
        uint32_t m_sels;
    };

    struct vvec3F { vfloat c[3]; };
    struct vcolor3 { vfloat c[3]; };

    static CPPSPMD_FORCE_INLINE vfloat squarevf(const vfloat &a)
    {
        return a * a;
    }

    static CPPSPMD_FORCE_INLINE vfloat to_5(vfloat v)
    {
        vfloat t = fma(v, 31.0 * (1.0f/256.0f), 128.0f * (1.0f/256.0f));
        return floor( fma(t, 1.0f/256.0f, t) );
    }

    static CPPSPMD_FORCE_INLINE vfloat to_6(vfloat v)
    {
        vfloat t = fma(v, 63.0 * (1.0f/256.0f), 128.0f * (1.0f/256.0f));
        return floor( fma(t, 1.0f/256.0f, t) );
    }

    static void global_init()
    {
    }

    CPPSPMD_FORCE_INLINE void bc1_get_block_colors4(vfloat block_r[4], vfloat block_g[4], vfloat block_b[4], const vfloat &lr, const vfloat &lg, const vfloat &lb, const vfloat &hr, const vfloat &hg, const vfloat &hb)
    {
        store_all(block_r[0], fma(lr, 8.0f, floor(lr * .25f)));
        store_all(block_g[0], fma(lg, 4.0f, floor(lg * .0625f)));
        store_all(block_b[0], fma(lb, 8.0f, floor(lb * .25f)));

        store_all(block_r[3], fma(hr, 8.0f, floor(hr * .25f)));
        store_all(block_g[3], fma(hg, 4.0f, floor(hg * .0625f)));
        store_all(block_b[3], fma(hb, 8.0f, floor(hb * .25f)));

        vfloat delta_r = block_r[3] - block_r[0];
        vfloat delta_g = block_g[3] - block_g[0];
        vfloat delta_b = block_b[3] - block_b[0];

        store_all( block_r[1], floor(fma(delta_r, 1.0f/3.0f, block_r[0])) );
        store_all( block_g[1], floor(fma(delta_g, 1.0f/3.0f, block_g[0])) );
        store_all( block_b[1], floor(fma(delta_b, 1.0f/3.0f, block_b[0])) );

        store_all( block_r[2], floor(fma(delta_r, 2.0f/3.0f, block_r[0])) );
        store_all( block_g[2], floor(fma(delta_g, 2.0f/3.0f, block_g[0])) );
        store_all( block_b[2], floor(fma(delta_b, 2.0f/3.0f, block_b[0])) );
    }

    CPPSPMD_FORCE_INLINE void bc1_find_sels4_noerr(const vcolor3* pSrc_pixels, const vfloat &lr, const vfloat &lg, const vfloat &lb, const vfloat &hr, const vfloat &hg, const vfloat &hb, vfloat sels[16])
    {
        vfloat block_r[4], block_g[4], block_b[4];
        bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);

        vfloat ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];

        vfloat dots[4];
        for (uint32_t i = 0; i < 4; i++)
            store_all(dots[i], block_r[i] * ar + block_g[i] * ag + block_b[i] * ab);

        vfloat t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];

        vfloat ar_scaled = ar * 2.0f, ag_scaled = ag * 2.0f, ab_scaled = ab * 2.0f;

        for (uint32_t i = 0; i < 16; i += 4)
        {
            const vfloat d0 = pSrc_pixels[i+0].c[0] * ar_scaled + pSrc_pixels[i+0].c[1] * ag_scaled + pSrc_pixels[i+0].c[2] * ab_scaled;
            const vfloat d1 = pSrc_pixels[i+1].c[0] * ar_scaled + pSrc_pixels[i+1].c[1] * ag_scaled + pSrc_pixels[i+1].c[2] * ab_scaled;
            const vfloat d2 = pSrc_pixels[i+2].c[0] * ar_scaled + pSrc_pixels[i+2].c[1] * ag_scaled + pSrc_pixels[i+2].c[2] * ab_scaled;
            const vfloat d3 = pSrc_pixels[i+3].c[0] * ar_scaled + pSrc_pixels[i+3].c[1] * ag_scaled + pSrc_pixels[i+3].c[2] * ab_scaled;

            vfloat sel0 = (vfloat)(d0 > t0) + (vfloat)(d0 >= t1) + (vfloat)(d0 >= t2);
            vfloat sel1 = (vfloat)(d1 > t0) + (vfloat)(d1 >= t1) + (vfloat)(d1 >= t2);
            vfloat sel2 = (vfloat)(d2 > t0) + (vfloat)(d2 >= t1) + (vfloat)(d2 >= t2);
            vfloat sel3 = (vfloat)(d3 > t0) + (vfloat)(d3 >= t1) + (vfloat)(d3 >= t2);

            store(sels[i+0], sel0);
            store(sels[i+1], sel1);
            store(sels[i+2], sel2);
            store(sels[i+3], sel3);
        }
    }

    CPPSPMD_FORCE_INLINE vfloat select4(const vfloat x[4], const vfloat &sel)
    {
        return vfloat(
            spmd_ternaryf(sel == 0.0f, x[0], spmd_ternaryf(sel == 1.0f, x[1], spmd_ternaryf(sel == 2.0f, x[2], x[3] ) ) )
                );
    }

    CPPSPMD_FORCE_INLINE vbool compute_least_squares_endpoints4_rgb(const vcolor3* pColors, const vfloat* pSelectors, vvec3F* pXl, vvec3F* pXh, const vfloat &total_r, const vfloat &total_g, const vfloat &total_b)
    {
        vfloat q00_r = 0.0f, q00_g = 0.0f, q00_b = 0.0f;
        vfloat z00 = 0.0f, z10 = 0.0f, z11 = 0.0f;
        for (uint32_t i = 0; i < 16; i++)
        {
            const vfloat r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2];
            const vfloat sel = pSelectors[i];

            const vfloat w = sel * (1.0f/3.0f);
            const vfloat one_minus_w = 1.0f - w;

            store_all(z00, z00 + w * w);
            store_all(z10, z10 + one_minus_w * w);
            store_all(z11, z11 + one_minus_w * one_minus_w);

            store_all(q00_r, q00_r + w * r);
            store_all(q00_g, q00_g + w * g);
            store_all(q00_b, q00_b + w * b);
        }

        vfloat q10_r = total_r - q00_r;
        vfloat q10_g = total_g - q00_g;
        vfloat q10_b = total_b - q00_b;

        vfloat z01 = z10;

        vfloat det = z00 * z11 - z01 * z10;

        vbool valid_mask = (abs(det) >= 1e-4f);

        vfloat det_fixed = spmd_ternaryf(valid_mask, det, 1e+10f);

        vfloat det_scaled = (1.0f / 255.0f) / det_fixed;

        vfloat iz00 = z11 * det_scaled;
        vfloat iz01 = -z01 * det_scaled;
        vfloat iz10 = -z10 * det_scaled;
        vfloat iz11 = z00 * det_scaled;

        store_all(pXl->c[0], iz00 * q00_r + iz01 * q10_r);
        store_all(pXh->c[0], iz10 * q00_r + iz11 * q10_r);

        store_all(pXl->c[1], iz00 * q00_g + iz01 * q10_g);
        store_all(pXh->c[1], iz10 * q00_g + iz11 * q10_g);

        store_all(pXl->c[2], iz00 * q00_b + iz01 * q10_b);
        store_all(pXh->c[2], iz10 * q00_b + iz11 * q10_b);

        return valid_mask;
    }

    CPPSPMD_FORCE_INLINE void precise_round_565(const vvec3F &xl, const vvec3F &xh,
        vfloat &trial_lr, vfloat &trial_lg, vfloat &trial_lb,
        vfloat &trial_hr, vfloat &trial_hg, vfloat &trial_hb)
    {
        // FIXME: Use precise BC1 rounding.
        store_all(trial_lr, clamp(round_nearest(xl.c[0] * 31.0f), 0.0f, 31.0f));
        store_all(trial_lg, clamp(round_nearest(xl.c[1] * 63.0f), 0.0f, 63.0f));
        store_all(trial_lb, clamp(round_nearest(xl.c[2] * 31.0f), 0.0f, 31.0f));

        store_all(trial_hr, clamp(round_nearest(xh.c[0] * 31.0f), 0.0f, 31.0f));
        store_all(trial_hg, clamp(round_nearest(xh.c[1] * 63.0f), 0.0f, 63.0f));
        store_all(trial_hb, clamp(round_nearest(xh.c[2] * 31.0f), 0.0f, 31.0f));
    }

    static CPPSPMD_FORCE_INLINE vfloat fix_sels(vfloat sels, float m)
    {
        return spmd_ternaryf(sels == 0.0f, 0.0f, spmd_ternaryf(sels == 1.0f, 2.0f * m, spmd_ternaryf(sels == 2.0f, 3.0f * m, m ) ) );
    }

    CPPSPMD_FORCE_INLINE void bc1_encode4(lint indices, bc1_block *pDst_blocks, const vfloat lr, const vfloat lg, const vfloat lb, const vfloat hr, const vfloat hg, const vfloat hb, vfloat sels[16], int pcount)
    {
        vfloat lc16i = lb + (lg * 32.0f) + (lr * 2048.0f);
        vfloat hc16i = hb + (hg * 32.0f) + (hr * 2048.0f);

        SPMD_SIMPLE_IF(lc16i == hc16i)
        {
            SPMD_SIMPLE_IF(hc16i > 0.0f)
            {
                store(hc16i, hc16i - 1.0f);
            }
            SPMD_SIMPLE_ELSE(hc16i > 0.0f)
            {
                store(hc16i, 0.0f);
                store(lc16i, 1.0f);
                for (uint32_t i = 0; i < 16; i++)
                    store(sels[i], 3.0f);
            }
            SPMD_SIMPLE_END_IF
        }
        SPMD_SIMPLE_ELSE(lc16i == hc16i)
        {
            SPMD_SIMPLE_IF(lc16i < hc16i)
            {
                swap(lc16i, hc16i);

                for (uint32_t i = 0; i < 16; i++)
                    store(sels[i], 3.0f - sels[i]);
            }
            SPMD_SIMPLE_END_IF
        }
        SPMD_SIMPLE_END_IF

        vfloat sels0 = fix_sels(sels[0], 1.0f) + fix_sels(sels[1], 4.0f) + fix_sels(sels[2], 16.0f) + fix_sels(sels[3], 64.0f) +
            fix_sels(sels[4], 256.0f) + fix_sels(sels[5], 1024.0f) + fix_sels(sels[6], 4096.0f) + fix_sels(sels[7], 16384.0f);

        vfloat sels1 = fix_sels(sels[8], 1.0f) + fix_sels(sels[9], 4.0f) + fix_sels(sels[10], 16.0f) + fix_sels(sels[11], 64.0f) +
            fix_sels(sels[12], 256.0f) + fix_sels(sels[13], 1024.0f) + fix_sels(sels[14], 4096.0f) + fix_sels(sels[15], 16384.0f);

        vint endpoints = vint(lc16i) | VINT_SHIFT_LEFT(vint(hc16i), 16);
        vint selectors = vint(sels0) | VINT_SHIFT_LEFT(vint(sels1), 16);

        //int *pDst_ints = reinterpret_cast<int *>(pDst_blocks);
        //vint vindices(vint(indices) * 2);
        //store(vindices[pDst_ints], endpoints);
        //store((vindices + 1)[pDst_ints],  selectors);

        int *pDst_ints = reinterpret_cast<int *>(pDst_blocks) + indices.get_first_value() * 2;
        store_strided(pDst_ints, 2, endpoints);
        store_strided(pDst_ints + 1, 2, selectors);
    }

    CPPSPMD_FORCE_INLINE void encode_bc1_pick_initial(const vcolor3 *pSrc_pixels, const vbool grayscale_flag,
            const vfloat min_r, const vfloat min_g, const vfloat min_b, const vfloat max_r, const vfloat max_g, const vfloat max_b,
            const vfloat avg_r, const vfloat avg_g, const vfloat avg_b, const vfloat total_r, const vfloat total_g, const vfloat total_b,
            vfloat &lr, vfloat &lg, vfloat &lb, vfloat &hr, vfloat &hg, vfloat &hb)
    {
        SPMD_SIMPLE_IF(grayscale_flag)
        {
            // Grayscale blocks are a common enough case to specialize.
            SPMD_SIMPLE_IF( (max_r - min_r) < 2.0f )
            {
                const vfloat fr = pSrc_pixels[0].c[0];

                vfloat fr5 = to_5(fr);
                vfloat fr6 = to_6(fr);

                store_all(lr, fr5);
                store_all(lg, fr6);
                store_all(lb, fr5);

                store_all(hr, fr5);
                store_all(hg, fr6);
                store_all(hb, fr5);
            }
            SPMD_SIMPLE_ELSE( (max_r - min_r) < 2.0f )
            {
                vfloat min_r5 = to_5(min_r);
                vfloat min_r6 = to_6(min_r);

                vfloat max_r5 = to_5(max_r);
                vfloat max_r6 = to_6(max_r);

                store(lr, min_r5);
                store(lg, min_r6);
                store(lb, min_r5);

                store(hr, max_r5);
                store(hg, max_r6);
                store(hb, max_r5);
            }
            SPMD_SIMPLE_END_IF
        }
        SPMD_SIMPLE_ELSE(grayscale_flag)
        {
            // Select 2 colors along the principle axis. (There must be a faster/simpler way.)
            vfloat icov0 = 0.0f, icov1 = 0.0f, icov2 = 0.0f, icov3 = 0.0f, icov4 = 0.0f, icov5 = 0.0f;

            for (uint32_t i = 0; i < 16; i++)
            {
                vfloat r = pSrc_pixels[i].c[0] - avg_r;
                vfloat g = pSrc_pixels[i].c[1] - avg_g;
                vfloat b = pSrc_pixels[i].c[2] - avg_b;

                store_all(icov0, icov0 + r * r);
                store_all(icov1, icov1 + r * g);
                store_all(icov2, icov2 + r * b);
                store_all(icov3, icov3 + g * g);
                store_all(icov4, icov4 + g * b);
                store_all(icov5, icov5 + b * b);
            }

            vfloat saxis_r = 306.0f, saxis_g = 601.0f, saxis_b = 117.0f;

            vfloat xr = max_r - min_r;
            vfloat xg = max_g - min_g;
            vfloat xb = max_b - min_b;

            store_all(xr, spmd_ternaryf(icov2 < 0.0f, -xr, xr));
            store_all(xg, spmd_ternaryf(icov4 < 0.0f, -xg, xg));

            vfloat cov0 = icov0 * (1.0f/255.0f);
            vfloat cov1 = icov1 * (1.0f/255.0f);
            vfloat cov2 = icov2 * (1.0f/255.0f);
            vfloat cov3 = icov3 * (1.0f/255.0f);
            vfloat cov4 = icov4 * (1.0f/255.0f);
            vfloat cov5 = icov5 * (1.0f/255.0f);

            const uint32_t total_power_iters = 4;
            for (uint32_t power_iter = 0; power_iter < total_power_iters; power_iter++)
            {
                vfloat r = xr * cov0 + xg * cov1 + xb * cov2;
                vfloat g = xr * cov1 + xg * cov3 + xb * cov4;
                vfloat b = xr * cov2 + xg * cov4 + xb * cov5;

                store_all(xr, r);
                store_all(xg, g);
                store_all(xb, b);
            }

            vfloat k = max(max(abs(xr), abs(xg)), abs(xb));

            SPMD_SIMPLE_IF(k >= 2.0f)
            {
                vfloat m = 2048.0f / k;
                store(saxis_r, xr * m);
                store(saxis_g, xg * m);
                store(saxis_b, xb * m);
            }
            SPMD_SIMPLE_END_IF

            vfloat br = pSrc_pixels[0].c[0], bg = pSrc_pixels[0].c[1], bb = pSrc_pixels[0].c[2];
            vfloat bdot = br * saxis_r + bg * saxis_g + bb * saxis_b;

            vfloat lo_dot = bdot, hi_dot = bdot;
            vfloat lo_r(br), lo_g(bg), lo_b(bb), hi_r(br), hi_g(bg), hi_b(bb);

            for (uint32_t i = 1; i < 16; i++)
            {
                vfloat r = pSrc_pixels[i].c[0], g = pSrc_pixels[i].c[1], b = pSrc_pixels[i].c[2];
                vfloat dot = r * saxis_r + g * saxis_g + b * saxis_b;

                vbool l = dot < lo_dot;
                vbool h = dot > hi_dot;

                store_all(lo_dot, spmd_ternaryf(l, dot, lo_dot));
                store_all(lo_r, spmd_ternaryf(l, r, lo_r));
                store_all(lo_g, spmd_ternaryf(l, g, lo_g));
                store_all(lo_b, spmd_ternaryf(l, b, lo_b));

                store_all(hi_dot, spmd_ternaryf(h, dot, hi_dot));
                store_all(hi_r, spmd_ternaryf(h, r, hi_r));
                store_all(hi_g, spmd_ternaryf(h, g, hi_g));
                store_all(hi_b, spmd_ternaryf(h, b, hi_b));
            }

            store(lr, to_5(lo_r));
            store(lg, to_6(lo_g));
            store(lb, to_5(lo_b));

            store(hr, to_5(hi_r));
            store(hg, to_6(hi_g));
            store(hb, to_5(hi_b));
        }
        SPMD_SIMPLE_END_IF
    }

    CPPSPMD_FORCE_INLINE void encode_bc1_internal(const lint index, bc1_block* pDst_blocks, const vcolor3 *pSrc_pixels, int pcount)
    {
        const vfloat fr = pSrc_pixels[0].c[0], fg = pSrc_pixels[0].c[1], fb = pSrc_pixels[0].c[2];

        vfloat total_r(fr), total_g(fg), total_b(fb), min_r(fr), min_g(fg), min_b(fb), max_r(fr), max_g(fg), max_b(fb);

        vbool grayscale_flag(true);
        for (uint32_t i = 1; i < 16; i++)
        {
            const vfloat r = pSrc_pixels[i].c[0], g = pSrc_pixels[i].c[1], b = pSrc_pixels[i].c[2];

            store_all(grayscale_flag, grayscale_flag && (r == g) && (r == b));

            store_all(min_r, min(min_r, r));
            store_all(min_g, min(min_g, g));
            store_all(min_b, min(min_b, b));

            store_all(max_r, max(max_r, r));
            store_all(max_g, max(max_g, g));
            store_all(max_b, max(max_b, b));

            store_all(total_r, total_r + r);
            store_all(total_g, total_g + g);
            store_all(total_b, total_b + b);
        }

        vfloat avg_r = floor((total_r + 8) * (1.0f/16.0f));
        vfloat avg_g = floor((total_g + 8) * (1.0f/16.0f));
        vfloat avg_b = floor((total_b + 8) * (1.0f/16.0f));

        vfloat lr, lg, lb, hr, hg, hb;

        encode_bc1_pick_initial(pSrc_pixels, grayscale_flag,
            min_r, min_g, min_b, max_r, max_g, max_b,
            avg_r, avg_g, avg_b, total_r, total_g, total_b,
            lr, lg, lb, hr, hg, hb);

        vfloat sels[16];
        bc1_find_sels4_noerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels);

        vfloat trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb;

        vvec3F xl, xh;
        vbool result = compute_least_squares_endpoints4_rgb(pSrc_pixels, sels, &xl, &xh, total_r, total_g, total_b);

        SPMD_SIMPLE_IF(result)
        {
            precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb);

            bc1_find_sels4_noerr(pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, sels);

            store(lr, trial_lr);
            store(lg, trial_lg);
            store(lb, trial_lb);

            store(hr, trial_hr);
            store(hg, trial_hg);
            store(hb, trial_hb);
        }
        SPMD_SIMPLE_END_IF

        bc1_encode4(index, pDst_blocks, lr, lg, lb, hr, hg, hb, sels, pcount);
    }

    void _call(bc1_block* pDst_blocks, const uint8_t *pSrc_pixels, int n)
    {
        spmd_foreach(0, n,
            [&](const lint index, int pcount)
            {
                const int first_index = index.get_first_value();

                vcolor3 src_pixels[16];

                if ((CPPSPMD_AVX) && (pcount == CPPSPMD::PROGRAM_COUNT))
                {
                    int32_t *pSrc_int32 = (int32_t *)pSrc_pixels;

#if CPPSPMD_AVX2
                    vint vindex16 = vint(index) * 16;
#else
                    int32_t *pCur_int32 = pSrc_int32 + first_index * 16;
#endif

                    for (uint32_t i = 0; i < 16; i++)
                    {
#if CPPSPMD_AVX2
                        vint v = load_all((vindex16 + i)[pSrc_int32]);
#else
                        vint v = load_all_strided(pCur_int32, 16);
                        pCur_int32++;
#endif
                        vfloat r = vfloat(v & 0xFF);
                        vfloat g = vfloat(VINT_SHIFT_RIGHT(v, 8) & 0xFF);
                        vfloat b = vfloat(VINT_SHIFT_RIGHT(v, 16) & 0xFF);

                        store_all(src_pixels[i].c[2], b);
                        store_all(src_pixels[i].c[1], g);
                        store_all(src_pixels[i].c[0], r);
                    }
                }
                else
                {
                    const uint8_t *pSrc = pSrc_pixels + first_index * 16 * 4;

                    for (int p = 0; p < CPPSPMD::PROGRAM_COUNT; p++)
                    {
                        if (p < pcount)
                        {
                            for (uint32_t i = 0; i < 16; i++, pSrc += 4)
                            {
                                uint32_t v = *(uint32_t *)pSrc;

                                ((float *)&src_pixels[i].c[0])[p] = (float)(v & 0xFF);
                                ((float *)&src_pixels[i].c[1])[p] = (float)((v >> 8) & 0xFF);
                                ((float *)&src_pixels[i].c[2])[p] = (float)((v >> 16) & 0xFF);
                            }
                        }
                        else
                        {
                            for (uint32_t i = 0; i < 16; i++, pSrc += 4)
                            {
                                ((float *)&src_pixels[i].c[0])[p] = 0;
                                ((float *)&src_pixels[i].c[1])[p] = 0;
                                ((float *)&src_pixels[i].c[2])[p] = 0;
                            }
                        }
                    }
                }

                encode_bc1_internal(index, pDst_blocks, src_pixels, pcount);
            }
        );

    }
};