Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // bc1v_kernel.h
- struct CPPSPMD_MAKE_NAME(bc1v_kernel) : CPPSPMD::spmd_kernel
- {
- struct bc1_block
- {
- uint16_t m_lo;
- uint16_t m_hi;
- uint32_t m_sels;
- };
- struct vvec3F { vfloat c[3]; };
- struct vcolor3 { vfloat c[3]; };
- static CPPSPMD_FORCE_INLINE vfloat squarevf(const vfloat &a)
- {
- return a * a;
- }
- static CPPSPMD_FORCE_INLINE vfloat to_5(vfloat v)
- {
- vfloat t = fma(v, 31.0 * (1.0f/256.0f), 128.0f * (1.0f/256.0f));
- return floor( fma(t, 1.0f/256.0f, t) );
- }
- static CPPSPMD_FORCE_INLINE vfloat to_6(vfloat v)
- {
- vfloat t = fma(v, 63.0 * (1.0f/256.0f), 128.0f * (1.0f/256.0f));
- return floor( fma(t, 1.0f/256.0f, t) );
- }
- static void global_init()
- {
- }
- CPPSPMD_FORCE_INLINE void bc1_get_block_colors4(vfloat block_r[4], vfloat block_g[4], vfloat block_b[4], const vfloat &lr, const vfloat &lg, const vfloat &lb, const vfloat &hr, const vfloat &hg, const vfloat &hb)
- {
- store_all(block_r[0], fma(lr, 8.0f, floor(lr * .25f)));
- store_all(block_g[0], fma(lg, 4.0f, floor(lg * .0625f)));
- store_all(block_b[0], fma(lb, 8.0f, floor(lb * .25f)));
- store_all(block_r[3], fma(hr, 8.0f, floor(hr * .25f)));
- store_all(block_g[3], fma(hg, 4.0f, floor(hg * .0625f)));
- store_all(block_b[3], fma(hb, 8.0f, floor(hb * .25f)));
- vfloat delta_r = block_r[3] - block_r[0];
- vfloat delta_g = block_g[3] - block_g[0];
- vfloat delta_b = block_b[3] - block_b[0];
- store_all( block_r[1], floor(fma(delta_r, 1.0f/3.0f, block_r[0])) );
- store_all( block_g[1], floor(fma(delta_g, 1.0f/3.0f, block_g[0])) );
- store_all( block_b[1], floor(fma(delta_b, 1.0f/3.0f, block_b[0])) );
- store_all( block_r[2], floor(fma(delta_r, 2.0f/3.0f, block_r[0])) );
- store_all( block_g[2], floor(fma(delta_g, 2.0f/3.0f, block_g[0])) );
- store_all( block_b[2], floor(fma(delta_b, 2.0f/3.0f, block_b[0])) );
- }
- CPPSPMD_FORCE_INLINE void bc1_find_sels4_noerr(const vcolor3* pSrc_pixels, const vfloat &lr, const vfloat &lg, const vfloat &lb, const vfloat &hr, const vfloat &hg, const vfloat &hb, vfloat sels[16])
- {
- vfloat block_r[4], block_g[4], block_b[4];
- bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);
- vfloat ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];
- vfloat dots[4];
- for (uint32_t i = 0; i < 4; i++)
- store_all(dots[i], block_r[i] * ar + block_g[i] * ag + block_b[i] * ab);
- vfloat t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];
- vfloat ar_scaled = ar * 2.0f, ag_scaled = ag * 2.0f, ab_scaled = ab * 2.0f;
- for (uint32_t i = 0; i < 16; i += 4)
- {
- const vfloat d0 = pSrc_pixels[i+0].c[0] * ar_scaled + pSrc_pixels[i+0].c[1] * ag_scaled + pSrc_pixels[i+0].c[2] * ab_scaled;
- const vfloat d1 = pSrc_pixels[i+1].c[0] * ar_scaled + pSrc_pixels[i+1].c[1] * ag_scaled + pSrc_pixels[i+1].c[2] * ab_scaled;
- const vfloat d2 = pSrc_pixels[i+2].c[0] * ar_scaled + pSrc_pixels[i+2].c[1] * ag_scaled + pSrc_pixels[i+2].c[2] * ab_scaled;
- const vfloat d3 = pSrc_pixels[i+3].c[0] * ar_scaled + pSrc_pixels[i+3].c[1] * ag_scaled + pSrc_pixels[i+3].c[2] * ab_scaled;
- vfloat sel0 = (vfloat)(d0 > t0) + (vfloat)(d0 >= t1) + (vfloat)(d0 >= t2);
- vfloat sel1 = (vfloat)(d1 > t0) + (vfloat)(d1 >= t1) + (vfloat)(d1 >= t2);
- vfloat sel2 = (vfloat)(d2 > t0) + (vfloat)(d2 >= t1) + (vfloat)(d2 >= t2);
- vfloat sel3 = (vfloat)(d3 > t0) + (vfloat)(d3 >= t1) + (vfloat)(d3 >= t2);
- store(sels[i+0], sel0);
- store(sels[i+1], sel1);
- store(sels[i+2], sel2);
- store(sels[i+3], sel3);
- }
- }
- CPPSPMD_FORCE_INLINE vfloat select4(const vfloat x[4], const vfloat &sel)
- {
- return vfloat(
- spmd_ternaryf(sel == 0.0f, x[0], spmd_ternaryf(sel == 1.0f, x[1], spmd_ternaryf(sel == 2.0f, x[2], x[3] ) ) )
- );
- }
- CPPSPMD_FORCE_INLINE vbool compute_least_squares_endpoints4_rgb(const vcolor3* pColors, const vfloat* pSelectors, vvec3F* pXl, vvec3F* pXh, const vfloat &total_r, const vfloat &total_g, const vfloat &total_b)
- {
- vfloat q00_r = 0.0f, q00_g = 0.0f, q00_b = 0.0f;
- vfloat z00 = 0.0f, z10 = 0.0f, z11 = 0.0f;
- for (uint32_t i = 0; i < 16; i++)
- {
- const vfloat r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2];
- const vfloat sel = pSelectors[i];
- const vfloat w = sel * (1.0f/3.0f);
- const vfloat one_minus_w = 1.0f - w;
- store_all(z00, z00 + w * w);
- store_all(z10, z10 + one_minus_w * w);
- store_all(z11, z11 + one_minus_w * one_minus_w);
- store_all(q00_r, q00_r + w * r);
- store_all(q00_g, q00_g + w * g);
- store_all(q00_b, q00_b + w * b);
- }
- vfloat q10_r = total_r - q00_r;
- vfloat q10_g = total_g - q00_g;
- vfloat q10_b = total_b - q00_b;
- vfloat z01 = z10;
- vfloat det = z00 * z11 - z01 * z10;
- vbool valid_mask = (abs(det) >= 1e-4f);
- vfloat det_fixed = spmd_ternaryf(valid_mask, det, 1e+10f);
- vfloat det_scaled = (1.0f / 255.0f) / det_fixed;
- vfloat iz00 = z11 * det_scaled;
- vfloat iz01 = -z01 * det_scaled;
- vfloat iz10 = -z10 * det_scaled;
- vfloat iz11 = z00 * det_scaled;
- store_all(pXl->c[0], iz00 * q00_r + iz01 * q10_r);
- store_all(pXh->c[0], iz10 * q00_r + iz11 * q10_r);
- store_all(pXl->c[1], iz00 * q00_g + iz01 * q10_g);
- store_all(pXh->c[1], iz10 * q00_g + iz11 * q10_g);
- store_all(pXl->c[2], iz00 * q00_b + iz01 * q10_b);
- store_all(pXh->c[2], iz10 * q00_b + iz11 * q10_b);
- return valid_mask;
- }
- CPPSPMD_FORCE_INLINE void precise_round_565(const vvec3F &xl, const vvec3F &xh,
- vfloat &trial_lr, vfloat &trial_lg, vfloat &trial_lb,
- vfloat &trial_hr, vfloat &trial_hg, vfloat &trial_hb)
- {
- // FIXME: Use precise BC1 rounding.
- store_all(trial_lr, clamp(round_nearest(xl.c[0] * 31.0f), 0.0f, 31.0f));
- store_all(trial_lg, clamp(round_nearest(xl.c[1] * 63.0f), 0.0f, 63.0f));
- store_all(trial_lb, clamp(round_nearest(xl.c[2] * 31.0f), 0.0f, 31.0f));
- store_all(trial_hr, clamp(round_nearest(xh.c[0] * 31.0f), 0.0f, 31.0f));
- store_all(trial_hg, clamp(round_nearest(xh.c[1] * 63.0f), 0.0f, 63.0f));
- store_all(trial_hb, clamp(round_nearest(xh.c[2] * 31.0f), 0.0f, 31.0f));
- }
- static CPPSPMD_FORCE_INLINE vfloat fix_sels(vfloat sels, float m)
- {
- return spmd_ternaryf(sels == 0.0f, 0.0f, spmd_ternaryf(sels == 1.0f, 2.0f * m, spmd_ternaryf(sels == 2.0f, 3.0f * m, m ) ) );
- }
- CPPSPMD_FORCE_INLINE void bc1_encode4(lint indices, bc1_block *pDst_blocks, const vfloat lr, const vfloat lg, const vfloat lb, const vfloat hr, const vfloat hg, const vfloat hb, vfloat sels[16], int pcount)
- {
- vfloat lc16i = lb + (lg * 32.0f) + (lr * 2048.0f);
- vfloat hc16i = hb + (hg * 32.0f) + (hr * 2048.0f);
- SPMD_SIMPLE_IF(lc16i == hc16i)
- {
- SPMD_SIMPLE_IF(hc16i > 0.0f)
- {
- store(hc16i, hc16i - 1.0f);
- }
- SPMD_SIMPLE_ELSE(hc16i > 0.0f)
- {
- store(hc16i, 0.0f);
- store(lc16i, 1.0f);
- for (uint32_t i = 0; i < 16; i++)
- store(sels[i], 3.0f);
- }
- SPMD_SIMPLE_END_IF
- }
- SPMD_SIMPLE_ELSE(lc16i == hc16i)
- {
- SPMD_SIMPLE_IF(lc16i < hc16i)
- {
- swap(lc16i, hc16i);
- for (uint32_t i = 0; i < 16; i++)
- store(sels[i], 3.0f - sels[i]);
- }
- SPMD_SIMPLE_END_IF
- }
- SPMD_SIMPLE_END_IF
- vfloat sels0 = fix_sels(sels[0], 1.0f) + fix_sels(sels[1], 4.0f) + fix_sels(sels[2], 16.0f) + fix_sels(sels[3], 64.0f) +
- fix_sels(sels[4], 256.0f) + fix_sels(sels[5], 1024.0f) + fix_sels(sels[6], 4096.0f) + fix_sels(sels[7], 16384.0f);
- vfloat sels1 = fix_sels(sels[8], 1.0f) + fix_sels(sels[9], 4.0f) + fix_sels(sels[10], 16.0f) + fix_sels(sels[11], 64.0f) +
- fix_sels(sels[12], 256.0f) + fix_sels(sels[13], 1024.0f) + fix_sels(sels[14], 4096.0f) + fix_sels(sels[15], 16384.0f);
- vint endpoints = vint(lc16i) | VINT_SHIFT_LEFT(vint(hc16i), 16);
- vint selectors = vint(sels0) | VINT_SHIFT_LEFT(vint(sels1), 16);
- //int *pDst_ints = reinterpret_cast<int *>(pDst_blocks);
- //vint vindices(vint(indices) * 2);
- //store(vindices[pDst_ints], endpoints);
- //store((vindices + 1)[pDst_ints], selectors);
- int *pDst_ints = reinterpret_cast<int *>(pDst_blocks) + indices.get_first_value() * 2;
- store_strided(pDst_ints, 2, endpoints);
- store_strided(pDst_ints + 1, 2, selectors);
- }
- CPPSPMD_FORCE_INLINE void encode_bc1_pick_initial(const vcolor3 *pSrc_pixels, const vbool grayscale_flag,
- const vfloat min_r, const vfloat min_g, const vfloat min_b, const vfloat max_r, const vfloat max_g, const vfloat max_b,
- const vfloat avg_r, const vfloat avg_g, const vfloat avg_b, const vfloat total_r, const vfloat total_g, const vfloat total_b,
- vfloat &lr, vfloat &lg, vfloat &lb, vfloat &hr, vfloat &hg, vfloat &hb)
- {
- SPMD_SIMPLE_IF(grayscale_flag)
- {
- // Grayscale blocks are a common enough case to specialize.
- SPMD_SIMPLE_IF( (max_r - min_r) < 2.0f )
- {
- const vfloat fr = pSrc_pixels[0].c[0];
- vfloat fr5 = to_5(fr);
- vfloat fr6 = to_6(fr);
- store_all(lr, fr5);
- store_all(lg, fr6);
- store_all(lb, fr5);
- store_all(hr, fr5);
- store_all(hg, fr6);
- store_all(hb, fr5);
- }
- SPMD_SIMPLE_ELSE( (max_r - min_r) < 2.0f )
- {
- vfloat min_r5 = to_5(min_r);
- vfloat min_r6 = to_6(min_r);
- vfloat max_r5 = to_5(max_r);
- vfloat max_r6 = to_6(max_r);
- store(lr, min_r5);
- store(lg, min_r6);
- store(lb, min_r5);
- store(hr, max_r5);
- store(hg, max_r6);
- store(hb, max_r5);
- }
- SPMD_SIMPLE_END_IF
- }
- SPMD_SIMPLE_ELSE(grayscale_flag)
- {
- // Select 2 colors along the principle axis. (There must be a faster/simpler way.)
- vfloat icov0 = 0.0f, icov1 = 0.0f, icov2 = 0.0f, icov3 = 0.0f, icov4 = 0.0f, icov5 = 0.0f;
- for (uint32_t i = 0; i < 16; i++)
- {
- vfloat r = pSrc_pixels[i].c[0] - avg_r;
- vfloat g = pSrc_pixels[i].c[1] - avg_g;
- vfloat b = pSrc_pixels[i].c[2] - avg_b;
- store_all(icov0, icov0 + r * r);
- store_all(icov1, icov1 + r * g);
- store_all(icov2, icov2 + r * b);
- store_all(icov3, icov3 + g * g);
- store_all(icov4, icov4 + g * b);
- store_all(icov5, icov5 + b * b);
- }
- vfloat saxis_r = 306.0f, saxis_g = 601.0f, saxis_b = 117.0f;
- vfloat xr = max_r - min_r;
- vfloat xg = max_g - min_g;
- vfloat xb = max_b - min_b;
- store_all(xr, spmd_ternaryf(icov2 < 0.0f, -xr, xr));
- store_all(xg, spmd_ternaryf(icov4 < 0.0f, -xg, xg));
- vfloat cov0 = icov0 * (1.0f/255.0f);
- vfloat cov1 = icov1 * (1.0f/255.0f);
- vfloat cov2 = icov2 * (1.0f/255.0f);
- vfloat cov3 = icov3 * (1.0f/255.0f);
- vfloat cov4 = icov4 * (1.0f/255.0f);
- vfloat cov5 = icov5 * (1.0f/255.0f);
- const uint32_t total_power_iters = 4;
- for (uint32_t power_iter = 0; power_iter < total_power_iters; power_iter++)
- {
- vfloat r = xr * cov0 + xg * cov1 + xb * cov2;
- vfloat g = xr * cov1 + xg * cov3 + xb * cov4;
- vfloat b = xr * cov2 + xg * cov4 + xb * cov5;
- store_all(xr, r);
- store_all(xg, g);
- store_all(xb, b);
- }
- vfloat k = max(max(abs(xr), abs(xg)), abs(xb));
- SPMD_SIMPLE_IF(k >= 2.0f)
- {
- vfloat m = 2048.0f / k;
- store(saxis_r, xr * m);
- store(saxis_g, xg * m);
- store(saxis_b, xb * m);
- }
- SPMD_SIMPLE_END_IF
- vfloat br = pSrc_pixels[0].c[0], bg = pSrc_pixels[0].c[1], bb = pSrc_pixels[0].c[2];
- vfloat bdot = br * saxis_r + bg * saxis_g + bb * saxis_b;
- vfloat lo_dot = bdot, hi_dot = bdot;
- vfloat lo_r(br), lo_g(bg), lo_b(bb), hi_r(br), hi_g(bg), hi_b(bb);
- for (uint32_t i = 1; i < 16; i++)
- {
- vfloat r = pSrc_pixels[i].c[0], g = pSrc_pixels[i].c[1], b = pSrc_pixels[i].c[2];
- vfloat dot = r * saxis_r + g * saxis_g + b * saxis_b;
- vbool l = dot < lo_dot;
- vbool h = dot > hi_dot;
- store_all(lo_dot, spmd_ternaryf(l, dot, lo_dot));
- store_all(lo_r, spmd_ternaryf(l, r, lo_r));
- store_all(lo_g, spmd_ternaryf(l, g, lo_g));
- store_all(lo_b, spmd_ternaryf(l, b, lo_b));
- store_all(hi_dot, spmd_ternaryf(h, dot, hi_dot));
- store_all(hi_r, spmd_ternaryf(h, r, hi_r));
- store_all(hi_g, spmd_ternaryf(h, g, hi_g));
- store_all(hi_b, spmd_ternaryf(h, b, hi_b));
- }
- store(lr, to_5(lo_r));
- store(lg, to_6(lo_g));
- store(lb, to_5(lo_b));
- store(hr, to_5(hi_r));
- store(hg, to_6(hi_g));
- store(hb, to_5(hi_b));
- }
- SPMD_SIMPLE_END_IF
- }
- CPPSPMD_FORCE_INLINE void encode_bc1_internal(const lint index, bc1_block* pDst_blocks, const vcolor3 *pSrc_pixels, int pcount)
- {
- const vfloat fr = pSrc_pixels[0].c[0], fg = pSrc_pixels[0].c[1], fb = pSrc_pixels[0].c[2];
- vfloat total_r(fr), total_g(fg), total_b(fb), min_r(fr), min_g(fg), min_b(fb), max_r(fr), max_g(fg), max_b(fb);
- vbool grayscale_flag(true);
- for (uint32_t i = 1; i < 16; i++)
- {
- const vfloat r = pSrc_pixels[i].c[0], g = pSrc_pixels[i].c[1], b = pSrc_pixels[i].c[2];
- store_all(grayscale_flag, grayscale_flag && (r == g) && (r == b));
- store_all(min_r, min(min_r, r));
- store_all(min_g, min(min_g, g));
- store_all(min_b, min(min_b, b));
- store_all(max_r, max(max_r, r));
- store_all(max_g, max(max_g, g));
- store_all(max_b, max(max_b, b));
- store_all(total_r, total_r + r);
- store_all(total_g, total_g + g);
- store_all(total_b, total_b + b);
- }
- vfloat avg_r = floor((total_r + 8) * (1.0f/16.0f));
- vfloat avg_g = floor((total_g + 8) * (1.0f/16.0f));
- vfloat avg_b = floor((total_b + 8) * (1.0f/16.0f));
- vfloat lr, lg, lb, hr, hg, hb;
- encode_bc1_pick_initial(pSrc_pixels, grayscale_flag,
- min_r, min_g, min_b, max_r, max_g, max_b,
- avg_r, avg_g, avg_b, total_r, total_g, total_b,
- lr, lg, lb, hr, hg, hb);
- vfloat sels[16];
- bc1_find_sels4_noerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels);
- vfloat trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb;
- vvec3F xl, xh;
- vbool result = compute_least_squares_endpoints4_rgb(pSrc_pixels, sels, &xl, &xh, total_r, total_g, total_b);
- SPMD_SIMPLE_IF(result)
- {
- precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb);
- bc1_find_sels4_noerr(pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, sels);
- store(lr, trial_lr);
- store(lg, trial_lg);
- store(lb, trial_lb);
- store(hr, trial_hr);
- store(hg, trial_hg);
- store(hb, trial_hb);
- }
- SPMD_SIMPLE_END_IF
- bc1_encode4(index, pDst_blocks, lr, lg, lb, hr, hg, hb, sels, pcount);
- }
- void _call(bc1_block* pDst_blocks, const uint8_t *pSrc_pixels, int n)
- {
- spmd_foreach(0, n,
- [&](const lint index, int pcount)
- {
- const int first_index = index.get_first_value();
- vcolor3 src_pixels[16];
- if ((CPPSPMD_AVX) && (pcount == CPPSPMD::PROGRAM_COUNT))
- {
- int32_t *pSrc_int32 = (int32_t *)pSrc_pixels;
- #if CPPSPMD_AVX2
- vint vindex16 = vint(index) * 16;
- #else
- int32_t *pCur_int32 = pSrc_int32 + first_index * 16;
- #endif
- for (uint32_t i = 0; i < 16; i++)
- {
- #if CPPSPMD_AVX2
- vint v = load_all((vindex16 + i)[pSrc_int32]);
- #else
- vint v = load_all_strided(pCur_int32, 16);
- pCur_int32++;
- #endif
- vfloat r = vfloat(v & 0xFF);
- vfloat g = vfloat(VINT_SHIFT_RIGHT(v, 8) & 0xFF);
- vfloat b = vfloat(VINT_SHIFT_RIGHT(v, 16) & 0xFF);
- store_all(src_pixels[i].c[2], b);
- store_all(src_pixels[i].c[1], g);
- store_all(src_pixels[i].c[0], r);
- }
- }
- else
- {
- const uint8_t *pSrc = pSrc_pixels + first_index * 16 * 4;
- for (int p = 0; p < CPPSPMD::PROGRAM_COUNT; p++)
- {
- if (p < pcount)
- {
- for (uint32_t i = 0; i < 16; i++, pSrc += 4)
- {
- uint32_t v = *(uint32_t *)pSrc;
- ((float *)&src_pixels[i].c[0])[p] = (float)(v & 0xFF);
- ((float *)&src_pixels[i].c[1])[p] = (float)((v >> 8) & 0xFF);
- ((float *)&src_pixels[i].c[2])[p] = (float)((v >> 16) & 0xFF);
- }
- }
- else
- {
- for (uint32_t i = 0; i < 16; i++, pSrc += 4)
- {
- ((float *)&src_pixels[i].c[0])[p] = 0;
- ((float *)&src_pixels[i].c[1])[p] = 0;
- ((float *)&src_pixels[i].c[2])[p] = 0;
- }
- }
- }
- }
- encode_bc1_internal(index, pDst_blocks, src_pixels, pcount);
- }
- );
- }
- };
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement