Untitled

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
typedef unsigned char       U8;
typedef unsigned short      U16;
typedef unsigned int        U32;
typedef signed char         S8;
typedef signed short        S16;
typedef signed int          S32;
typedef float               F32;
typedef double              F64;
typedef void                (*FuncPtr)(void);
typedef unsigned long       U64;
typedef signed long         S64;
typedef S64                 SPTR;
typedef U64                 UPTR;
inline U64    doubleToBits    (F64 a)         { return *(U64*)&a; }
inline F64    bitsToDouble    (U64 a)         { return *(F64*)&a; }
inline U32    floatToBits     (F32 a)         { return as_int(a); }
inline F32    bitsToFloat     (U32 a)         { return as_float(a); }
inline F32    scale           (F32 a, int b)  { return a * (exp2((float)b)); }
inline int    popc8           (U32 mask);
inline int    popc16          (U32 mask);
inline int    popc32          (U32 mask);
typedef double2 Vec2d;
typedef int2 Vec2i;
typedef float2 Vec2f;
typedef struct Vec3d_
{
    double          x;
    double          y;
    double          z;
} Vec3d;
typedef struct Vec3i_
{
    int             x;
    int             y;
    int             z;
} Vec3i;
typedef struct Vec3f_
{
    float           x;
    float           y;
    float           z;
} Vec3f;
typedef double4 Vec4d;
typedef int4 Vec4i;
typedef float4 Vec4f;
typedef struct Mat2f_
{
    float   m00, m10;
    float   m01, m11;
} Mat2f;
typedef struct Mat3f_
{
    float   m00, m10, m20;
    float   m01, m11, m21;
    float   m02, m12, m22;
} Mat3f;
typedef struct Mat4f_
{
    float   m00, m10, m20, m30;
    float   m01, m11, m21, m31;
    float   m02, m12, m22, m32;
    float   m03, m13, m23, m33;
} Mat4f;
__constant int c_popc8LUT[] =
{
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};
inline int popc8(U32 mask)
{
    ((void)0);
    return c_popc8LUT[mask];
}
inline int popc16(U32 mask)
{
    ((void)0);
    return c_popc8LUT[mask & 0xffu] + c_popc8LUT[mask >> 8];
}
inline int popc32(U32 mask)
{
    int result = c_popc8LUT[mask & 0xffu];
    result += c_popc8LUT[(mask >> 8) & 0xffu];
    result += c_popc8LUT[(mask >> 16) & 0xffu];
    result += c_popc8LUT[mask >> 24];
    return result;
}

enum RenderFlags
{
    RenderFlags_CoarsePass              = 1 << 0,
    RenderFlags_UseCoarseData           = 1 << 1,
    RenderFlags_VisualizeIterations     = 1 << 2,
    RenderFlags_VisualizeRaycastLevel   = 1 << 3,
};
enum AttachSlot
{
    AttachSlot_Contour = 0,
    AttachSlot_Attribute,
    AttachSlot_AO,
    AttachSlot_Max
};
enum
{
    PageBytesLog2   = 13,
    PageBytes       = 1 << PageBytesLog2,
    PageSizeLog2    = PageBytesLog2 - 2,
    PageSize        = 1 << PageSizeLog2,
    TrunksPerBlock  = 512,
};
enum FindMode
{
    FindMode_Load = 0,
    FindMode_Build,
    FindMode_LoadOrBuild,
    FindMode_Unload,
    FindMode_UnloadDeepest,
    FindMode_Max
};
enum BlockInfo
{
    BlockInfo_SliceID = 0,
    BlockInfo_IndexInSlice,
    BlockInfo_BlockPtr,
    BlockInfo_NumAttach,
    BlockInfo_End
};
enum AttachInfo
{
    AttachInfo_Type = 0,
    AttachInfo_Ptr,
    AttachInfo_End
};
enum AttachType
{
    VoidAttach                  = 0,
    ColorNormalDXTAttach        = 1,
    ColorNormalPaletteAttach    = 3,
    BuildDataAttach             = 4,
    ContourAttach               = 6,
    ColorNormalCornerAttach     = 7,
    AOAttach                    = 10,
    AttachType_Max
};
typedef struct OctreeMatrices_
{
    Mat4f           viewportToCamera;
    Mat4f           cameraToOctree;
    Mat4f           octreeToWorld;
    F32             pixelInOctree;
    Mat4f           worldToOctree;
    Mat3f           octreeToWorldN;
    Vec3f           cameraPosition;
    Mat4f           octreeToViewport;
    Mat4f           viewportToOctreeN;
}OctreeMatrices;
typedef struct StandardInput_
{
    Vec2i           frameSize;
    U32             flags;
    S32             batchSize;
    S32             aaRays;
    F32             maxVoxelSize;
    F32             brightness;
    S32             coarseSize;
    Vec2i           coarseFrameSize;
    S32             numPrimaryRays;
    S32             totalWork;
    OctreeMatrices  octreeMatrices;
}StandardInput;

inline F32          fmaxf3          (F32 a, F32 b, F32 c)           { return fmax(fmax(a, b), c); }
inline F32          fminf3          (F32 a, F32 b, F32 c)           { return fmin(fmin(a, b), c); }
inline F32          smoothstep01    (F32 f)                         { return smoothstep(0.f, 1.f, f); }
inline float4   scaleVec4         (const float4* a, const float4* b)  { float4 res = {(*a).x * (*b).x, (*a).y * (*b).y, (*a).z* (*b).z, (*a).w * (*b).w}; return res; }
inline float4    mulMatVec4     (const Mat4f* m, const float4* v);
inline float4    mulMatVec3     (const Mat3f* m, const float4* v);
inline float4    mulMatVec3const(__constant Mat3f* m, const float4* v);
inline Mat3f     extractMat3f   (__constant Mat4f* m);
inline float4    fromABGR        (U32 abgr);
inline U32       toABGR          (float4 v);
inline void      jenkinsMix      (U32* a, U32* b, U32* c);
inline float4    perpendicular   (const float4* v);
inline float4    mulMatVec4     (const Mat4f* m, const float4* v)
{
    float4 res = {
        (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z + (*m).m03 * (*v).w,
        (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z + (*m).m13 * (*v).w,
        (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z + (*m).m23 * (*v).w,
        (*m).m30 * (*v).x + (*m).m31 * (*v).y + (*m).m32 * (*v).z + (*m).m33 * (*v).w};
    return res;
}
inline float4    mulMat4Vec3        (__constant Mat4f* m, const float4* v)
{
    float4 res = {
        (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z + (*m).m03,
        (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z + (*m).m13,
        (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z + (*m).m23,
        0.0f};
    return res;
}
inline float4    mulMatVec3     (const Mat3f* m, const float4* v)
{
    float4 res = {
        (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z,
        (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z,
        (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z,
        0.0f};
    return res;
}
inline float4    mulMatVec3local    (const Mat3f* m, __local const float4* v)
{
    float4 res ={
        (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z,
        (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z,
        (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z,
        0.0f};
    return res;
}
inline float4    mulMatVec3const     (__constant Mat3f* m, const float4* v)
{
    float4 res = {
        (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z,
        (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z,
        (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z,
        0.0f};
    return res;
}
inline Mat3f extractMat3f(__constant Mat4f* m)
{
    Mat3f n;
    n.m00 = (*m).m00; n.m01 = (*m).m01; n.m02 = (*m).m02;
    n.m10 = (*m).m10; n.m11 = (*m).m11; n.m12 = (*m).m12;
    n.m20 = (*m).m20; n.m21 = (*m).m21; n.m22 = (*m).m22;
    return n;
}
inline float4 fromABGR(U32 abgr)
{
    float4 res = {
        (F32)(abgr & 0xFF),
        (F32)((abgr >> 8) & 0xFF),
        (F32)((abgr >> 16) & 0xFF),
        (F32)(abgr >> 24)};
    return res;
}
inline U32 toABGR(float4 v)
{
    v.x = fmin(fmax(v.x, 0.f), 255.f);
    v.y = fmin(fmax(v.y, 0.f), 255.f);
    v.z = fmin(fmax(v.z, 0.f), 255.f);
    v.w = fmin(fmax(v.w, 0.f), 255.f);
    U32 ir = (U32)(v.x);
    U32 ig = (U32)(v.y);
    U32 ib = (U32)(v.z);
    U32 ia = (U32)(v.w);
    return ir | (ig << 8) | (ib << 16) | (ia << 24);
}
inline void jenkinsMix(U32* a, U32* b, U32* c)
{
    *a -= *b; *a -= *c; *a ^= (*c>>13);
    *b -= *c; *b -= *a; *b ^= (*a<<8);
    *c -= *a; *c -= *b; *c ^= (*b>>13);
    *a -= *b; *a -= *c; *a ^= (*c>>12);
    *b -= *c; *b -= *a; *b ^= (*a<<16);
    *c -= *a; *c -= *b; *c ^= (*b>>5);
    *a -= *b; *a -= *c; *a ^= (*c>>3);
    *b -= *c; *b -= *a; *b ^= (*a<<10);
    *c -= *a; *c -= *b; *c ^= (*b>>15);
}
inline float4 perpendicular(const float4* v)
{
    float4 res;
    float vmin = fmin(fabs((*v).x), fmin(fabs((*v).y), fabs((*v).z)));
    if (vmin == fabs((*v).x))
        res = (float4)(0.0f, (*v).z, -(*v).y, 0.0f);
    else if (vmin == fabs((*v).y))
        res = (float4)(-(*v).z, 0.0f, (*v).x, 0.0f);
    else
        res = (float4)((*v).y, -(*v).x, 0.0f, 0.0f);
    return res;
}

typedef struct Ray_
{
    float4      orig;
    float4      dir;
}Ray;
typedef struct CastResult_
{

    float4      pos;
    int         iter;
    __global S32* node;
    int         childIdx;
    int         stackPtr;
}CastResult;
typedef struct CastStack_
{
    F32         tmax[23 + 1];
    intptr_t    node[23 + 1];
}CastStack;
__global S32* readStack(CastStack* castStack, int idx, F32* tmax)
{
    *tmax = (*castStack).tmax[idx];
    intptr_t e = (*castStack).node[idx];
    return (__global S32*)e;
};
void writeStack(CastStack* castStack, int idx, __global S32* node, F32 tmax)
{
    (*castStack).tmax[idx] = tmax;
    (*castStack).node[idx] = (intptr_t) node;
};
void castRayLocal(__global int* g_rootNode, unsigned int rootNodeOfs, CastResult* res, CastStack* stack, volatile __local Ray* ray)
{
    const float epsilon = exp2((float)-23);
    float ray_orig_sz = (*ray).orig.w;
    int iter = 0;

    if (fabs((*ray).dir.x) < epsilon) (*ray).dir.x = copysign(epsilon, (*ray).dir.x);
    if (fabs((*ray).dir.y) < epsilon) (*ray).dir.y = copysign(epsilon, (*ray).dir.y);
    if (fabs((*ray).dir.z) < epsilon) (*ray).dir.z = copysign(epsilon, (*ray).dir.z);


    float tx_coef = 1.0f / -fabs((*ray).dir.x);
    float ty_coef = 1.0f / -fabs((*ray).dir.y);
    float tz_coef = 1.0f / -fabs((*ray).dir.z);
    float tx_bias = tx_coef * (*ray).orig.x;
    float ty_bias = ty_coef * (*ray).orig.y;
    float tz_bias = tz_coef * (*ray).orig.z;


    int octant_mask = 7;
    if ((*ray).dir.x > 0.0f) octant_mask ^= 1, tx_bias = 3.0f * tx_coef - tx_bias;
    if ((*ray).dir.y > 0.0f) octant_mask ^= 2, ty_bias = 3.0f * ty_coef - ty_bias;
    if ((*ray).dir.z > 0.0f) octant_mask ^= 4, tz_bias = 3.0f * tz_coef - tz_bias;


    float t_min = fmax(fmax(2.0f * tx_coef - tx_bias, 2.0f * ty_coef - ty_bias), 2.0f * tz_coef - tz_bias);
    float t_max = fmin(fmin(tx_coef - tx_bias, ty_coef - ty_bias), tz_coef - tz_bias);
    float h = t_max;
    t_min = fmax(t_min, 0.0f);
    t_max = fmin(t_max, 1.0f);

    __global int*   parent  = g_rootNode + rootNodeOfs;
    int2   child_descriptor = {0, 0};
    int    idx              = 0;
    float4 pos              = {1.0f, 1.0f, 1.0f, 0.0f};
    int    scale            = 23 - 1;
    float  scale_exp2       = 0.5f;
    if (1.5f * tx_coef - tx_bias > t_min) idx ^= 1, pos.x = 1.5f;
    if (1.5f * ty_coef - ty_bias > t_min) idx ^= 2, pos.y = 1.5f;
    if (1.5f * tz_coef - tz_bias > t_min) idx ^= 4, pos.z = 1.5f;


   bool bBreak;

    while (scale < 23)
    {


        iter++;
        if (iter > 10000)
            break;

        if (child_descriptor.x == 0)
        {
            child_descriptor = *((__global int2*)(parent));

        }


        float tx_corner = pos.x * tx_coef - tx_bias;
        float ty_corner = pos.y * ty_coef - ty_bias;
        float tz_corner = pos.z * tz_coef - tz_bias;
        float tc_max = fmin(fmin(tx_corner, ty_corner), tz_corner);


        int child_shift = idx ^ octant_mask;
        int child_masks = child_descriptor.x << child_shift;
        if ((child_masks & 0x8000) != 0 && t_min <= t_max)
        {


            if (tc_max * (*ray).dir.w + ray_orig_sz >= scale_exp2)
                break;


            float tv_max = fmin(t_max, tc_max);
            float half = scale_exp2 * 0.5f;
            float tx_center = half * tx_coef + tx_corner;
            float ty_center = half * ty_coef + ty_corner;
            float tz_center = half * tz_coef + tz_corner;

            int contour_mask = child_descriptor.y << child_shift;
            if ((contour_mask & 0x80) != 0)
            {

                int   ofs    = (unsigned int)child_descriptor.y >> 8;
                int   value  = parent[ofs + popc8(contour_mask & 0x7F)];

                float cthick = (float)(unsigned int)value * scale_exp2 * 0.75f;
                float cpos   = (float)(value << 7) * scale_exp2 * 1.5f;
                float cdirx  = (float)(value << 14) * (*ray).dir.x;
                float cdiry  = (float)(value << 20) * (*ray).dir.y;
                float cdirz  = (float)(value << 26) * (*ray).dir.z;
                float tcoef  = 1.0f / (cdirx + cdiry + cdirz);
                float tavg   = tx_center * cdirx + ty_center * cdiry + tz_center * cdirz + cpos;
                float tdiff  = cthick * tcoef;
                t_min  = fmax(t_min,  tcoef * tavg - fabs(tdiff));
                tv_max = fmin(tv_max, tcoef * tavg + fabs(tdiff));
            }


            if (t_min <= tv_max)
            {


                if ((child_masks & 0x0080) == 0)
                 bBreak = true;
           else
                {
                    if (tc_max < h)
                    {

                        writeStack(stack, scale, parent, t_max);

                    }
                    h = tc_max;

                    int ofs = (unsigned int)child_descriptor.x >> 17;
                    if ((child_descriptor.x & 0x10000) != 0)
                    {
                        ofs = parent[ofs * 2];

                    }
                    ofs += popc8(child_masks & 0x7F);
                    parent += ofs * 2;

                    idx = 0;
                    scale--;
                    scale_exp2 = half;
                    if (tx_center > t_min) idx ^= 1, pos.x += scale_exp2;
                    if (ty_center > t_min) idx ^= 2, pos.y += scale_exp2;
                    if (tz_center > t_min) idx ^= 4, pos.z += scale_exp2;

                    t_max = tv_max;
                    child_descriptor.x = 0;
                    //continue;
                }
            }
        }

        if(bBreak)
         break;


        int step_mask = 0;
        if (tx_corner <= tc_max) step_mask ^= 1, pos.x -= scale_exp2;
        if (ty_corner <= tc_max) step_mask ^= 2, pos.y -= scale_exp2;
        if (tz_corner <= tc_max) step_mask ^= 4, pos.z -= scale_exp2;

        t_min = tc_max;
        idx ^= step_mask;

        if ((idx & step_mask) != 0)
        {


            unsigned int differing_bits = 0;
            if ((step_mask & 1) != 0) differing_bits |= as_int(pos.x) ^ as_int(pos.x + scale_exp2);
            if ((step_mask & 2) != 0) differing_bits |= as_int(pos.y) ^ as_int(pos.y + scale_exp2);
            if ((step_mask & 4) != 0) differing_bits |= as_int(pos.z) ^ as_int(pos.z + scale_exp2);
            scale = (as_int((float)differing_bits) >> 23) - 127;
            scale_exp2 = as_float((scale - 23 + 127) << 23);

            parent = readStack(stack, scale, &t_max);


            int shx = as_int(pos.x) >> scale;
            int shy = as_int(pos.y) >> scale;
            int shz = as_int(pos.z) >> scale;
            pos.x = as_float(shx << scale);
            pos.y = as_float(shy << scale);
            pos.z = as_float(shz << scale);
            idx  = (shx & 1) | ((shy & 1) << 1) | ((shz & 1) << 2);

            h = 0.0f;
            child_descriptor.x = 0;
        }

    }

    if (scale >= 23 || iter > 10000)
    {
        t_min = 2.0f;
    }

    if ((octant_mask & 1) == 0) pos.x = 3.0f - scale_exp2 - pos.x;
    if ((octant_mask & 2) == 0) pos.y = 3.0f - scale_exp2 - pos.y;
    if ((octant_mask & 4) == 0) pos.z = 3.0f - scale_exp2 - pos.z;

    (*res).pos.w = t_min;
    (*res).iter = iter;
    (*res).pos.x = fmin(fmax((*ray).orig.x + t_min * (*ray).dir.x, pos.x + epsilon), pos.x + scale_exp2 - epsilon);
    (*res).pos.y = fmin(fmax((*ray).orig.y + t_min * (*ray).dir.y, pos.y + epsilon), pos.y + scale_exp2 - epsilon);
    (*res).pos.z = fmin(fmax((*ray).orig.z + t_min * (*ray).dir.z, pos.z + epsilon), pos.z + scale_exp2 - epsilon);
    (*res).node = parent;
    (*res).childIdx = idx ^ octant_mask ^ 7;
    (*res).stackPtr = scale;
}
void castRay(__global int* g_rootNode, unsigned int rootNodeOfs, CastResult* res, CastStack* stack, volatile Ray* ray)
{
    const float epsilon = exp2((float)-23);
    float ray_orig_sz = (*ray).orig.w;
    int iter = 0;

    if (fabs((*ray).dir.x) < epsilon) (*ray).dir.x = copysign(epsilon, (*ray).dir.x);
    if (fabs((*ray).dir.y) < epsilon) (*ray).dir.y = copysign(epsilon, (*ray).dir.y);
    if (fabs((*ray).dir.z) < epsilon) (*ray).dir.z = copysign(epsilon, (*ray).dir.z);


    float tx_coef = 1.0f / -fabs((*ray).dir.x);
    float ty_coef = 1.0f / -fabs((*ray).dir.y);
    float tz_coef = 1.0f / -fabs((*ray).dir.z);
    float tx_bias = tx_coef * (*ray).orig.x;
    float ty_bias = ty_coef * (*ray).orig.y;
    float tz_bias = tz_coef * (*ray).orig.z;


    int octant_mask = 7;
    if ((*ray).dir.x > 0.0f) octant_mask ^= 1, tx_bias = 3.0f * tx_coef - tx_bias;
    if ((*ray).dir.y > 0.0f) octant_mask ^= 2, ty_bias = 3.0f * ty_coef - ty_bias;
    if ((*ray).dir.z > 0.0f) octant_mask ^= 4, tz_bias = 3.0f * tz_coef - tz_bias;


    float t_min = fmax(fmax(2.0f * tx_coef - tx_bias, 2.0f * ty_coef - ty_bias), 2.0f * tz_coef - tz_bias);
    float t_max = fmin(fmin(tx_coef - tx_bias, ty_coef - ty_bias), tz_coef - tz_bias);
    float h = t_max;
    t_min = fmax(t_min, 0.0f);
    t_max = fmin(t_max, 1.0f);

    __global int*   parent          = g_rootNode + rootNodeOfs;
    int2   child_descriptor = {0, 0};
    int    idx              = 0;
    float4 pos              = {1.0f, 1.0f, 1.0f, 0.0f};
    int    scale            = 23 - 1;
    float  scale_exp2       = 0.5f;
    if (1.5f * tx_coef - tx_bias > t_min) idx ^= 1, pos.x = 1.5f;
    if (1.5f * ty_coef - ty_bias > t_min) idx ^= 2, pos.y = 1.5f;
    if (1.5f * tz_coef - tz_bias > t_min) idx ^= 4, pos.z = 1.5f;


    while (scale < 23)
    {


        iter++;
        if (iter > 10000)
            break;

        if (child_descriptor.x == 0)
        {
            child_descriptor = *(__global int2*)(parent);

        }


        float tx_corner = pos.x * tx_coef - tx_bias;
        float ty_corner = pos.y * ty_coef - ty_bias;
        float tz_corner = pos.z * tz_coef - tz_bias;
        float tc_max = fmin(fmin(tx_corner, ty_corner), tz_corner);


        int child_shift = idx ^ octant_mask;
        int child_masks = child_descriptor.x << child_shift;
        if ((child_masks & 0x8000) != 0 && t_min <= t_max)
        {


            if (tc_max * (*ray).dir.w + ray_orig_sz >= scale_exp2)
                break;


            float tv_max = fmin(t_max, tc_max);
            float half = scale_exp2 * 0.5f;
            float tx_center = half * tx_coef + tx_corner;
            float ty_center = half * ty_coef + ty_corner;
            float tz_center = half * tz_coef + tz_corner;

            int contour_mask = child_descriptor.y << child_shift;
            if ((contour_mask & 0x80) != 0)
            {

                int   ofs    = (unsigned int)child_descriptor.y >> 8;
                int   value  = parent[ofs + popc8(contour_mask & 0x7F)];

                float cthick = (float)(unsigned int)value * scale_exp2 * 0.75f;
                float cpos   = (float)(value << 7) * scale_exp2 * 1.5f;
                float cdirx  = (float)(value << 14) * (*ray).dir.x;
                float cdiry  = (float)(value << 20) * (*ray).dir.y;
                float cdirz  = (float)(value << 26) * (*ray).dir.z;
                float tcoef  = 1.0f / (cdirx + cdiry + cdirz);
                float tavg   = tx_center * cdirx + ty_center * cdiry + tz_center * cdirz + cpos;
                float tdiff  = cthick * tcoef;
                t_min  = fmax(t_min,  tcoef * tavg - fabs(tdiff));
                tv_max = fmin(tv_max, tcoef * tavg + fabs(tdiff));
            }


            if (t_min <= tv_max)
            {


                if ((child_masks & 0x0080) == 0)
                    break;


                if (tc_max < h)
                {

                    writeStack(stack, scale, parent, t_max);

                }
                h = tc_max;

                int ofs = (unsigned int)child_descriptor.x >> 17;
                if ((child_descriptor.x & 0x10000) != 0)
                {
                    ofs = parent[ofs * 2];

                }
                ofs += popc8(child_masks & 0x7F);
                parent += ofs * 2;

                idx = 0;
                scale--;
                scale_exp2 = half;
                if (tx_center > t_min) idx ^= 1, pos.x += scale_exp2;
                if (ty_center > t_min) idx ^= 2, pos.y += scale_exp2;
                if (tz_center > t_min) idx ^= 4, pos.z += scale_exp2;

                t_max = tv_max;
                child_descriptor.x = 0;
                continue;
            }
        }


        int step_mask = 0;
        if (tx_corner <= tc_max) step_mask ^= 1, pos.x -= scale_exp2;
        if (ty_corner <= tc_max) step_mask ^= 2, pos.y -= scale_exp2;
        if (tz_corner <= tc_max) step_mask ^= 4, pos.z -= scale_exp2;

        t_min = tc_max;
        idx ^= step_mask;

        if ((idx & step_mask) != 0)
        {


            unsigned int differing_bits = 0;
            if ((step_mask & 1) != 0) differing_bits |= as_int(pos.x) ^ as_int(pos.x + scale_exp2);
            if ((step_mask & 2) != 0) differing_bits |= as_int(pos.y) ^ as_int(pos.y + scale_exp2);
            if ((step_mask & 4) != 0) differing_bits |= as_int(pos.z) ^ as_int(pos.z + scale_exp2);
            scale = (as_int((float)differing_bits) >> 23) - 127;
            scale_exp2 = as_float((scale - 23 + 127) << 23);

            parent = readStack(stack, scale, &t_max);


            int shx = as_int(pos.x) >> scale;
            int shy = as_int(pos.y) >> scale;
            int shz = as_int(pos.z) >> scale;
            pos.x = as_float(shx << scale);
            pos.y = as_float(shy << scale);
            pos.z = as_float(shz << scale);
            idx  = (shx & 1) | ((shy & 1) << 1) | ((shz & 1) << 2);

            h = 0.0f;
            child_descriptor.x = 0;
        }

    }

    if (scale >= 23 || iter > 10000)
    {
        t_min = 2.0f;
    }

    if ((octant_mask & 1) == 0) pos.x = 3.0f - scale_exp2 - pos.x;
    if ((octant_mask & 2) == 0) pos.y = 3.0f - scale_exp2 - pos.y;
    if ((octant_mask & 4) == 0) pos.z = 3.0f - scale_exp2 - pos.z;

    (*res).pos.w = t_min;
    (*res).iter = iter;
    (*res).pos.x = fmin(fmax((*ray).orig.x + t_min * (*ray).dir.x, pos.x + epsilon), pos.x + scale_exp2 - epsilon);
    (*res).pos.y = fmin(fmax((*ray).orig.y + t_min * (*ray).dir.y, pos.y + epsilon), pos.y + scale_exp2 - epsilon);
    (*res).pos.z = fmin(fmax((*ray).orig.z + t_min * (*ray).dir.z, pos.z + epsilon), pos.z + scale_exp2 - epsilon);
    (*res).node = parent;
    (*res).childIdx = idx ^ octant_mask ^ 7;
    (*res).stackPtr = scale;
}

inline float4    decodeRawNormal         (U32 value);
inline float4    decodeDXTColor          (U64 block, int texelIdx);
inline float4    decodeDXTNormal         (U64 blockA, U64 blockB, int texelIdx);
inline float     decodeAO                (U64 block, int texelIdx);
void             lookupVoxelColorNormal  (float4* colorRes, float4* normalRes, const CastResult* castRes, const CastStack* stack);
void             lookupVoxelAO           (float* res, const CastResult* castRes, const CastStack* stack);
inline float4 decodeRawNormal(U32 value)
{
    S32 sign = (S32)value >> 31;
    F32 t = (F32)(sign ^ 0x7fffffff);
    F32 u = (F32)((S32)value << 3);
    F32 v = (F32)((S32)value << 18);
    float4 result = { t, u, v, 0.0f };
    if ((value & 0x20000000) != 0)
        result.x = v, result.y = t, result.z = u;
    else if ((value & 0x40000000) != 0)
        result.x = u, result.y = v, result.z = t;
    return result;
}
__constant F32 c_dxtColorCoefs[4] =
{
    1.0f / (F32)(1 << 24),
    0.0f,
    2.0f / (F32)(3 << 24),
    1.0f / (F32)(3 << 24),
};
inline float4 decodeDXTColor(U64 block, int texelIdx)
{
    U32 head = (U32)block;
    U32 bits = (U32)(block >> 32);
    F32 c0 = c_dxtColorCoefs[(bits >> (texelIdx * 2)) & 3];
    F32 c1 = 1.0f / (F32)(1 << 24) - c0;
    float4 res = {
        c0 * (F32)(head << 16) + c1 * (F32)head,
        c0 * (F32)(head << 21) + c1 * (F32)(head << 5),
        c0 * (F32)(head << 27) + c1 * (F32)(head << 11),
        0.0f};
    return res;
}
__constant F32 c_dxtNormalCoefs[4] = {
    -1.0f,
    -1.0f / 3.0f,
    +1.0f / 3.0f,
    +1.0f,
};
inline float4 decodeDXTNormal(U64 blockA, U64 blockB, int texelIdx)
{
    U32 headBase = (U32)blockA;
    U32 headUV   = (U32)blockB;
    U32 bitsU    = (U32)(blockA >> 32);
    U32 bitsV    = (U32)(blockB >> 32);
    int shift = texelIdx * 2;
    F32 cu = c_dxtNormalCoefs[(bitsU >> shift) & 3];
    F32 cv = c_dxtNormalCoefs[(bitsV >> shift) & 3];
    cu *= as_float(((headUV & 15) + (127 + 3 - 13)) << 23);
    cv *= as_float((((headUV >> 16) & 15) + (127 + 3 - 13)) << 23);
    float4 base = decodeRawNormal(headBase);
    float4 res = {
        base.x + cu * (F32)(S32)(headUV << 16) + cv * (F32)(S32)headUV,
        base.y + cu * (F32)(S32)(headUV << 20) + cv * (F32)(S32)(headUV << 4),
        base.z + cu * (F32)(S32)(headUV << 24) + cv * (F32)(S32)(headUV << 8),
        0.0f};
    return res;
}
__constant F32 c_dxtAOCoefs[8] = {
    0.f,
    1.f / ((F32)(1 << 24) * 255.f * 7.f),
    2.f / ((F32)(1 << 24) * 255.f * 7.f),
    3.f / ((F32)(1 << 24) * 255.f * 7.f),
    4.f / ((F32)(1 << 24) * 255.f * 7.f),
    5.f / ((F32)(1 << 24) * 255.f * 7.f),
    6.f / ((F32)(1 << 24) * 255.f * 7.f),
    1.f / ((F32)(1 << 24) * 255.f)
};
inline float decodeAO(U64 block, int texelIdx)
{
    F32 c0 = c_dxtAOCoefs[((U32)(block >> (texelIdx * 3 + 16))) & 7];
    F32 c1 = c_dxtAOCoefs[7] - c0;
    return c0 * (F32)((U32)block << 16) + c1 * (F32)((U32)block << 24);
}
void lookupVoxelColorNormal(float4* colorRes, float4* normalRes, const CastResult* castRes, const CastStack* stack)
{

    __global S32* pageHeader  = (__global S32*)((intptr_t)(*castRes).node & -PageBytes);

    __global S32* blockInfo   = pageHeader + *pageHeader;
    __global S32* blockStart  = blockInfo + blockInfo[BlockInfo_BlockPtr];
    __global S32* attachInfos = blockInfo + BlockInfo_End;
    __global S32* attachInfo  = attachInfos + AttachInfo_End * AttachSlot_Attribute;
    __global S32* attachData  = blockInfo + attachInfo[AttachInfo_Ptr];
    ptrdiff_t foo = (ptrdiff_t)((*castRes).node - blockStart);
    foo = foo >> 2;
    foo *= 6;
    __global S32* bar = attachData + foo;
    __global U64* dxtBlock = (__global U64*) bar;


    U64 colorBlock = dxtBlock[0];
    U64 normalBlockA = dxtBlock[1];
    U64 normalBlockB = dxtBlock[2];


    int texelIdx = (*castRes).childIdx | (int)((((*castRes).node - pageHeader) & 2) << 2);
    float4 tmp = decodeDXTColor(colorBlock, texelIdx);
    *colorRes = (float4)(tmp.x, tmp.y, tmp.z, 255.0f);
    *normalRes = decodeDXTNormal(normalBlockA, normalBlockB, texelIdx);
}
void lookupVoxelAO( float* res, const CastResult* castRes, const CastStack* stack)
{

    __global S32* pageHeader  = (__global S32*)((S32)(*castRes).node & -PageBytes);
    __global S32* blockInfo   = pageHeader + *pageHeader;
    __global S32* blockStart  = blockInfo + blockInfo[BlockInfo_BlockPtr];
    __global S32* attachInfos = blockInfo + BlockInfo_End;
    __global S32* attachInfo  = attachInfos + AttachInfo_End * AttachSlot_AO;
    __global S32* attachData  = blockInfo + attachInfo[AttachInfo_Ptr];

    __global U64* dxtBlock    = (__global U64*)(attachData + (((*castRes).node - blockStart) >> 2) * 2);


    U64 block = dxtBlock[0];

    int texelIdx = (*castRes).childIdx | ((((*castRes).node - pageHeader) & 2) << 2);
    *res = decodeAO(block, texelIdx);
}
typedef struct Aux_
{
    __global U32*        framePtr;
    float       vSizeMultiplier;
    union
    {
        S32     fetchWorkTemp;
        Ray     ray;
        struct
        {
            U32 color;
            U32 alpha;
        }       aa;
    };
}Aux;
Ray constructPrimaryRay(__constant StandardInput* input, __global float* g_frameCoarse, int ppos, int ridx, volatile __local Aux* aux)
{
    float vsize = (*input).maxVoxelSize;
    int xsize = (*input).frameSize.x;

    if ((*input).flags & RenderFlags_CoarsePass)
    {
        vsize = (float)(*input).coarseSize * 2.83f;
        xsize = (*input).coarseFrameSize.x;
    } else
    {

        U32 a = ppos;
        U32 b = ridx;
        U32 c = 0x9e3779b9u;
        jenkinsMix(&a, &b, &c);
        float f = (float)c / ((float)(1u << 31) * 2.f);
        f = .5f + .5f*f;
        (*aux).vSizeMultiplier = f;
        vsize *= f;
    }

    int pixely = ppos / xsize;
    int pixelx = ppos - (pixely * xsize);
    F32 fx = pixelx;
    F32 fy = pixely;
    if ((*input).flags & RenderFlags_CoarsePass)
    {
        fx *= (float)(*input).coarseSize;
        fy *= (float)(*input).coarseSize;
    }
    else
    {
        if ((*input).aaRays == 1)
        {
            fx += .5f;
            fy += .5f;
        }


    }
    F32 tmin = 0.f;
    if ((*input).flags & RenderFlags_UseCoarseData)
    {

        int bx = pixelx / (*input).coarseSize;
        int by = pixely / (*input).coarseSize;
        int bidx = bx + by * (*input).coarseFrameSize.x;
        F32 tmin0 = g_frameCoarse[bidx];
        F32 tmin1 = g_frameCoarse[bidx+1];
        F32 tmin2 = g_frameCoarse[bidx+(*input).coarseFrameSize.x];
        F32 tmin3 = g_frameCoarse[bidx+(*input).coarseFrameSize.x+1];
        tmin = fmin(fmin(tmin0, tmin1), fmin(tmin2, tmin3));
        tmin = fmin(tmin, 0.9999f);
    }
    __constant Mat4f* vtc = &(*input).octreeMatrices.viewportToCamera;
    __constant Mat4f* cto = &(*input).octreeMatrices.cameraToOctree;
    float4 pos = {
        (*vtc).m00 * fx + (*vtc).m01 * fy + (*vtc).m03,
        (*vtc).m10 * fx + (*vtc).m11 * fy + (*vtc).m13,
        (*vtc).m20 * fx + (*vtc).m21 * fy + (*vtc).m23,
        (*vtc).m30 * fx + (*vtc).m31 * fy + (*vtc).m33};
    float4 near = {
        pos.x - (*vtc).m02,
        pos.y - (*vtc).m12,
        pos.z - (*vtc).m22,
        (*input).octreeMatrices.pixelInOctree * vsize};

    float4 diff = {
        (*vtc).m32 * pos.x - (*vtc).m02 * pos.w,
        (*vtc).m32 * pos.y - (*vtc).m12 * pos.w,
        (*vtc).m32 * pos.z - (*vtc).m22 * pos.w,
        near.w * (*vtc).m32};

    float a = 1.0f / (pos.w - (*vtc).m32);
    float b = 2.0f * a / fmax(pos.w + (*vtc).m32, 1.0e-8f);
    float c = tmin * b;
    Ray ray;
    ray.orig = near * a - diff * c;
    ray.dir  = diff * (c - b);
    ray.orig.xyz = (mulMat4Vec3(cto, &ray.orig)).xyz;
    ray.dir = (float4)(
        (*cto).m00 * ray.dir.x + (*cto).m01 * ray.dir.y + (*cto).m02 * ray.dir.z,
        (*cto).m10 * ray.dir.x + (*cto).m11 * ray.dir.y + (*cto).m12 * ray.dir.z,
        (*cto).m20 * ray.dir.x + (*cto).m21 * ray.dir.y + (*cto).m22 * ray.dir.z,
        ray.dir.w);
    return ray;
}
U32 processPrimaryRay(__constant StandardInput* input, __global int * g_rootNode, unsigned int rootNodeOfs,
                    volatile __local Ray* ray, float vSizeMultiplier)
{

    CastResult castRes;
    CastStack stack;
    castRayLocal(g_rootNode, rootNodeOfs, &castRes, &stack, ray);

    if ((*input).flags & RenderFlags_VisualizeIterations)
    {
        F32 v = 255.0f * (F32)castRes.iter / 64.0f;
        return toABGR((float4)(v, v, v, 0.0f));
    }
    else if ((*input).flags & RenderFlags_VisualizeRaycastLevel)
    {
        F32 v = 0.0f;
        if (castRes.pos.w <= 1.0f)
            v = 255.0f - ((F32)23 - (F32)castRes.stackPtr) * (255.0f / 18.0f);
        return toABGR((float4)(v * 0.5f, v, v * 0.5f, 0.0f));
    }

    float4 L = {0.3643f, 0.3535f, 0.8616f, 0.0f};
    Mat3f m3f = extractMat3f(&(*input).octreeMatrices.octreeToWorld);
    float4 I = normalize(mulMatVec3local(&m3f, &(*ray).dir));


    if (castRes.pos.w > 1.0f)
    {
        float4 c;
        if (I.y >= 0.f)
        {
            float4 horz = { 179.0f, 205.0f, 253.0f, 0.0f };
            float4 zen  = { 77.0f,  102.0f, 179.0f, 0.0f };
            c = horz + (zen - horz) * I.y * I.y;
            c *= 2.5f;
        }
        else
        {
            float4 horz = { 192.0f, 154.0f, 102.0f, 0.0f };
            float4 zen  = { 128.0f, 102.0f, 77.0f, 0.0f };
            c = horz - (zen - horz) * I.y;
        }
        c *= fmax(L.y, 0.0f);
        float IL = dot(I, L);
        if (IL > 0.0f)
        {
            float4 f = {255.0f, 179.0f, 102.0f, 0.0f};
            c += f * pow(IL, 1000.0f);
        }

        return toABGR((float4)(c.x, c.y, c.z, 0.0f));
    }

    float4 voxelColor;
    float4 voxelNormal;

    lookupVoxelColorNormal(&voxelColor, &voxelNormal, &castRes, &stack);

    return toABGR(voxelColor);
    F32 voxelAmbient = 1.0f;
    lookupVoxelAO(&voxelAmbient, &castRes, &stack);

    float4 N  = normalize(mulMatVec3const(&(*input).octreeMatrices.octreeToWorldN, &voxelNormal));
    float4 R  = (I - N * (dot(N, I) * 2.0f));
    F32    LN = dot(L, N);

    bool shadow = (LN <= 0.0f);

    float4 shadedColor = voxelColor * (voxelAmbient * (0.25f + LN * ((LN < 0.0f) ? 0.15f : (shadow) ? 0.25f : 1.0f)));
    if (!shadow)
        shadedColor += (float4)(32.f, 32.f, 32.f, 0.0f) * pow(fmax(dot(L, R), 0.0f), 18.0f);
    shadedColor *= (*input).brightness;
    U32 color = toABGR(shadedColor);

    float vSize = (F32)(1 << castRes.stackPtr) / (F32)(1 << 23);
    float pSize = (*ray).orig.w + castRes.pos.w * (*ray).dir.w;
    vSize *= vSizeMultiplier;
    float blurRadius = max(vSize / pSize * (*input).maxVoxelSize, 1.0f);

    shadedColor.w = log2(blurRadius) * 32.0f + 0.5f;
    return toABGR(shadedColor);
}
void fetchWorkFirst(int* warp, int* batchCounter, __global int* g_warpCounter, int batchSize, volatile __local S32* sharedTemp)
{
    *warp = get_local_id(1) + get_group_id(0) * 2;

    *batchCounter = 0;
}
void fetchWorkNext(int* warp, int* batchCounter, __global int* g_warpCounter, int batchSize, volatile __local S32* sharedTemp)
{
    *warp = 0x03FFFFFF;
}
__kernel void renderKernel(__constant StandardInput* input, __global int* g_warpCounter, __global int* g_activeWarps,
                    __global int* g_coarseIndexToPixel, __global float* g_frameCoarse,
                    __global int* g_indexToPixel, __global unsigned int* g_frame,
                    __global unsigned int* g_aaSampleBuffer,
                    __global int* g_rootNode, unsigned int rootNodeOfs)
{
    int dummy = (*input).totalWork;


    __local Aux auxbuf[32 * 2];
    volatile __local Aux* aux0 = &auxbuf[32 * get_local_id(1)];
    volatile __local Aux* aux  = &auxbuf[get_local_id(0) + 32 * get_local_id(1)];


    int warp, batchCounter;
    fetchWorkFirst(&warp, &batchCounter, g_warpCounter, (*input).batchSize, (volatile __local S32*) &(*aux0).fetchWorkTemp);
    if (warp * 32 >= (*input).totalWork)
        return;

    for (;;)
    {

        int ridx = warp * 32 + get_local_id(0);
        if (ridx >= (*input).totalWork)
            return;


        int pidx = ridx;
        int ppos;
        if ((*input).flags & RenderFlags_CoarsePass)
        {
            ppos = g_coarseIndexToPixel[pidx];
            (*aux).framePtr = (__global U32*)g_frameCoarse + ppos;
        }
        else
        {
            ppos = g_indexToPixel[pidx];
            (*aux).framePtr = (__global U32*)g_frame + ppos;
        }

        Ray ray = constructPrimaryRay(input, g_frameCoarse, ppos, ridx, aux);
        (*aux).ray.orig.x   = ray.orig.x;
        (*aux).ray.orig.y   = ray.orig.y;
        (*aux).ray.orig.z   = ray.orig.z;
        (*aux).ray.dir.x    = ray.dir.x;
        (*aux).ray.dir.y    = ray.dir.y;
        (*aux).ray.dir.z    = ray.dir.z;
        (*aux).ray.orig.w   = ray.orig.w;
        (*aux).ray.dir.w    = ray.dir.w;
        if ((*input).flags & RenderFlags_CoarsePass)
        {
            CastResult castRes;
            CastStack stack;
            castRayLocal(g_rootNode, rootNodeOfs, &castRes, &stack, (volatile __local Ray*) &((*aux).ray));
            if (castRes.pos.w < 1.0f)
            {
                F32 size = (F32)(1 << castRes.stackPtr) / (F32)(1 << 23);
                castRes.pos.w -= size / length((float4)((*aux).ray.dir.xyz, 0.0f)) * 0.5f;
            }
            *((__global float*)(*aux).framePtr) = max(castRes.pos.w, 0.0f);
        }
        else
        {

            U32 color; // = processPrimaryRay(input, g_rootNode, rootNodeOfs, (volatile __local Ray*) &(*aux).ray, (*aux).vSizeMultiplier);

            if ((*input).aaRays == 1)
            {
                *(*aux).framePtr = color;
            }
            else
            {

                U32 resc = (color & 0xff) | ((color & 0xff00) << 2) | ((color & 0xff0000) << 4);
                (*aux).aa.color = resc;
                (*aux).aa.alpha = color;

                if ((get_local_id(0) & 3) == 0)
                {

                    U32 resc0 = aux[0].aa.color;
                    U32 resc1 = aux[1].aa.color;
                    U32 resc2 = aux[2].aa.color;
                    U32 resc3 = aux[3].aa.color;
                    resc = (resc0 + resc1 + resc2 + resc3);
                    resc = ((resc >> 2) & 0xff) | ((resc >> 4) & 0xff00) | ((resc >> 6) & 0xff0000);

                    U32 resa0 = aux[0].aa.alpha;
                    U32 resa1 = aux[1].aa.alpha;
                    U32 resa2 = aux[2].aa.alpha;
                    U32 resa3 = aux[3].aa.alpha;
                    U32 resa = min(min(resa0, resa1), min(resa2, resa3));;

                    *(*aux).framePtr = (resa & 0xff000000) | resc;
                }
            }
        }

        fetchWorkNext(&warp, &batchCounter, g_warpCounter, (*input).batchSize, (volatile __local S32*) &(*aux0).fetchWorkTemp);
    }


    const int nWidth = get_global_size(0);

    const int xOut = get_global_id(0);
    const int yOut = get_global_id(1);

    const int idxOut = yOut * nWidth + xOut;

    g_frame[idxOut] = 1000000;
}