Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
- typedef unsigned char U8;
- typedef unsigned short U16;
- typedef unsigned int U32;
- typedef signed char S8;
- typedef signed short S16;
- typedef signed int S32;
- typedef float F32;
- typedef double F64;
- typedef void (*FuncPtr)(void);
- typedef unsigned long U64;
- typedef signed long S64;
- typedef S64 SPTR;
- typedef U64 UPTR;
- inline U64 doubleToBits (F64 a) { return *(U64*)&a; }
- inline F64 bitsToDouble (U64 a) { return *(F64*)&a; }
- inline U32 floatToBits (F32 a) { return as_int(a); }
- inline F32 bitsToFloat (U32 a) { return as_float(a); }
- inline F32 scale (F32 a, int b) { return a * (exp2((float)b)); }
- inline int popc8 (U32 mask);
- inline int popc16 (U32 mask);
- inline int popc32 (U32 mask);
- typedef double2 Vec2d;
- typedef int2 Vec2i;
- typedef float2 Vec2f;
- typedef struct Vec3d_
- {
- double x;
- double y;
- double z;
- } Vec3d;
- typedef struct Vec3i_
- {
- int x;
- int y;
- int z;
- } Vec3i;
- typedef struct Vec3f_
- {
- float x;
- float y;
- float z;
- } Vec3f;
- typedef double4 Vec4d;
- typedef int4 Vec4i;
- typedef float4 Vec4f;
- typedef struct Mat2f_
- {
- float m00, m10;
- float m01, m11;
- } Mat2f;
- typedef struct Mat3f_
- {
- float m00, m10, m20;
- float m01, m11, m21;
- float m02, m12, m22;
- } Mat3f;
- typedef struct Mat4f_
- {
- float m00, m10, m20, m30;
- float m01, m11, m21, m31;
- float m02, m12, m22, m32;
- float m03, m13, m23, m33;
- } Mat4f;
- __constant int c_popc8LUT[] =
- {
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
- };
- inline int popc8(U32 mask)
- {
- ((void)0);
- return c_popc8LUT[mask];
- }
- inline int popc16(U32 mask)
- {
- ((void)0);
- return c_popc8LUT[mask & 0xffu] + c_popc8LUT[mask >> 8];
- }
- inline int popc32(U32 mask)
- {
- int result = c_popc8LUT[mask & 0xffu];
- result += c_popc8LUT[(mask >> 8) & 0xffu];
- result += c_popc8LUT[(mask >> 16) & 0xffu];
- result += c_popc8LUT[mask >> 24];
- return result;
- }
- enum RenderFlags
- {
- RenderFlags_CoarsePass = 1 << 0,
- RenderFlags_UseCoarseData = 1 << 1,
- RenderFlags_VisualizeIterations = 1 << 2,
- RenderFlags_VisualizeRaycastLevel = 1 << 3,
- };
- enum AttachSlot
- {
- AttachSlot_Contour = 0,
- AttachSlot_Attribute,
- AttachSlot_AO,
- AttachSlot_Max
- };
- enum
- {
- PageBytesLog2 = 13,
- PageBytes = 1 << PageBytesLog2,
- PageSizeLog2 = PageBytesLog2 - 2,
- PageSize = 1 << PageSizeLog2,
- TrunksPerBlock = 512,
- };
- enum FindMode
- {
- FindMode_Load = 0,
- FindMode_Build,
- FindMode_LoadOrBuild,
- FindMode_Unload,
- FindMode_UnloadDeepest,
- FindMode_Max
- };
- enum BlockInfo
- {
- BlockInfo_SliceID = 0,
- BlockInfo_IndexInSlice,
- BlockInfo_BlockPtr,
- BlockInfo_NumAttach,
- BlockInfo_End
- };
- enum AttachInfo
- {
- AttachInfo_Type = 0,
- AttachInfo_Ptr,
- AttachInfo_End
- };
- enum AttachType
- {
- VoidAttach = 0,
- ColorNormalDXTAttach = 1,
- ColorNormalPaletteAttach = 3,
- BuildDataAttach = 4,
- ContourAttach = 6,
- ColorNormalCornerAttach = 7,
- AOAttach = 10,
- AttachType_Max
- };
- typedef struct OctreeMatrices_
- {
- Mat4f viewportToCamera;
- Mat4f cameraToOctree;
- Mat4f octreeToWorld;
- F32 pixelInOctree;
- Mat4f worldToOctree;
- Mat3f octreeToWorldN;
- Vec3f cameraPosition;
- Mat4f octreeToViewport;
- Mat4f viewportToOctreeN;
- }OctreeMatrices;
- typedef struct StandardInput_
- {
- Vec2i frameSize;
- U32 flags;
- S32 batchSize;
- S32 aaRays;
- F32 maxVoxelSize;
- F32 brightness;
- S32 coarseSize;
- Vec2i coarseFrameSize;
- S32 numPrimaryRays;
- S32 totalWork;
- OctreeMatrices octreeMatrices;
- }StandardInput;
- inline F32 fmaxf3 (F32 a, F32 b, F32 c) { return fmax(fmax(a, b), c); }
- inline F32 fminf3 (F32 a, F32 b, F32 c) { return fmin(fmin(a, b), c); }
- inline F32 smoothstep01 (F32 f) { return smoothstep(0.f, 1.f, f); }
- inline float4 scaleVec4 (const float4* a, const float4* b) { float4 res = {(*a).x * (*b).x, (*a).y * (*b).y, (*a).z* (*b).z, (*a).w * (*b).w}; return res; }
- inline float4 mulMatVec4 (const Mat4f* m, const float4* v);
- inline float4 mulMatVec3 (const Mat3f* m, const float4* v);
- inline float4 mulMatVec3const(__constant Mat3f* m, const float4* v);
- inline Mat3f extractMat3f (__constant Mat4f* m);
- inline float4 fromABGR (U32 abgr);
- inline U32 toABGR (float4 v);
- inline void jenkinsMix (U32* a, U32* b, U32* c);
- inline float4 perpendicular (const float4* v);
- inline float4 mulMatVec4 (const Mat4f* m, const float4* v)
- {
- float4 res = {
- (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z + (*m).m03 * (*v).w,
- (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z + (*m).m13 * (*v).w,
- (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z + (*m).m23 * (*v).w,
- (*m).m30 * (*v).x + (*m).m31 * (*v).y + (*m).m32 * (*v).z + (*m).m33 * (*v).w};
- return res;
- }
- inline float4 mulMat4Vec3 (__constant Mat4f* m, const float4* v)
- {
- float4 res = {
- (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z + (*m).m03,
- (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z + (*m).m13,
- (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z + (*m).m23,
- 0.0f};
- return res;
- }
- inline float4 mulMatVec3 (const Mat3f* m, const float4* v)
- {
- float4 res = {
- (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z,
- (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z,
- (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z,
- 0.0f};
- return res;
- }
- inline float4 mulMatVec3local (const Mat3f* m, __local const float4* v)
- {
- float4 res ={
- (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z,
- (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z,
- (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z,
- 0.0f};
- return res;
- }
- inline float4 mulMatVec3const (__constant Mat3f* m, const float4* v)
- {
- float4 res = {
- (*m).m00 * (*v).x + (*m).m01 * (*v).y + (*m).m02 * (*v).z,
- (*m).m10 * (*v).x + (*m).m11 * (*v).y + (*m).m12 * (*v).z,
- (*m).m20 * (*v).x + (*m).m21 * (*v).y + (*m).m22 * (*v).z,
- 0.0f};
- return res;
- }
- inline Mat3f extractMat3f(__constant Mat4f* m)
- {
- Mat3f n;
- n.m00 = (*m).m00; n.m01 = (*m).m01; n.m02 = (*m).m02;
- n.m10 = (*m).m10; n.m11 = (*m).m11; n.m12 = (*m).m12;
- n.m20 = (*m).m20; n.m21 = (*m).m21; n.m22 = (*m).m22;
- return n;
- }
- inline float4 fromABGR(U32 abgr)
- {
- float4 res = {
- (F32)(abgr & 0xFF),
- (F32)((abgr >> 8) & 0xFF),
- (F32)((abgr >> 16) & 0xFF),
- (F32)(abgr >> 24)};
- return res;
- }
- inline U32 toABGR(float4 v)
- {
- v.x = fmin(fmax(v.x, 0.f), 255.f);
- v.y = fmin(fmax(v.y, 0.f), 255.f);
- v.z = fmin(fmax(v.z, 0.f), 255.f);
- v.w = fmin(fmax(v.w, 0.f), 255.f);
- U32 ir = (U32)(v.x);
- U32 ig = (U32)(v.y);
- U32 ib = (U32)(v.z);
- U32 ia = (U32)(v.w);
- return ir | (ig << 8) | (ib << 16) | (ia << 24);
- }
- inline void jenkinsMix(U32* a, U32* b, U32* c)
- {
- *a -= *b; *a -= *c; *a ^= (*c>>13);
- *b -= *c; *b -= *a; *b ^= (*a<<8);
- *c -= *a; *c -= *b; *c ^= (*b>>13);
- *a -= *b; *a -= *c; *a ^= (*c>>12);
- *b -= *c; *b -= *a; *b ^= (*a<<16);
- *c -= *a; *c -= *b; *c ^= (*b>>5);
- *a -= *b; *a -= *c; *a ^= (*c>>3);
- *b -= *c; *b -= *a; *b ^= (*a<<10);
- *c -= *a; *c -= *b; *c ^= (*b>>15);
- }
- inline float4 perpendicular(const float4* v)
- {
- float4 res;
- float vmin = fmin(fabs((*v).x), fmin(fabs((*v).y), fabs((*v).z)));
- if (vmin == fabs((*v).x))
- res = (float4)(0.0f, (*v).z, -(*v).y, 0.0f);
- else if (vmin == fabs((*v).y))
- res = (float4)(-(*v).z, 0.0f, (*v).x, 0.0f);
- else
- res = (float4)((*v).y, -(*v).x, 0.0f, 0.0f);
- return res;
- }
- typedef struct Ray_
- {
- float4 orig;
- float4 dir;
- }Ray;
- typedef struct CastResult_
- {
- float4 pos;
- int iter;
- __global S32* node;
- int childIdx;
- int stackPtr;
- }CastResult;
- typedef struct CastStack_
- {
- F32 tmax[23 + 1];
- intptr_t node[23 + 1];
- }CastStack;
- __global S32* readStack(CastStack* castStack, int idx, F32* tmax)
- {
- *tmax = (*castStack).tmax[idx];
- intptr_t e = (*castStack).node[idx];
- return (__global S32*)e;
- };
- void writeStack(CastStack* castStack, int idx, __global S32* node, F32 tmax)
- {
- (*castStack).tmax[idx] = tmax;
- (*castStack).node[idx] = (intptr_t) node;
- };
- void castRayLocal(__global int* g_rootNode, unsigned int rootNodeOfs, CastResult* res, CastStack* stack, volatile __local Ray* ray)
- {
- const float epsilon = exp2((float)-23);
- float ray_orig_sz = (*ray).orig.w;
- int iter = 0;
- if (fabs((*ray).dir.x) < epsilon) (*ray).dir.x = copysign(epsilon, (*ray).dir.x);
- if (fabs((*ray).dir.y) < epsilon) (*ray).dir.y = copysign(epsilon, (*ray).dir.y);
- if (fabs((*ray).dir.z) < epsilon) (*ray).dir.z = copysign(epsilon, (*ray).dir.z);
- float tx_coef = 1.0f / -fabs((*ray).dir.x);
- float ty_coef = 1.0f / -fabs((*ray).dir.y);
- float tz_coef = 1.0f / -fabs((*ray).dir.z);
- float tx_bias = tx_coef * (*ray).orig.x;
- float ty_bias = ty_coef * (*ray).orig.y;
- float tz_bias = tz_coef * (*ray).orig.z;
- int octant_mask = 7;
- if ((*ray).dir.x > 0.0f) octant_mask ^= 1, tx_bias = 3.0f * tx_coef - tx_bias;
- if ((*ray).dir.y > 0.0f) octant_mask ^= 2, ty_bias = 3.0f * ty_coef - ty_bias;
- if ((*ray).dir.z > 0.0f) octant_mask ^= 4, tz_bias = 3.0f * tz_coef - tz_bias;
- float t_min = fmax(fmax(2.0f * tx_coef - tx_bias, 2.0f * ty_coef - ty_bias), 2.0f * tz_coef - tz_bias);
- float t_max = fmin(fmin(tx_coef - tx_bias, ty_coef - ty_bias), tz_coef - tz_bias);
- float h = t_max;
- t_min = fmax(t_min, 0.0f);
- t_max = fmin(t_max, 1.0f);
- __global int* parent = g_rootNode + rootNodeOfs;
- int2 child_descriptor = {0, 0};
- int idx = 0;
- float4 pos = {1.0f, 1.0f, 1.0f, 0.0f};
- int scale = 23 - 1;
- float scale_exp2 = 0.5f;
- if (1.5f * tx_coef - tx_bias > t_min) idx ^= 1, pos.x = 1.5f;
- if (1.5f * ty_coef - ty_bias > t_min) idx ^= 2, pos.y = 1.5f;
- if (1.5f * tz_coef - tz_bias > t_min) idx ^= 4, pos.z = 1.5f;
- bool bBreak;
- while (scale < 23)
- {
- iter++;
- if (iter > 10000)
- break;
- if (child_descriptor.x == 0)
- {
- child_descriptor = *((__global int2*)(parent));
- }
- float tx_corner = pos.x * tx_coef - tx_bias;
- float ty_corner = pos.y * ty_coef - ty_bias;
- float tz_corner = pos.z * tz_coef - tz_bias;
- float tc_max = fmin(fmin(tx_corner, ty_corner), tz_corner);
- int child_shift = idx ^ octant_mask;
- int child_masks = child_descriptor.x << child_shift;
- if ((child_masks & 0x8000) != 0 && t_min <= t_max)
- {
- if (tc_max * (*ray).dir.w + ray_orig_sz >= scale_exp2)
- break;
- float tv_max = fmin(t_max, tc_max);
- float half = scale_exp2 * 0.5f;
- float tx_center = half * tx_coef + tx_corner;
- float ty_center = half * ty_coef + ty_corner;
- float tz_center = half * tz_coef + tz_corner;
- int contour_mask = child_descriptor.y << child_shift;
- if ((contour_mask & 0x80) != 0)
- {
- int ofs = (unsigned int)child_descriptor.y >> 8;
- int value = parent[ofs + popc8(contour_mask & 0x7F)];
- float cthick = (float)(unsigned int)value * scale_exp2 * 0.75f;
- float cpos = (float)(value << 7) * scale_exp2 * 1.5f;
- float cdirx = (float)(value << 14) * (*ray).dir.x;
- float cdiry = (float)(value << 20) * (*ray).dir.y;
- float cdirz = (float)(value << 26) * (*ray).dir.z;
- float tcoef = 1.0f / (cdirx + cdiry + cdirz);
- float tavg = tx_center * cdirx + ty_center * cdiry + tz_center * cdirz + cpos;
- float tdiff = cthick * tcoef;
- t_min = fmax(t_min, tcoef * tavg - fabs(tdiff));
- tv_max = fmin(tv_max, tcoef * tavg + fabs(tdiff));
- }
- if (t_min <= tv_max)
- {
- if ((child_masks & 0x0080) == 0)
- bBreak = true;
- else
- {
- if (tc_max < h)
- {
- writeStack(stack, scale, parent, t_max);
- }
- h = tc_max;
- int ofs = (unsigned int)child_descriptor.x >> 17;
- if ((child_descriptor.x & 0x10000) != 0)
- {
- ofs = parent[ofs * 2];
- }
- ofs += popc8(child_masks & 0x7F);
- parent += ofs * 2;
- idx = 0;
- scale--;
- scale_exp2 = half;
- if (tx_center > t_min) idx ^= 1, pos.x += scale_exp2;
- if (ty_center > t_min) idx ^= 2, pos.y += scale_exp2;
- if (tz_center > t_min) idx ^= 4, pos.z += scale_exp2;
- t_max = tv_max;
- child_descriptor.x = 0;
- //continue;
- }
- }
- }
- if(bBreak)
- break;
- int step_mask = 0;
- if (tx_corner <= tc_max) step_mask ^= 1, pos.x -= scale_exp2;
- if (ty_corner <= tc_max) step_mask ^= 2, pos.y -= scale_exp2;
- if (tz_corner <= tc_max) step_mask ^= 4, pos.z -= scale_exp2;
- t_min = tc_max;
- idx ^= step_mask;
- if ((idx & step_mask) != 0)
- {
- unsigned int differing_bits = 0;
- if ((step_mask & 1) != 0) differing_bits |= as_int(pos.x) ^ as_int(pos.x + scale_exp2);
- if ((step_mask & 2) != 0) differing_bits |= as_int(pos.y) ^ as_int(pos.y + scale_exp2);
- if ((step_mask & 4) != 0) differing_bits |= as_int(pos.z) ^ as_int(pos.z + scale_exp2);
- scale = (as_int((float)differing_bits) >> 23) - 127;
- scale_exp2 = as_float((scale - 23 + 127) << 23);
- parent = readStack(stack, scale, &t_max);
- int shx = as_int(pos.x) >> scale;
- int shy = as_int(pos.y) >> scale;
- int shz = as_int(pos.z) >> scale;
- pos.x = as_float(shx << scale);
- pos.y = as_float(shy << scale);
- pos.z = as_float(shz << scale);
- idx = (shx & 1) | ((shy & 1) << 1) | ((shz & 1) << 2);
- h = 0.0f;
- child_descriptor.x = 0;
- }
- }
- if (scale >= 23 || iter > 10000)
- {
- t_min = 2.0f;
- }
- if ((octant_mask & 1) == 0) pos.x = 3.0f - scale_exp2 - pos.x;
- if ((octant_mask & 2) == 0) pos.y = 3.0f - scale_exp2 - pos.y;
- if ((octant_mask & 4) == 0) pos.z = 3.0f - scale_exp2 - pos.z;
- (*res).pos.w = t_min;
- (*res).iter = iter;
- (*res).pos.x = fmin(fmax((*ray).orig.x + t_min * (*ray).dir.x, pos.x + epsilon), pos.x + scale_exp2 - epsilon);
- (*res).pos.y = fmin(fmax((*ray).orig.y + t_min * (*ray).dir.y, pos.y + epsilon), pos.y + scale_exp2 - epsilon);
- (*res).pos.z = fmin(fmax((*ray).orig.z + t_min * (*ray).dir.z, pos.z + epsilon), pos.z + scale_exp2 - epsilon);
- (*res).node = parent;
- (*res).childIdx = idx ^ octant_mask ^ 7;
- (*res).stackPtr = scale;
- }
- void castRay(__global int* g_rootNode, unsigned int rootNodeOfs, CastResult* res, CastStack* stack, volatile Ray* ray)
- {
- const float epsilon = exp2((float)-23);
- float ray_orig_sz = (*ray).orig.w;
- int iter = 0;
- if (fabs((*ray).dir.x) < epsilon) (*ray).dir.x = copysign(epsilon, (*ray).dir.x);
- if (fabs((*ray).dir.y) < epsilon) (*ray).dir.y = copysign(epsilon, (*ray).dir.y);
- if (fabs((*ray).dir.z) < epsilon) (*ray).dir.z = copysign(epsilon, (*ray).dir.z);
- float tx_coef = 1.0f / -fabs((*ray).dir.x);
- float ty_coef = 1.0f / -fabs((*ray).dir.y);
- float tz_coef = 1.0f / -fabs((*ray).dir.z);
- float tx_bias = tx_coef * (*ray).orig.x;
- float ty_bias = ty_coef * (*ray).orig.y;
- float tz_bias = tz_coef * (*ray).orig.z;
- int octant_mask = 7;
- if ((*ray).dir.x > 0.0f) octant_mask ^= 1, tx_bias = 3.0f * tx_coef - tx_bias;
- if ((*ray).dir.y > 0.0f) octant_mask ^= 2, ty_bias = 3.0f * ty_coef - ty_bias;
- if ((*ray).dir.z > 0.0f) octant_mask ^= 4, tz_bias = 3.0f * tz_coef - tz_bias;
- float t_min = fmax(fmax(2.0f * tx_coef - tx_bias, 2.0f * ty_coef - ty_bias), 2.0f * tz_coef - tz_bias);
- float t_max = fmin(fmin(tx_coef - tx_bias, ty_coef - ty_bias), tz_coef - tz_bias);
- float h = t_max;
- t_min = fmax(t_min, 0.0f);
- t_max = fmin(t_max, 1.0f);
- __global int* parent = g_rootNode + rootNodeOfs;
- int2 child_descriptor = {0, 0};
- int idx = 0;
- float4 pos = {1.0f, 1.0f, 1.0f, 0.0f};
- int scale = 23 - 1;
- float scale_exp2 = 0.5f;
- if (1.5f * tx_coef - tx_bias > t_min) idx ^= 1, pos.x = 1.5f;
- if (1.5f * ty_coef - ty_bias > t_min) idx ^= 2, pos.y = 1.5f;
- if (1.5f * tz_coef - tz_bias > t_min) idx ^= 4, pos.z = 1.5f;
- while (scale < 23)
- {
- iter++;
- if (iter > 10000)
- break;
- if (child_descriptor.x == 0)
- {
- child_descriptor = *(__global int2*)(parent);
- }
- float tx_corner = pos.x * tx_coef - tx_bias;
- float ty_corner = pos.y * ty_coef - ty_bias;
- float tz_corner = pos.z * tz_coef - tz_bias;
- float tc_max = fmin(fmin(tx_corner, ty_corner), tz_corner);
- int child_shift = idx ^ octant_mask;
- int child_masks = child_descriptor.x << child_shift;
- if ((child_masks & 0x8000) != 0 && t_min <= t_max)
- {
- if (tc_max * (*ray).dir.w + ray_orig_sz >= scale_exp2)
- break;
- float tv_max = fmin(t_max, tc_max);
- float half = scale_exp2 * 0.5f;
- float tx_center = half * tx_coef + tx_corner;
- float ty_center = half * ty_coef + ty_corner;
- float tz_center = half * tz_coef + tz_corner;
- int contour_mask = child_descriptor.y << child_shift;
- if ((contour_mask & 0x80) != 0)
- {
- int ofs = (unsigned int)child_descriptor.y >> 8;
- int value = parent[ofs + popc8(contour_mask & 0x7F)];
- float cthick = (float)(unsigned int)value * scale_exp2 * 0.75f;
- float cpos = (float)(value << 7) * scale_exp2 * 1.5f;
- float cdirx = (float)(value << 14) * (*ray).dir.x;
- float cdiry = (float)(value << 20) * (*ray).dir.y;
- float cdirz = (float)(value << 26) * (*ray).dir.z;
- float tcoef = 1.0f / (cdirx + cdiry + cdirz);
- float tavg = tx_center * cdirx + ty_center * cdiry + tz_center * cdirz + cpos;
- float tdiff = cthick * tcoef;
- t_min = fmax(t_min, tcoef * tavg - fabs(tdiff));
- tv_max = fmin(tv_max, tcoef * tavg + fabs(tdiff));
- }
- if (t_min <= tv_max)
- {
- if ((child_masks & 0x0080) == 0)
- break;
- if (tc_max < h)
- {
- writeStack(stack, scale, parent, t_max);
- }
- h = tc_max;
- int ofs = (unsigned int)child_descriptor.x >> 17;
- if ((child_descriptor.x & 0x10000) != 0)
- {
- ofs = parent[ofs * 2];
- }
- ofs += popc8(child_masks & 0x7F);
- parent += ofs * 2;
- idx = 0;
- scale--;
- scale_exp2 = half;
- if (tx_center > t_min) idx ^= 1, pos.x += scale_exp2;
- if (ty_center > t_min) idx ^= 2, pos.y += scale_exp2;
- if (tz_center > t_min) idx ^= 4, pos.z += scale_exp2;
- t_max = tv_max;
- child_descriptor.x = 0;
- continue;
- }
- }
- int step_mask = 0;
- if (tx_corner <= tc_max) step_mask ^= 1, pos.x -= scale_exp2;
- if (ty_corner <= tc_max) step_mask ^= 2, pos.y -= scale_exp2;
- if (tz_corner <= tc_max) step_mask ^= 4, pos.z -= scale_exp2;
- t_min = tc_max;
- idx ^= step_mask;
- if ((idx & step_mask) != 0)
- {
- unsigned int differing_bits = 0;
- if ((step_mask & 1) != 0) differing_bits |= as_int(pos.x) ^ as_int(pos.x + scale_exp2);
- if ((step_mask & 2) != 0) differing_bits |= as_int(pos.y) ^ as_int(pos.y + scale_exp2);
- if ((step_mask & 4) != 0) differing_bits |= as_int(pos.z) ^ as_int(pos.z + scale_exp2);
- scale = (as_int((float)differing_bits) >> 23) - 127;
- scale_exp2 = as_float((scale - 23 + 127) << 23);
- parent = readStack(stack, scale, &t_max);
- int shx = as_int(pos.x) >> scale;
- int shy = as_int(pos.y) >> scale;
- int shz = as_int(pos.z) >> scale;
- pos.x = as_float(shx << scale);
- pos.y = as_float(shy << scale);
- pos.z = as_float(shz << scale);
- idx = (shx & 1) | ((shy & 1) << 1) | ((shz & 1) << 2);
- h = 0.0f;
- child_descriptor.x = 0;
- }
- }
- if (scale >= 23 || iter > 10000)
- {
- t_min = 2.0f;
- }
- if ((octant_mask & 1) == 0) pos.x = 3.0f - scale_exp2 - pos.x;
- if ((octant_mask & 2) == 0) pos.y = 3.0f - scale_exp2 - pos.y;
- if ((octant_mask & 4) == 0) pos.z = 3.0f - scale_exp2 - pos.z;
- (*res).pos.w = t_min;
- (*res).iter = iter;
- (*res).pos.x = fmin(fmax((*ray).orig.x + t_min * (*ray).dir.x, pos.x + epsilon), pos.x + scale_exp2 - epsilon);
- (*res).pos.y = fmin(fmax((*ray).orig.y + t_min * (*ray).dir.y, pos.y + epsilon), pos.y + scale_exp2 - epsilon);
- (*res).pos.z = fmin(fmax((*ray).orig.z + t_min * (*ray).dir.z, pos.z + epsilon), pos.z + scale_exp2 - epsilon);
- (*res).node = parent;
- (*res).childIdx = idx ^ octant_mask ^ 7;
- (*res).stackPtr = scale;
- }
- inline float4 decodeRawNormal (U32 value);
- inline float4 decodeDXTColor (U64 block, int texelIdx);
- inline float4 decodeDXTNormal (U64 blockA, U64 blockB, int texelIdx);
- inline float decodeAO (U64 block, int texelIdx);
- void lookupVoxelColorNormal (float4* colorRes, float4* normalRes, const CastResult* castRes, const CastStack* stack);
- void lookupVoxelAO (float* res, const CastResult* castRes, const CastStack* stack);
- inline float4 decodeRawNormal(U32 value)
- {
- S32 sign = (S32)value >> 31;
- F32 t = (F32)(sign ^ 0x7fffffff);
- F32 u = (F32)((S32)value << 3);
- F32 v = (F32)((S32)value << 18);
- float4 result = { t, u, v, 0.0f };
- if ((value & 0x20000000) != 0)
- result.x = v, result.y = t, result.z = u;
- else if ((value & 0x40000000) != 0)
- result.x = u, result.y = v, result.z = t;
- return result;
- }
- __constant F32 c_dxtColorCoefs[4] =
- {
- 1.0f / (F32)(1 << 24),
- 0.0f,
- 2.0f / (F32)(3 << 24),
- 1.0f / (F32)(3 << 24),
- };
- inline float4 decodeDXTColor(U64 block, int texelIdx)
- {
- U32 head = (U32)block;
- U32 bits = (U32)(block >> 32);
- F32 c0 = c_dxtColorCoefs[(bits >> (texelIdx * 2)) & 3];
- F32 c1 = 1.0f / (F32)(1 << 24) - c0;
- float4 res = {
- c0 * (F32)(head << 16) + c1 * (F32)head,
- c0 * (F32)(head << 21) + c1 * (F32)(head << 5),
- c0 * (F32)(head << 27) + c1 * (F32)(head << 11),
- 0.0f};
- return res;
- }
- __constant F32 c_dxtNormalCoefs[4] = {
- -1.0f,
- -1.0f / 3.0f,
- +1.0f / 3.0f,
- +1.0f,
- };
- inline float4 decodeDXTNormal(U64 blockA, U64 blockB, int texelIdx)
- {
- U32 headBase = (U32)blockA;
- U32 headUV = (U32)blockB;
- U32 bitsU = (U32)(blockA >> 32);
- U32 bitsV = (U32)(blockB >> 32);
- int shift = texelIdx * 2;
- F32 cu = c_dxtNormalCoefs[(bitsU >> shift) & 3];
- F32 cv = c_dxtNormalCoefs[(bitsV >> shift) & 3];
- cu *= as_float(((headUV & 15) + (127 + 3 - 13)) << 23);
- cv *= as_float((((headUV >> 16) & 15) + (127 + 3 - 13)) << 23);
- float4 base = decodeRawNormal(headBase);
- float4 res = {
- base.x + cu * (F32)(S32)(headUV << 16) + cv * (F32)(S32)headUV,
- base.y + cu * (F32)(S32)(headUV << 20) + cv * (F32)(S32)(headUV << 4),
- base.z + cu * (F32)(S32)(headUV << 24) + cv * (F32)(S32)(headUV << 8),
- 0.0f};
- return res;
- }
- __constant F32 c_dxtAOCoefs[8] = {
- 0.f,
- 1.f / ((F32)(1 << 24) * 255.f * 7.f),
- 2.f / ((F32)(1 << 24) * 255.f * 7.f),
- 3.f / ((F32)(1 << 24) * 255.f * 7.f),
- 4.f / ((F32)(1 << 24) * 255.f * 7.f),
- 5.f / ((F32)(1 << 24) * 255.f * 7.f),
- 6.f / ((F32)(1 << 24) * 255.f * 7.f),
- 1.f / ((F32)(1 << 24) * 255.f)
- };
- inline float decodeAO(U64 block, int texelIdx)
- {
- F32 c0 = c_dxtAOCoefs[((U32)(block >> (texelIdx * 3 + 16))) & 7];
- F32 c1 = c_dxtAOCoefs[7] - c0;
- return c0 * (F32)((U32)block << 16) + c1 * (F32)((U32)block << 24);
- }
- void lookupVoxelColorNormal(float4* colorRes, float4* normalRes, const CastResult* castRes, const CastStack* stack)
- {
- __global S32* pageHeader = (__global S32*)((intptr_t)(*castRes).node & -PageBytes);
- __global S32* blockInfo = pageHeader + *pageHeader;
- __global S32* blockStart = blockInfo + blockInfo[BlockInfo_BlockPtr];
- __global S32* attachInfos = blockInfo + BlockInfo_End;
- __global S32* attachInfo = attachInfos + AttachInfo_End * AttachSlot_Attribute;
- __global S32* attachData = blockInfo + attachInfo[AttachInfo_Ptr];
- ptrdiff_t foo = (ptrdiff_t)((*castRes).node - blockStart);
- foo = foo >> 2;
- foo *= 6;
- __global S32* bar = attachData + foo;
- __global U64* dxtBlock = (__global U64*) bar;
- U64 colorBlock = dxtBlock[0];
- U64 normalBlockA = dxtBlock[1];
- U64 normalBlockB = dxtBlock[2];
- int texelIdx = (*castRes).childIdx | (int)((((*castRes).node - pageHeader) & 2) << 2);
- float4 tmp = decodeDXTColor(colorBlock, texelIdx);
- *colorRes = (float4)(tmp.x, tmp.y, tmp.z, 255.0f);
- *normalRes = decodeDXTNormal(normalBlockA, normalBlockB, texelIdx);
- }
- void lookupVoxelAO( float* res, const CastResult* castRes, const CastStack* stack)
- {
- __global S32* pageHeader = (__global S32*)((S32)(*castRes).node & -PageBytes);
- __global S32* blockInfo = pageHeader + *pageHeader;
- __global S32* blockStart = blockInfo + blockInfo[BlockInfo_BlockPtr];
- __global S32* attachInfos = blockInfo + BlockInfo_End;
- __global S32* attachInfo = attachInfos + AttachInfo_End * AttachSlot_AO;
- __global S32* attachData = blockInfo + attachInfo[AttachInfo_Ptr];
- __global U64* dxtBlock = (__global U64*)(attachData + (((*castRes).node - blockStart) >> 2) * 2);
- U64 block = dxtBlock[0];
- int texelIdx = (*castRes).childIdx | ((((*castRes).node - pageHeader) & 2) << 2);
- *res = decodeAO(block, texelIdx);
- }
- typedef struct Aux_
- {
- __global U32* framePtr;
- float vSizeMultiplier;
- union
- {
- S32 fetchWorkTemp;
- Ray ray;
- struct
- {
- U32 color;
- U32 alpha;
- } aa;
- };
- }Aux;
- Ray constructPrimaryRay(__constant StandardInput* input, __global float* g_frameCoarse, int ppos, int ridx, volatile __local Aux* aux)
- {
- float vsize = (*input).maxVoxelSize;
- int xsize = (*input).frameSize.x;
- if ((*input).flags & RenderFlags_CoarsePass)
- {
- vsize = (float)(*input).coarseSize * 2.83f;
- xsize = (*input).coarseFrameSize.x;
- } else
- {
- U32 a = ppos;
- U32 b = ridx;
- U32 c = 0x9e3779b9u;
- jenkinsMix(&a, &b, &c);
- float f = (float)c / ((float)(1u << 31) * 2.f);
- f = .5f + .5f*f;
- (*aux).vSizeMultiplier = f;
- vsize *= f;
- }
- int pixely = ppos / xsize;
- int pixelx = ppos - (pixely * xsize);
- F32 fx = pixelx;
- F32 fy = pixely;
- if ((*input).flags & RenderFlags_CoarsePass)
- {
- fx *= (float)(*input).coarseSize;
- fy *= (float)(*input).coarseSize;
- }
- else
- {
- if ((*input).aaRays == 1)
- {
- fx += .5f;
- fy += .5f;
- }
- }
- F32 tmin = 0.f;
- if ((*input).flags & RenderFlags_UseCoarseData)
- {
- int bx = pixelx / (*input).coarseSize;
- int by = pixely / (*input).coarseSize;
- int bidx = bx + by * (*input).coarseFrameSize.x;
- F32 tmin0 = g_frameCoarse[bidx];
- F32 tmin1 = g_frameCoarse[bidx+1];
- F32 tmin2 = g_frameCoarse[bidx+(*input).coarseFrameSize.x];
- F32 tmin3 = g_frameCoarse[bidx+(*input).coarseFrameSize.x+1];
- tmin = fmin(fmin(tmin0, tmin1), fmin(tmin2, tmin3));
- tmin = fmin(tmin, 0.9999f);
- }
- __constant Mat4f* vtc = &(*input).octreeMatrices.viewportToCamera;
- __constant Mat4f* cto = &(*input).octreeMatrices.cameraToOctree;
- float4 pos = {
- (*vtc).m00 * fx + (*vtc).m01 * fy + (*vtc).m03,
- (*vtc).m10 * fx + (*vtc).m11 * fy + (*vtc).m13,
- (*vtc).m20 * fx + (*vtc).m21 * fy + (*vtc).m23,
- (*vtc).m30 * fx + (*vtc).m31 * fy + (*vtc).m33};
- float4 near = {
- pos.x - (*vtc).m02,
- pos.y - (*vtc).m12,
- pos.z - (*vtc).m22,
- (*input).octreeMatrices.pixelInOctree * vsize};
- float4 diff = {
- (*vtc).m32 * pos.x - (*vtc).m02 * pos.w,
- (*vtc).m32 * pos.y - (*vtc).m12 * pos.w,
- (*vtc).m32 * pos.z - (*vtc).m22 * pos.w,
- near.w * (*vtc).m32};
- float a = 1.0f / (pos.w - (*vtc).m32);
- float b = 2.0f * a / fmax(pos.w + (*vtc).m32, 1.0e-8f);
- float c = tmin * b;
- Ray ray;
- ray.orig = near * a - diff * c;
- ray.dir = diff * (c - b);
- ray.orig.xyz = (mulMat4Vec3(cto, &ray.orig)).xyz;
- ray.dir = (float4)(
- (*cto).m00 * ray.dir.x + (*cto).m01 * ray.dir.y + (*cto).m02 * ray.dir.z,
- (*cto).m10 * ray.dir.x + (*cto).m11 * ray.dir.y + (*cto).m12 * ray.dir.z,
- (*cto).m20 * ray.dir.x + (*cto).m21 * ray.dir.y + (*cto).m22 * ray.dir.z,
- ray.dir.w);
- return ray;
- }
- U32 processPrimaryRay(__constant StandardInput* input, __global int * g_rootNode, unsigned int rootNodeOfs,
- volatile __local Ray* ray, float vSizeMultiplier)
- {
- CastResult castRes;
- CastStack stack;
- castRayLocal(g_rootNode, rootNodeOfs, &castRes, &stack, ray);
- if ((*input).flags & RenderFlags_VisualizeIterations)
- {
- F32 v = 255.0f * (F32)castRes.iter / 64.0f;
- return toABGR((float4)(v, v, v, 0.0f));
- }
- else if ((*input).flags & RenderFlags_VisualizeRaycastLevel)
- {
- F32 v = 0.0f;
- if (castRes.pos.w <= 1.0f)
- v = 255.0f - ((F32)23 - (F32)castRes.stackPtr) * (255.0f / 18.0f);
- return toABGR((float4)(v * 0.5f, v, v * 0.5f, 0.0f));
- }
- float4 L = {0.3643f, 0.3535f, 0.8616f, 0.0f};
- Mat3f m3f = extractMat3f(&(*input).octreeMatrices.octreeToWorld);
- float4 I = normalize(mulMatVec3local(&m3f, &(*ray).dir));
- if (castRes.pos.w > 1.0f)
- {
- float4 c;
- if (I.y >= 0.f)
- {
- float4 horz = { 179.0f, 205.0f, 253.0f, 0.0f };
- float4 zen = { 77.0f, 102.0f, 179.0f, 0.0f };
- c = horz + (zen - horz) * I.y * I.y;
- c *= 2.5f;
- }
- else
- {
- float4 horz = { 192.0f, 154.0f, 102.0f, 0.0f };
- float4 zen = { 128.0f, 102.0f, 77.0f, 0.0f };
- c = horz - (zen - horz) * I.y;
- }
- c *= fmax(L.y, 0.0f);
- float IL = dot(I, L);
- if (IL > 0.0f)
- {
- float4 f = {255.0f, 179.0f, 102.0f, 0.0f};
- c += f * pow(IL, 1000.0f);
- }
- return toABGR((float4)(c.x, c.y, c.z, 0.0f));
- }
- float4 voxelColor;
- float4 voxelNormal;
- lookupVoxelColorNormal(&voxelColor, &voxelNormal, &castRes, &stack);
- return toABGR(voxelColor);
- F32 voxelAmbient = 1.0f;
- lookupVoxelAO(&voxelAmbient, &castRes, &stack);
- float4 N = normalize(mulMatVec3const(&(*input).octreeMatrices.octreeToWorldN, &voxelNormal));
- float4 R = (I - N * (dot(N, I) * 2.0f));
- F32 LN = dot(L, N);
- bool shadow = (LN <= 0.0f);
- float4 shadedColor = voxelColor * (voxelAmbient * (0.25f + LN * ((LN < 0.0f) ? 0.15f : (shadow) ? 0.25f : 1.0f)));
- if (!shadow)
- shadedColor += (float4)(32.f, 32.f, 32.f, 0.0f) * pow(fmax(dot(L, R), 0.0f), 18.0f);
- shadedColor *= (*input).brightness;
- U32 color = toABGR(shadedColor);
- float vSize = (F32)(1 << castRes.stackPtr) / (F32)(1 << 23);
- float pSize = (*ray).orig.w + castRes.pos.w * (*ray).dir.w;
- vSize *= vSizeMultiplier;
- float blurRadius = max(vSize / pSize * (*input).maxVoxelSize, 1.0f);
- shadedColor.w = log2(blurRadius) * 32.0f + 0.5f;
- return toABGR(shadedColor);
- }
- void fetchWorkFirst(int* warp, int* batchCounter, __global int* g_warpCounter, int batchSize, volatile __local S32* sharedTemp)
- {
- *warp = get_local_id(1) + get_group_id(0) * 2;
- *batchCounter = 0;
- }
- void fetchWorkNext(int* warp, int* batchCounter, __global int* g_warpCounter, int batchSize, volatile __local S32* sharedTemp)
- {
- *warp = 0x03FFFFFF;
- }
- __kernel void renderKernel(__constant StandardInput* input, __global int* g_warpCounter, __global int* g_activeWarps,
- __global int* g_coarseIndexToPixel, __global float* g_frameCoarse,
- __global int* g_indexToPixel, __global unsigned int* g_frame,
- __global unsigned int* g_aaSampleBuffer,
- __global int* g_rootNode, unsigned int rootNodeOfs)
- {
- int dummy = (*input).totalWork;
- __local Aux auxbuf[32 * 2];
- volatile __local Aux* aux0 = &auxbuf[32 * get_local_id(1)];
- volatile __local Aux* aux = &auxbuf[get_local_id(0) + 32 * get_local_id(1)];
- int warp, batchCounter;
- fetchWorkFirst(&warp, &batchCounter, g_warpCounter, (*input).batchSize, (volatile __local S32*) &(*aux0).fetchWorkTemp);
- if (warp * 32 >= (*input).totalWork)
- return;
- for (;;)
- {
- int ridx = warp * 32 + get_local_id(0);
- if (ridx >= (*input).totalWork)
- return;
- int pidx = ridx;
- int ppos;
- if ((*input).flags & RenderFlags_CoarsePass)
- {
- ppos = g_coarseIndexToPixel[pidx];
- (*aux).framePtr = (__global U32*)g_frameCoarse + ppos;
- }
- else
- {
- ppos = g_indexToPixel[pidx];
- (*aux).framePtr = (__global U32*)g_frame + ppos;
- }
- Ray ray = constructPrimaryRay(input, g_frameCoarse, ppos, ridx, aux);
- (*aux).ray.orig.x = ray.orig.x;
- (*aux).ray.orig.y = ray.orig.y;
- (*aux).ray.orig.z = ray.orig.z;
- (*aux).ray.dir.x = ray.dir.x;
- (*aux).ray.dir.y = ray.dir.y;
- (*aux).ray.dir.z = ray.dir.z;
- (*aux).ray.orig.w = ray.orig.w;
- (*aux).ray.dir.w = ray.dir.w;
- if ((*input).flags & RenderFlags_CoarsePass)
- {
- CastResult castRes;
- CastStack stack;
- castRayLocal(g_rootNode, rootNodeOfs, &castRes, &stack, (volatile __local Ray*) &((*aux).ray));
- if (castRes.pos.w < 1.0f)
- {
- F32 size = (F32)(1 << castRes.stackPtr) / (F32)(1 << 23);
- castRes.pos.w -= size / length((float4)((*aux).ray.dir.xyz, 0.0f)) * 0.5f;
- }
- *((__global float*)(*aux).framePtr) = max(castRes.pos.w, 0.0f);
- }
- else
- {
- U32 color; // = processPrimaryRay(input, g_rootNode, rootNodeOfs, (volatile __local Ray*) &(*aux).ray, (*aux).vSizeMultiplier);
- if ((*input).aaRays == 1)
- {
- *(*aux).framePtr = color;
- }
- else
- {
- U32 resc = (color & 0xff) | ((color & 0xff00) << 2) | ((color & 0xff0000) << 4);
- (*aux).aa.color = resc;
- (*aux).aa.alpha = color;
- if ((get_local_id(0) & 3) == 0)
- {
- U32 resc0 = aux[0].aa.color;
- U32 resc1 = aux[1].aa.color;
- U32 resc2 = aux[2].aa.color;
- U32 resc3 = aux[3].aa.color;
- resc = (resc0 + resc1 + resc2 + resc3);
- resc = ((resc >> 2) & 0xff) | ((resc >> 4) & 0xff00) | ((resc >> 6) & 0xff0000);
- U32 resa0 = aux[0].aa.alpha;
- U32 resa1 = aux[1].aa.alpha;
- U32 resa2 = aux[2].aa.alpha;
- U32 resa3 = aux[3].aa.alpha;
- U32 resa = min(min(resa0, resa1), min(resa2, resa3));;
- *(*aux).framePtr = (resa & 0xff000000) | resc;
- }
- }
- }
- fetchWorkNext(&warp, &batchCounter, g_warpCounter, (*input).batchSize, (volatile __local S32*) &(*aux0).fetchWorkTemp);
- }
- const int nWidth = get_global_size(0);
- const int xOut = get_global_id(0);
- const int yOut = get_global_id(1);
- const int idxOut = yOut * nWidth + xOut;
- g_frame[idxOut] = 1000000;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement