The Witcher 3 - average log luminance distribution

// Example values:
cbuffer cbPerFrame : register (b0)
{
 	float4 cb0_v0;  // (480.0, 270.0, 0.26106, 0.7599)
	float4 cb0_v1;  // (1920.0, 1080.0, 0, 0)
	float4 cb0_v2;  // (0.0, 0.51582, 4.0, 0.0)
}

static const float3 LUMA_RGB = float3(0.2126, 0.7152, 0.0722);

// Inputs:
// 1/4 x 1/4 downscaled fullscreen HDR color buffer
Texture2D texture0 : register (t0);

// Fullscreen depth buffer
Texture2D texture1 : register (t1);

RWStructuredBuffer<uint> g_buffer : register (u0);

groupshared uint shared_data[256];
// Each thread group has 64 threads and shared storage (shared_data)
[numthreads(64, 1, 1)]
void TestCS(uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID)
{
    // GroupID: Number of current thread group (run by Dispatch(X, Y, Z) call).
    // After downscaling fullscreen color buffer to 1/4 we run a single thread group per one row.
    // For example, with 1920x1080 fullscreen resolution, groupID.x values ranges: [0-269]
    const uint groupID = Gid.x;

    // GroupThreadID: Number of currently processed thread within a single thread group.
    // In this case values of threadID range [0-63]
    const uint threadID = GTid.x;

    // The first step is to set whole shared data to zero.
    // Because each thread group has 64 threads, each one can zero 4 elements using a simple offset.
    [unroll] for (uint idx=0; idx < 4; idx++)
    {
        const uint offset = threadID + idx*64;

        shared_data[ offset ] = 0;
    }

    // We set a barrier here, which means we block execution of all threads in a group until all group
    // shared accesses have been completed and all threads in the group have reached this call.
    GroupMemoryBarrierWithGroupSync();

    // cb0_v0.x is width of downscaled color buffer. For 1920x1080, it's 1920/4 = 480;
    float ViewportSizeX = cb0_v0.x;
    [loop] for ( uint PositionX = 0; PositionX < ViewportSizeX; PositionX += 64 )
    {
        // We move along X axis, pixel by pixel. Y is GroupID.
        uint CurrentPixelPositionX = PositionX + threadID;
        uint CurrentPixelPositionY = groupID;

        if ( CurrentPixelPositionX < ViewportSizeX )
        {
            // HDR Color buffer.
            // Calculate screen space position of HDR color buffer, load it and calculate luma.
            uint2 colorPos = int2(CurrentPixelPositionX, CurrentPixelPositionY);
            float3 color = texture0.Load( int3(colorPos, 0) ).rgb;
            float luma = dot(color, LUMA_RGB);

            // Depth texture.
            // Since the color texture is downscaled by 1/4 x 1/4 of fullscreen resolution,
            // we need to scale color position by 4 to make sure we are loading corresponding depth value.
            const int iDepthTextureScale = (int) cb0_v2.z;

            uint2 depthPos = iDepthTextureScale  * colorPos;
            float depth = texture1.Load( int3(depthPos, 0) ).x;

            // We check if pixel lies on far plane (sky). If yes, we can specify how it will be
            // mixed with our values.
            float value = (depth == cb0_v2.w) ? cb0_v2.y : 0.0;

            // If 'value' is 0.0, this lerp will simply give us 'luma'. However, if 'value' is different
            // (often around ~0.50), calculated luma can have less importance. (cb0_v2.x is usually close to 0.0).
            float lumaOk = lerp( luma, cb0_v2.x, value );

            // Let's assume that lumaOk is 0.0.
            // log(0) is undefined
            // log(1) = 0.
            // calculate natural logarithm of luma
            lumaOk = log(lumaOk + 1.0);

            // Scale logarithm of luma by 128
            lumaOk *= 128;

            // Calculate proper index. Uint and since we have 256 elements in array,
            // make sure it will not get out of bounds.
            uint uLuma = (uint) lumaOk;
            uLuma = min(uLuma, 255);

            // Add '1' to corresponding luma value.
            InterlockedAdd( shared_data[uLuma], 1 );
        }
    }

    // Wait until all pixels in this row have been processed
    GroupMemoryBarrierWithGroupSync();

    // Add calculated values to structured buffer.
    [unroll] for (uint idx = 0; idx < 4; idx++)
    {
        const uint offset = threadID + idx*64;

        uint data = shared_data[offset];
        InterlockedAdd( g_buffer[offset], data );
    }
}