Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Example values:
- cbuffer cbPerFrame : register (b0)
- {
- float4 cb0_v0; // (480.0, 270.0, 0.26106, 0.7599)
- float4 cb0_v1; // (1920.0, 1080.0, 0, 0)
- float4 cb0_v2; // (0.0, 0.51582, 4.0, 0.0)
- }
- static const float3 LUMA_RGB = float3(0.2126, 0.7152, 0.0722);
- // Inputs:
- // 1/4 x 1/4 downscaled fullscreen HDR color buffer
- Texture2D texture0 : register (t0);
- // Fullscreen depth buffer
- Texture2D texture1 : register (t1);
- RWStructuredBuffer<uint> g_buffer : register (u0);
- groupshared uint shared_data[256];
- // Each thread group has 64 threads and shared storage (shared_data)
- [numthreads(64, 1, 1)]
- void TestCS(uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID)
- {
- // GroupID: Number of current thread group (run by Dispatch(X, Y, Z) call).
- // After downscaling fullscreen color buffer to 1/4 we run a single thread group per one row.
- // For example, with 1920x1080 fullscreen resolution, groupID.x values ranges: [0-269]
- const uint groupID = Gid.x;
- // GroupThreadID: Number of currently processed thread within a single thread group.
- // In this case values of threadID range [0-63]
- const uint threadID = GTid.x;
- // The first step is to set whole shared data to zero.
- // Because each thread group has 64 threads, each one can zero 4 elements using a simple offset.
- [unroll] for (uint idx=0; idx < 4; idx++)
- {
- const uint offset = threadID + idx*64;
- shared_data[ offset ] = 0;
- }
- // We set a barrier here, which means we block execution of all threads in a group until all group
- // shared accesses have been completed and all threads in the group have reached this call.
- GroupMemoryBarrierWithGroupSync();
- // cb0_v0.x is width of downscaled color buffer. For 1920x1080, it's 1920/4 = 480;
- float ViewportSizeX = cb0_v0.x;
- [loop] for ( uint PositionX = 0; PositionX < ViewportSizeX; PositionX += 64 )
- {
- // We move along X axis, pixel by pixel. Y is GroupID.
- uint CurrentPixelPositionX = PositionX + threadID;
- uint CurrentPixelPositionY = groupID;
- if ( CurrentPixelPositionX < ViewportSizeX )
- {
- // HDR Color buffer.
- // Calculate screen space position of HDR color buffer, load it and calculate luma.
- uint2 colorPos = int2(CurrentPixelPositionX, CurrentPixelPositionY);
- float3 color = texture0.Load( int3(colorPos, 0) ).rgb;
- float luma = dot(color, LUMA_RGB);
- // Depth texture.
- // Since the color texture is downscaled by 1/4 x 1/4 of fullscreen resolution,
- // we need to scale color position by 4 to make sure we are loading corresponding depth value.
- const int iDepthTextureScale = (int) cb0_v2.z;
- uint2 depthPos = iDepthTextureScale * colorPos;
- float depth = texture1.Load( int3(depthPos, 0) ).x;
- // We check if pixel lies on far plane (sky). If yes, we can specify how it will be
- // mixed with our values.
- float value = (depth == cb0_v2.w) ? cb0_v2.y : 0.0;
- // If 'value' is 0.0, this lerp will simply give us 'luma'. However, if 'value' is different
- // (often around ~0.50), calculated luma can have less importance. (cb0_v2.x is usually close to 0.0).
- float lumaOk = lerp( luma, cb0_v2.x, value );
- // Let's assume that lumaOk is 0.0.
- // log(0) is undefined
- // log(1) = 0.
- // calculate natural logarithm of luma
- lumaOk = log(lumaOk + 1.0);
- // Scale logarithm of luma by 128
- lumaOk *= 128;
- // Calculate proper index. Uint and since we have 256 elements in array,
- // make sure it will not get out of bounds.
- uint uLuma = (uint) lumaOk;
- uLuma = min(uLuma, 255);
- // Add '1' to corresponding luma value.
- InterlockedAdd( shared_data[uLuma], 1 );
- }
- }
- // Wait until all pixels in this row have been processed
- GroupMemoryBarrierWithGroupSync();
- // Add calculated values to structured buffer.
- [unroll] for (uint idx = 0; idx < 4; idx++)
- {
- const uint offset = threadID + idx*64;
- uint data = shared_data[offset];
- InterlockedAdd( g_buffer[offset], data );
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement