SHARE
TWEET

The Witcher 3 - average log luminance distribution

a guest Dec 13th, 2018 105 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. // Example values:
  2. cbuffer cbPerFrame : register (b0)
  3. {
  4.     float4 cb0_v0;  // (480.0, 270.0, 0.26106, 0.7599)
  5.     float4 cb0_v1;  // (1920.0, 1080.0, 0, 0)
  6.     float4 cb0_v2;  // (0.0, 0.51582, 4.0, 0.0)
  7. }
  8.  
  9. static const float3 LUMA_RGB = float3(0.2126, 0.7152, 0.0722);
  10.  
  11. // Inputs:
  12. // 1/4 x 1/4 downscaled fullscreen HDR color buffer
  13. Texture2D texture0 : register (t0);
  14.  
  15. // Fullscreen depth buffer
  16. Texture2D texture1 : register (t1);
  17.  
  18. RWStructuredBuffer<uint> g_buffer : register (u0);
  19.  
  20. groupshared uint shared_data[256];
  21. // Each thread group has 64 threads and shared storage (shared_data)
  22. [numthreads(64, 1, 1)]
  23. void TestCS(uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID)
  24. {
  25.     // GroupID: Number of current thread group (run by Dispatch(X, Y, Z) call).
  26.     // After downscaling fullscreen color buffer to 1/4 we run a single thread group per one row.
  27.     // For example, with 1920x1080 fullscreen resolution, groupID.x values ranges: [0-269]
  28.     const uint groupID = Gid.x;
  29.  
  30.     // GroupThreadID: Number of currently processed thread within a single thread group.
  31.     // In this case values of threadID range [0-63]
  32.     const uint threadID = GTid.x;
  33.    
  34.     // The first step is to set whole shared data to zero.
  35.     // Because each thread group has 64 threads, each one can zero 4 elements using a simple offset.
  36.     [unroll] for (uint idx=0; idx < 4; idx++)
  37.     {
  38.         const uint offset = threadID + idx*64;
  39.  
  40.         shared_data[ offset ] = 0;
  41.     }
  42.  
  43.     // We set a barrier here, which means we block execution of all threads in a group until all group
  44.     // shared accesses have been completed and all threads in the group have reached this call.
  45.     GroupMemoryBarrierWithGroupSync();
  46.  
  47.     // cb0_v0.x is width of downscaled color buffer. For 1920x1080, it's 1920/4 = 480;
  48.     float ViewportSizeX = cb0_v0.x;
  49.     [loop] for ( uint PositionX = 0; PositionX < ViewportSizeX; PositionX += 64 )
  50.     {
  51.         // We move along X axis, pixel by pixel. Y is GroupID.
  52.         uint CurrentPixelPositionX = PositionX + threadID;
  53.         uint CurrentPixelPositionY = groupID;
  54.  
  55.         if ( CurrentPixelPositionX < ViewportSizeX )
  56.         {
  57.             // HDR Color buffer.
  58.             // Calculate screen space position of HDR color buffer, load it and calculate luma.
  59.             uint2 colorPos = int2(CurrentPixelPositionX, CurrentPixelPositionY);
  60.             float3 color = texture0.Load( int3(colorPos, 0) ).rgb;
  61.             float luma = dot(color, LUMA_RGB);
  62.    
  63.             // Depth texture.
  64.             // Since the color texture is downscaled by 1/4 x 1/4 of fullscreen resolution,
  65.             // we need to scale color position by 4 to make sure we are loading corresponding depth value.
  66.             const int iDepthTextureScale = (int) cb0_v2.z;
  67.  
  68.             uint2 depthPos = iDepthTextureScale  * colorPos;
  69.             float depth = texture1.Load( int3(depthPos, 0) ).x;
  70.            
  71.             // We check if pixel lies on far plane (sky). If yes, we can specify how it will be
  72.             // mixed with our values.
  73.             float value = (depth == cb0_v2.w) ? cb0_v2.y : 0.0;
  74.            
  75.             // If 'value' is 0.0, this lerp will simply give us 'luma'. However, if 'value' is different
  76.             // (often around ~0.50), calculated luma can have less importance. (cb0_v2.x is usually close to 0.0).
  77.             float lumaOk = lerp( luma, cb0_v2.x, value );
  78.  
  79.             // Let's assume that lumaOk is 0.0.
  80.             // log(0) is undefined
  81.             // log(1) = 0.
  82.             // calculate natural logarithm of luma
  83.             lumaOk = log(lumaOk + 1.0);
  84.            
  85.             // Scale logarithm of luma by 128
  86.             lumaOk *= 128;
  87.  
  88.             // Calculate proper index. Uint and since we have 256 elements in array,
  89.             // make sure it will not get out of bounds.
  90.             uint uLuma = (uint) lumaOk;
  91.             uLuma = min(uLuma, 255);
  92.  
  93.             // Add '1' to corresponding luma value.
  94.             InterlockedAdd( shared_data[uLuma], 1 );
  95.         }
  96.     }
  97.  
  98.     // Wait until all pixels in this row have been processed
  99.     GroupMemoryBarrierWithGroupSync();
  100.  
  101.     // Add calculated values to structured buffer.
  102.     [unroll] for (uint idx = 0; idx < 4; idx++)
  103.     {
  104.         const uint offset = threadID + idx*64;
  105.  
  106.         uint data = shared_data[offset];
  107.         InterlockedAdd( g_buffer[offset], data );
  108.     }
  109. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top