Advertisement
Guest User

The Witcher 3 - average log luminance distribution

a guest
Dec 13th, 2018
617
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.38 KB | None | 0 0
  1. // Example values:
  2. cbuffer cbPerFrame : register (b0)
  3. {
  4. float4 cb0_v0; // (480.0, 270.0, 0.26106, 0.7599)
  5. float4 cb0_v1; // (1920.0, 1080.0, 0, 0)
  6. float4 cb0_v2; // (0.0, 0.51582, 4.0, 0.0)
  7. }
  8.  
  9. static const float3 LUMA_RGB = float3(0.2126, 0.7152, 0.0722);
  10.  
  11. // Inputs:
  12. // 1/4 x 1/4 downscaled fullscreen HDR color buffer
  13. Texture2D texture0 : register (t0);
  14.  
  15. // Fullscreen depth buffer
  16. Texture2D texture1 : register (t1);
  17.  
  18. RWStructuredBuffer<uint> g_buffer : register (u0);
  19.  
  20. groupshared uint shared_data[256];
  21. // Each thread group has 64 threads and shared storage (shared_data)
  22. [numthreads(64, 1, 1)]
  23. void TestCS(uint3 GTid : SV_GroupThreadID, uint3 Gid : SV_GroupID)
  24. {
  25. // GroupID: Number of current thread group (run by Dispatch(X, Y, Z) call).
  26. // After downscaling fullscreen color buffer to 1/4 we run a single thread group per one row.
  27. // For example, with 1920x1080 fullscreen resolution, groupID.x values ranges: [0-269]
  28. const uint groupID = Gid.x;
  29.  
  30. // GroupThreadID: Number of currently processed thread within a single thread group.
  31. // In this case values of threadID range [0-63]
  32. const uint threadID = GTid.x;
  33.  
  34. // The first step is to set whole shared data to zero.
  35. // Because each thread group has 64 threads, each one can zero 4 elements using a simple offset.
  36. [unroll] for (uint idx=0; idx < 4; idx++)
  37. {
  38. const uint offset = threadID + idx*64;
  39.  
  40. shared_data[ offset ] = 0;
  41. }
  42.  
  43. // We set a barrier here, which means we block execution of all threads in a group until all group
  44. // shared accesses have been completed and all threads in the group have reached this call.
  45. GroupMemoryBarrierWithGroupSync();
  46.  
  47. // cb0_v0.x is width of downscaled color buffer. For 1920x1080, it's 1920/4 = 480;
  48. float ViewportSizeX = cb0_v0.x;
  49. [loop] for ( uint PositionX = 0; PositionX < ViewportSizeX; PositionX += 64 )
  50. {
  51. // We move along X axis, pixel by pixel. Y is GroupID.
  52. uint CurrentPixelPositionX = PositionX + threadID;
  53. uint CurrentPixelPositionY = groupID;
  54.  
  55. if ( CurrentPixelPositionX < ViewportSizeX )
  56. {
  57. // HDR Color buffer.
  58. // Calculate screen space position of HDR color buffer, load it and calculate luma.
  59. uint2 colorPos = int2(CurrentPixelPositionX, CurrentPixelPositionY);
  60. float3 color = texture0.Load( int3(colorPos, 0) ).rgb;
  61. float luma = dot(color, LUMA_RGB);
  62.  
  63. // Depth texture.
  64. // Since the color texture is downscaled by 1/4 x 1/4 of fullscreen resolution,
  65. // we need to scale color position by 4 to make sure we are loading corresponding depth value.
  66. const int iDepthTextureScale = (int) cb0_v2.z;
  67.  
  68. uint2 depthPos = iDepthTextureScale * colorPos;
  69. float depth = texture1.Load( int3(depthPos, 0) ).x;
  70.  
  71. // We check if pixel lies on far plane (sky). If yes, we can specify how it will be
  72. // mixed with our values.
  73. float value = (depth == cb0_v2.w) ? cb0_v2.y : 0.0;
  74.  
  75. // If 'value' is 0.0, this lerp will simply give us 'luma'. However, if 'value' is different
  76. // (often around ~0.50), calculated luma can have less importance. (cb0_v2.x is usually close to 0.0).
  77. float lumaOk = lerp( luma, cb0_v2.x, value );
  78.  
  79. // Let's assume that lumaOk is 0.0.
  80. // log(0) is undefined
  81. // log(1) = 0.
  82. // calculate natural logarithm of luma
  83. lumaOk = log(lumaOk + 1.0);
  84.  
  85. // Scale logarithm of luma by 128
  86. lumaOk *= 128;
  87.  
  88. // Calculate proper index. Uint and since we have 256 elements in array,
  89. // make sure it will not get out of bounds.
  90. uint uLuma = (uint) lumaOk;
  91. uLuma = min(uLuma, 255);
  92.  
  93. // Add '1' to corresponding luma value.
  94. InterlockedAdd( shared_data[uLuma], 1 );
  95. }
  96. }
  97.  
  98. // Wait until all pixels in this row have been processed
  99. GroupMemoryBarrierWithGroupSync();
  100.  
  101. // Add calculated values to structured buffer.
  102. [unroll] for (uint idx = 0; idx < 4; idx++)
  103. {
  104. const uint offset = threadID + idx*64;
  105.  
  106. uint data = shared_data[offset];
  107. InterlockedAdd( g_buffer[offset], data );
  108. }
  109. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement