SHARE
TWEET

GenerateBinaryRadixTree.comp

amdreallyfast May 19th, 2017 (edited) 61 in 2 days
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. // REQUIRES Version.comp
  2. // REQUIRES ComputeShaderWorkGroupSizes.comp
  3. // REQUIRES SsboBufferBindings.comp
  4. // REQUIRES CrossShaderUniformLocations.comp
  5. // REQUIRES ParticleBuffer.comp
  6. // REQUIRES BvhNodeBuffer.comp
  7.  
  8. // Y and Z work group sizes default to 1
  9. layout (local_size_x = PARTICLE_OPERATIONS_WORK_GROUP_SIZE_X) in;
  10.  
  11.  
  12. // see CLZ1(...) comment block for credit
  13. // Note: This array is a key part of CLZ1, which in turn is a key part of LengthOfCommonPrefix
  14. // (...), which in turn is a key part of the binary radix tree construction.  Let it be a global
  15. // in this shader, and make it shared to prevent copies from hogging more memory than necessary
  16. // (it's honestly not much, but still).
  17. const int[] clz_lkup =
  18. {
  19.     32, 31, 30, 30, 29, 29, 29, 29,
  20.     28, 28, 28, 28, 28, 28, 28, 28,
  21.     27, 27, 27, 27, 27, 27, 27, 27,
  22.     27, 27, 27, 27, 27, 27, 27, 27,
  23.     26, 26, 26, 26, 26, 26, 26, 26,
  24.     26, 26, 26, 26, 26, 26, 26, 26,
  25.     26, 26, 26, 26, 26, 26, 26, 26,
  26.     26, 26, 26, 26, 26, 26, 26, 26,
  27.     25, 25, 25, 25, 25, 25, 25, 25,
  28.     25, 25, 25, 25, 25, 25, 25, 25,
  29.     25, 25, 25, 25, 25, 25, 25, 25,
  30.     25, 25, 25, 25, 25, 25, 25, 25,
  31.     25, 25, 25, 25, 25, 25, 25, 25,
  32.     25, 25, 25, 25, 25, 25, 25, 25,
  33.     25, 25, 25, 25, 25, 25, 25, 25,
  34.     25, 25, 25, 25, 25, 25, 25, 25,
  35.     24, 24, 24, 24, 24, 24, 24, 24,
  36.     24, 24, 24, 24, 24, 24, 24, 24,
  37.     24, 24, 24, 24, 24, 24, 24, 24,
  38.     24, 24, 24, 24, 24, 24, 24, 24,
  39.     24, 24, 24, 24, 24, 24, 24, 24,
  40.     24, 24, 24, 24, 24, 24, 24, 24,
  41.     24, 24, 24, 24, 24, 24, 24, 24,
  42.     24, 24, 24, 24, 24, 24, 24, 24,
  43.     24, 24, 24, 24, 24, 24, 24, 24,
  44.     24, 24, 24, 24, 24, 24, 24, 24,
  45.     24, 24, 24, 24, 24, 24, 24, 24,
  46.     24, 24, 24, 24, 24, 24, 24, 24,
  47.     24, 24, 24, 24, 24, 24, 24, 24,
  48.     24, 24, 24, 24, 24, 24, 24, 24,
  49.     24, 24, 24, 24, 24, 24, 24, 24,
  50.     24, 24, 24, 24, 24, 24, 24, 24
  51. };
  52.  
  53. /*------------------------------------------------------------------------------------------------
  54. Description:
  55.     Algorithm credit goes to Miro Samek with Embedded Gurus.
  56.     http://embeddedgurus.com/state-space/2014/09/fast-deterministic-and-portable-counting-leading-zeros/
  57.  
  58.     I adapted it for use in an OpenGL compute shader.
  59.  
  60.     Note: This algorithm works by figuring out which byte has the most significant 1 bit.  Then
  61.     the whole thing is bit shifted so that the byte with the most significant 1 is in byte 0 of
  62.     the 32bit integer (all trailing bits are considered irrelevant).  This makes the most
  63.     significant byte on the range 0-255.  The "clz_lkup" array has 256 values.  These values are
  64.     the number of leading zeros for every single possible value on range 0-255.  This is less of
  65.     a calculation and more of a lookup.  And it's a fast lookup.
  66.    
  67.     Also Note: Miro Samek also posted a CLZ2(...) function in the same article.  That function
  68.     is meant for embedded systems that are severely limited in ROM and are willing to sacrifice
  69.     a little run speed for a smaller clz_lkup[] footprint.  I am not running this on an embedded
  70.     system and have no problems with available memory, so I will keep using CLZ1(...).
  71. Parameters:
  72.     x   The value to analyze.
  73. Returns:    
  74.     The number of leading zeros in the 32bit unsigned integer
  75. Creator:    amdreallyfast, 5/2017
  76. ------------------------------------------------------------------------------------------------*/
  77. int CLZ1(uint x) {
  78.     int n = 0;
  79.     if (x >= (1U << 16)) {
  80.         if (x >= (1U << 24)) {
  81.             n = 24;
  82.         }
  83.         else {
  84.             n = 16;
  85.         }
  86.     }
  87.     else {
  88.         if (x >= (1 << 8)) {
  89.             n = 8;
  90.         }
  91.         else {
  92.             n = 0;
  93.         }
  94.     }
  95.     return clz_lkup[x >> n] - n;
  96. }
  97.  
  98. /*------------------------------------------------------------------------------------------------
  99. Description:
  100.     Calculates the number of leading bits that the two arguments have in common.
  101.  
  102.     Ex: 0b00001 and 0b10001
  103.     The common prefix is length 0.  There are three bits in common, but they are not leading
  104.     bits.
  105.  
  106.     This is part of the binary radix tree generation described in this paper:
  107.     http://devblogs.nvidia.com/parallelforall/wp-content/uploads/2012/11/karras2012hpg_paper.pdf
  108.  
  109.     This method performs the job of the pseudocode's sigma.  
  110.  
  111.     Note: The paper said to, if the values were equal, concatenate the bits of the index to the
  112.     bits of the values being analyzed and then then determine the longest common prefix.  This
  113.     implementation only lives in 32bit land, so I can't concatenate a 32bit value to another
  114.     32bit value.  There aren't enough bits.  I'd need a a 64bit uint for that.  But I can
  115.     determine the longest common prefix of the values and of their indices and then add the two
  116.     together if necessary.  The CLZ1(...) function is fast enough to make performance not a
  117.     concern.
  118.    
  119.     Also Note: I discovered by experimentation that always adding the lenght of the common
  120.     prefixes together messed up the tree.  So only add the length of the common prefix if the
  121.     values are equal.
  122.  
  123.     Ex 1:
  124.         // suppose our integers live in 5bit land
  125.         value1 at indexA = 0b00010
  126.         value2 at indexB = 0b01111
  127.         longest common prefix = 1 (the most significant bit)
  128.  
  129.     Ex 2:
  130.         // equal values get special handling
  131.         value1 at indexA = 0b11010
  132.         value2 at indexB = 0b11010
  133.         length of common prefix = 5 (all bits identical)
  134.         let indexA = 13 = 0b01101
  135.         let indexB = 14 = 0b01110
  136.         length of common prefix = 3 (most significant bits)
  137.         reported common prefix length = 5 + 3 = 8;
  138.  
  139. Parameters:
  140.     indexA  An index into the "leaf" section of AllBvhNodes.  Expected to be thread ID.
  141.     indexB  Another index into the "leaf" section of AllBvhNodes.
  142. Returns:    
  143.     See Description.
  144. Creator:    amdreallyfast, 5/2017
  145. ------------------------------------------------------------------------------------------------*/
  146. int LengthOfCommonPrefix(int indexA, int indexB)
  147. {
  148.     // don't need to check 'a' because that is a thread ID and therefore should always be in
  149.     // bounds
  150.     // Note: It seems that a >= comparison between int and uint is ok, no cast required.
  151.     if (indexB < 0 || indexB >= uNumActiveParticles)
  152.     {
  153.         return -1;
  154.     }
  155.  
  156.     uint valueA = AllBvhNodes[indexA]._data;
  157.     uint valueB = AllBvhNodes[indexB]._data;
  158.  
  159.     // the XOR will highlight the bits that are different, thus leaving as 0s all the bits that
  160.     // are identical
  161.     int valueNumCommonBits = CLZ1(valueA ^ valueB);
  162.     int indexNumCommonBits = CLZ1(indexA ^ indexB);
  163.     return (valueA == valueB) ? 32 + indexNumCommonBits : valueNumCommonBits;
  164. }
  165.  
  166. /*------------------------------------------------------------------------------------------------
  167. Description:
  168.     Determines how many leaves this internal node covers, minus 1.  I don't have an intuitive
  169.     explanation, but the math works out.  The number of nodes covered depends on depth into the
  170.     tree.
  171.  
  172.     Part of the binary radix tree generation described in this paper:
  173.     http://devblogs.nvidia.com/parallelforall/wp-content/uploads/2012/11/karras2012hpg_paper.pdf
  174.  
  175.     This is only called once by main(), but I thought that it made the algorithm easier to read
  176.     if this part of it was off in its own function.
  177.  
  178.     Note: It may be possible to find the length ("range", or number of leaf nodes that this
  179.     internal node covers) in a single loop, but the only way that I'm aware of is a linear
  180.     search through all items.  The whole point of a binary search is to reduce an O(n) solution
  181.     to O(log n), so keep the 2-loop solution.
  182.  
  183. Parameters:
  184.     startNodeIndex  An index into the internal node section of the BvhNodeBuffer.
  185.     direction       -1 or +1.  If it is 0, something has gone terribly wrong.
  186. Returns:    
  187.     See Description.
  188. Creator:    amdreallyfast, 5/2017
  189. ------------------------------------------------------------------------------------------------*/
  190. int DetermineRange(int startNodeIndex, int direction)
  191. {
  192.     // determine how many leaves this internal node covers ("range")
  193.     int minimumCommonPrefixLength = LengthOfCommonPrefix(startNodeIndex, startNodeIndex - direction);
  194.    
  195.     int maxPossibleLength = 2;
  196.     int secondNodeIndex = startNodeIndex + (maxPossibleLength * direction);
  197.     while (LengthOfCommonPrefix(startNodeIndex, secondNodeIndex) > minimumCommonPrefixLength)
  198.     {
  199.         maxPossibleLength *= 2;
  200.         secondNodeIndex = startNodeIndex + (maxPossibleLength * direction);
  201.     }
  202.  
  203.     // find actual length using a binary search on the max possible length
  204.     int actualLengthSans1 = 0;
  205.     for (int rangeIncrement = maxPossibleLength >> 1; rangeIncrement >= 1; rangeIncrement >>= 1)
  206.     {
  207.         secondNodeIndex = startNodeIndex + ((actualLengthSans1 + rangeIncrement) * direction);
  208.         if (LengthOfCommonPrefix(startNodeIndex, secondNodeIndex) > minimumCommonPrefixLength)
  209.         {
  210.             actualLengthSans1 += rangeIncrement;
  211.         }
  212.     }
  213.  
  214.     return actualLengthSans1;
  215. }
  216.  
  217. /*------------------------------------------------------------------------------------------------
  218. Description:
  219.     Finds the "split index", which is the leaf node just prior to the one where the common
  220.     prefix in the nodes in the range changes.  The number of bits under consider depends on the
  221.     range, which is a function of node depth.
  222.  
  223.     Example tree (taken from by-hand calculations):
  224.         Leaf0 = 0b00001
  225.         Leaf1 = 0b00010
  226.         Leaf2 = 0b00011
  227.         Leaf3 = 0b01001
  228.         Leaf4 = 0b01001
  229.         Leaf5 = 0b01110
  230.         Leaf6 = 0b01111
  231.  
  232.     N-1 internal nodes
  233.     Internal node 0 covers the range 0-6, split is at Leaf2
  234.     Internal node 1 covers the range 1-2, split is at Leaf1
  235.     Internal node 2 covers the range 0-2, split is at Leaf0
  236.     Internal node 3 covers the range 3-6, split is at Leaf4
  237.     Internal node 4 covers the range 3-4, split is at Leaf3
  238.     Internal node 5 covers the range 5-6, split is at Leaf5
  239.  
  240.     It is part of the binary radix tree generation described in this paper:
  241.     http://devblogs.nvidia.com/parallelforall/wp-content/uploads/2012/11/karras2012hpg_paper.pdf
  242.  
  243.     Like DetermineRange(...), this is only called once by main(), but I thought that it made the
  244.     algorithm easier to read if this part of it was off in its own function.
  245.  
  246. Parameters:
  247.     startNodeIndex  An index into the internal node section of the BvhNodeBuffer.
  248.     length          See DetermineRange(...) Description.
  249.     direction       -1 or +1.  If it is 0, something has gone terribly wrong.
  250. Returns:    
  251.     The number of leaf nodes that are below this internal node in the hierarchy.
  252. Creator:    amdreallyfast, 5/2017
  253. ------------------------------------------------------------------------------------------------*/
  254. int FindSplitPosition(int startNodeIndex, int length, int direction)
  255. {
  256.     int otherEndIndex = startNodeIndex + (length * direction);
  257.     int commonPrefixLengthBetweenBeginAndEnd = LengthOfCommonPrefix(startNodeIndex, otherEndIndex);
  258.     int splitOffset = 0;
  259.  
  260.     // Note: The loop should run through rangeIncrement == 1, dividing in half each time (log2
  261.     // ranging search), but calling ceil(...) with anything on the range [0,1] will return 1.  
  262.     // A do-while(...) loop will work nicely here and avoid an infinite loop.
  263.     float rangeIncrement = length;
  264.     do
  265.     {
  266.         // cut the increment in half on each loop
  267.         rangeIncrement = ceil(rangeIncrement * 0.5f);
  268.         int rangeIncrementInteger = int(rangeIncrement);
  269.         int secondNodeIndex = startNodeIndex + ((splitOffset + rangeIncrementInteger) * direction);
  270.         if (LengthOfCommonPrefix(startNodeIndex, secondNodeIndex) > commonPrefixLengthBetweenBeginAndEnd)
  271.         {
  272.             splitOffset += rangeIncrementInteger;
  273.         }
  274.     } while (rangeIncrement > 1.0f);
  275.  
  276.     int splitIndex = startNodeIndex + (splitOffset * direction) + min(direction, 0);
  277.  
  278.     return splitIndex;
  279. }
  280.  
  281.  
  282. /*------------------------------------------------------------------------------------------------
  283. Description:
  284.     Performs a parallel construction of a binary radix tree.
  285.  
  286.     The algorithm has been adapted from this paper to OpenGL compute shaders:
  287.     http://devblogs.nvidia.com/parallelforall/wp-content/uploads/2012/11/karras2012hpg_paper.pdf
  288.  
  289.     Other influence came from the article that linked to the paper:
  290.     https://devblogs.nvidia.com/parallelforall/thinking-parallel-part-iii-tree-construction-gpu/
  291.  
  292.     This algorithm has been worked through by hand and followed by a CPU implementation before
  293.     creating this compute shader version.
  294. Creator:    amdreallyfast, 5/2017
  295. ------------------------------------------------------------------------------------------------*/
  296. void main()
  297. {
  298.     // Note: For N leaves there are n-1 internal nodes, hence the -1.
  299.     uint threadIndex = gl_GlobalInvocationID.x;
  300.     if (threadIndex >= uParticleBufferSize ||
  301.         threadIndex >= (uNumActiveParticles - 1))
  302.     {
  303.         // all inactive particles are sorted to the back of the particle buffer, so returning
  304.         // now won't leave a gap in the tree
  305.         return;
  306.     }
  307.  
  308.     // Note: This algorithm invloves lots of index calculations that must be allowed to go
  309.     // negative.  For example, the function LengthOfCommonPrefix(...) will return -1 if the
  310.     // second index is out of range.  That is an important part of the range calculation.
  311.     int thisLeafIndex = int(threadIndex);
  312.     int thisInternalNodeIndex = int(uBvhNumberLeaves + threadIndex);
  313.     int rootInternalNodeIndex = int(uBvhNumberLeaves);
  314.  
  315.     // build the tree
  316.     int commonPrefixLengthBefore = LengthOfCommonPrefix(thisLeafIndex, thisLeafIndex - 1);
  317.     int commonPrefixLengthAfter = LengthOfCommonPrefix(thisLeafIndex, thisLeafIndex + 1);
  318.  
  319.     // if I made LengthOfCommonPrefix(...) correctly, this should never return 0
  320.     // Note: If direction is 0, then something has gone terribly wrong.
  321.     int d = sign(commonPrefixLengthAfter - commonPrefixLengthBefore);
  322.  
  323.     int range = DetermineRange(thisLeafIndex, d);
  324.     int splitIndex = FindSplitPosition(thisLeafIndex, range, d);
  325.    
  326.     // already calculated in FindSplitPosition(...), but it's already returning a value and this
  327.     // calculation is cheap and it's needed, so do it again
  328.     int otherEndIndex = thisLeafIndex + (range * d);
  329.  
  330.     int leftChildIndex = -1;
  331.     if (min(thisLeafIndex, otherEndIndex) == splitIndex)
  332.     {
  333.         // left child is a leaf node
  334.         leftChildIndex = splitIndex;
  335.     }
  336.     else
  337.     {
  338.         // left child is an internal node
  339.         leftChildIndex = rootInternalNodeIndex + splitIndex;
  340.     }
  341.     AllBvhNodes[thisInternalNodeIndex]._leftChildIndex = leftChildIndex;
  342.  
  343.     int rightChildIndex = -12;
  344.     if (max(thisLeafIndex, otherEndIndex) == (splitIndex + 1))
  345.     {
  346.         // right child is a leaf node
  347.         rightChildIndex = splitIndex + 1;
  348.     }
  349.     else
  350.     {
  351.         // right child is an internal node
  352.         rightChildIndex = rootInternalNodeIndex + splitIndex + 1;
  353.     }
  354.     AllBvhNodes[thisInternalNodeIndex]._rightChildIndex = rightChildIndex;
  355.  
  356.     AllBvhNodes[thisInternalNodeIndex]._threadEntranceCounter = 0;
  357.    
  358.     // in the next stage (constructing the bounding volumes out of this new tree's hierarchy),
  359.     // both nodes need the option of traversing up to their parent
  360.     // Note: This should set both leaf node and internal parent indices.
  361.     // Also Note: If this algorithm is working correctly, then all internal nodes should have
  362.     // exactly 2 children, so there should be no need for an "index is -1" check.
  363.     AllBvhNodes[leftChildIndex]._parentIndex = thisInternalNodeIndex;
  364.     AllBvhNodes[rightChildIndex]._parentIndex = thisInternalNodeIndex;
  365. }
RAW Paste Data
Challenge yourself this year...
Learn something new in 2017
Top