Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define MAX_KERNEL_WIDTH 64
- #define THREAD_GROUP_WIDTH 128
- cbuffer BlurData : register(b5)
- {
- float2 direction : packoffset(c0);
- float width : packoffset(c0.z);
- float buffer : packoffset(c0.w);
- };
- cbuffer Kernel : register(b6)
- {
- float4 gKernel[ MAX_KERNEL_WIDTH ];
- };
- // reading from depth texture
- Texture2D gInputTexture : register(t0);
- Texture2D<float4> gNormalTexture : register(t1);
- Texture2D<float4> gDepthTexture : register(t1);
- //specify the output to the CPU as a read-write buffer
- RWTexture2D<float4> gOutput : register(u0);
- // shared memory that all threads share. sized for worst-case scenario of using the max kernel width
- groupshared float4 sharedMem[ 128 + 2 * MAX_KERNEL_WIDTH + 1 ];
- [numthreads(128, 1, 1)]
- void main(uint3 groupThreadID : SV_GroupThreadID, uint3 globalThreadID : SV_DispatchThreadID)
- {
- // weird copy, not sure why data won't align
- static float tempKernel[ MAX_KERNEL_WIDTH ] = (float[ MAX_KERNEL_WIDTH ])(gKernel);
- // calculate global texel coord of this thread
- int2 gpos = int2(globalThreadID.x * (int2)(direction) + globalThreadID.y * (1 - (int2)(direction)));
- int i = groupThreadID.x;
- // calculate where we'll start sampling
- int2 basePos = gpos - (int2)(direction) * width;
- float4 temp = gInputTexture.Load(int3(basePos, 0));
- // sample first pixel
- sharedMem[i] = temp;
- // if this thread is one of the first 2*w threads, load second pixel
- if (i < 2 * (int)(width))
- {
- sharedMem[ i + THREAD_GROUP_WIDTH ] = gInputTexture.Load(int3(basePos + THREAD_GROUP_WIDTH * (int2)(direction), 0));
- }
- // wait for all memory writes
- DeviceMemoryBarrier();
- // now, apply kernel to section of weights
- float4 sum = float4(0,0,0,0);
- for (int x = 0; x <= width * 2; ++x)
- {
- sum += sharedMem[ i + x ] * tempKernel[ x ].x;
- }
- sum.w = 1;
- // save result
- gOutput[ gpos ] = sum;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement