Untitled

#define MAX_KERNEL_WIDTH                    64
#define THREAD_GROUP_WIDTH                  128

cbuffer BlurData                            : register(b5)
{
    float2 direction                        : packoffset(c0);
    float width                             : packoffset(c0.z);
    float buffer                            : packoffset(c0.w);
};

cbuffer Kernel                              : register(b6)
{
    float4 gKernel[ MAX_KERNEL_WIDTH ];
};

// reading from depth texture
Texture2D gInputTexture                     : register(t0);
Texture2D<float4> gNormalTexture            : register(t1);
Texture2D<float4> gDepthTexture             : register(t1);

//specify the output to the CPU as a read-write buffer
RWTexture2D<float4> gOutput                 : register(u0);

// shared memory that all threads share. sized for worst-case scenario of using the max kernel width
groupshared float4 sharedMem[ 128 + 2 * MAX_KERNEL_WIDTH + 1 ];

[numthreads(128, 1, 1)]
void main(uint3 groupThreadID : SV_GroupThreadID, uint3 globalThreadID : SV_DispatchThreadID)
{
    // weird copy, not sure why data won't align
    static float tempKernel[ MAX_KERNEL_WIDTH ] = (float[ MAX_KERNEL_WIDTH ])(gKernel);

    // calculate global texel coord of this thread
    int2 gpos = int2(globalThreadID.x * (int2)(direction) + globalThreadID.y * (1 - (int2)(direction)));
    int i = groupThreadID.x;

    // calculate where we'll start sampling
    int2 basePos = gpos - (int2)(direction) * width;

    float4 temp = gInputTexture.Load(int3(basePos, 0));

    // sample first pixel
    sharedMem[i] = temp;

    // if this thread is one of the first 2*w threads, load second pixel
    if (i < 2 * (int)(width))
    {
        sharedMem[ i + THREAD_GROUP_WIDTH ] = gInputTexture.Load(int3(basePos + THREAD_GROUP_WIDTH * (int2)(direction), 0));
    }

    // wait for all memory writes
    DeviceMemoryBarrier();

    // now, apply kernel to section of weights
    float4 sum = float4(0,0,0,0);
    for (int x = 0; x <= width * 2; ++x)
    {
        sum += sharedMem[ i + x ] * tempKernel[ x ].x;
    }

    sum.w = 1;

    // save result
    gOutput[ gpos ] = sum;
}