CTemporalsoften_cuda.cpp

// CTemporalsoften.cpp cuda attempt
#include "CTemporalSoften.h"
#include <cuda_runtime.h>

/* old code
CTemporalSoften::CTemporalSoften(PClip _child, int _radius, bool _isb, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), radius(_radius), isb(_isb), Y(_Y), U(_U), V(_V)
{
    if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
    {
        env->ThrowError("CTemporalSoften: supported colorspaces are Y, YUV420, YUV422, YUV444!");
    }
    else if (radius < 0 || radius > 7)
    {
        env->ThrowError("CTemporalSoften: radius values must be in the [1, 7] range!");
    }
}

__global__ void ctemporalSoftenKernel(unsigned char* dstp, const unsigned char* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    if (x >= row_size || y >= height)
        return;

    int64_t temp = 0;
    for (int i = -radius; i <= radius; i++)
    {
        temp += srcp[y * src_pitch + x + i];
    }
    dstp[y * dst_pitch + x] = (unsigned char)(temp / (2 * radius + 1));
}

PVideoFrame __stdcall CTemporalSoften::GetFrame(int n, IScriptEnvironment* env) {

    int i;
    PVideoFrame src[8];
    src[0] = child->GetFrame(n, env);

    for (i = 1; i < radius + 1; i += 1)
    {
        src[i] = isb ? child->GetFrame(max(n - i, 0), env) : child->GetFrame(min(n + i, vi.num_frames - 1), env);
    }
    PVideoFrame dst = env->NewVideoFrame(vi);
    int bits_per_pixel = vi.BitsPerPixel();

    if(bits_per_pixel == 8){
        unsigned char* dstp;
        const unsigned char* srcp;

        int dst_pitch;
        int src_pitch;
        int height;
        int row_size;
        int p;
        int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V };

    for (p = 0; p < 3; p += 1)
    {
        dstp = dst->GetWritePtr(planes[p]);
        dst_pitch = dst->GetPitch(planes[p]);
        height = dst->GetHeight(planes[p]);
        row_size = dst->GetRowSize(planes[p]);
        srcp[0] = src[0]->GetReadPtr(planes[p]);
        src_pitch[0] = src[0]->GetPitch(planes[p]);

        if (p > 0 && vi.IsY())
        {
            break;
        }
        else if (p == 0 && !Y)
        {
            for (y = 0; y < height; y += 1)
            {
                memcpy(dstp, srcp[0], row_size);
                dstp += dst_pitch;
                srcp[0] += src_pitch[0];
            }
        }
        else if (p == 1 && !U)
        {
            for (y = 0; y < height; y += 1)
            {
                memcpy(dstp, srcp[0], row_size);
                dstp += dst_pitch;
                srcp[0] += src_pitch[0];
            }
        }
        else if (p == 2 && !V)
        {
            for (y = 0; y < height; y += 1)
            {
                memcpy(dstp, srcp[0], row_size);
                dstp += dst_pitch;
                srcp[0] += src_pitch[0];
            }
        }
        else
        {
            for (i = 1; i < radius + 1; i += 1)
            {
                srcp[i] = src[i]->GetReadPtr(planes[p]);
                src_pitch[i] = src[i]->GetPitch(planes[p]);
            }
            for (y = 0; y < height; y += 1)
            {
                for (x = 0; x < row_size; x += 1)
                {
                    temp = 0;
                    for (i = 0; i < radius + 1; i += 1)
                    {
                        temp += srcp[i][x];
                    }
                    dstp[x] = (unsigned char)(temp / (radius + 1));
                }
                dstp += dst_pitch;
                for (i = 0; i < radius + 1; i += 1)
                {
                    srcp[i] += src_pitch[i];
                }
            }
        }
    } */

// new code:
class CTemporalSoften : public GenericVideoFilter
{
public:
    __global__ void ctemporalSoftenKernel(unsigned char* dstp, const unsigned char* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius)
    {
        // kernel function for 8-bit

        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        if (x >= row_size || y >= height)
            return;

        int64_t temp = 0;
        for (int i = -radius; i <= radius; i++)
        {
            temp += srcp[y * src_pitch + x + i];
        }
        dstp[y * dst_pitch + x] = (unsigned char)(temp / (2 * radius + 1));
    }

        __global__ void ctemporalSoftenKernel(unsigned short* dstp, const unsigned short* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius, int bits_per_pixel)
    {
        // kernel function for 10...16-bit
        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        if (x >= row_size || y >= height)
            return;

        int64_t temp = 0;
        int shift = bits_per_pixel - 8;
        for (int i = -radius; i <= radius; i++)
        {
            temp += srcp[y * src_pitch + x + i] >> shift;
        }
        dstp[y * dst_pitch + x] = (unsigned short)(temp / (2 * radius + 1) << shift);
    }

AVSValue __cdecl Create_CTemporalSoften(AVSValue args, void* user_data, IScriptEnvironment* env)
{
    return new CTemporalSoften(args[0].AsClip(), args[1].AsInt(), args[2].AsBool(), args[3].AsBool(), args[4].AsBool(), args[5].AsBool(), env);
}

const AVS_Linkage* AVS_linkage = nullptr;

extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors)
{
    AVS_linkage = vectors;
    env->AddFunction("CTemporalSoften", "ciibbb", Create_CTemporalSoften, 0);
    return "CTemporalsoften";
}

};

PVideoFrame __stdcall CTemporalSoften::GetFrame(int n, IScriptEnvironment* env) {

    int i;
    PVideoFrame src[8];
    src[0] = child->GetFrame(n, env);

    for (i = 1; i < radius + 1; i += 1)
    {
        src[i] = isb ? child->GetFrame(max(n - i, 0), env) : child->GetFrame(min(n + i, vi.num_frames - 1), env);
    }
    PVideoFrame dst = env->NewVideoFrame(vi);
    int bits_per_pixel = vi.BitsPerPixel();

    if(bits_per_pixel == 8){
        unsigned char* dstp;
        const unsigned char* srcp;

        int dst_pitch;
        int src_pitch;
        int height;
        int row_size;
        int p;
        int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V };

    for (p = 0; p < 3; p += 1)
    {
        dstp = dst->GetWritePtr(planes[p]);
        dst_pitch = dst->GetPitch(planes[p]);
        height = dst->GetHeight(planes[p]);
        row_size = dst->GetRowSize(planes[p]);
        srcp[0] = src[0]->GetReadPtr(planes[p]);
        src_pitch[0] = src[0]->GetPitch(planes[p]);

                if (p > 0 && vi.IsY())
        {
            break;
        }
        else if (p == 0 && Y)
        {
            if(bits_per_pixel == 8){
                unsigned char* d_srcp[8], *d_dstp;
                cudaMalloc(&d_srcp[0], src_pitch[0] * height);
                cudaMalloc(&d_dstp, dst_pitch * height);

                cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height, cudaMemcpyHostToDevice);

                dim3 block(16, 16);
                dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
                CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);

                cudaMemcpy(dstp, d_dstp, dst_pitch * height, cudaMemcpyDeviceToHost);

                cudaFree(d_srcp[0]);
                cudaFree(d_dstp);
            }else if(bits_per_pixel == 10){
                unsigned short* d_srcp[8], *d_dstp;
                cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
                                 cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));

                cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);

                dim3 block(16, 16);
                dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
                CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);

                cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);

                cudaFree(d_srcp[0]);
                cudaFree(d_dstp);
            }else if(bits_per_pixel == 12){
                unsigned short* d_srcp[8], *d_dstp;
                cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
                cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));

                cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);

                dim3 block(16, 16);
                dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
                CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);

                cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);

                cudaFree(d_srcp[0]);
                cudaFree(d_dstp);
            }else if(bits_per_pixel == 14){
                unsigned short* d_srcp[8], *d_dstp;
                cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
                cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));

                cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);

                                dim3 block(16, 16);
                dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
                CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);

                cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);

                cudaFree(d_srcp[0]);
                cudaFree(d_dstp);
            }else if(bits_per_pixel == 16){
                unsigned short* d_srcp[8], *d_dstp;
                cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
                cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));

                cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);

                dim3 block(16, 16);
                dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
                CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);

                cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);

                cudaFree(d_srcp[0]);
                cudaFree(d_dstp);
            }
        }
    }
    return dst;
}