CDegrain_cuda.cpp

// Include the CUDA runtime library
#include <cuda_runtime.h>
#include "cdegrain.h"

// Define the CUDA kernel function that will be called
__global__ void CDegrain_kernel(unsigned char* src, unsigned char* den, unsigned char* dst, int nt, int thr, int radius, int blksize)
{
    // Perform the same operations as in CDegrain_Y,
    // CDegrain_YUV420, CDegrain_YUV422, CDegrain_YUV444
    // But use CUDA-specific data types and functions

    if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
    {
        env->ThrowError("CDegrain: supported colorspaces are Y, YUV420, YUV422, YUV444!");
    }
    else if ((vi.IsYUV420() || vi.IsYUV422() || vi.IsYUV444()) && blksize < 2)
    {
        env->ThrowError("CDegrain: YUV420/YUV422/YUV444 min. blksize is 2!");
    }
    else if (radius < 0 || radius > 7)
    {
        env->ThrowError("CDegrain: radius values must be in the [1, 7] range!");
    }
    else if (bit_depth != 8 && bit_depth != 10 && bit_depth != 12 && bit_depth != 14 && bit_depth != 16)
    {
        env->ThrowError("CDegrain: bit depth must be 8, 10, 12, 14 or 16!");
    }
    else
    {
        // Create a circular buffer of size radius+1
        // circular_buffer<PVideoFrame> den(radius + 1);
        // Fill the buffer with the frames from the input clip
        for (int i = 0; i < radius; i += 1)
        {
            den[i] = dClip->GetFrame(max(n - radius + i, 0), env);
        }
        den[radius] = dClip->GetFrame(n, env);
        for (int i = 0; i < radius; i += 1)
        {
            den[i + radius + 1] = dClip->GetFrame(min(n + i + 1, vi.num_frames - 1), env);
        }

        // Use std::thread to create multiple threads and
        // divide the workload between them
        std::vector<std::thread> threads;
        for (int i = 0; i < num_threads; i++)
        {
            int start = i * workload_per_thread;
            int end = (i + 1) * workload_per_thread;
            threads.emplace_back([&, start, end](){
                // Process the workload for this thread
                for (int i = start; i < end; i++)
                {
         // Perform the operation on the specific frame
    vi.IsY() ? CDegrain_Y(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
    vi.IsYUV420() ? CDegrain_YUV420(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
    vi.IsYUV422() ? CDegrain_YUV422(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
    vi.IsYUV444() ? CDegrain_YUV444(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
    }     }); }
// Wait for all threads to finish
//for (auto& thread : threads)
{
thread.join();
//old: return dst;
    // ...
}

CDegrain::CDegrain(PClip _child, int _radius, int _nt, int _thr, int _blksize, PClip _dClip, IScriptEnvironment* env) : GenericVideoFilter(_child), radius(_radius), nt(_nt), thr(_thr), blksize(_blksize), dClip(_dClip)
{
    if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
    {
        env->ThrowError("CDegrain: supported colorspaces are Y, YUV420, YUV422, YUV444!");
    }
    else if ((vi.IsYUV420() || vi.IsYUV422() || vi.IsYUV444()) && blksize < 2)
    {
        env->ThrowError("CDegrain: YUV420/YUV422/YUV444 min. blksize is 2!");
    }
    else if (radius < 0 || radius > 7)
    {
        env->ThrowError("CDegrain: radius values must be in the [1, 7] range!");
    }
    else
    {
        // Allocate memory on the GPU
        unsigned char* src_gpu, * den_gpu, * dst_gpu;
        size_t size = vi.num_frames * vi.height * vi.width;
        cudaMalloc(&src_gpu, size);
        cudaMalloc(&den_gpu, size);
        cudaMalloc(&dst_gpu, size);

        // Copy the data from the host to the GPU
        cudaMemcpy(src_gpu, src, size, cudaMemcpyHostToDevice);
        cudaMemcpy(den_gpu, den, size, cudaMemcpyHostToDevice);

        // Set the kernel launch configuration
        dim3 dimBlock(blksize, blksize);
        dim3 dimGrid(vi.height / blksize, vi.width / blksize);

        // Launch the kernel on the GPU
        CDegrain_kernel<<<dimGrid, dimBlock>>>(src_gpu, den_gpu, dst_gpu, nt, thr, radius, blksize);

        // Synchronize the execution of the GPU threads
        cudaDeviceSynchronize();

        // Copy the data from the GPU to the host
        cudaMemcpy(dst, dst_gpu, size,
        cudaMemcpyDeviceToHost);

        // Free the memory on the GPU
        cudaFree(src_gpu);

// Include the CUDA runtime library
#include <cuda_runtime.h>

// Define the CUDA kernel function that will be called
__global__ void CDegrain_YUV444_kernel(unsigned char* srcp[3][15], unsigned char* denp[3][15], unsigned char* dstp[3], int src_pitch[3][15], int den_pitch[3][15], int dst_pitch[3], int64_t BSY, int64_t BSX, int nt, int thr, int radius, int blksize)
{
    // Perform the same operations as in the original function
    // But use CUDA-specific data types and functions
    // Get the bit depth of the input video
    int bit_depth = dst->GetBitDepth();

    // Create arrays to hold the pointers to the data for
    // each plane
    const unsigned char* srcp[3][15];
    const unsigned char* denp[3][15];

    // Get the pointers to the data for each plane of each frame
    for (int i = 0; i < 15; i++)
    {
        for (int j = 0; j < 3; j++)
        {
            srcp[j][min(i, radius * 2)] = src[min(i, radius * 2)]->GetReadPtr(PLANAR_Y + j);
            denp[j][min(i, radius * 2)] = den[min(i, radius * 2)]->GetReadPtr(PLANAR_Y + j);
        }
    }

    // Get the pitch for each plane of each frame
    int src_pitch[3][15];
    int den_pitch[3][15];
    for (int i = 0; i < 15; i++)
    {
        for (int j = 0; j < 3; j++)
        {
            src_pitch[j][min(i, radius * 2)] = src[min(i, radius * 2)]->GetPitch(PLANAR_Y + j);
            den_pitch[j][min(i, radius * 2)] = den[min(i, radius * 2)]->GetPitch(PLANAR_Y + j);
        }
    }

    // Get the pointers to the data for the destination
    //frame
    unsigned char* dstp[3];
    dstp[0] = dst->GetWritePtr(PLANAR_Y);
    dstp[1] = dst->GetWritePtr(PLANAR_U);
    dstp[2] = dst->GetWritePtr(PLANAR_V);

    // Get the pitch for the destination frame
    int dst_pitch[3];
    dst_pitch[0] = dst->GetPitch(PLANAR_Y);
    dst_pitch[1] = dst->GetPitch(PLANAR_U);
    dst_pitch[2] = dst->GetPitch(PLANAR_V);

    int height = dst->GetHeight(PLANAR_Y);
    int row_size = dst->GetRowSize(PLANAR_Y);

    // Pre-allocate the SUM array
    int64_t BSY = 0;
    int64_t BSX = 0;
    BSY += min(1, height % blksize);
    BSY += height / blksize;
    BSX += min(1, row_size % blksize);
    BSX += row_size / blksize;
    auto SUM = new int[BSY * BSX][15];
    memset(SUM, 0, BSY * BSX * 15 * sizeof(int));

    int BX, BY;
    int pixel_sum[4];
    int x, y, a, b, temp;

    //DIFFERENCE
    for (y = 0; y < height; y += 1)
    {
        BY = y / blksize;
        BY += y % blksize ? 1 : 0;
        for (x = 0; x < row_size; x += 1)
        {
            BX = x / blksize;
            BX += x % blksize ? 1 : 0;
            for (a = 0; a < 3; a += 1)
            {
                for (b = 0; b < 15; b += 1)
                {
                    temp = srcp[a][b][y*src_pitch[a][b] + x] - denp[a][b][y*den_pitch[a][b] + x];
                    SUM[BY*BSX + BX][b] += temp*temp;
                }
            }
        }
    }

    int sum_pitch = BSX;
    int sum_pitch_uv = BSX >> 1;

    //FILTER
    for (y = 0; y < height; y += 1)
    {
        for (x = 0; x < row_size; x += 1)
        {
            for (a = 0; a < 3; a += 1)
            {
                pixel_sum[0] = 0;
                pixel_sum[1] = 0;
                pixel_sum[2] = 0;
                pixel_sum[3] = 0;

                BY = y / blksize;
                BY += y % blksize ? 1 : 0;
                BX = x / blksize;
                BX += x % blksize ? 1 : 0;

                for (b = 0; b < 15; b += 1)
                {
                    temp = srcp[a][b][y*src_pitch[a][b] + x] - denp[a][b][y*den_pitch[a][b] + x];
                    temp = min(abs(temp), nt);
                    temp = temp < thr ? 0 : temp;
                    temp *= temp;
                    temp *= temp;
                    temp *= temp;

                    if (SUM[(BY - 1)*sum_pitch + BX - 1][b] <= temp)
                    {
                        pixel_sum[0] += denp[a][b][y*den_pitch[a][b] + x];
                    }
                    if (SUM[(BY - 1)*sum_pitch + BX][b] <= temp)
                    {
                        pixel_sum[1] += denp[a][b][y*den_pitch[a][b] + x];
                    }
                    if (SUM[(BY - 1)*sum_pitch + BX + 1][b] <= temp)
                    {
                pixel_sum[2] += denp[a][b][y*den_pitch[a][b] + x];
            }
            if (SUM[(BY)*sum_pitch + BX - 1][b] <= temp)
            {
                pixel_sum[3] += denp[a][b][y*den_pitch[a][b] + x];
            }
        }
        pixel_sum[0] = pixel_sum[0] > pixel_sum[3] ? pixel_sum[0] : pixel_sum[3];
        pixel_sum[1] = pixel_sum[1] > pixel_sum[2] ? pixel_sum[1] : pixel_sum[2];
        pixel_sum[0] = pixel_sum[0] > pixel_sum[1] ? pixel_sum[0] : pixel_sum[1];
        dstp[a][y*dst_pitch[a] + x] = (unsigned char)min(max((int)srcp[a][radius][y*src_pitch[a][radius] + x] - (int)pixel_sum[0] / (radius*2 + 1), 0), (1 << bit_depth) - 1);
    }
    // ...
}

void CDegrain_Y(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
{
    // Get the bit depth of the input video
    int bit_depth = dst->GetBitDepth();

    // Allocate memory on the GPU for the pointers to the data for each plane
    unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
    cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));

    // Copy the data from the host to the GPU
    cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the pitch for each plane of each frame
        int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
    cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));

    // Copy the data from the host to the GPU
    cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the SUM array
    int64_t BSY = 0;
    int64_t BSX = 0;
    BSY += min(1, height % blksize);
    BSY += height / blksize;
    BSX += min(1, row_size % blksize);
    BSX += row_size / blksize;
    int* SUM_gpu;
    cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
    cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));

    // Set the kernel launch configuration
    dim3 dimBlock(blksize, blksize);
    dim3 dimGrid(height / blksize, row_size / blksize);

    // Launch the kernel on the GPU
    CDegrain_Y_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy
    //DIFFERENCE
    int sum_pitch = BSX;
    int sum_pitch_uv = BSX >> 1;
    int64_t temp;
    CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    //FILTER
    CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);

    //Free the memory on the GPU
    cudaFree(srcp_gpu);
    cudaFree(denp_gpu);
    cudaFree(dstp_gpu);
    cudaFree(src_pitch_gpu);
    cudaFree(den_pitch_gpu);
    cudaFree(dst_pitch_gpu);
    cudaFree(SUM_gpu);

void CDegrain_YUV420(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
{
    // Get the bit depth of the input video
    int bit_depth = dst->GetBitDepth();

    // Allocate memory on the GPU for the pointers to the data for each plane
    unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
    cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));

    // Copy the data from the host to the GPU
    cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the pitch for each plane of each frame
        int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
    cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));

    // Copy the data from the host to the GPU
    cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the SUM array
    int64_t BSY = 0;
    int64_t BSX = 0;
    BSY += min(1, height % blksize);
    BSY += height / blksize;
    BSX += min(1, row_size % blksize);
    BSX += row_size / blksize;
    int* SUM_gpu;
    cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
    cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));

    // Set the kernel launch configuration
    dim3 dimBlock(blksize, blksize);
    dim3 dimGrid(height / blksize, row_size / blksize);

    // Launch the kernel on the GPU
    CDegrain_YUV420_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy
    //DIFFERENCE
    int sum_pitch = BSX;
    int sum_pitch_uv = BSX >> 1;
    int64_t temp;
    CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    //FILTER
    CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);

    //Free the memory on the GPU
    cudaFree(srcp_gpu);
    cudaFree(denp_gpu);
    cudaFree(dstp_gpu);
    cudaFree(src_pitch_gpu);
    cudaFree(den_pitch_gpu);
    cudaFree(dst_pitch_gpu);
    cudaFree(SUM_gpu);

void CDegrain_YUV422(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
{
    // Get the bit depth of the input video
    int bit_depth = dst->GetBitDepth();

    // Allocate memory on the GPU for the pointers to the data for each plane
    unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
    cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));

    // Copy the data from the host to the GPU
    cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the pitch for each plane of each frame
        int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
    cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));

    // Copy the data from the host to the GPU
    cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the SUM array
    int64_t BSY = 0;
    int64_t BSX = 0;
    BSY += min(1, height % blksize);
    BSY += height / blksize;
    BSX += min(1, row_size % blksize);
    BSX += row_size / blksize;
    int* SUM_gpu;
    cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
    cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));

    // Set the kernel launch configuration
    dim3 dimBlock(blksize, blksize);
    dim3 dimGrid(height / blksize, row_size / blksize);

    // Launch the kernel on the GPU
    CDegrain_YUV422_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy
    //DIFFERENCE
    int sum_pitch = BSX;
    int sum_pitch_uv = BSX >> 1;
    int64_t temp;
    CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    //FILTER
    CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);

    //Free the memory on the GPU
    cudaFree(srcp_gpu);
    cudaFree(denp_gpu);
    cudaFree(dstp_gpu);
    cudaFree(src_pitch_gpu);
    cudaFree(den_pitch_gpu);
    cudaFree(dst_pitch_gpu);
    cudaFree(SUM_gpu);

void CDegrain_YUV444(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
{
    // Get the bit depth of the input video
    int bit_depth = dst->GetBitDepth();

    // Allocate memory on the GPU for the pointers to the data for each plane
    unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
    cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
    cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));

    // Copy the data from the host to the GPU
    cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
    cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the pitch for each plane of each frame
        int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
    cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
    cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));

    // Copy the data from the host to the GPU
    cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);

    // Allocate memory on the GPU for the SUM array
    int64_t BSY = 0;
    int64_t BSX = 0;
    BSY += min(1, height % blksize);
    BSY += height / blksize;
    BSX += min(1, row_size % blksize);
    BSX += row_size / blksize;
    int* SUM_gpu;
    cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
    cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));

    // Set the kernel launch configuration
    dim3 dimBlock(blksize, blksize);
    dim3 dimGrid(height / blksize, row_size / blksize);

    // Launch the kernel on the GPU
    CDegrain_YUV444_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
    cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy
    //DIFFERENCE
    int sum_pitch = BSX;
    int sum_pitch_uv = BSX >> 1;
    int64_t temp;
    CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    //FILTER
    CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
    cudaDeviceSynchronize();

    // Copy the data from the GPU to the host
    cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);

    //Free the memory on the GPU
    cudaFree(srcp_gpu);
    cudaFree(denp_gpu);
    cudaFree(dstp_gpu);
    cudaFree(src_pitch_gpu);
    cudaFree(den_pitch_gpu);
    cudaFree(dst_pitch_gpu);
    cudaFree(SUM_gpu);