Advertisement
Ucurrent

CDegrain_cuda.cpp

Jan 17th, 2023 (edited)
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 25.98 KB | Source Code | 0 0
  1. // Include the CUDA runtime library
  2. #include <cuda_runtime.h>
  3. #include "cdegrain.h"
  4.  
  5. // Define the CUDA kernel function that will be called
  6. __global__ void CDegrain_kernel(unsigned char* src, unsigned char* den, unsigned char* dst, int nt, int thr, int radius, int blksize)
  7. {
  8.     // Perform the same operations as in CDegrain_Y,
  9.     // CDegrain_YUV420, CDegrain_YUV422, CDegrain_YUV444
  10.     // But use CUDA-specific data types and functions
  11.  
  12.     if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
  13.     {
  14.         env->ThrowError("CDegrain: supported colorspaces are Y, YUV420, YUV422, YUV444!");
  15.     }
  16.     else if ((vi.IsYUV420() || vi.IsYUV422() || vi.IsYUV444()) && blksize < 2)
  17.     {
  18.         env->ThrowError("CDegrain: YUV420/YUV422/YUV444 min. blksize is 2!");
  19.     }
  20.     else if (radius < 0 || radius > 7)
  21.     {
  22.         env->ThrowError("CDegrain: radius values must be in the [1, 7] range!");
  23.     }
  24.     else if (bit_depth != 8 && bit_depth != 10 && bit_depth != 12 && bit_depth != 14 && bit_depth != 16)
  25.     {
  26.         env->ThrowError("CDegrain: bit depth must be 8, 10, 12, 14 or 16!");
  27.     }
  28.     else
  29.     {
  30.         // Create a circular buffer of size radius+1
  31.         // circular_buffer<PVideoFrame> den(radius + 1);
  32.         // Fill the buffer with the frames from the input clip
  33.         for (int i = 0; i < radius; i += 1)
  34.         {
  35.             den[i] = dClip->GetFrame(max(n - radius + i, 0), env);
  36.         }
  37.         den[radius] = dClip->GetFrame(n, env);
  38.         for (int i = 0; i < radius; i += 1)
  39.         {
  40.             den[i + radius + 1] = dClip->GetFrame(min(n + i + 1, vi.num_frames - 1), env);
  41.         }
  42.  
  43.         // Use std::thread to create multiple threads and
  44.         // divide the workload between them
  45.         std::vector<std::thread> threads;
  46.         for (int i = 0; i < num_threads; i++)
  47.         {
  48.             int start = i * workload_per_thread;
  49.             int end = (i + 1) * workload_per_thread;
  50.             threads.emplace_back([&, start, end](){
  51.                 // Process the workload for this thread
  52.                 for (int i = start; i < end; i++)
  53.                 {
  54.          // Perform the operation on the specific frame
  55.     vi.IsY() ? CDegrain_Y(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
  56.     vi.IsYUV420() ? CDegrain_YUV420(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
  57.     vi.IsYUV422() ? CDegrain_YUV422(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
  58.     vi.IsYUV444() ? CDegrain_YUV444(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
  59.     }     }); }
  60. // Wait for all threads to finish
  61. //for (auto& thread : threads)
  62. {    
  63. thread.join();
  64. //old: return dst;
  65.     // ...
  66. }
  67.  
  68. CDegrain::CDegrain(PClip _child, int _radius, int _nt, int _thr, int _blksize, PClip _dClip, IScriptEnvironment* env) : GenericVideoFilter(_child), radius(_radius), nt(_nt), thr(_thr), blksize(_blksize), dClip(_dClip)
  69. {
  70.     if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
  71.     {
  72.         env->ThrowError("CDegrain: supported colorspaces are Y, YUV420, YUV422, YUV444!");
  73.     }
  74.     else if ((vi.IsYUV420() || vi.IsYUV422() || vi.IsYUV444()) && blksize < 2)
  75.     {
  76.         env->ThrowError("CDegrain: YUV420/YUV422/YUV444 min. blksize is 2!");
  77.     }
  78.     else if (radius < 0 || radius > 7)
  79.     {
  80.         env->ThrowError("CDegrain: radius values must be in the [1, 7] range!");
  81.     }
  82.     else
  83.     {
  84.         // Allocate memory on the GPU
  85.         unsigned char* src_gpu, * den_gpu, * dst_gpu;
  86.         size_t size = vi.num_frames * vi.height * vi.width;
  87.         cudaMalloc(&src_gpu, size);
  88.         cudaMalloc(&den_gpu, size);
  89.         cudaMalloc(&dst_gpu, size);
  90.  
  91.         // Copy the data from the host to the GPU
  92.         cudaMemcpy(src_gpu, src, size, cudaMemcpyHostToDevice);
  93.         cudaMemcpy(den_gpu, den, size, cudaMemcpyHostToDevice);
  94.  
  95.         // Set the kernel launch configuration
  96.         dim3 dimBlock(blksize, blksize);
  97.         dim3 dimGrid(vi.height / blksize, vi.width / blksize);
  98.  
  99.         // Launch the kernel on the GPU
  100.         CDegrain_kernel<<<dimGrid, dimBlock>>>(src_gpu, den_gpu, dst_gpu, nt, thr, radius, blksize);
  101.  
  102.         // Synchronize the execution of the GPU threads
  103.         cudaDeviceSynchronize();
  104.  
  105.         // Copy the data from the GPU to the host
  106.         cudaMemcpy(dst, dst_gpu, size,
  107.         cudaMemcpyDeviceToHost);
  108.  
  109.         // Free the memory on the GPU
  110.         cudaFree(src_gpu);
  111.  
  112. // Include the CUDA runtime library
  113. #include <cuda_runtime.h>
  114.  
  115. // Define the CUDA kernel function that will be called
  116. __global__ void CDegrain_YUV444_kernel(unsigned char* srcp[3][15], unsigned char* denp[3][15], unsigned char* dstp[3], int src_pitch[3][15], int den_pitch[3][15], int dst_pitch[3], int64_t BSY, int64_t BSX, int nt, int thr, int radius, int blksize)
  117. {
  118.     // Perform the same operations as in the original function
  119.     // But use CUDA-specific data types and functions
  120.     // Get the bit depth of the input video
  121.     int bit_depth = dst->GetBitDepth();
  122.  
  123.     // Create arrays to hold the pointers to the data for
  124.     // each plane
  125.     const unsigned char* srcp[3][15];
  126.     const unsigned char* denp[3][15];
  127.  
  128.     // Get the pointers to the data for each plane of each frame
  129.     for (int i = 0; i < 15; i++)
  130.     {
  131.         for (int j = 0; j < 3; j++)
  132.         {
  133.             srcp[j][min(i, radius * 2)] = src[min(i, radius * 2)]->GetReadPtr(PLANAR_Y + j);
  134.             denp[j][min(i, radius * 2)] = den[min(i, radius * 2)]->GetReadPtr(PLANAR_Y + j);
  135.         }
  136.     }
  137.  
  138.     // Get the pitch for each plane of each frame
  139.     int src_pitch[3][15];
  140.     int den_pitch[3][15];
  141.     for (int i = 0; i < 15; i++)
  142.     {
  143.         for (int j = 0; j < 3; j++)
  144.         {
  145.             src_pitch[j][min(i, radius * 2)] = src[min(i, radius * 2)]->GetPitch(PLANAR_Y + j);
  146.             den_pitch[j][min(i, radius * 2)] = den[min(i, radius * 2)]->GetPitch(PLANAR_Y + j);
  147.         }
  148.     }
  149.  
  150.     // Get the pointers to the data for the destination
  151.     //frame
  152.     unsigned char* dstp[3];
  153.     dstp[0] = dst->GetWritePtr(PLANAR_Y);
  154.     dstp[1] = dst->GetWritePtr(PLANAR_U);
  155.     dstp[2] = dst->GetWritePtr(PLANAR_V);
  156.  
  157.     // Get the pitch for the destination frame
  158.     int dst_pitch[3];
  159.     dst_pitch[0] = dst->GetPitch(PLANAR_Y);
  160.     dst_pitch[1] = dst->GetPitch(PLANAR_U);
  161.     dst_pitch[2] = dst->GetPitch(PLANAR_V);
  162.  
  163.     int height = dst->GetHeight(PLANAR_Y);
  164.     int row_size = dst->GetRowSize(PLANAR_Y);
  165.  
  166.     // Pre-allocate the SUM array
  167.     int64_t BSY = 0;
  168.     int64_t BSX = 0;
  169.     BSY += min(1, height % blksize);
  170.     BSY += height / blksize;
  171.     BSX += min(1, row_size % blksize);
  172.     BSX += row_size / blksize;
  173.     auto SUM = new int[BSY * BSX][15];
  174.     memset(SUM, 0, BSY * BSX * 15 * sizeof(int));
  175.  
  176.     int BX, BY;
  177.     int pixel_sum[4];
  178.     int x, y, a, b, temp;
  179.  
  180.     //DIFFERENCE
  181.     for (y = 0; y < height; y += 1)
  182.     {
  183.         BY = y / blksize;
  184.         BY += y % blksize ? 1 : 0;
  185.         for (x = 0; x < row_size; x += 1)
  186.         {
  187.             BX = x / blksize;
  188.             BX += x % blksize ? 1 : 0;
  189.             for (a = 0; a < 3; a += 1)
  190.             {
  191.                 for (b = 0; b < 15; b += 1)
  192.                 {
  193.                     temp = srcp[a][b][y*src_pitch[a][b] + x] - denp[a][b][y*den_pitch[a][b] + x];
  194.                     SUM[BY*BSX + BX][b] += temp*temp;
  195.                 }
  196.             }
  197.         }
  198.     }
  199.  
  200.     int sum_pitch = BSX;
  201.     int sum_pitch_uv = BSX >> 1;
  202.  
  203.     //FILTER
  204.     for (y = 0; y < height; y += 1)
  205.     {
  206.         for (x = 0; x < row_size; x += 1)
  207.         {
  208.             for (a = 0; a < 3; a += 1)
  209.             {
  210.                 pixel_sum[0] = 0;
  211.                 pixel_sum[1] = 0;
  212.                 pixel_sum[2] = 0;
  213.                 pixel_sum[3] = 0;
  214.  
  215.                 BY = y / blksize;
  216.                 BY += y % blksize ? 1 : 0;
  217.                 BX = x / blksize;
  218.                 BX += x % blksize ? 1 : 0;
  219.  
  220.                 for (b = 0; b < 15; b += 1)
  221.                 {
  222.                     temp = srcp[a][b][y*src_pitch[a][b] + x] - denp[a][b][y*den_pitch[a][b] + x];
  223.                     temp = min(abs(temp), nt);
  224.                     temp = temp < thr ? 0 : temp;
  225.                     temp *= temp;
  226.                     temp *= temp;
  227.                     temp *= temp;
  228.  
  229.                     if (SUM[(BY - 1)*sum_pitch + BX - 1][b] <= temp)
  230.                     {
  231.                         pixel_sum[0] += denp[a][b][y*den_pitch[a][b] + x];
  232.                     }
  233.                     if (SUM[(BY - 1)*sum_pitch + BX][b] <= temp)
  234.                     {
  235.                         pixel_sum[1] += denp[a][b][y*den_pitch[a][b] + x];
  236.                     }
  237.                     if (SUM[(BY - 1)*sum_pitch + BX + 1][b] <= temp)
  238.                     {
  239.                 pixel_sum[2] += denp[a][b][y*den_pitch[a][b] + x];
  240.             }
  241.             if (SUM[(BY)*sum_pitch + BX - 1][b] <= temp)
  242.             {
  243.                 pixel_sum[3] += denp[a][b][y*den_pitch[a][b] + x];
  244.             }
  245.         }
  246.         pixel_sum[0] = pixel_sum[0] > pixel_sum[3] ? pixel_sum[0] : pixel_sum[3];
  247.         pixel_sum[1] = pixel_sum[1] > pixel_sum[2] ? pixel_sum[1] : pixel_sum[2];
  248.         pixel_sum[0] = pixel_sum[0] > pixel_sum[1] ? pixel_sum[0] : pixel_sum[1];
  249.         dstp[a][y*dst_pitch[a] + x] = (unsigned char)min(max((int)srcp[a][radius][y*src_pitch[a][radius] + x] - (int)pixel_sum[0] / (radius*2 + 1), 0), (1 << bit_depth) - 1);
  250.     }
  251.     // ...
  252. }
  253.  
  254. void CDegrain_Y(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
  255. {
  256.     // Get the bit depth of the input video
  257.     int bit_depth = dst->GetBitDepth();
  258.  
  259.     // Allocate memory on the GPU for the pointers to the data for each plane
  260.     unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
  261.     cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
  262.     cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
  263.     cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
  264.  
  265.     // Copy the data from the host to the GPU
  266.     cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  267.     cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  268.     cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  269.  
  270.     // Allocate memory on the GPU for the pitch for each plane of each frame
  271.         int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
  272.     cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
  273.     cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
  274.     cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
  275.  
  276.     // Copy the data from the host to the GPU
  277.     cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  278.     cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  279.     cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
  280.  
  281.     // Allocate memory on the GPU for the SUM array
  282.     int64_t BSY = 0;
  283.     int64_t BSX = 0;
  284.     BSY += min(1, height % blksize);
  285.     BSY += height / blksize;
  286.     BSX += min(1, row_size % blksize);
  287.     BSX += row_size / blksize;
  288.     int* SUM_gpu;
  289.     cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
  290.     cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
  291.  
  292.     // Set the kernel launch configuration
  293.     dim3 dimBlock(blksize, blksize);
  294.     dim3 dimGrid(height / blksize, row_size / blksize);
  295.  
  296.     // Launch the kernel on the GPU
  297.     CDegrain_Y_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  298.     cudaDeviceSynchronize();
  299.  
  300.     // Copy the data from the GPU to the host
  301.     cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  302.     cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  303.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  304.     cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  305.     cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  306.     cudaMemcpy
  307.     //DIFFERENCE
  308.     int sum_pitch = BSX;
  309.     int sum_pitch_uv = BSX >> 1;
  310.     int64_t temp;
  311.     CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  312.     cudaDeviceSynchronize();
  313.  
  314.     //FILTER
  315.     CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  316.     cudaDeviceSynchronize();
  317.  
  318.     // Copy the data from the GPU to the host
  319.     cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
  320.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  321.  
  322.     //Free the memory on the GPU
  323.     cudaFree(srcp_gpu);
  324.     cudaFree(denp_gpu);
  325.     cudaFree(dstp_gpu);
  326.     cudaFree(src_pitch_gpu);
  327.     cudaFree(den_pitch_gpu);
  328.     cudaFree(dst_pitch_gpu);
  329.     cudaFree(SUM_gpu);
  330.  
  331. void CDegrain_YUV420(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
  332. {
  333.     // Get the bit depth of the input video
  334.     int bit_depth = dst->GetBitDepth();
  335.  
  336.     // Allocate memory on the GPU for the pointers to the data for each plane
  337.     unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
  338.     cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
  339.     cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
  340.     cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
  341.  
  342.     // Copy the data from the host to the GPU
  343.     cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  344.     cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  345.     cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  346.  
  347.     // Allocate memory on the GPU for the pitch for each plane of each frame
  348.         int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
  349.     cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
  350.     cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
  351.     cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
  352.  
  353.     // Copy the data from the host to the GPU
  354.     cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  355.     cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  356.     cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
  357.  
  358.     // Allocate memory on the GPU for the SUM array
  359.     int64_t BSY = 0;
  360.     int64_t BSX = 0;
  361.     BSY += min(1, height % blksize);
  362.     BSY += height / blksize;
  363.     BSX += min(1, row_size % blksize);
  364.     BSX += row_size / blksize;
  365.     int* SUM_gpu;
  366.     cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
  367.     cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
  368.  
  369.     // Set the kernel launch configuration
  370.     dim3 dimBlock(blksize, blksize);
  371.     dim3 dimGrid(height / blksize, row_size / blksize);
  372.  
  373.     // Launch the kernel on the GPU
  374.     CDegrain_YUV420_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  375.     cudaDeviceSynchronize();
  376.  
  377.     // Copy the data from the GPU to the host
  378.     cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  379.     cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  380.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  381.     cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  382.     cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  383.     cudaMemcpy
  384.     //DIFFERENCE
  385.     int sum_pitch = BSX;
  386.     int sum_pitch_uv = BSX >> 1;
  387.     int64_t temp;
  388.     CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  389.     cudaDeviceSynchronize();
  390.  
  391.     //FILTER
  392.     CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  393.     cudaDeviceSynchronize();
  394.  
  395.     // Copy the data from the GPU to the host
  396.     cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
  397.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  398.  
  399.     //Free the memory on the GPU
  400.     cudaFree(srcp_gpu);
  401.     cudaFree(denp_gpu);
  402.     cudaFree(dstp_gpu);
  403.     cudaFree(src_pitch_gpu);
  404.     cudaFree(den_pitch_gpu);
  405.     cudaFree(dst_pitch_gpu);
  406.     cudaFree(SUM_gpu);
  407.  
  408. void CDegrain_YUV422(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
  409. {
  410.     // Get the bit depth of the input video
  411.     int bit_depth = dst->GetBitDepth();
  412.  
  413.     // Allocate memory on the GPU for the pointers to the data for each plane
  414.     unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
  415.     cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
  416.     cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
  417.     cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
  418.  
  419.     // Copy the data from the host to the GPU
  420.     cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  421.     cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  422.     cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  423.  
  424.     // Allocate memory on the GPU for the pitch for each plane of each frame
  425.         int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
  426.     cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
  427.     cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
  428.     cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
  429.  
  430.     // Copy the data from the host to the GPU
  431.     cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  432.     cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  433.     cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
  434.  
  435.     // Allocate memory on the GPU for the SUM array
  436.     int64_t BSY = 0;
  437.     int64_t BSX = 0;
  438.     BSY += min(1, height % blksize);
  439.     BSY += height / blksize;
  440.     BSX += min(1, row_size % blksize);
  441.     BSX += row_size / blksize;
  442.     int* SUM_gpu;
  443.     cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
  444.     cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
  445.  
  446.     // Set the kernel launch configuration
  447.     dim3 dimBlock(blksize, blksize);
  448.     dim3 dimGrid(height / blksize, row_size / blksize);
  449.  
  450.     // Launch the kernel on the GPU
  451.     CDegrain_YUV422_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  452.     cudaDeviceSynchronize();
  453.  
  454.     // Copy the data from the GPU to the host
  455.     cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  456.     cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  457.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  458.     cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  459.     cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  460.     cudaMemcpy
  461.     //DIFFERENCE
  462.     int sum_pitch = BSX;
  463.     int sum_pitch_uv = BSX >> 1;
  464.     int64_t temp;
  465.     CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  466.     cudaDeviceSynchronize();
  467.  
  468.     //FILTER
  469.     CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  470.     cudaDeviceSynchronize();
  471.  
  472.     // Copy the data from the GPU to the host
  473.     cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
  474.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  475.  
  476.     //Free the memory on the GPU
  477.     cudaFree(srcp_gpu);
  478.     cudaFree(denp_gpu);
  479.     cudaFree(dstp_gpu);
  480.     cudaFree(src_pitch_gpu);
  481.     cudaFree(den_pitch_gpu);
  482.     cudaFree(dst_pitch_gpu);
  483.     cudaFree(SUM_gpu);
  484.  
  485. void CDegrain_YUV444(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
  486. {
  487.     // Get the bit depth of the input video
  488.     int bit_depth = dst->GetBitDepth();
  489.  
  490.     // Allocate memory on the GPU for the pointers to the data for each plane
  491.     unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
  492.     cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
  493.     cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
  494.     cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
  495.  
  496.     // Copy the data from the host to the GPU
  497.     cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  498.     cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  499.     cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
  500.  
  501.     // Allocate memory on the GPU for the pitch for each plane of each frame
  502.         int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
  503.     cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
  504.     cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
  505.     cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
  506.  
  507.     // Copy the data from the host to the GPU
  508.     cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  509.     cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
  510.     cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
  511.  
  512.     // Allocate memory on the GPU for the SUM array
  513.     int64_t BSY = 0;
  514.     int64_t BSX = 0;
  515.     BSY += min(1, height % blksize);
  516.     BSY += height / blksize;
  517.     BSX += min(1, row_size % blksize);
  518.     BSX += row_size / blksize;
  519.     int* SUM_gpu;
  520.     cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
  521.     cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
  522.  
  523.     // Set the kernel launch configuration
  524.     dim3 dimBlock(blksize, blksize);
  525.     dim3 dimGrid(height / blksize, row_size / blksize);
  526.  
  527.     // Launch the kernel on the GPU
  528.     CDegrain_YUV444_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  529.     cudaDeviceSynchronize();
  530.  
  531.     // Copy the data from the GPU to the host
  532.     cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  533.     cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  534.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  535.     cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  536.     cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
  537.     cudaMemcpy
  538.     //DIFFERENCE
  539.     int sum_pitch = BSX;
  540.     int sum_pitch_uv = BSX >> 1;
  541.     int64_t temp;
  542.     CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  543.     cudaDeviceSynchronize();
  544.  
  545.     //FILTER
  546.     CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
  547.     cudaDeviceSynchronize();
  548.  
  549.     // Copy the data from the GPU to the host
  550.     cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
  551.     cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
  552.  
  553.     //Free the memory on the GPU
  554.     cudaFree(srcp_gpu);
  555.     cudaFree(denp_gpu);
  556.     cudaFree(dstp_gpu);
  557.     cudaFree(src_pitch_gpu);
  558.     cudaFree(den_pitch_gpu);
  559.     cudaFree(dst_pitch_gpu);
  560.     cudaFree(SUM_gpu);
Tags: CUDA
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement