Advertisement
Ucurrent

CTemporalsoften_cuda.cpp

Jan 17th, 2023 (edited)
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 10.94 KB | Source Code | 0 0
  1. // CTemporalsoften.cpp cuda attempt
  2. #include "CTemporalSoften.h"
  3. #include <cuda_runtime.h>
  4.  
  5. /* old code
  6. CTemporalSoften::CTemporalSoften(PClip _child, int _radius, bool _isb, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), radius(_radius), isb(_isb), Y(_Y), U(_U), V(_V)
  7. {
  8.     if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
  9.     {
  10.         env->ThrowError("CTemporalSoften: supported colorspaces are Y, YUV420, YUV422, YUV444!");
  11.     }
  12.     else if (radius < 0 || radius > 7)
  13.     {
  14.         env->ThrowError("CTemporalSoften: radius values must be in the [1, 7] range!");
  15.     }
  16. }
  17.  
  18. __global__ void ctemporalSoftenKernel(unsigned char* dstp, const unsigned char* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius)
  19. {
  20.     int x = threadIdx.x + blockIdx.x * blockDim.x;
  21.     int y = threadIdx.y + blockIdx.y * blockDim.y;
  22.     if (x >= row_size || y >= height)
  23.         return;
  24.  
  25.     int64_t temp = 0;
  26.     for (int i = -radius; i <= radius; i++)
  27.     {
  28.         temp += srcp[y * src_pitch + x + i];
  29.     }
  30.     dstp[y * dst_pitch + x] = (unsigned char)(temp / (2 * radius + 1));
  31. }
  32.  
  33. PVideoFrame __stdcall CTemporalSoften::GetFrame(int n, IScriptEnvironment* env) {
  34.  
  35.     int i;
  36.     PVideoFrame src[8];
  37.     src[0] = child->GetFrame(n, env);
  38.  
  39.     for (i = 1; i < radius + 1; i += 1)
  40.     {
  41.         src[i] = isb ? child->GetFrame(max(n - i, 0), env) : child->GetFrame(min(n + i, vi.num_frames - 1), env);
  42.     }
  43.     PVideoFrame dst = env->NewVideoFrame(vi);
  44.     int bits_per_pixel = vi.BitsPerPixel();
  45.  
  46.     if(bits_per_pixel == 8){
  47.         unsigned char* dstp;
  48.         const unsigned char* srcp;
  49.  
  50.         int dst_pitch;
  51.         int src_pitch;
  52.         int height;
  53.         int row_size;
  54.         int p;
  55.         int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V };
  56.  
  57.     for (p = 0; p < 3; p += 1)
  58.     {
  59.         dstp = dst->GetWritePtr(planes[p]);
  60.         dst_pitch = dst->GetPitch(planes[p]);
  61.         height = dst->GetHeight(planes[p]);
  62.         row_size = dst->GetRowSize(planes[p]);
  63.         srcp[0] = src[0]->GetReadPtr(planes[p]);
  64.         src_pitch[0] = src[0]->GetPitch(planes[p]);
  65.  
  66.         if (p > 0 && vi.IsY())
  67.         {
  68.             break;
  69.         }
  70.         else if (p == 0 && !Y)
  71.         {
  72.             for (y = 0; y < height; y += 1)
  73.             {
  74.                 memcpy(dstp, srcp[0], row_size);
  75.                 dstp += dst_pitch;
  76.                 srcp[0] += src_pitch[0];
  77.             }
  78.         }
  79.         else if (p == 1 && !U)
  80.         {
  81.             for (y = 0; y < height; y += 1)
  82.             {
  83.                 memcpy(dstp, srcp[0], row_size);
  84.                 dstp += dst_pitch;
  85.                 srcp[0] += src_pitch[0];
  86.             }
  87.         }
  88.         else if (p == 2 && !V)
  89.         {
  90.             for (y = 0; y < height; y += 1)
  91.             {
  92.                 memcpy(dstp, srcp[0], row_size);
  93.                 dstp += dst_pitch;
  94.                 srcp[0] += src_pitch[0];
  95.             }
  96.         }
  97.         else
  98.         {
  99.             for (i = 1; i < radius + 1; i += 1)
  100.             {
  101.                 srcp[i] = src[i]->GetReadPtr(planes[p]);
  102.                 src_pitch[i] = src[i]->GetPitch(planes[p]);
  103.             }
  104.             for (y = 0; y < height; y += 1)
  105.             {
  106.                 for (x = 0; x < row_size; x += 1)
  107.                 {
  108.                     temp = 0;
  109.                     for (i = 0; i < radius + 1; i += 1)
  110.                     {
  111.                         temp += srcp[i][x];
  112.                     }
  113.                     dstp[x] = (unsigned char)(temp / (radius + 1));
  114.                 }
  115.                 dstp += dst_pitch;
  116.                 for (i = 0; i < radius + 1; i += 1)
  117.                 {
  118.                     srcp[i] += src_pitch[i];
  119.                 }
  120.             }
  121.         }
  122.     } */
  123.  
  124. // new code:
  125. class CTemporalSoften : public GenericVideoFilter
  126. {
  127. public:
  128.     __global__ void ctemporalSoftenKernel(unsigned char* dstp, const unsigned char* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius)
  129.     {
  130.         // kernel function for 8-bit
  131.  
  132.         int x = threadIdx.x + blockIdx.x * blockDim.x;
  133.         int y = threadIdx.y + blockIdx.y * blockDim.y;
  134.         if (x >= row_size || y >= height)
  135.             return;
  136.  
  137.         int64_t temp = 0;
  138.         for (int i = -radius; i <= radius; i++)
  139.         {
  140.             temp += srcp[y * src_pitch + x + i];
  141.         }
  142.         dstp[y * dst_pitch + x] = (unsigned char)(temp / (2 * radius + 1));
  143.     }
  144.  
  145.         __global__ void ctemporalSoftenKernel(unsigned short* dstp, const unsigned short* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius, int bits_per_pixel)
  146.     {
  147.         // kernel function for 10...16-bit
  148.         int x = threadIdx.x + blockIdx.x * blockDim.x;
  149.         int y = threadIdx.y + blockIdx.y * blockDim.y;
  150.         if (x >= row_size || y >= height)
  151.             return;
  152.  
  153.         int64_t temp = 0;
  154.         int shift = bits_per_pixel - 8;
  155.         for (int i = -radius; i <= radius; i++)
  156.         {
  157.             temp += srcp[y * src_pitch + x + i] >> shift;
  158.         }
  159.         dstp[y * dst_pitch + x] = (unsigned short)(temp / (2 * radius + 1) << shift);
  160.     }
  161.  
  162. AVSValue __cdecl Create_CTemporalSoften(AVSValue args, void* user_data, IScriptEnvironment* env)
  163. {
  164.     return new CTemporalSoften(args[0].AsClip(), args[1].AsInt(), args[2].AsBool(), args[3].AsBool(), args[4].AsBool(), args[5].AsBool(), env);
  165. }
  166.  
  167. const AVS_Linkage* AVS_linkage = nullptr;
  168.  
  169. extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors)
  170. {
  171.     AVS_linkage = vectors;
  172.     env->AddFunction("CTemporalSoften", "ciibbb", Create_CTemporalSoften, 0);
  173.     return "CTemporalsoften";
  174. }
  175.  
  176. };
  177.  
  178. PVideoFrame __stdcall CTemporalSoften::GetFrame(int n, IScriptEnvironment* env) {
  179.  
  180.     int i;
  181.     PVideoFrame src[8];
  182.     src[0] = child->GetFrame(n, env);
  183.  
  184.     for (i = 1; i < radius + 1; i += 1)
  185.     {
  186.         src[i] = isb ? child->GetFrame(max(n - i, 0), env) : child->GetFrame(min(n + i, vi.num_frames - 1), env);
  187.     }
  188.     PVideoFrame dst = env->NewVideoFrame(vi);
  189.     int bits_per_pixel = vi.BitsPerPixel();
  190.  
  191.     if(bits_per_pixel == 8){
  192.         unsigned char* dstp;
  193.         const unsigned char* srcp;
  194.  
  195.         int dst_pitch;
  196.         int src_pitch;
  197.         int height;
  198.         int row_size;
  199.         int p;
  200.         int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V };
  201.  
  202.     for (p = 0; p < 3; p += 1)
  203.     {
  204.         dstp = dst->GetWritePtr(planes[p]);
  205.         dst_pitch = dst->GetPitch(planes[p]);
  206.         height = dst->GetHeight(planes[p]);
  207.         row_size = dst->GetRowSize(planes[p]);
  208.         srcp[0] = src[0]->GetReadPtr(planes[p]);
  209.         src_pitch[0] = src[0]->GetPitch(planes[p]);
  210.  
  211.                 if (p > 0 && vi.IsY())
  212.         {
  213.             break;
  214.         }
  215.         else if (p == 0 && Y)
  216.         {
  217.             if(bits_per_pixel == 8){
  218.                 unsigned char* d_srcp[8], *d_dstp;
  219.                 cudaMalloc(&d_srcp[0], src_pitch[0] * height);
  220.                 cudaMalloc(&d_dstp, dst_pitch * height);
  221.  
  222.                 cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height, cudaMemcpyHostToDevice);
  223.  
  224.                 dim3 block(16, 16);
  225.                 dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
  226.                 CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
  227.  
  228.                 cudaMemcpy(dstp, d_dstp, dst_pitch * height, cudaMemcpyDeviceToHost);
  229.  
  230.                 cudaFree(d_srcp[0]);
  231.                 cudaFree(d_dstp);
  232.             }else if(bits_per_pixel == 10){
  233.                 unsigned short* d_srcp[8], *d_dstp;
  234.                 cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
  235.                                  cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
  236.  
  237.                 cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
  238.  
  239.                 dim3 block(16, 16);
  240.                 dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
  241.                 CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
  242.  
  243.                 cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
  244.  
  245.                 cudaFree(d_srcp[0]);
  246.                 cudaFree(d_dstp);
  247.             }else if(bits_per_pixel == 12){
  248.                 unsigned short* d_srcp[8], *d_dstp;
  249.                 cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
  250.                 cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
  251.  
  252.                 cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
  253.  
  254.                 dim3 block(16, 16);
  255.                 dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
  256.                 CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
  257.  
  258.                 cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
  259.  
  260.                 cudaFree(d_srcp[0]);
  261.                 cudaFree(d_dstp);
  262.             }else if(bits_per_pixel == 14){
  263.                 unsigned short* d_srcp[8], *d_dstp;
  264.                 cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
  265.                 cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
  266.  
  267.                 cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
  268.  
  269.                                 dim3 block(16, 16);
  270.                 dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
  271.                 CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
  272.  
  273.                 cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
  274.  
  275.                 cudaFree(d_srcp[0]);
  276.                 cudaFree(d_dstp);
  277.             }else if(bits_per_pixel == 16){
  278.                 unsigned short* d_srcp[8], *d_dstp;
  279.                 cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
  280.                 cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
  281.  
  282.                 cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
  283.  
  284.                 dim3 block(16, 16);
  285.                 dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
  286.                 CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
  287.  
  288.                 cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
  289.  
  290.                 cudaFree(d_srcp[0]);
  291.                 cudaFree(d_dstp);
  292.             }
  293.         }
  294.     }
  295.     return dst;
  296. }
  297.  
  298.  
  299.            
  300.  
Tags: CUDA
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement