Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // CTemporalsoften.cpp cuda attempt
- #include "CTemporalSoften.h"
- #include <cuda_runtime.h>
- /* old code
- CTemporalSoften::CTemporalSoften(PClip _child, int _radius, bool _isb, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), radius(_radius), isb(_isb), Y(_Y), U(_U), V(_V)
- {
- if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
- {
- env->ThrowError("CTemporalSoften: supported colorspaces are Y, YUV420, YUV422, YUV444!");
- }
- else if (radius < 0 || radius > 7)
- {
- env->ThrowError("CTemporalSoften: radius values must be in the [1, 7] range!");
- }
- }
- __global__ void ctemporalSoftenKernel(unsigned char* dstp, const unsigned char* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius)
- {
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- if (x >= row_size || y >= height)
- return;
- int64_t temp = 0;
- for (int i = -radius; i <= radius; i++)
- {
- temp += srcp[y * src_pitch + x + i];
- }
- dstp[y * dst_pitch + x] = (unsigned char)(temp / (2 * radius + 1));
- }
- PVideoFrame __stdcall CTemporalSoften::GetFrame(int n, IScriptEnvironment* env) {
- int i;
- PVideoFrame src[8];
- src[0] = child->GetFrame(n, env);
- for (i = 1; i < radius + 1; i += 1)
- {
- src[i] = isb ? child->GetFrame(max(n - i, 0), env) : child->GetFrame(min(n + i, vi.num_frames - 1), env);
- }
- PVideoFrame dst = env->NewVideoFrame(vi);
- int bits_per_pixel = vi.BitsPerPixel();
- if(bits_per_pixel == 8){
- unsigned char* dstp;
- const unsigned char* srcp;
- int dst_pitch;
- int src_pitch;
- int height;
- int row_size;
- int p;
- int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V };
- for (p = 0; p < 3; p += 1)
- {
- dstp = dst->GetWritePtr(planes[p]);
- dst_pitch = dst->GetPitch(planes[p]);
- height = dst->GetHeight(planes[p]);
- row_size = dst->GetRowSize(planes[p]);
- srcp[0] = src[0]->GetReadPtr(planes[p]);
- src_pitch[0] = src[0]->GetPitch(planes[p]);
- if (p > 0 && vi.IsY())
- {
- break;
- }
- else if (p == 0 && !Y)
- {
- for (y = 0; y < height; y += 1)
- {
- memcpy(dstp, srcp[0], row_size);
- dstp += dst_pitch;
- srcp[0] += src_pitch[0];
- }
- }
- else if (p == 1 && !U)
- {
- for (y = 0; y < height; y += 1)
- {
- memcpy(dstp, srcp[0], row_size);
- dstp += dst_pitch;
- srcp[0] += src_pitch[0];
- }
- }
- else if (p == 2 && !V)
- {
- for (y = 0; y < height; y += 1)
- {
- memcpy(dstp, srcp[0], row_size);
- dstp += dst_pitch;
- srcp[0] += src_pitch[0];
- }
- }
- else
- {
- for (i = 1; i < radius + 1; i += 1)
- {
- srcp[i] = src[i]->GetReadPtr(planes[p]);
- src_pitch[i] = src[i]->GetPitch(planes[p]);
- }
- for (y = 0; y < height; y += 1)
- {
- for (x = 0; x < row_size; x += 1)
- {
- temp = 0;
- for (i = 0; i < radius + 1; i += 1)
- {
- temp += srcp[i][x];
- }
- dstp[x] = (unsigned char)(temp / (radius + 1));
- }
- dstp += dst_pitch;
- for (i = 0; i < radius + 1; i += 1)
- {
- srcp[i] += src_pitch[i];
- }
- }
- }
- } */
- // new code:
- class CTemporalSoften : public GenericVideoFilter
- {
- public:
- __global__ void ctemporalSoftenKernel(unsigned char* dstp, const unsigned char* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius)
- {
- // kernel function for 8-bit
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- if (x >= row_size || y >= height)
- return;
- int64_t temp = 0;
- for (int i = -radius; i <= radius; i++)
- {
- temp += srcp[y * src_pitch + x + i];
- }
- dstp[y * dst_pitch + x] = (unsigned char)(temp / (2 * radius + 1));
- }
- __global__ void ctemporalSoftenKernel(unsigned short* dstp, const unsigned short* srcp, int dst_pitch, int src_pitch, int height, int row_size, int radius, int bits_per_pixel)
- {
- // kernel function for 10...16-bit
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- if (x >= row_size || y >= height)
- return;
- int64_t temp = 0;
- int shift = bits_per_pixel - 8;
- for (int i = -radius; i <= radius; i++)
- {
- temp += srcp[y * src_pitch + x + i] >> shift;
- }
- dstp[y * dst_pitch + x] = (unsigned short)(temp / (2 * radius + 1) << shift);
- }
- AVSValue __cdecl Create_CTemporalSoften(AVSValue args, void* user_data, IScriptEnvironment* env)
- {
- return new CTemporalSoften(args[0].AsClip(), args[1].AsInt(), args[2].AsBool(), args[3].AsBool(), args[4].AsBool(), args[5].AsBool(), env);
- }
- const AVS_Linkage* AVS_linkage = nullptr;
- extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors)
- {
- AVS_linkage = vectors;
- env->AddFunction("CTemporalSoften", "ciibbb", Create_CTemporalSoften, 0);
- return "CTemporalsoften";
- }
- };
- PVideoFrame __stdcall CTemporalSoften::GetFrame(int n, IScriptEnvironment* env) {
- int i;
- PVideoFrame src[8];
- src[0] = child->GetFrame(n, env);
- for (i = 1; i < radius + 1; i += 1)
- {
- src[i] = isb ? child->GetFrame(max(n - i, 0), env) : child->GetFrame(min(n + i, vi.num_frames - 1), env);
- }
- PVideoFrame dst = env->NewVideoFrame(vi);
- int bits_per_pixel = vi.BitsPerPixel();
- if(bits_per_pixel == 8){
- unsigned char* dstp;
- const unsigned char* srcp;
- int dst_pitch;
- int src_pitch;
- int height;
- int row_size;
- int p;
- int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V };
- for (p = 0; p < 3; p += 1)
- {
- dstp = dst->GetWritePtr(planes[p]);
- dst_pitch = dst->GetPitch(planes[p]);
- height = dst->GetHeight(planes[p]);
- row_size = dst->GetRowSize(planes[p]);
- srcp[0] = src[0]->GetReadPtr(planes[p]);
- src_pitch[0] = src[0]->GetPitch(planes[p]);
- if (p > 0 && vi.IsY())
- {
- break;
- }
- else if (p == 0 && Y)
- {
- if(bits_per_pixel == 8){
- unsigned char* d_srcp[8], *d_dstp;
- cudaMalloc(&d_srcp[0], src_pitch[0] * height);
- cudaMalloc(&d_dstp, dst_pitch * height);
- cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height, cudaMemcpyHostToDevice);
- dim3 block(16, 16);
- dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
- CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
- cudaMemcpy(dstp, d_dstp, dst_pitch * height, cudaMemcpyDeviceToHost);
- cudaFree(d_srcp[0]);
- cudaFree(d_dstp);
- }else if(bits_per_pixel == 10){
- unsigned short* d_srcp[8], *d_dstp;
- cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
- cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
- cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
- dim3 block(16, 16);
- dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
- CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
- cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
- cudaFree(d_srcp[0]);
- cudaFree(d_dstp);
- }else if(bits_per_pixel == 12){
- unsigned short* d_srcp[8], *d_dstp;
- cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
- cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
- cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
- dim3 block(16, 16);
- dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
- CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
- cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
- cudaFree(d_srcp[0]);
- cudaFree(d_dstp);
- }else if(bits_per_pixel == 14){
- unsigned short* d_srcp[8], *d_dstp;
- cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
- cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
- cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
- dim3 block(16, 16);
- dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
- CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
- cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
- cudaFree(d_srcp[0]);
- cudaFree(d_dstp);
- }else if(bits_per_pixel == 16){
- unsigned short* d_srcp[8], *d_dstp;
- cudaMalloc(&d_srcp[0], src_pitch[0] * height*sizeof(unsigned short));
- cudaMalloc(&d_dstp, dst_pitch * height*sizeof(unsigned short));
- cudaMemcpy(d_srcp[0], srcp[0], src_pitch[0] * height*sizeof(unsigned short), cudaMemcpyHostToDevice);
- dim3 block(16, 16);
- dim3 grid((row_size + block.x - 1) / block.x, (height + block.y - 1) / block.y);
- CtemporalSoftenKernel<<<grid, block>>>(d_dstp, d_srcp[0], dst_pitch, src_pitch[0], height, row_size, radius);
- cudaMemcpy(dstp, d_dstp, dst_pitch * height*sizeof(unsigned short), cudaMemcpyDeviceToHost);
- cudaFree(d_srcp[0]);
- cudaFree(d_dstp);
- }
- }
- }
- return dst;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement