Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Include the CUDA runtime library
- #include <cuda_runtime.h>
- #include "cdegrain.h"
- // Define the CUDA kernel function that will be called
- __global__ void CDegrain_kernel(unsigned char* src, unsigned char* den, unsigned char* dst, int nt, int thr, int radius, int blksize)
- {
- // Perform the same operations as in CDegrain_Y,
- // CDegrain_YUV420, CDegrain_YUV422, CDegrain_YUV444
- // But use CUDA-specific data types and functions
- if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
- {
- env->ThrowError("CDegrain: supported colorspaces are Y, YUV420, YUV422, YUV444!");
- }
- else if ((vi.IsYUV420() || vi.IsYUV422() || vi.IsYUV444()) && blksize < 2)
- {
- env->ThrowError("CDegrain: YUV420/YUV422/YUV444 min. blksize is 2!");
- }
- else if (radius < 0 || radius > 7)
- {
- env->ThrowError("CDegrain: radius values must be in the [1, 7] range!");
- }
- else if (bit_depth != 8 && bit_depth != 10 && bit_depth != 12 && bit_depth != 14 && bit_depth != 16)
- {
- env->ThrowError("CDegrain: bit depth must be 8, 10, 12, 14 or 16!");
- }
- else
- {
- // Create a circular buffer of size radius+1
- // circular_buffer<PVideoFrame> den(radius + 1);
- // Fill the buffer with the frames from the input clip
- for (int i = 0; i < radius; i += 1)
- {
- den[i] = dClip->GetFrame(max(n - radius + i, 0), env);
- }
- den[radius] = dClip->GetFrame(n, env);
- for (int i = 0; i < radius; i += 1)
- {
- den[i + radius + 1] = dClip->GetFrame(min(n + i + 1, vi.num_frames - 1), env);
- }
- // Use std::thread to create multiple threads and
- // divide the workload between them
- std::vector<std::thread> threads;
- for (int i = 0; i < num_threads; i++)
- {
- int start = i * workload_per_thread;
- int end = (i + 1) * workload_per_thread;
- threads.emplace_back([&, start, end](){
- // Process the workload for this thread
- for (int i = start; i < end; i++)
- {
- // Perform the operation on the specific frame
- vi.IsY() ? CDegrain_Y(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
- vi.IsYUV420() ? CDegrain_YUV420(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
- vi.IsYUV422() ? CDegrain_YUV422(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
- vi.IsYUV444() ? CDegrain_YUV444(src, den, dst, nt, thr, radius, blksize, bit_depth) : NULL;
- } }); }
- // Wait for all threads to finish
- //for (auto& thread : threads)
- {
- thread.join();
- //old: return dst;
- // ...
- }
- CDegrain::CDegrain(PClip _child, int _radius, int _nt, int _thr, int _blksize, PClip _dClip, IScriptEnvironment* env) : GenericVideoFilter(_child), radius(_radius), nt(_nt), thr(_thr), blksize(_blksize), dClip(_dClip)
- {
- if (!vi.IsY() && !vi.IsYUV420() && !vi.IsYUV422() && !vi.IsYUV444())
- {
- env->ThrowError("CDegrain: supported colorspaces are Y, YUV420, YUV422, YUV444!");
- }
- else if ((vi.IsYUV420() || vi.IsYUV422() || vi.IsYUV444()) && blksize < 2)
- {
- env->ThrowError("CDegrain: YUV420/YUV422/YUV444 min. blksize is 2!");
- }
- else if (radius < 0 || radius > 7)
- {
- env->ThrowError("CDegrain: radius values must be in the [1, 7] range!");
- }
- else
- {
- // Allocate memory on the GPU
- unsigned char* src_gpu, * den_gpu, * dst_gpu;
- size_t size = vi.num_frames * vi.height * vi.width;
- cudaMalloc(&src_gpu, size);
- cudaMalloc(&den_gpu, size);
- cudaMalloc(&dst_gpu, size);
- // Copy the data from the host to the GPU
- cudaMemcpy(src_gpu, src, size, cudaMemcpyHostToDevice);
- cudaMemcpy(den_gpu, den, size, cudaMemcpyHostToDevice);
- // Set the kernel launch configuration
- dim3 dimBlock(blksize, blksize);
- dim3 dimGrid(vi.height / blksize, vi.width / blksize);
- // Launch the kernel on the GPU
- CDegrain_kernel<<<dimGrid, dimBlock>>>(src_gpu, den_gpu, dst_gpu, nt, thr, radius, blksize);
- // Synchronize the execution of the GPU threads
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(dst, dst_gpu, size,
- cudaMemcpyDeviceToHost);
- // Free the memory on the GPU
- cudaFree(src_gpu);
- // Include the CUDA runtime library
- #include <cuda_runtime.h>
- // Define the CUDA kernel function that will be called
- __global__ void CDegrain_YUV444_kernel(unsigned char* srcp[3][15], unsigned char* denp[3][15], unsigned char* dstp[3], int src_pitch[3][15], int den_pitch[3][15], int dst_pitch[3], int64_t BSY, int64_t BSX, int nt, int thr, int radius, int blksize)
- {
- // Perform the same operations as in the original function
- // But use CUDA-specific data types and functions
- // Get the bit depth of the input video
- int bit_depth = dst->GetBitDepth();
- // Create arrays to hold the pointers to the data for
- // each plane
- const unsigned char* srcp[3][15];
- const unsigned char* denp[3][15];
- // Get the pointers to the data for each plane of each frame
- for (int i = 0; i < 15; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- srcp[j][min(i, radius * 2)] = src[min(i, radius * 2)]->GetReadPtr(PLANAR_Y + j);
- denp[j][min(i, radius * 2)] = den[min(i, radius * 2)]->GetReadPtr(PLANAR_Y + j);
- }
- }
- // Get the pitch for each plane of each frame
- int src_pitch[3][15];
- int den_pitch[3][15];
- for (int i = 0; i < 15; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- src_pitch[j][min(i, radius * 2)] = src[min(i, radius * 2)]->GetPitch(PLANAR_Y + j);
- den_pitch[j][min(i, radius * 2)] = den[min(i, radius * 2)]->GetPitch(PLANAR_Y + j);
- }
- }
- // Get the pointers to the data for the destination
- //frame
- unsigned char* dstp[3];
- dstp[0] = dst->GetWritePtr(PLANAR_Y);
- dstp[1] = dst->GetWritePtr(PLANAR_U);
- dstp[2] = dst->GetWritePtr(PLANAR_V);
- // Get the pitch for the destination frame
- int dst_pitch[3];
- dst_pitch[0] = dst->GetPitch(PLANAR_Y);
- dst_pitch[1] = dst->GetPitch(PLANAR_U);
- dst_pitch[2] = dst->GetPitch(PLANAR_V);
- int height = dst->GetHeight(PLANAR_Y);
- int row_size = dst->GetRowSize(PLANAR_Y);
- // Pre-allocate the SUM array
- int64_t BSY = 0;
- int64_t BSX = 0;
- BSY += min(1, height % blksize);
- BSY += height / blksize;
- BSX += min(1, row_size % blksize);
- BSX += row_size / blksize;
- auto SUM = new int[BSY * BSX][15];
- memset(SUM, 0, BSY * BSX * 15 * sizeof(int));
- int BX, BY;
- int pixel_sum[4];
- int x, y, a, b, temp;
- //DIFFERENCE
- for (y = 0; y < height; y += 1)
- {
- BY = y / blksize;
- BY += y % blksize ? 1 : 0;
- for (x = 0; x < row_size; x += 1)
- {
- BX = x / blksize;
- BX += x % blksize ? 1 : 0;
- for (a = 0; a < 3; a += 1)
- {
- for (b = 0; b < 15; b += 1)
- {
- temp = srcp[a][b][y*src_pitch[a][b] + x] - denp[a][b][y*den_pitch[a][b] + x];
- SUM[BY*BSX + BX][b] += temp*temp;
- }
- }
- }
- }
- int sum_pitch = BSX;
- int sum_pitch_uv = BSX >> 1;
- //FILTER
- for (y = 0; y < height; y += 1)
- {
- for (x = 0; x < row_size; x += 1)
- {
- for (a = 0; a < 3; a += 1)
- {
- pixel_sum[0] = 0;
- pixel_sum[1] = 0;
- pixel_sum[2] = 0;
- pixel_sum[3] = 0;
- BY = y / blksize;
- BY += y % blksize ? 1 : 0;
- BX = x / blksize;
- BX += x % blksize ? 1 : 0;
- for (b = 0; b < 15; b += 1)
- {
- temp = srcp[a][b][y*src_pitch[a][b] + x] - denp[a][b][y*den_pitch[a][b] + x];
- temp = min(abs(temp), nt);
- temp = temp < thr ? 0 : temp;
- temp *= temp;
- temp *= temp;
- temp *= temp;
- if (SUM[(BY - 1)*sum_pitch + BX - 1][b] <= temp)
- {
- pixel_sum[0] += denp[a][b][y*den_pitch[a][b] + x];
- }
- if (SUM[(BY - 1)*sum_pitch + BX][b] <= temp)
- {
- pixel_sum[1] += denp[a][b][y*den_pitch[a][b] + x];
- }
- if (SUM[(BY - 1)*sum_pitch + BX + 1][b] <= temp)
- {
- pixel_sum[2] += denp[a][b][y*den_pitch[a][b] + x];
- }
- if (SUM[(BY)*sum_pitch + BX - 1][b] <= temp)
- {
- pixel_sum[3] += denp[a][b][y*den_pitch[a][b] + x];
- }
- }
- pixel_sum[0] = pixel_sum[0] > pixel_sum[3] ? pixel_sum[0] : pixel_sum[3];
- pixel_sum[1] = pixel_sum[1] > pixel_sum[2] ? pixel_sum[1] : pixel_sum[2];
- pixel_sum[0] = pixel_sum[0] > pixel_sum[1] ? pixel_sum[0] : pixel_sum[1];
- dstp[a][y*dst_pitch[a] + x] = (unsigned char)min(max((int)srcp[a][radius][y*src_pitch[a][radius] + x] - (int)pixel_sum[0] / (radius*2 + 1), 0), (1 << bit_depth) - 1);
- }
- // ...
- }
- void CDegrain_Y(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
- {
- // Get the bit depth of the input video
- int bit_depth = dst->GetBitDepth();
- // Allocate memory on the GPU for the pointers to the data for each plane
- unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
- cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
- // Copy the data from the host to the GPU
- cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the pitch for each plane of each frame
- int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
- cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
- // Copy the data from the host to the GPU
- cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the SUM array
- int64_t BSY = 0;
- int64_t BSX = 0;
- BSY += min(1, height % blksize);
- BSY += height / blksize;
- BSX += min(1, row_size % blksize);
- BSX += row_size / blksize;
- int* SUM_gpu;
- cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
- cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
- // Set the kernel launch configuration
- dim3 dimBlock(blksize, blksize);
- dim3 dimGrid(height / blksize, row_size / blksize);
- // Launch the kernel on the GPU
- CDegrain_Y_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy
- //DIFFERENCE
- int sum_pitch = BSX;
- int sum_pitch_uv = BSX >> 1;
- int64_t temp;
- CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- //FILTER
- CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- //Free the memory on the GPU
- cudaFree(srcp_gpu);
- cudaFree(denp_gpu);
- cudaFree(dstp_gpu);
- cudaFree(src_pitch_gpu);
- cudaFree(den_pitch_gpu);
- cudaFree(dst_pitch_gpu);
- cudaFree(SUM_gpu);
- void CDegrain_YUV420(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
- {
- // Get the bit depth of the input video
- int bit_depth = dst->GetBitDepth();
- // Allocate memory on the GPU for the pointers to the data for each plane
- unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
- cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
- // Copy the data from the host to the GPU
- cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the pitch for each plane of each frame
- int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
- cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
- // Copy the data from the host to the GPU
- cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the SUM array
- int64_t BSY = 0;
- int64_t BSX = 0;
- BSY += min(1, height % blksize);
- BSY += height / blksize;
- BSX += min(1, row_size % blksize);
- BSX += row_size / blksize;
- int* SUM_gpu;
- cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
- cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
- // Set the kernel launch configuration
- dim3 dimBlock(blksize, blksize);
- dim3 dimGrid(height / blksize, row_size / blksize);
- // Launch the kernel on the GPU
- CDegrain_YUV420_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy
- //DIFFERENCE
- int sum_pitch = BSX;
- int sum_pitch_uv = BSX >> 1;
- int64_t temp;
- CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- //FILTER
- CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- //Free the memory on the GPU
- cudaFree(srcp_gpu);
- cudaFree(denp_gpu);
- cudaFree(dstp_gpu);
- cudaFree(src_pitch_gpu);
- cudaFree(den_pitch_gpu);
- cudaFree(dst_pitch_gpu);
- cudaFree(SUM_gpu);
- void CDegrain_YUV422(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
- {
- // Get the bit depth of the input video
- int bit_depth = dst->GetBitDepth();
- // Allocate memory on the GPU for the pointers to the data for each plane
- unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
- cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
- // Copy the data from the host to the GPU
- cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the pitch for each plane of each frame
- int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
- cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
- // Copy the data from the host to the GPU
- cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the SUM array
- int64_t BSY = 0;
- int64_t BSX = 0;
- BSY += min(1, height % blksize);
- BSY += height / blksize;
- BSX += min(1, row_size % blksize);
- BSX += row_size / blksize;
- int* SUM_gpu;
- cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
- cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
- // Set the kernel launch configuration
- dim3 dimBlock(blksize, blksize);
- dim3 dimGrid(height / blksize, row_size / blksize);
- // Launch the kernel on the GPU
- CDegrain_YUV422_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy
- //DIFFERENCE
- int sum_pitch = BSX;
- int sum_pitch_uv = BSX >> 1;
- int64_t temp;
- CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- //FILTER
- CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- //Free the memory on the GPU
- cudaFree(srcp_gpu);
- cudaFree(denp_gpu);
- cudaFree(dstp_gpu);
- cudaFree(src_pitch_gpu);
- cudaFree(den_pitch_gpu);
- cudaFree(dst_pitch_gpu);
- cudaFree(SUM_gpu);
- void CDegrain_YUV444(PVideoFrame(&src)[15], PVideoFrame(&den)[15], PVideoFrame& dst, int nt, int thr, int radius, int blksize)
- {
- // Get the bit depth of the input video
- int bit_depth = dst->GetBitDepth();
- // Allocate memory on the GPU for the pointers to the data for each plane
- unsigned char* srcp_gpu[3][15], * denp_gpu[3][15], * dstp_gpu[3];
- cudaMalloc(&srcp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&denp_gpu, 15 * 3 * sizeof(unsigned char*));
- cudaMalloc(&dstp_gpu, 3 * sizeof(unsigned char*));
- // Copy the data from the host to the GPU
- cudaMemcpy(srcp_gpu, srcp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(denp_gpu, denp, 15 * 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- cudaMemcpy(dstp_gpu, dstp, 3 * sizeof(unsigned char*), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the pitch for each plane of each frame
- int src_pitch_gpu[3][15], den_pitch_gpu[3][15], dst_pitch_gpu[3];
- cudaMalloc(&src_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&den_pitch_gpu, 15 * 3 * sizeof(int));
- cudaMalloc(&dst_pitch_gpu, 3 * sizeof(int));
- // Copy the data from the host to the GPU
- cudaMemcpy(src_pitch_gpu, src_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(den_pitch_gpu, den_pitch, 15 * 3 * sizeof(int), cudaMemcpyHostToDevice);
- cudaMemcpy(dst_pitch_gpu, dst_pitch, 3 * sizeof(int), cudaMemcpyHostToDevice);
- // Allocate memory on the GPU for the SUM array
- int64_t BSY = 0;
- int64_t BSX = 0;
- BSY += min(1, height % blksize);
- BSY += height / blksize;
- BSX += min(1, row_size % blksize);
- BSX += row_size / blksize;
- int* SUM_gpu;
- cudaMalloc(&SUM_gpu, BSY * BSX * 15 * sizeof(int));
- cudaMemset(SUM_gpu, 0, BSY * BSX * 15 * sizeof(int));
- // Set the kernel launch configuration
- dim3 dimBlock(blksize, blksize);
- dim3 dimGrid(height / blksize, row_size / blksize);
- // Launch the kernel on the GPU
- CDegrain_YUV444_kernel<<<dimGrid, dimBlock>>>(srcp_gpu, denp_gpu, dstp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(srcp, srcp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(denp, denp_gpu, 15 * 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- cudaMemcpy(src_pitch, src_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(den_pitch, den_pitch_gpu, 15 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy
- //DIFFERENCE
- int sum_pitch = BSX;
- int sum_pitch_uv = BSX >> 1;
- int64_t temp;
- CDegrain_Difference_kernel<<<dimGrid, dimBlock>>>(SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- //FILTER
- CDegrain_Filter_kernel<<<dimGrid, dimBlock>>>(dstp_gpu, SUM_gpu, srcp_gpu, denp_gpu, src_pitch_gpu, den_pitch_gpu, dst_pitch_gpu, BSY, BSX, nt, thr, radius, blksize);
- cudaDeviceSynchronize();
- // Copy the data from the GPU to the host
- cudaMemcpy(SUM, SUM_gpu, BSY * BSX * 15 * sizeof(int), cudaMemcpyDeviceToHost);
- cudaMemcpy(dstp, dstp_gpu, 3 * sizeof(unsigned char*), cudaMemcpyDeviceToHost);
- //Free the memory on the GPU
- cudaFree(srcp_gpu);
- cudaFree(denp_gpu);
- cudaFree(dstp_gpu);
- cudaFree(src_pitch_gpu);
- cudaFree(den_pitch_gpu);
- cudaFree(dst_pitch_gpu);
- cudaFree(SUM_gpu);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement