Untitled

pixels = fileSizeBytes / 2;
        heightPx = 3040;
        widthPx = 4096;

        cudaMalloc(&d_inp, pixels*4*sizeof(ushort));
        d_img = d_inp + pixels;

        cudaMemcpy(d_inp, h_img  , pixels*sizeof(ushort), cudaMemcpyHostToDevice);

        dim3 block(widthPx/16, heightPx/16);
        dim3 threads(16,16);
        bayerRG <<<block,threads>>>(d_img, d_inp, widthPx,heightPx);

        cudaMemcpy(gpu_output, d_img, pixels*3*sizeof(ushort), cudaMemcpyDeviceToHost);

        cv::Mat outputMat_16UC3CUDA = cv::Mat( 3040, 4096, CV_16UC3, gpu_output );

__global__ void bayerRG(ushort *d_img, ushort *d_inp, uint width, uint height)
{
    uint x =  (blockIdx.x* blockDim.x + threadIdx.x);
    uint y =  (blockIdx.y* blockDim.y + threadIdx.y);
    uint img_i = y * width*3 + x*3; //3 channels in image
    uint inp_i = (y*width +x); //1 channel in input

    d_img[img_i] = d_inp[inp_i];
    d_img[img_i + 1] = d_inp[inp_i];
    d_img[img_i + 2] = d_inp[inp_i];
}