cuda img filtering - better


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "helper_cuda.h"

#include <stdlib.h>
#include <stdio.h>
#include <memory>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <ctime>

#include "bitmap_image.hpp"
#include "csv_parser/csv_parser.hpp"

#ifndef __CUDACC__
    #define __CUDACC__
#endif

const int Filter_Size = 7;
const int Filter_Total_Size = 49;

__device__ __constant__ float g_filter[Filter_Total_Size];
__device__ __constant__ float g_factor[2];

__device__ int buffadr(unsigned int x, unsigned int y, unsigned int imageWidth, unsigned int imageHeight)
{
    if (x >= imageWidth || y >= imageHeight)
    {
        return -1;
    }
    return y*imageWidth + x;
}

__global__ void filterKernel(const unsigned char *inputColor, unsigned char *outputColor, unsigned int imageWidth, unsigned int imageHeight,
    unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
{
    __shared__ int blockXstart, blockYstart;
    __shared__ int blockXend, blockYend;

    if (threadIdx.x == 0 && threadIdx.y == 0)
    {
        //policz i zapisz do shared czym zajmuje sie dany blok
        blockXstart = blockIdx.x * blockPartWidth;
        blockYstart = blockIdx.y * blockPartHeight;
        blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
        blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
        if (blockXend > imageWidth)
        {
            blockXend = imageWidth;
        }
        if (blockYend > imageHeight)
        {
            blockYend = imageHeight;
        }
        //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
    }

    __syncthreads();
    //teraz każdy wątek liczy sobie swój start i koniec
    unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
    unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
    unsigned int threadXend = threadXstart + threadPartWidth;
    unsigned int threadYend = threadYstart + threadPartHeight;
    threadXend = (threadXend > blockXend) ? blockXend : threadXend;
    threadYend = (threadYend > blockYend) ? blockYend : threadYend;

    int x_, y_, bufadr_;
    int c;
    for (int y = threadYstart; y < threadYend; y++)
    {
        for (int x = threadXstart; x < threadXend; x++)
        {
            bufadr_ = buffadr(x, y, imageWidth, imageHeight);
            if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
            {
                continue;
            }
            c = 0;
            for (int j = -3; j < 4; j++)
            {
                for (int i = -3; i < 4; i++)
                {
                    y_ = y + j;
                    x_ = x + i;
                    bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
                    if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
                    {
                        continue;
                    }
                    c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
                }
            }
            c = g_factor[0] * c + g_factor[1];

            c = (c < 0) ? 0 : c;
            c = (c > 255) ? 255 : c;
            outputColor[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c;
        }
    }

}

__global__ void newFilterKernel(const unsigned char *inputR, unsigned char *outputR, const unsigned char *inputG,
    unsigned char *outputG, const unsigned char *inputB, unsigned char *outputB, unsigned int imageWidth, unsigned int imageHeight,
    unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
{
    __shared__ int blockXstart, blockYstart;
    __shared__ int blockXend, blockYend;

    if (threadIdx.x == 0 && threadIdx.y == 0)
    {
        //policz i zapisz do shared czym zajmuje sie dany blok
        blockXstart = blockIdx.x * blockPartWidth;
        blockYstart = blockIdx.y * blockPartHeight;
        blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
        blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
        if (blockXend > imageWidth)
        {
            blockXend = imageWidth;
        }
        if (blockYend > imageHeight)
        {
            blockYend = imageHeight;
        }
        //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
    }

    __syncthreads();
    //teraz każdy wątek liczy sobie swój start i koniec
    unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
    unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
    unsigned int threadXend = threadXstart + threadPartWidth;
    unsigned int threadYend = threadYstart + threadPartHeight;
    threadXend = (threadXend > blockXend) ? blockXend : threadXend;
    threadYend = (threadYend > blockYend) ? blockYend : threadYend;

    int x_, y_, bufadr_;
    int c_r, c_g, c_b;
    float fil;
    for (int y = threadYstart; y < threadYend; y++)
    {
        for (int x = threadXstart; x < threadXend; x++)
        {
            bufadr_ = buffadr(x, y, imageWidth, imageHeight);
            if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
            {
                continue;
            }
            c_r = c_g = c_b = 0;
            for (int j = -3; j < 4; j++)
            {
                for (int i = -3; i < 4; i++)
                {
                    fil = g_filter[((j + 3)*7) + i + 3];
                    if(0.0 == fil)
                    { continue; }
                    y_ = y + j;
                    x_ = x + i;
                    bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
                    if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
                    {
                        continue;
                    }
                    //c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
                    c_r += fil * (int)inputR[bufadr_];
                    c_g += fil * (int)inputG[bufadr_];
                    c_b += fil * (int)inputB[bufadr_];
                }
            }
            c_r = g_factor[0] * c_r + g_factor[1];
            c_g = g_factor[0] * c_g + g_factor[1];
            c_b = g_factor[0] * c_b + g_factor[1];

            c_r = (c_r < 0) ? 0 : c_r;
            c_r = (c_r > 255) ? 255 : c_r;
            outputR[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_r;

            c_g = (c_g < 0) ? 0 : c_g;
            c_g = (c_g > 255) ? 255 : c_g;
            outputG[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_g;

            c_b = (c_b < 0) ? 0 : c_b;
            c_b = (c_b > 255) ? 255 : c_b;
            outputB[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_b;
        }
    }
}

void printHelp()
{
    std::cout << "HELP: \n";
    std::cout << "exe filter_file_path image_file_path\n";
    std::cout << "\n";
    system("pause");
}

void readDeviceAttributes(int devId, int &driverVersion, int &runtimeVersion, unsigned long long &totalGlobalMem,
    int &multiProcessorCount, int &cudaCores, int &warpSize, unsigned int &totalConstantMem,
    unsigned int &sharedMemPerBlock, int &maxThreadsPerBlock, int &maxThreadsPerMP)
{
    /* code from deviceQuery example*/
    cudaSetDevice(devId);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, devId);
    printf("\nDevice %d: \"%s\"\n", devId, deviceProp.name);
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);
    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

    totalGlobalMem = (unsigned long long) deviceProp.totalGlobalMem;
    printf("  Total amount of global memory: (%llu bytes)\n", totalGlobalMem);
    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
        deviceProp.multiProcessorCount,
        _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
        _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

    printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
    totalConstantMem = deviceProp.totalConstMem;
    printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
    sharedMemPerBlock = deviceProp.sharedMemPerBlock;
    printf("  Warp size:                                     %d\n", deviceProp.warpSize);

    maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
    maxThreadsPerMP = deviceProp.maxThreadsPerMultiProcessor;
    multiProcessorCount = deviceProp.multiProcessorCount;
    cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);

    warpSize = deviceProp.warpSize;
    printf("\n");
}

int main(int argc, char **argv)
{
    clock_t start,end,kernelstart,kernelend;
    start = clock();

    int devId = 0;
    int warpSize, multiProcesorCount, cudaCores;
    int runtimeVersion, driverVersion;
    unsigned long long totalGlobalMem;
    unsigned int sharedMemPerBlock, totalConstantMem;
    int maxThreadsPerBlock, maxThreadsPerMP;

    cudaDeviceProp deviceProp;

    int status = EXIT_SUCCESS;
    std::cout << argv[0] << " starting.. \n";
    std::cout << "CUDA Image filtering - 03:40\n";

    unsigned char *g_r = 0; //graphics red
    unsigned char *g_g = 0; //graphics green
    unsigned char *g_b = 0; //graphics blue
    unsigned char *g_or = 0; //graphics output red
    unsigned char *g_og = 0; //graphics output green
    unsigned char *g_ob = 0; //graphics output blue
    cudaError_t cudaStatus;

    /* read cuda devices */
#pragma region readDevices
    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
    if (error_id != cudaSuccess)
    {
        std::cout << "cudaGetDeviceCount returned " << (int)error_id << "->" << cudaGetErrorString(error_id) << "\n";
        std::cout << "Result = FAIL\n";
        exit(EXIT_FAILURE);
    }
    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        std::cout << "There are no available device(s) that support CUDA\n";
        std::cout << "Result = FAIL\n";
        exit(EXIT_FAILURE);
    }
    else
    {
        std::cout << "Detected " << deviceCount << " CUDA Capable device(s)\n";
        if (deviceCount > 1)
        {
            std::cout << "Application will run on device with id= " << devId <<"\n";
        }
    }
#pragma endregion readDevices

    /* read input parameters */
#pragma region readAndParseInput
    if (argc < 3)
    {
        std::cout << "You passed " << argc << "arguments\n";
        std::cout << "Not enough arguments passed to program\n";
        printHelp();
        std::cout << "Result = FAIL\n";
        exit(EXIT_FAILURE);
    }
    float filter_value[49];
    float factor[2];

    /* read filter file and load to constant mem */
    const char field_terminator = ',';
    const char line_terminator  = '\n';
    const char enclosure_char   = '"';

    csv_parser file_parser;
    file_parser.set_skip_lines(0);
    file_parser.init(argv[1]);

    file_parser.set_enclosed_char(enclosure_char, ENCLOSURE_OPTIONAL);
    file_parser.set_field_term_char(field_terminator);
    file_parser.set_line_term_char(line_terminator);

    std::string num;
    float tmp;
    for (int i = 0; i < 8; i++)
    {
        if(!file_parser.has_more_rows() && 7 != i)
        {
            std::cout << "Malformed csv filter\n";
            std::cout << "line:" << i << std::endl;
            printHelp();
            std::cout << "Result = FAIL\n";
            exit(EXIT_FAILURE);
        }
        csv_row row = file_parser.get_row();

        if (i < 7 && 7 == row.size())
        {
            for (int j = 0; j < 7; j++)
            {
                num = std::string(row[j]);
                tmp = ::strtod(num.c_str(), 0);
                filter_value[i * 7 + j] = (float)tmp;
            }
        }
        else if(7 == i && 2 == row.size())
        {
            num = std::string(row[0]);
            tmp = ::strtod(num.c_str(), 0);
            factor[0] = (float)tmp;
            num = std::string(row[1]);
            tmp = ::strtod(num.c_str(), 0);
            factor[1] = (float)tmp;
        }
        else
        {
            std::cout << "Malformed csv filter\n";
            std::cout << "line: " << i << std::endl;
            std::cout << "row size: " << row.size() << endl;
            printHelp();
            std::cout << "Result = FAIL\n";
            exit(EXIT_FAILURE);
        }
    }

    std::cout << "Filter from file " << argv[1] << " read succesfully \n";

    /* read bmp from file to gpu global mem */

    std::string file_name(argv[2]);
    bitmap_image image(file_name);
    if (!image)
    {
        std::cout << "Failed to open image " << file_name.c_str() << "\n";
        printHelp();
        std::cout << "Result = FAIL\n";
        exit(EXIT_FAILURE);
    }

    const unsigned int imgWidth = image.width();
    const unsigned int imgHeight = image.height();
    std::cout << "Loaded image " << imgWidth << "X" << imgHeight << " from file " << file_name.c_str() << " \n";

    unsigned int imgSize = imgWidth * imgHeight;
    unsigned int imgSizeBytes = sizeof(unsigned char)* 3 * imgSize;

    /* check if filer fits into constant mem, load */
    /* it will not, constant memory is too small*/
    int neededConstMem = sizeof(float)* Filter_Total_Size;
    /* check if image fits into global memory, alocate output tables */
    int neededGlobalMem = 2 * imgSizeBytes;
    std::cout << "Application will need total of " << neededGlobalMem <<  " bytes in global memory \n";
    std::cout << "Application will need total of " << neededConstMem << " bytes in constant memory \n";
#pragma endregion readAndParseInput

#pragma region readDeviceProperties
    readDeviceAttributes(devId, driverVersion, runtimeVersion, totalGlobalMem, multiProcesorCount, cudaCores,
        warpSize, totalConstantMem, sharedMemPerBlock, maxThreadsPerBlock, maxThreadsPerMP);

    if ((unsigned int)neededConstMem > totalConstantMem || neededGlobalMem > totalGlobalMem)
    {
        std::cout << "Not enough memory. Try smaller image or get better device. \n";
        printHelp();
        std::cout << "Result = FAIL\n";
        exit(EXIT_FAILURE);
    }
#pragma endregion readDeviceProperties

#pragma region allocateAndCopyMem
    unsigned char *r = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
    unsigned char *g = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
    unsigned char *b = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);

    unsigned char red, green, blue;
    for (std::size_t i = 0; i < imgHeight; i++)
    {
        for (std::size_t j = 0; j < imgWidth; j++)
        {
            image.get_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
            r[(int)i * (int)imgWidth + (int)j] = red;
            g[(int)i * (int)imgWidth + (int)j] = green;
            b[(int)i * (int)imgWidth + (int)j] = blue;
        }
    }

    std::cout << "Allocated mem for rgb tables. " << imgSizeBytes << " bytes in total \n";

    cudaStatus = cudaSetDevice(devId);
    cudaGetDeviceProperties(&deviceProp, devId);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaSetDevice failed! Do you have a CUDA - capable GPU installed ?\n";
        status = EXIT_FAILURE;
        goto Error;
    }

    // Allocate GPU buffers for six vectors (3 input, 3 output).
    cudaStatus = cudaMalloc((void**)&g_r, sizeof(unsigned char)* imgSize);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&g_g, sizeof(unsigned char)* imgSize);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&g_b, sizeof(unsigned char)* imgSize);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&g_or, sizeof(unsigned char)* imgSize);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&g_og, sizeof(unsigned char)* imgSize);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&g_ob, sizeof(unsigned char)* imgSize);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }

    std::cout << "Space for input and output images mallocked.\n";

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(g_r, r, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMemcpy(g_g, g, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMemcpy(g_b, b, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMalloc failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    std::cout << "Input data coppied into global mem.\n";
    cudaStatus = cudaMemcpyToSymbol(g_filter, filter_value, sizeof(float)* Filter_Total_Size, 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMemcpyToSymbol failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaMemcpyToSymbol(g_factor, factor, sizeof(float)*2,0,cudaMemcpyHostToDevice);
    getLastCudaError("Factor copy failed");
    checkCudaErrors(cudaDeviceSynchronize());

    std::cout << "Filter data coppied into constant mem.\n";

#pragma endregion allocateAndCopyMem

#pragma region divideImage
    /*
    int warpSize, multiProcesorCount, cudaCores;
    int runtimeVersion, driverVersion;
    unsigned long long totalGlobalMem;
    unsigned int sharedMemPerBlock, totalConstantMem;
    int maxThreadsPerBlock, maxThreadsPerMP;
    */

    float dev_score = (float)cudaCores / (float)warpSize;
    float img_ratio = (float)imgWidth/(float)imgHeight;

    std::cout << "Image ratio: " << img_ratio << " cores/warp: " << dev_score << endl;


    int blocksX = 64;
    int blocksY = 64;
    int threadsX = 8;// 8;
    int threadsY = 8;// 8;

    float div_ratio = (float)threadsX / (float)threadsY;
    //zaokrąglone w górę te podziały
    unsigned int blockPartX = (imgWidth / blocksX) + (imgWidth%blocksX != 0);
    unsigned int blockPartY = (imgHeight / blocksY) + (imgHeight%blocksY != 0);
    //to wyjdzie zawsze conajmniej 1, chyba że wymiar obrazu to 0, a to już patola

    unsigned int threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
    unsigned int threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
    //tu też zawsze conajmniej 1
    while(threadsX * threadsY < dev_score * warpSize && (threadsX * threadsY) < maxThreadsPerBlock)
    {
        if(img_ratio > div_ratio && 1 != threadPartX)
        {
            threadsX += 2;
        }
        else
        {
            threadsY += 2;
        }
        div_ratio = (float)threadsX / (float)threadsY;
        threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
        threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
        if(1 == threadPartX && 1 == threadPartY)
        { break; }
    }

    //64x64
    dim3 blocksPerGrid(blocksX, blocksY);
    //and 8x8 threads per block
    dim3 threadsPerBlock(threadsX, threadsY);

    std::cout << blocksX << "X" << blocksY << " blocks, each block procesing " << blockPartX << "X" << blockPartY << "pixels.\n";
    std::cout << threadsX << "X" << threadsY << " threads per block, each procesing " << threadPartX << "X" << threadPartY << "pixels.\n";
#pragma endregion divideImage

#pragma region runKernels
    /* do magic - run kernel */
    kernelstart = clock();

    /*
    filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
    getLastCudaError("Kernel execution failed");
    checkCudaErrors(cudaDeviceSynchronize());
    std::cout << "Red.\n";

    filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_g, g_og, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
    getLastCudaError("Kernel execution failed");
    checkCudaErrors(cudaDeviceSynchronize());
    std::cout << "Green.\n";

    filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
    getLastCudaError("Kernel execution failed");
    checkCudaErrors(cudaDeviceSynchronize());
    std::cout << "Blue.\n";
    getLastCudaError("Kernel execution failed");
    checkCudaErrors(cudaDeviceSynchronize());
    */

    newFilterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, g_g, g_og, g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
    getLastCudaError("Kernel execution failed");
    checkCudaErrors(cudaDeviceSynchronize());
    std::cout << "Red.\n";  std::cout << "Green.\n"; std::cout << "Blue.\n";

    kernelend = clock();
    //sekwencyjnie, hehe.
    // Check if kernel execution generated an error

    /* start clock */

    /*Keep in mind that there is some driver overhead the first time you call
    a particular kernel in your program, so when doing timing studies, you
    should warm up your kernels by calling them once before your timer
    starts.*/

    /* end clock */

#pragma endregion runKernels

#pragma region copyResults
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(r, g_or, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMemcpy failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMemcpy(g, g_og, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMemcpy failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
    cudaStatus = cudaMemcpy(b, g_ob, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaMemcpy failed!\n";
        status = EXIT_FAILURE;
        goto Error;
    }
#pragma endregion copyResults

    /* assemble and save output image */
#pragma region saveOutput
    for (std::size_t i = 0; i < imgHeight; i++)
    {
        for (std::size_t j = 0; j < imgWidth; j++)
        {
            red = r[(int)i * (int)imgWidth + (int)j];
            green = g[(int)i * (int)imgWidth + (int)j];
            blue = b[(int)i * (int)imgWidth + (int)j];
            image.set_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
        }
    }
    time_t timer = NULL;
    time(&timer);
    struct tm * timeinfo = localtime(&timer);
    char matko[14];
    strftime(matko, 14, "%y%m%d_%H%M%S", timeinfo);
    std::cout << "Saving result image to " << file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp" << " file.\n";
    image.save_image(file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp");
#pragma endregion saveOutput


    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

/* free every mallocked space and exit */
#pragma region freeAndExit
Error:
    cudaFree(g_r);
    cudaFree(g_g);
    cudaFree(g_b);
    cudaFree(g_or);
    cudaFree(g_og);
    cudaFree(g_ob);
    //cudaFree(g_filter);
    free(r);
    free(g);
    free(b);
    end = clock();
    double total_diff = double(end - start) / CLOCKS_PER_SEC;
    printf ("Total elapsed time is %.6lf seconds.\n", total_diff );
    if(NULL != kernelstart && NULL != kernelend)
    {
        double kernel_diff = double(kernelend - kernelstart) / CLOCKS_PER_SEC;
        printf ("Kernel functions elapsed time is %.6lf seconds.\n", kernel_diff );
    }
    system("pause");
    exit(status);
#pragma endregion freeAndExit
}