Untitled


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

#include <stdio.h>
#include <cstdlib>
#include <iostream>
#include <ostream>
#include <iomanip>
#include <windows.h>
#include <conio.h>
#include <cstdint>
#include <chrono>

#include <npp.h>

#include "../common/inc/helper_cuda.h"
#include "../common/inc/helper_image.h"

using namespace std;

void showImg(unsigned char* matrix, int N, int M);
unsigned char* rebuildImg(unsigned char* img, int h, int w);
unsigned char* CPU(unsigned char* img, unsigned char* newImg, int h, int w);
unsigned char filterCPU(unsigned char* img, int i, int j, int w, int filter[3][3], int divisionCoef);
unsigned char* GPU(unsigned char* img, int h, int w);
__global__ void GPUfunc(unsigned char* img_origin, unsigned char* img_new, int w, int h, int dif);
void compose(unsigned char* matrix1, unsigned char* matrix2,int h,int w);
unsigned char* creat300mpx();
__global__ void GPUfuncShared(unsigned char* img_origin, unsigned char* img_new, int w, int h, int dif);
unsigned char* GPUShared(unsigned char* img, int h, int w);


__global__ void GPUfunc(unsigned char* img_origin, unsigned char* img_new, int w, int h, int dif) {
    int curWidth = blockIdx.x * blockDim.x + threadIdx.x;
    int curHeight = blockIdx.y * blockDim.y + threadIdx.y;

    int filter[3][3] = {
        {1,-2,1},{-2,5,-2},{1,-2,1}
    };

    /*if (curWidth >= w || curHeight >= h)
        return;*/

    img_new[curWidth*w + curHeight] = ((img_origin[curWidth*(w + 2) + curHeight] * (filter[0][0])
        + img_origin[(curWidth)*(w + 2) + (curHeight + 1)] * (filter[0][1]) + img_origin[(curWidth)*(w + 2)
        + (curHeight + 2)] * (filter[0][2]) + img_origin[(curWidth + 1)*(w + 2) + (curHeight)] * (filter[1][0])
        + img_origin[(curWidth + 1)*(w + 2) + (curHeight + 1)] * (filter[1][1]) + img_origin[(curWidth + 1)*(w + 2)
        + (curHeight + 2)] * (filter[1][2]) + img_origin[(curWidth + 2)*(w + 2) + (curHeight)] * (filter[2][0])
        + img_origin[(curWidth + 2)*(w + 2) + (curHeight + 1)] * (filter[2][1]) + img_origin[(curWidth + 2)*(w + 2)
        + (curHeight + 2)] * (filter[2][2])) / dif);
}

__global__ void GPUfuncShared(unsigned char* img_origin, unsigned char* img_new, int w, int h, int dif) {
    int curWidth = blockIdx.x * blockDim.x + threadIdx.x;
    int curHeight = blockIdx.y * blockDim.y + threadIdx.y;

    int filter[3][3] = {
        { 1,-2,1 },{ -2,5,-2 },{ 1,-2,1 }
    };

    __shared__ unsigned char block[32][32];

    block[threadIdx.y][threadIdx.x] = ((img_origin[curWidth*(w + 2) + curHeight] * (filter[0][0])
        + img_origin[(curWidth)*(w + 2) + (curHeight + 1)] * (filter[0][1]) + img_origin[(curWidth)*(w + 2)
        + (curHeight + 2)] * (filter[0][2]) + img_origin[(curWidth + 1)*(w + 2) + (curHeight)] * (filter[1][0])
        + img_origin[(curWidth + 1)*(w + 2) + (curHeight + 1)] * (filter[1][1]) + img_origin[(curWidth + 1)*(w + 2)
        + (curHeight + 2)] * (filter[1][2]) + img_origin[(curWidth + 2)*(w + 2) + (curHeight)] * (filter[2][0])
        + img_origin[(curWidth + 2)*(w + 2) + (curHeight + 1)] * (filter[2][1]) + img_origin[(curWidth + 2)*(w + 2)
        + (curHeight + 2)] * (filter[2][2])) / dif);

     img_new[curWidth*w + curHeight] = block[threadIdx.y][threadIdx.x];

}

int main() {
    unsigned char* img = NULL;

    unsigned char *rebuild = NULL;
    unsigned int w = 0;
    unsigned int h = 0;
    unsigned int chanels = 1;

    __loadPPM("C:\\Users\\messi\\Documents\\Visual Studio 2015\\Projects\\AVP5neW\\AVP5neW\\smile.pgm", &img, &w, &h, &chanels);
    //img = creat300mpx();
    rebuild = rebuildImg(img, h, w);

    unsigned char* cpuImg = (unsigned char*)calloc((w)*(h), sizeof(unsigned char));
    unsigned char* gpuImg = (unsigned char*)calloc((w)*(h), sizeof(unsigned char));
    unsigned char* gpuImgShared = (unsigned char*)calloc((w)*(h), sizeof(unsigned char));

    showImg(img,h,w);

    cpuImg = CPU(rebuild, cpuImg, h,w);
    gpuImg = GPU(rebuild, h,w);
    gpuImgShared = GPUShared(rebuild, h, w);

    cout << h << "x" << w<<endl;

    compose(cpuImg,gpuImg,h,w);
    compose(gpuImgShared, gpuImg, h, w);

    __savePPM("C:\\Users\\messi\\Documents\\Visual Studio 2015\\Projects\\AVP5neW\\AVP5neW\\cpu.pgm", cpuImg, w, h, chanels);
    __savePPM("C:\\Users\\messi\\Documents\\Visual Studio 2015\\Projects\\AVP5neW\\AVP5neW\\gpu.pgm", gpuImg, w, h, chanels);
    __savePPM("C:\\Users\\messi\\Documents\\Visual Studio 2015\\Projects\\AVP5neW\\AVP5neW\\gpuShared.pgm", gpuImgShared, w, h, chanels);

    system("pause");
}

void compose(unsigned char* matrix1, unsigned char* matrix2,int h,int w) {
    int t = 0;
    for (int i = 0; i < h*w; i++)
        if (matrix1[i] != matrix2[i]) {
            t++;
        }
    cout << t << endl;
}

void showImg(unsigned char*matrix, int N, int M) {
    for (int i = 0; i < N; i++) {
        cout << endl;
        for (int j = 0; j < M; j++) {
            cout << matrix[i*M + j];
        }
    }
    cout << endl;
}

unsigned char* rebuildImg(unsigned char* img, int h, int w) {
    unsigned char* newImg = new unsigned char[(w + 3)*(h + 3)];

    newImg[0] = img[0];
    newImg[w + 1] = img[w - 1];
    newImg[(h + 1)*(w + 1) + (w + 1)] = img[(h - 1)*(w - 1) + (w - 1)];
    newImg[(h + 1)*(w + 1) + (w + 1) + (w + 1)] = img[(h - 1)*(w - 1) + (w - 1) + (w - 1)];

    for (int i = 0; i < w; i++) {
        newImg[i + 1] = img[i];
    }
    for (int i = 0; i < w; i++) {
        newImg[(i + 1) + (h + 1)*(w + 1) + (w + 1)] = img[i + (h - 1)*(w - 1) + (w - 1)];
    }
    for (int i = 0; i < h; i++) {
        newImg[(i + 1)*(w + 2)] = img[i*w];
    }
    for (int i = 0; i < h; i++) {
        newImg[(i + 1)*(w + 2) + w + 1] = img[i*w + w - 1];
    }

    for (int i = 0; i < h; i++) {
        for (int j = 0; j < w; j++) {
            newImg[(i + 1)*(w + 2) + j + 1] = img[i*(w)+j];
        }
    }

    return newImg;
}

unsigned char* CPU(unsigned char* img,unsigned char* newImg, int h, int w) {
    int filter[3][3] = {
        { 1,-2,1 },{ -2,5,-2 },{ 1,-2,1 }
    };
    int divisionCoef = 1;

    chrono::time_point<chrono::steady_clock> start, end;
    start = chrono::steady_clock::now();
    for (int i = 0; i < h; i++) {
        for (int j = 0; j < w; j++) {
            newImg[(i)*w + (j)] = filterCPU(img,i,j,w,filter,divisionCoef);
        }
    }
    end = chrono::steady_clock::now();
    auto CPU_TIME = end - start;
    cout << "CPU time:" << chrono::duration <double, milli>(CPU_TIME).count() << " ms" << endl;
    return newImg;
}

unsigned char filterCPU(unsigned char* img, int i, int j, int w, int filter[3][3], int divisionCoef) {
    unsigned char pixel = ((img[i*(w + 2) + j] * (filter[0][0]) + img[(i)*(w + 2)
        + (j + 1)] * (filter[0][1]) + img[(i)*(w + 2) + (j + 2)] * (filter[0][2])
        + img[(i + 1)*(w + 2) + (j)] * (filter[1][0]) + img[(i + 1)*(w + 2) + (j + 1)] * (filter[1][1])
        + img[(i + 1)*(w + 2) + (j + 2)] * (filter[1][2]) + img[(i + 2)*(w + 2) + (j)] * (filter[2][0])
        + img[(i + 2)*(w + 2) + (j + 1)] * (filter[2][1]) + img[(i + 2)*(w + 2) + (j + 2)] * (filter[2][2]))/divisionCoef);

    return pixel;
}

unsigned char* GPUShared(unsigned char* img, int h, int w) {
    int dif = 1;

    cudaError_t cudaStatus;

    cudaEvent_t startTime;
    cudaEvent_t stopTime;

    cudaEventCreate(&startTime);
    cudaEventCreate(&stopTime);

    float resultTime;

    unsigned char* result = new unsigned char[(w)*(h)];
    unsigned char* img_origin;

    cudaStatus = cudaMalloc((void**)&img_origin, (h+2)*(w+2) * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        exit(EXIT_FAILURE);
    }

    cudaStatus = cudaMemcpy(img_origin, img, (h+2)*(w+2) * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        exit(EXIT_FAILURE);
    }

    unsigned char* img_new;
    cudaStatus = cudaMalloc((void**)&img_new, (h)*(w) * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        exit(EXIT_FAILURE);
    }

    dim3 bl(8, 4);
    dim3 grid;

    grid.x = h / bl.x;
    if (h % bl.x != 0)
        grid.x += 1;

    grid.y = w / bl.y;
    if (w % bl.y != 0)
        grid.y += 1;

    cudaEventRecord(startTime);
    GPUfuncShared << <grid, bl>> > (img_origin, img_new, w, h,dif);
    cudaDeviceSynchronize();

    cudaEventRecord(stopTime);
    cudaEventSynchronize(stopTime);
    cudaThreadSynchronize();

    cudaEventElapsedTime(&resultTime, startTime, stopTime);

    cout << "GPU Shared time:" << resultTime << " ms" << endl;
    cudaDeviceSynchronize();

    cudaStatus = cudaMemcpy(result, img_new, h*w * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        exit(EXIT_FAILURE);
    }
    cudaThreadSynchronize();

    cudaFree(img_new);
    cudaFree(img_origin);

    return result;
}

unsigned char* GPU(unsigned char* img, int h, int w) {
    int dif = 1;

    cudaError_t cudaStatus;

    cudaEvent_t startTime;
    cudaEvent_t stopTime;

    cudaEventCreate(&startTime);
    cudaEventCreate(&stopTime);

    float resultTime;

    unsigned char* result = new unsigned char[(w)*(h)];
    unsigned char* img_origin;

    cudaStatus = cudaMalloc((void**)&img_origin, (h + 2)*(w + 2) * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        exit(EXIT_FAILURE);
    }

    cudaStatus = cudaMemcpy(img_origin, img, (h + 2)*(w + 2) * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        exit(EXIT_FAILURE);
    }

    unsigned char* img_new;
    cudaStatus = cudaMalloc((void**)&img_new, (h)*(w) * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        exit(EXIT_FAILURE);
    }

    dim3 bl(8, 4);
    dim3 grid;

    grid.x = h / bl.x;
    if (h % bl.x != 0)
        grid.x += 1;

    grid.y = w / bl.y;
    if (w % bl.y != 0)
        grid.y += 1;

    cudaEventRecord(startTime);
    GPUfunc << <grid, bl >> > (img_origin, img_new, w, h, dif);
    cudaDeviceSynchronize();

    cudaEventRecord(stopTime);
    cudaEventSynchronize(stopTime);
    cudaThreadSynchronize();

    cudaEventElapsedTime(&resultTime, startTime, stopTime);

    cout << "GPU time:" << resultTime << " ms" << endl;
    cudaDeviceSynchronize();

    cudaStatus = cudaMemcpy(result, img_new, h*w * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        exit(EXIT_FAILURE);
    }
    cudaThreadSynchronize();

    cudaFree(img_new);
    cudaFree(img_origin);

    return result;
}

unsigned char* creat300mpx() {
    unsigned char *image = (unsigned char*)calloc(30000*10000, sizeof(unsigned char));

    for (int i = 0; i < 10000 * 30000; i++)
    {
        image[i] = (unsigned char)rand() % 200;
    }
    return image;
}