Untitled


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cmath>

#include "timewindows.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#define BLOCK_SIZE  16
#define BLOCK_SIZE_SH 18
#define HEADER_SIZE 122

typedef unsigned char BYTE;

/**
 * Structure that represents a BMP image.
 */
typedef struct
{
    int   width;
    int   height;
    float* data;
} BMPImage;

typedef struct timeval tval;

BYTE g_info[HEADER_SIZE]; // Reference header

/**
 * Reads a BMP 24bpp file and returns a BMPImage structure.
 * Thanks to https://stackoverflow.com/a/9296467
 */
BMPImage readBMP(char* filename)
{
    BMPImage bitmap = { 0 };
    int      size = 0;
    BYTE* data = NULL;
    FILE* file = fopen(filename, "rb");

    // Read the header (expected BGR - 24bpp)
    fread(g_info, sizeof(BYTE), HEADER_SIZE, file);

    // Get the image width / height from the header
    bitmap.width = *((int*)&g_info[18]);
    bitmap.height = *((int*)&g_info[22]);
    size = *((int*)&g_info[34]);

    // Read the image data
    data = (BYTE*)malloc(sizeof(BYTE) * size);
    fread(data, sizeof(BYTE), size, file);

    // Convert the pixel values to float
    bitmap.data = (float*)malloc(sizeof(float) * size);

    for (int i = 0; i < size; i++)
    {
        bitmap.data[i] = (float)data[i];
    }

    fclose(file);
    free(data);

    return bitmap;
}

/**
 * Writes a BMP file in grayscale given its image data and a filename.
 */
void writeBMPGrayscale(int width, int height, float* image, char* filename)
{
    FILE* file = NULL;

    file = fopen(filename, "wb");

    // Write the reference header
    fwrite(g_info, sizeof(BYTE), HEADER_SIZE, file);

    // Unwrap the 8-bit grayscale into a 24bpp (for simplicity)
    for (int h = 0; h < height; h++)
    {
        int offset = h * width;

        for (int w = 0; w < width; w++)
        {
            BYTE pixel = (BYTE)((image[offset + w] > 255.0f) ? 255.0f :
                (image[offset + w] < 0.0f) ? 0.0f :
                image[offset + w]);

            // Repeat the same pixel value for BGR
            fputc(pixel, file);
            fputc(pixel, file);
            fputc(pixel, file);
        }
    }

    fclose(file);
}

/**
 * Releases a given BMPImage.
 */
void freeBMP(BMPImage bitmap)
{
    free(bitmap.data);
}

/**
 * Checks if there has been any CUDA error. The method will automatically print
 * some information and exit the program when an error is found.
 */
void checkCUDAError()
{
    cudaError_t cudaError = cudaGetLastError();

    if (cudaError != cudaSuccess)
    {
        printf("CUDA Error: Returned %d: %s\n", cudaError,
            cudaGetErrorString(cudaError));
        exit(-1);
    }
}

/**
 * Calculates the elapsed time between two time intervals (in milliseconds).
 */
double get_elapsed(tval t0, tval t1)
{
    return (double)(t1.tv_sec - t0.tv_sec) * 1000.0L + (double)(t1.tv_usec - t0.tv_usec) / 1000.0L;
}

/**
 * Stores the result image and prints a message.
 */
void store_result(int index, double elapsed_cpu, double elapsed_gpu,
    int width, int height, float* image, float* image2=nullptr)
{
    char path[255];

    sprintf(path, "images/hw3_result_%d.bmp", index);
    writeBMPGrayscale(width, height, image, path);

    if (image2)
    {
        sprintf(path, "images/hw3_result_%d_gpu.bmp", index);
        writeBMPGrayscale(width, height, image2, path);
    }

    printf("Step #%d Completed - Result stored in \"%s\".\n", index, path);
    printf("Elapsed CPU: %fms / ", elapsed_cpu);

    if (elapsed_gpu == 0)
    {
        printf("[GPU version not available]\n");
    }
    else
    {
        printf("Elapsed GPU: %fms\n", elapsed_gpu);
    }
}

/**
 * Converts a given 24bpp image into 8bpp grayscale using the CPU.
 */
void cpu_grayscale(int width, int height, float* image, float* image_out)
{
    for (int h = 0; h < height; h++)
    {
        int offset_out = h * width;      // 1 color per pixel
        int offset = offset_out * 3; // 3 colors per pixel

        for (int w = 0; w < width; w++)
        {
            float* pixel = &image[offset + w * 3];

            // Convert to grayscale following the "luminance" model
            image_out[offset_out + w] = pixel[0] * 0.0722f + // B
                pixel[1] * 0.7152f + // G
                pixel[2] * 0.2126f;  // R
        }
    }
}

/**
 * Converts a given 24bpp image into 8bpp grayscale using the GPU.
 */
__global__ void gpu_grayscale(int width, int height, float* image, float* image_out)
{
    int index_x = blockIdx.x * blockDim.x + threadIdx.x;
    int index_y = blockIdx.y * blockDim.y + threadIdx.y;

    if (index_x > width || index_y > height)
        return;

    int offset = index_y * width + index_x;

    float* pixel = &image[offset * 3];

    image_out[offset] = pixel[0] * 0.0722f + // B
                        pixel[1] * 0.7152f + // G
                        pixel[2] * 0.2126f;  // R
}

/**
 * Applies a 3x3 convolution matrix to a pixel using the GPU.
 */
__device__ __host__ float applyFilter(float* image, int stride, float* matrix, int filter_dim)
{
    float pixel = 0.0f;

    for (int h = 0; h < filter_dim; h++)
    {
        int offset = h * stride;
        int offset_kernel = h * filter_dim;

        for (int w = 0; w < filter_dim; w++)
        {
            pixel += image[offset + w] * matrix[offset_kernel + w];
        }
    }

    return pixel;
}

/**
 * Applies a Gaussian 3x3 filter to a given image using the CPU.
 */
void cpu_gaussian(int width, int height, float* image, float* image_out)
{
    float gaussian[9] = { 1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f,
                          2.0f / 16.0f, 4.0f / 16.0f, 2.0f / 16.0f,
                          1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f };

    for (int h = 0; h < (height - 2); h++)
    {
        int offset_t = h * width;
        int offset = (h + 1) * width;

        for (int w = 0; w < (width - 2); w++)
        {
            image_out[offset + (w + 1)] = applyFilter(&image[offset_t + w],
                width, gaussian, 3);
        }
    }
}

/**
 * Applies a Gaussian 3x3 filter to a given image using the GPU.
 */
__global__ void gpu_gaussian(int width, int height, float* image, float* image_out)
{
    //__shared__ float sh_block[BLOCK_SIZE_SH * BLOCK_SIZE_SH];

    float gaussian[9] = { 1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f,
                          2.0f / 16.0f, 4.0f / 16.0f, 2.0f / 16.0f,
                          1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f };

    int index_x = blockIdx.x * blockDim.x + threadIdx.x;
    int index_y = blockIdx.y * blockDim.y + threadIdx.y;

    if (index_x < (width - 2) && index_y < (height - 2))
    {
        int offset_t = index_y * width + index_x;
        int offset = (index_y + 1) * width + (index_x + 1);

        //int sh_block_offset = threadIdx.y * BLOCK_SIZE_SH + threadIdx.x;
        //sh_block[sh_block_offset] = image[offset_t];
        //__syncthreads();

        //image_out[offset] = applyFilter(&sh_block[sh_block_offset], width, gaussian, 3);
        image_out[offset] = applyFilter(&image[offset_t], width, gaussian, 3);
    }
}

/**
 * Calculates the gradient of an image using a Sobel filter on the CPU.
 */
void cpu_sobel(int width, int height, float* image, float* image_out)
{
    float sobel_x[9] = { 1.0f,  0.0f, -1.0f,
                         2.0f,  0.0f, -2.0f,
                         1.0f,  0.0f, -1.0f };
    float sobel_y[9] = { 1.0f,  2.0f,  1.0f,
                         0.0f,  0.0f,  0.0f,
                        -1.0f, -2.0f, -1.0f };

    for (int h = 0; h < (height - 2); h++)
    {
        int offset_t = h * width;
        int offset = (h + 1) * width;

        for (int w = 0; w < (width - 2); w++)
        {
            float gx = applyFilter(&image[offset_t + w], width, sobel_x, 3);
            float gy = applyFilter(&image[offset_t + w], width, sobel_y, 3);

            // Note: The output can be negative or exceed the max. color value
            // of 255. We compensate this afterwards while storing the file.
            image_out[offset + (w + 1)] = sqrtf(gx * gx + gy * gy);
        }
    }
}

/**
 * Calculates the gradient of an image using a Sobel filter on the GPU.
 */
__global__ void gpu_sobel(int width, int height, float* image, float* image_out)
{
    //__shared__ float sh_block[BLOCK_SIZE_SH * BLOCK_SIZE_SH];

    float sobel_x[9] = { 1.0f,  0.0f, -1.0f,
                         2.0f,  0.0f, -2.0f,
                         1.0f,  0.0f, -1.0f };
    float sobel_y[9] = { 1.0f,  2.0f,  1.0f,
                         0.0f,  0.0f,  0.0f,
                        -1.0f, -2.0f, -1.0f };

    int index_x = blockIdx.x * blockDim.x + threadIdx.x;
    int index_y = blockIdx.y * blockDim.y + threadIdx.y;

    if (index_x < (width - 2) && index_y < (height - 2))
    {
        int offset_t = index_y * width + index_x;
        int offset = (index_y + 1) * width + (index_x + 1);

        float gx = applyFilter(&image[offset_t], width, sobel_x, 3);
        float gy = applyFilter(&image[offset_t], width, sobel_y, 3);

        image_out[offset] = sqrtf(gx * gx + gy * gy);
    }

}

int main(int argc, char** argv)
{
    BMPImage bitmap = { 0 };
    float* d_bitmap = { 0 };
    float* image_out[2] = { 0 };
    float* image_out_gpu[2] = { 0 };
    float* d_image_out[2] = { 0 };
    int      image_size = 0;
    tval     t[2] = { 0 };
    double   elapsed[2] = { 0 };
    dim3     grid(1);                       // The grid will be defined later
    dim3     block(BLOCK_SIZE, BLOCK_SIZE); // The block size will not change

    // Make sure the filename is provided
    if (argc != 2)
    {
        fprintf(stderr, "Error: The filename is missing!\n");
        return -1;
    }

    // Read the input image and update the grid dimension
    bitmap = readBMP(argv[1]);
    image_size = bitmap.width * bitmap.height;
    grid = dim3(((bitmap.width + (BLOCK_SIZE - 1)) / BLOCK_SIZE),
        ((bitmap.height + (BLOCK_SIZE - 1)) / BLOCK_SIZE));

#if 0
    {
        int* gpu_index, index;
        cudaMalloc(&gpu_index, sizeof(int));
        cudaMemset(gpu_index, 0, sizeof(int));
        gpu_grayscale << <grid, block >> > (bitmap.width, bitmap.height,
            d_bitmap, d_image_out[0], gpu_index);

        cudaMemcpy(&index, gpu_index, sizeof(float), cudaMemcpyDeviceToHost);

        printf("INDEX %d", index);
    }
#endif

    printf("Image opened (width=%d height=%d).\n", bitmap.width, bitmap.height);

    // Allocate the intermediate image buffers for each step
    for (int i = 0; i < 2; i++)
    {
        image_out[i] = (float*)calloc(image_size, sizeof(float));
        image_out_gpu[i] = (float*)calloc(image_size, sizeof(float));

        cudaMalloc(&d_image_out[i], image_size * sizeof(float));
        cudaMemset(d_image_out[i], 0, image_size * sizeof(float));
    }

    cudaMalloc(&d_bitmap, image_size * sizeof(float) * 3);
    cudaMemcpy(d_bitmap, bitmap.data,
        image_size * sizeof(float) * 3, cudaMemcpyHostToDevice);

    // Step 1: Convert to grayscale
    {
        // Launch the CPU version
        gettimeofday(&t[0], NULL);
        cpu_grayscale(bitmap.width, bitmap.height, bitmap.data, image_out[0]);
        gettimeofday(&t[1], NULL);

        elapsed[0] = get_elapsed(t[0], t[1]);

        // Launch the GPU version
        gettimeofday(&t[0], NULL);
        gpu_grayscale<<<grid, block>>>(bitmap.width, bitmap.height, d_bitmap, d_image_out[0]);

        cudaMemcpy(image_out_gpu[0], d_image_out[0],
                   image_size * sizeof(float), cudaMemcpyDeviceToHost);
        gettimeofday(&t[1], NULL);

        elapsed[1] = get_elapsed(t[0], t[1]);

        // Store the result image in grayscale
        store_result(1, elapsed[0], elapsed[1], bitmap.width, bitmap.height, image_out[0], image_out_gpu[0]);
    }

    // Step 2: Apply a 3x3 Gaussian filter
    {
        // Launch the CPU version
        gettimeofday(&t[0], NULL);
        cpu_gaussian(bitmap.width, bitmap.height, image_out[0], image_out[1]);
        gettimeofday(&t[1], NULL);

        elapsed[0] = get_elapsed(t[0], t[1]);

        // Launch the GPU version
        gettimeofday(&t[0], NULL);
        gpu_gaussian<<<grid, block>>>(bitmap.width, bitmap.height,
                                      d_image_out[0], d_image_out[1]);

        cudaMemcpy(image_out_gpu[1], d_image_out[1],
                   image_size * sizeof(float), cudaMemcpyDeviceToHost);
        gettimeofday(&t[1], NULL);

        elapsed[1] = get_elapsed(t[0], t[1]);

        // Store the result image with the Gaussian filter applied
        store_result(2, elapsed[0], elapsed[1], bitmap.width, bitmap.height, image_out[1], image_out_gpu[1]);
    }

    // Step 3: Apply a Sobel filter
    {
        // Launch the CPU version
        gettimeofday(&t[0], NULL);
        cpu_sobel(bitmap.width, bitmap.height, image_out[1], image_out[0]);
        gettimeofday(&t[1], NULL);

        elapsed[0] = get_elapsed(t[0], t[1]);

        // Launch the GPU version
        gettimeofday(&t[0], NULL);
        gpu_sobel<<<grid, block>>>(bitmap.width, bitmap.height,
                                   d_image_out[1], d_image_out[0]);

        cudaMemcpy(image_out_gpu[0], d_image_out[0],
                   image_size * sizeof(float), cudaMemcpyDeviceToHost);
        gettimeofday(&t[1], NULL);

        elapsed[1] = get_elapsed(t[0], t[1]);

        // Store the final result image with the Sobel filter applied
        store_result(3, elapsed[0], elapsed[1], bitmap.width, bitmap.height, image_out[0], image_out_gpu[0]);
    }

    // Release the allocated memory
    for (int i = 0; i < 2; i++)
    {
        free(image_out[i]);
        cudaFree(d_image_out[i]);
    }

    freeBMP(bitmap);
    cudaFree(d_bitmap);

    return 0;
}