Advertisement
Guest User

Untitled

a guest
Nov 21st, 2019
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 12.44 KB | None | 0 0
  1.  
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <string.h>
  5. #include <cmath>
  6.  
  7. #include "timewindows.h"
  8. #include "cuda_runtime.h"
  9. #include "device_launch_parameters.h"
  10.  
  11.  
  12. #define BLOCK_SIZE  16
  13. #define BLOCK_SIZE_SH 18
  14. #define HEADER_SIZE 122
  15.  
  16. typedef unsigned char BYTE;
  17.  
  18. /**
  19.  * Structure that represents a BMP image.
  20.  */
  21. typedef struct
  22. {
  23.     int   width;
  24.     int   height;
  25.     float* data;
  26. } BMPImage;
  27.  
  28. typedef struct timeval tval;
  29.  
  30. BYTE g_info[HEADER_SIZE]; // Reference header
  31.  
  32. /**
  33.  * Reads a BMP 24bpp file and returns a BMPImage structure.
  34.  * Thanks to https://stackoverflow.com/a/9296467
  35.  */
  36. BMPImage readBMP(char* filename)
  37. {
  38.     BMPImage bitmap = { 0 };
  39.     int      size = 0;
  40.     BYTE* data = NULL;
  41.     FILE* file = fopen(filename, "rb");
  42.  
  43.     // Read the header (expected BGR - 24bpp)
  44.     fread(g_info, sizeof(BYTE), HEADER_SIZE, file);
  45.  
  46.     // Get the image width / height from the header
  47.     bitmap.width = *((int*)&g_info[18]);
  48.     bitmap.height = *((int*)&g_info[22]);
  49.     size = *((int*)&g_info[34]);
  50.  
  51.     // Read the image data
  52.     data = (BYTE*)malloc(sizeof(BYTE) * size);
  53.     fread(data, sizeof(BYTE), size, file);
  54.  
  55.     // Convert the pixel values to float
  56.     bitmap.data = (float*)malloc(sizeof(float) * size);
  57.  
  58.     for (int i = 0; i < size; i++)
  59.     {
  60.         bitmap.data[i] = (float)data[i];
  61.     }
  62.  
  63.     fclose(file);
  64.     free(data);
  65.  
  66.     return bitmap;
  67. }
  68.  
  69. /**
  70.  * Writes a BMP file in grayscale given its image data and a filename.
  71.  */
  72. void writeBMPGrayscale(int width, int height, float* image, char* filename)
  73. {
  74.     FILE* file = NULL;
  75.  
  76.     file = fopen(filename, "wb");
  77.  
  78.     // Write the reference header
  79.     fwrite(g_info, sizeof(BYTE), HEADER_SIZE, file);
  80.  
  81.     // Unwrap the 8-bit grayscale into a 24bpp (for simplicity)
  82.     for (int h = 0; h < height; h++)
  83.     {
  84.         int offset = h * width;
  85.  
  86.         for (int w = 0; w < width; w++)
  87.         {
  88.             BYTE pixel = (BYTE)((image[offset + w] > 255.0f) ? 255.0f :
  89.                 (image[offset + w] < 0.0f) ? 0.0f :
  90.                 image[offset + w]);
  91.  
  92.             // Repeat the same pixel value for BGR
  93.             fputc(pixel, file);
  94.             fputc(pixel, file);
  95.             fputc(pixel, file);
  96.         }
  97.     }
  98.  
  99.     fclose(file);
  100. }
  101.  
  102. /**
  103.  * Releases a given BMPImage.
  104.  */
  105. void freeBMP(BMPImage bitmap)
  106. {
  107.     free(bitmap.data);
  108. }
  109.  
  110. /**
  111.  * Checks if there has been any CUDA error. The method will automatically print
  112.  * some information and exit the program when an error is found.
  113.  */
  114. void checkCUDAError()
  115. {
  116.     cudaError_t cudaError = cudaGetLastError();
  117.  
  118.     if (cudaError != cudaSuccess)
  119.     {
  120.         printf("CUDA Error: Returned %d: %s\n", cudaError,
  121.             cudaGetErrorString(cudaError));
  122.         exit(-1);
  123.     }
  124. }
  125.  
  126. /**
  127.  * Calculates the elapsed time between two time intervals (in milliseconds).
  128.  */
  129. double get_elapsed(tval t0, tval t1)
  130. {
  131.     return (double)(t1.tv_sec - t0.tv_sec) * 1000.0L + (double)(t1.tv_usec - t0.tv_usec) / 1000.0L;
  132. }
  133.  
  134. /**
  135.  * Stores the result image and prints a message.
  136.  */
  137. void store_result(int index, double elapsed_cpu, double elapsed_gpu,
  138.     int width, int height, float* image, float* image2=nullptr)
  139. {
  140.     char path[255];
  141.  
  142.     sprintf(path, "images/hw3_result_%d.bmp", index);
  143.     writeBMPGrayscale(width, height, image, path);
  144.  
  145.     if (image2)
  146.     {
  147.         sprintf(path, "images/hw3_result_%d_gpu.bmp", index);
  148.         writeBMPGrayscale(width, height, image2, path);
  149.     }
  150.  
  151.     printf("Step #%d Completed - Result stored in \"%s\".\n", index, path);
  152.     printf("Elapsed CPU: %fms / ", elapsed_cpu);
  153.  
  154.     if (elapsed_gpu == 0)
  155.     {
  156.         printf("[GPU version not available]\n");
  157.     }
  158.     else
  159.     {
  160.         printf("Elapsed GPU: %fms\n", elapsed_gpu);
  161.     }
  162. }
  163.  
  164. /**
  165.  * Converts a given 24bpp image into 8bpp grayscale using the CPU.
  166.  */
  167. void cpu_grayscale(int width, int height, float* image, float* image_out)
  168. {
  169.     for (int h = 0; h < height; h++)
  170.     {
  171.         int offset_out = h * width;      // 1 color per pixel
  172.         int offset = offset_out * 3; // 3 colors per pixel
  173.  
  174.         for (int w = 0; w < width; w++)
  175.         {
  176.             float* pixel = &image[offset + w * 3];
  177.  
  178.             // Convert to grayscale following the "luminance" model
  179.             image_out[offset_out + w] = pixel[0] * 0.0722f + // B
  180.                 pixel[1] * 0.7152f + // G
  181.                 pixel[2] * 0.2126f;  // R
  182.         }
  183.     }
  184. }
  185.  
  186. /**
  187.  * Converts a given 24bpp image into 8bpp grayscale using the GPU.
  188.  */
  189. __global__ void gpu_grayscale(int width, int height, float* image, float* image_out)
  190. {
  191.     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
  192.     int index_y = blockIdx.y * blockDim.y + threadIdx.y;
  193.  
  194.     if (index_x > width || index_y > height)
  195.         return;
  196.  
  197.     int offset = index_y * width + index_x;
  198.  
  199.     float* pixel = &image[offset * 3];
  200.  
  201.     image_out[offset] = pixel[0] * 0.0722f + // B
  202.                         pixel[1] * 0.7152f + // G
  203.                         pixel[2] * 0.2126f;  // R
  204. }
  205.  
  206. /**
  207.  * Applies a 3x3 convolution matrix to a pixel using the GPU.
  208.  */
  209. __device__ __host__ float applyFilter(float* image, int stride, float* matrix, int filter_dim)
  210. {
  211.     float pixel = 0.0f;
  212.  
  213.     for (int h = 0; h < filter_dim; h++)
  214.     {
  215.         int offset = h * stride;
  216.         int offset_kernel = h * filter_dim;
  217.  
  218.         for (int w = 0; w < filter_dim; w++)
  219.         {
  220.             pixel += image[offset + w] * matrix[offset_kernel + w];
  221.         }
  222.     }
  223.  
  224.     return pixel;
  225. }
  226.  
  227. /**
  228.  * Applies a Gaussian 3x3 filter to a given image using the CPU.
  229.  */
  230. void cpu_gaussian(int width, int height, float* image, float* image_out)
  231. {
  232.     float gaussian[9] = { 1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f,
  233.                           2.0f / 16.0f, 4.0f / 16.0f, 2.0f / 16.0f,
  234.                           1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f };
  235.  
  236.     for (int h = 0; h < (height - 2); h++)
  237.     {
  238.         int offset_t = h * width;
  239.         int offset = (h + 1) * width;
  240.  
  241.         for (int w = 0; w < (width - 2); w++)
  242.         {
  243.             image_out[offset + (w + 1)] = applyFilter(&image[offset_t + w],
  244.                 width, gaussian, 3);
  245.         }
  246.     }
  247. }
  248.  
  249. /**
  250.  * Applies a Gaussian 3x3 filter to a given image using the GPU.
  251.  */
  252. __global__ void gpu_gaussian(int width, int height, float* image, float* image_out)
  253. {
  254.     //__shared__ float sh_block[BLOCK_SIZE_SH * BLOCK_SIZE_SH];
  255.  
  256.     float gaussian[9] = { 1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f,
  257.                           2.0f / 16.0f, 4.0f / 16.0f, 2.0f / 16.0f,
  258.                           1.0f / 16.0f, 2.0f / 16.0f, 1.0f / 16.0f };
  259.  
  260.     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
  261.     int index_y = blockIdx.y * blockDim.y + threadIdx.y;
  262.  
  263.     if (index_x < (width - 2) && index_y < (height - 2))
  264.     {
  265.         int offset_t = index_y * width + index_x;
  266.         int offset = (index_y + 1) * width + (index_x + 1);
  267.  
  268.         //int sh_block_offset = threadIdx.y * BLOCK_SIZE_SH + threadIdx.x;
  269.         //sh_block[sh_block_offset] = image[offset_t];
  270.         //__syncthreads();
  271.  
  272.         //image_out[offset] = applyFilter(&sh_block[sh_block_offset], width, gaussian, 3);
  273.         image_out[offset] = applyFilter(&image[offset_t], width, gaussian, 3);
  274.     }
  275. }
  276.  
  277. /**
  278.  * Calculates the gradient of an image using a Sobel filter on the CPU.
  279.  */
  280. void cpu_sobel(int width, int height, float* image, float* image_out)
  281. {
  282.     float sobel_x[9] = { 1.0f,  0.0f, -1.0f,
  283.                          2.0f,  0.0f, -2.0f,
  284.                          1.0f,  0.0f, -1.0f };
  285.     float sobel_y[9] = { 1.0f,  2.0f,  1.0f,
  286.                          0.0f,  0.0f,  0.0f,
  287.                         -1.0f, -2.0f, -1.0f };
  288.  
  289.     for (int h = 0; h < (height - 2); h++)
  290.     {
  291.         int offset_t = h * width;
  292.         int offset = (h + 1) * width;
  293.  
  294.         for (int w = 0; w < (width - 2); w++)
  295.         {
  296.             float gx = applyFilter(&image[offset_t + w], width, sobel_x, 3);
  297.             float gy = applyFilter(&image[offset_t + w], width, sobel_y, 3);
  298.  
  299.             // Note: The output can be negative or exceed the max. color value
  300.             // of 255. We compensate this afterwards while storing the file.
  301.             image_out[offset + (w + 1)] = sqrtf(gx * gx + gy * gy);
  302.         }
  303.     }
  304. }
  305.  
  306. /**
  307.  * Calculates the gradient of an image using a Sobel filter on the GPU.
  308.  */
  309. __global__ void gpu_sobel(int width, int height, float* image, float* image_out)
  310. {
  311.     //__shared__ float sh_block[BLOCK_SIZE_SH * BLOCK_SIZE_SH];
  312.  
  313.     float sobel_x[9] = { 1.0f,  0.0f, -1.0f,
  314.                          2.0f,  0.0f, -2.0f,
  315.                          1.0f,  0.0f, -1.0f };
  316.     float sobel_y[9] = { 1.0f,  2.0f,  1.0f,
  317.                          0.0f,  0.0f,  0.0f,
  318.                         -1.0f, -2.0f, -1.0f };
  319.  
  320.     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
  321.     int index_y = blockIdx.y * blockDim.y + threadIdx.y;
  322.  
  323.     if (index_x < (width - 2) && index_y < (height - 2))
  324.     {
  325.         int offset_t = index_y * width + index_x;
  326.         int offset = (index_y + 1) * width + (index_x + 1);
  327.  
  328.         float gx = applyFilter(&image[offset_t], width, sobel_x, 3);
  329.         float gy = applyFilter(&image[offset_t], width, sobel_y, 3);
  330.  
  331.         image_out[offset] = sqrtf(gx * gx + gy * gy);
  332.     }
  333.    
  334. }
  335.  
  336. int main(int argc, char** argv)
  337. {
  338.     BMPImage bitmap = { 0 };
  339.     float* d_bitmap = { 0 };
  340.     float* image_out[2] = { 0 };
  341.     float* image_out_gpu[2] = { 0 };
  342.     float* d_image_out[2] = { 0 };
  343.     int      image_size = 0;
  344.     tval     t[2] = { 0 };
  345.     double   elapsed[2] = { 0 };
  346.     dim3     grid(1);                       // The grid will be defined later
  347.     dim3     block(BLOCK_SIZE, BLOCK_SIZE); // The block size will not change
  348.  
  349.     // Make sure the filename is provided
  350.     if (argc != 2)
  351.     {
  352.         fprintf(stderr, "Error: The filename is missing!\n");
  353.         return -1;
  354.     }
  355.  
  356.     // Read the input image and update the grid dimension
  357.     bitmap = readBMP(argv[1]);
  358.     image_size = bitmap.width * bitmap.height;
  359.     grid = dim3(((bitmap.width + (BLOCK_SIZE - 1)) / BLOCK_SIZE),
  360.         ((bitmap.height + (BLOCK_SIZE - 1)) / BLOCK_SIZE));
  361.  
  362. #if 0
  363.     {
  364.         int* gpu_index, index;
  365.         cudaMalloc(&gpu_index, sizeof(int));
  366.         cudaMemset(gpu_index, 0, sizeof(int));
  367.         gpu_grayscale << <grid, block >> > (bitmap.width, bitmap.height,
  368.             d_bitmap, d_image_out[0], gpu_index);
  369.  
  370.         cudaMemcpy(&index, gpu_index, sizeof(float), cudaMemcpyDeviceToHost);
  371.  
  372.         printf("INDEX %d", index);
  373.     }
  374. #endif
  375.  
  376.     printf("Image opened (width=%d height=%d).\n", bitmap.width, bitmap.height);
  377.  
  378.     // Allocate the intermediate image buffers for each step
  379.     for (int i = 0; i < 2; i++)
  380.     {
  381.         image_out[i] = (float*)calloc(image_size, sizeof(float));
  382.         image_out_gpu[i] = (float*)calloc(image_size, sizeof(float));
  383.  
  384.         cudaMalloc(&d_image_out[i], image_size * sizeof(float));
  385.         cudaMemset(d_image_out[i], 0, image_size * sizeof(float));
  386.     }
  387.  
  388.     cudaMalloc(&d_bitmap, image_size * sizeof(float) * 3);
  389.     cudaMemcpy(d_bitmap, bitmap.data,
  390.         image_size * sizeof(float) * 3, cudaMemcpyHostToDevice);
  391.  
  392.     // Step 1: Convert to grayscale
  393.     {
  394.         // Launch the CPU version
  395.         gettimeofday(&t[0], NULL);
  396.         cpu_grayscale(bitmap.width, bitmap.height, bitmap.data, image_out[0]);
  397.         gettimeofday(&t[1], NULL);
  398.  
  399.         elapsed[0] = get_elapsed(t[0], t[1]);
  400.  
  401.         // Launch the GPU version
  402.         gettimeofday(&t[0], NULL);
  403.         gpu_grayscale<<<grid, block>>>(bitmap.width, bitmap.height, d_bitmap, d_image_out[0]);
  404.  
  405.         cudaMemcpy(image_out_gpu[0], d_image_out[0],
  406.                    image_size * sizeof(float), cudaMemcpyDeviceToHost);
  407.         gettimeofday(&t[1], NULL);
  408.  
  409.         elapsed[1] = get_elapsed(t[0], t[1]);
  410.  
  411.         // Store the result image in grayscale
  412.         store_result(1, elapsed[0], elapsed[1], bitmap.width, bitmap.height, image_out[0], image_out_gpu[0]);
  413.     }
  414.  
  415.     // Step 2: Apply a 3x3 Gaussian filter
  416.     {
  417.         // Launch the CPU version
  418.         gettimeofday(&t[0], NULL);
  419.         cpu_gaussian(bitmap.width, bitmap.height, image_out[0], image_out[1]);
  420.         gettimeofday(&t[1], NULL);
  421.  
  422.         elapsed[0] = get_elapsed(t[0], t[1]);
  423.  
  424.         // Launch the GPU version
  425.         gettimeofday(&t[0], NULL);
  426.         gpu_gaussian<<<grid, block>>>(bitmap.width, bitmap.height,
  427.                                       d_image_out[0], d_image_out[1]);
  428.  
  429.         cudaMemcpy(image_out_gpu[1], d_image_out[1],
  430.                    image_size * sizeof(float), cudaMemcpyDeviceToHost);
  431.         gettimeofday(&t[1], NULL);
  432.  
  433.         elapsed[1] = get_elapsed(t[0], t[1]);
  434.  
  435.         // Store the result image with the Gaussian filter applied
  436.         store_result(2, elapsed[0], elapsed[1], bitmap.width, bitmap.height, image_out[1], image_out_gpu[1]);
  437.     }
  438.  
  439.     // Step 3: Apply a Sobel filter
  440.     {
  441.         // Launch the CPU version
  442.         gettimeofday(&t[0], NULL);
  443.         cpu_sobel(bitmap.width, bitmap.height, image_out[1], image_out[0]);
  444.         gettimeofday(&t[1], NULL);
  445.  
  446.         elapsed[0] = get_elapsed(t[0], t[1]);
  447.  
  448.         // Launch the GPU version
  449.         gettimeofday(&t[0], NULL);
  450.         gpu_sobel<<<grid, block>>>(bitmap.width, bitmap.height,
  451.                                    d_image_out[1], d_image_out[0]);
  452.  
  453.         cudaMemcpy(image_out_gpu[0], d_image_out[0],
  454.                    image_size * sizeof(float), cudaMemcpyDeviceToHost);
  455.         gettimeofday(&t[1], NULL);
  456.  
  457.         elapsed[1] = get_elapsed(t[0], t[1]);
  458.  
  459.         // Store the final result image with the Sobel filter applied
  460.         store_result(3, elapsed[0], elapsed[1], bitmap.width, bitmap.height, image_out[0], image_out_gpu[0]);
  461.     }
  462.  
  463.     // Release the allocated memory
  464.     for (int i = 0; i < 2; i++)
  465.     {
  466.         free(image_out[i]);
  467.         cudaFree(d_image_out[i]);
  468.     }
  469.  
  470.     freeBMP(bitmap);
  471.     cudaFree(d_bitmap);
  472.  
  473.     return 0;
  474. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement