Untitled

// Histogram Equalization

#include <wb.h>

#define BLOCK_SIZE 256
#define BLOCK_WIDTH 16

//@@ insert code here

//First kernel converts the data from float to unsigned char
__global__
void floatToChar(float *input, unsigned char *output, int len){
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if(i < len)
          output[i] = (unsigned char) 255*input[i];
}

//Second kernel converts the data from rgb to grayScale image
__global__
void rgbToGray(unsigned char *input, unsigned char *output, int size){
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     unsigned char r,g,b;
     //checking bounds
     if(i < size){
          //we have 3 channels for RGB values
          r = input[i * 3];
          g = input[i * 3 + 1];
          b = input[i * 3 + 2];
          //set intensity values for grayScale
          output[i] = (unsigned char) (0.21f*r + 0.71f*g + 0.07f*b);
     }
}

//Third kernel that calculates the histogram
__global__
void pixelHisto(unsigned char *input, unsigned int *output, int len){
     __shared__ unsigned int privHisto[256];
     if(threadIdx.x < 256)
          privHisto[threadIdx.x] = 0;
     __syncthreads();
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
     while(i<len){
          atomicAdd(&privHisto[input[i]], 1);
          i += stride;
     }
     __syncthreads();
     if(threadIdx.x < 256)
          atomicAdd(&output[threadIdx.x], privHisto[threadIdx.x]);
}

//Fourth kernel that calculates the correct color of the image
__global__
void equalizeImage(unsigned char *input, float *output, float *cdf, int len){
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     float ans = (float) 255*(cdf[input[i]] - cdf[0])/(1.0-cdf[0]);
     unsigned char charAns = (unsigned char) ans;
     unsigned char var = (ans < 0)?0:(ans > 255)?255: charAns;
     if(i < len)
          output[i] = (float) var/255.0;
}


int main(int argc, char **argv) {
  wbArg_t args;
  int imageWidth;
  int imageHeight;
  int imageChannels;
  wbImage_t inputImage;
  wbImage_t outputImage;
  float *hostInput;
  float *hostOutput;
  float *deviceInput;
  //float *deviceOutput;
  float *cdf;
  float *deviceCDF;
  unsigned char *grayDeviceInput;
  unsigned char *charDeviceInput;
  unsigned int *deviceHisto;
  unsigned int *hostHisto;
  const char *inputImageFile;

  //@@ Insert more code here

  args = wbArg_read(argc, argv); /* parse the input arguments */

  inputImageFile = wbArg_getInputFile(args, 0);

  wbTime_start(Generic, "Importing data and creating memory on host");
  inputImage = wbImport(inputImageFile);
  imageWidth = wbImage_getWidth(inputImage);
  imageHeight = wbImage_getHeight(inputImage);
  imageChannels = wbImage_getChannels(inputImage);
  outputImage = wbImage_new(imageWidth, imageHeight, imageChannels);
  wbTime_stop(Generic, "Importing data and creating memory on host");

  //@@ insert code here
  //size of image (without channel rgb)
  int imageSize = imageHeight * imageWidth;
  int rgbImageSize = imageSize * imageChannels;

  //get data pointer to image input and output
  hostInput = wbImage_getData(inputImage);

  //allocate device and host memory for the whole program
  cudaMalloc((void **) &deviceInput, rgbImageSize * sizeof(float));
  cudaMalloc((void **) &grayDeviceInput, imageSize * sizeof(unsigned char));
  cudaMalloc((void **) &charDeviceInput, rgbImageSize * sizeof(unsigned char));
  cudaMalloc((void **) &deviceHisto, BLOCK_SIZE * sizeof(unsigned int));
  cudaMalloc((void **) &deviceCDF, BLOCK_SIZE * sizeof(float));
  hostHisto = (unsigned int *) malloc(BLOCK_SIZE * sizeof(unsigned int));
  cdf = (float *) malloc(BLOCK_SIZE * sizeof(float));
  hostOutput = (float *) malloc(rgbImageSize * sizeof(float));

  //copy host memory to device memory
  cudaMemcpy(deviceInput, hostInput, rgbImageSize * sizeof(float), cudaMemcpyHostToDevice);

  //declaring block and grid size for first 1D kernel
  dim3 dimGrid1(ceil((float) (imageSize * imageChannels)/BLOCK_SIZE));
  dim3 dimBlock1(BLOCK_SIZE);
  //convert float to unsigned char kernel1
  floatToChar <<<dimGrid1, dimBlock1>>> (deviceInput, charDeviceInput, rgbImageSize);
  //we get sunsigned char *charDeviceInput


  //declaring block and grid size for second 1D kernel
  dim3 dimGrid2(ceil((float) imageSize/BLOCK_SIZE));
  dim3 dimBlock2(BLOCK_SIZE);
  rgbToGray <<<dimGrid2, dimBlock2>>> (charDeviceInput, grayDeviceInput, imageWidth*imageHeight);


  //declaring block and grid size for the third 1D kernel
  dim3 dimGrid3(ceil((float) imageSize/BLOCK_SIZE));
  dim3 dimBlock3(BLOCK_SIZE);
  pixelHisto <<<dimGrid3, dimBlock3>>> (grayDeviceInput, deviceHisto, imageSize);


  //perform cdf calc on host
  cudaMemcpy(hostHisto, deviceHisto, BLOCK_SIZE * sizeof(unsigned int), cudaMemcpyDeviceToHost);
  cdf[0] = (float) hostHisto[0]/imageSize;
  for(int i = 1; i<256; ++i){
       cdf[i] = cdf[i-1] + (float) hostHisto[i]/imageSize;
  }
  cudaMemcpy(deviceCDF, cdf, BLOCK_SIZE * sizeof(float), cudaMemcpyHostToDevice);
  //Using same block_size as earlier
  equalizeImage <<<dimGrid1, dimBlock1>>> (charDeviceInput, deviceInput, deviceCDF, rgbImageSize);


  cudaMemcpy(hostOutput, deviceInput, rgbImageSize * sizeof(float), cudaMemcpyDeviceToHost);
  wbImage_setData(outputImage, hostOutput);
  wbSolution(args, outputImage);

  //@@ insert code here
  free(hostHisto);
  free(cdf);
  cudaFree(deviceInput);
  cudaFree(grayDeviceInput);
  cudaFree(charDeviceInput);
  cudaFree(deviceHisto);
  cudaFree(deviceCDF);

  return 0;
}