// Histogram Equalization #include #include #include #define HISTOGRAM_LENGTH 256 //@@ insert code here #define wbCheck(stmt) \ do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ return -1; \ } \ } while (0) __global__ void FloattoUChar(float *Pin, unsigned char *Pout, int imgsize){ unsigned int t = blockIdx.x*blockDim.x + threadIdx.x; if (t < imgsize){ Pout[t] = (unsigned char) ((HISTOGRAM_LENGTH - 1) * Pin[t]); } } __global__ void ColortoGray(unsigned char *Pin, unsigned char *Pout, int imgSize){ unsigned int t = blockIdx.x*blockDim.x + threadIdx.x; if(t < imgSize){ unsigned char red, grn, blu; red = Pin[t * 3]; grn = Pin[t * 3 + 1]; blu = Pin[t * 3 + 2]; Pout[t] = (unsigned char) ((0.21 * red) + (0.71 * grn) + (0.07 * blu)); } } __global__ void GraytoHist(unsigned char *Pin, unsigned int *Pout, int imgsize){ __shared__ unsigned int hist[HISTOGRAM_LENGTH]; unsigned int t = blockIdx.x*blockDim.x + threadIdx.x; if (threadIdx.x < HISTOGRAM_LENGTH){ // set histogram initial values to zero hist[threadIdx.x] = 0; } __syncthreads(); if (t < imgsize){ // feeding the histogram, Pin[t] always from 0 to 255, so hist[Pin[t]] matches atomicAdd(&hist[Pin[t]], 1); } __syncthreads(); if (threadIdx.x < HISTOGRAM_LENGTH){ // adding results from different block to the output atomicAdd(&Pout[threadIdx.x], hist[threadIdx.x]); } } __global__ void HistScanCDF(unsigned int *Pin, float *Pout, int imgsize){ __shared__ float Scan[HISTOGRAM_LENGTH]; unsigned int t = threadIdx.x; if (t < HISTOGRAM_LENGTH){ Scan[t] = Pin[t]; } __syncthreads(); for (unsigned int stride = 1; stride < blockDim.x; stride *= 2){ __syncthreads(); if (t >= stride) Scan[t] += Scan[t - stride]; } __syncthreads(); Pout[t] = Scan[t]/imgsize; } __global__ void Equalizer(unsigned char* Pin, float* Pout, float* CDF, int imgsize) { unsigned int t = blockIdx.x * blockDim.x + threadIdx.x; if(t < imgsize){ float val = (float) (255 * (CDF[Pin[t]] - CDF[0]) / (1.0 - CDF[0])) / (HISTOGRAM_LENGTH - 1.0); Pout[t] = (float) min(max(val, 0.0), 255.0); } } int main(int argc, char **argv) { wbArg_t args; int imageWidth; int imageHeight; int imageChannels; wbImage_t inputImage; wbImage_t outputImage; float *hostInputImageData; float *hostOutputImageData; const char *inputImageFile; float *deviceInput; unsigned char *deviceUChar; unsigned char *deviceGray; unsigned int *deviceHist; float *deviceCDF; float *deviceOutput; unsigned int *hostHist; unsigned char *hostGray; float *hostCDF; args = wbArg_read(argc, argv); /* parse the input arguments */ inputImageFile = wbArg_getInputFile(args, 0); //Import data and create memory on host inputImage = wbImport(inputImageFile); imageWidth = wbImage_getWidth(inputImage); imageHeight = wbImage_getHeight(inputImage); imageChannels = wbImage_getChannels(inputImage); outputImage = wbImage_new(imageWidth, imageHeight, imageChannels); hostInputImageData = wbImage_getData(inputImage); hostOutputImageData = wbImage_getData(outputImage); unsigned int imgsize = imageWidth*imageHeight*imageChannels; wbCheck(cudaMalloc((void**)&deviceInput, imgsize * sizeof(float))); wbCheck(cudaMalloc((void**)&deviceUChar, imgsize * sizeof(unsigned char))); wbCheck(cudaMalloc((void**)&deviceGray, imageWidth*imageHeight * sizeof(unsigned char))); wbCheck(cudaMalloc((void**)&deviceHist, HISTOGRAM_LENGTH * sizeof(unsigned int))); wbCheck(cudaMalloc((void**)&deviceCDF, HISTOGRAM_LENGTH * sizeof(float))); wbCheck(cudaMalloc((void**)&deviceOutput, imgsize * sizeof(float))); wbCheck(cudaMemcpy(deviceInput, hostInputImageData, imgsize * sizeof(float), cudaMemcpyHostToDevice)); wbCheck(cudaMemset((void *) deviceHist, 0, HISTOGRAM_LENGTH * sizeof(unsigned int))); wbCheck(cudaMemset((void *) deviceCDF, 0, HISTOGRAM_LENGTH * sizeof(float))); //memset(hostHist, 0, HISTOGRAM_LENGTH * sizeof(unsigned int)); hostHist = (unsigned int *)malloc(HISTOGRAM_LENGTH * sizeof(unsigned int)); hostGray = (unsigned char *)malloc(imageWidth*imageHeight * sizeof(unsigned char)); hostCDF = (float *)malloc(HISTOGRAM_LENGTH * sizeof(float)); dim3 DimGrid(((imageWidth*imageHeight*imageChannels) - 1) / HISTOGRAM_LENGTH + 1, 1, 1); dim3 DimGrid_CDF(1,1,1); dim3 DimBlock(HISTOGRAM_LENGTH, 1, 1); FloattoUChar <<>>(deviceInput, deviceUChar, imgsize); cudaDeviceSynchronize(); ColortoGray <<>>(deviceUChar, deviceGray, imageWidth*imageHeight); cudaDeviceSynchronize(); // wbCheck(cudaMemcpy(hostGray, deviceGray, imageWidth*imageHeight * sizeof(unsigned char), cudaMemcpyDeviceToHost)); // for (int i = 0; i < imageWidth*imageHeight; i++){ // printf("%hhu\n", hostGray[i]); // } GraytoHist <<>>(deviceGray, deviceHist, imageWidth*imageHeight); cudaDeviceSynchronize(); wbCheck(cudaMemcpy(hostHist, deviceHist, HISTOGRAM_LENGTH * sizeof(unsigned int), cudaMemcpyDeviceToHost)); // for (int i = 0; i < HISTOGRAM_LENGTH; i++){ // printf("%d\n", hostHist[i]); // } HistScanCDF <<>>(deviceHist, deviceCDF, imageWidth*imageHeight); cudaDeviceSynchronize(); // wbCheck(cudaMemcpy(hostCDF, deviceCDF, HISTOGRAM_LENGTH * sizeof(float), cudaMemcpyDeviceToHost)); // for (int i = 0; i < HISTOGRAM_LENGTH; i++){ // printf("%lf\n", hostCDF[i]); // } Equalizer <<>>(deviceUChar, deviceOutput, deviceCDF, imgsize); cudaDeviceSynchronize(); wbCheck(cudaMemcpy(hostOutputImageData, deviceOutput, imgsize * sizeof(float), cudaMemcpyDeviceToHost)); // for (int i = 0; i < imgsize; i++){ // printf("%lf\n", hostOutputImageData[i]); // } wbImage_setData(outputImage, hostOutputImageData); wbSolution(args, outputImage); cudaFree(deviceInput); cudaFree(deviceUChar); cudaFree(deviceGray); cudaFree(deviceHist); cudaFree(deviceCDF); cudaFree(deviceOutput); free(hostCDF); free(hostGray); free(hostHist); free(hostInputImageData); free(hostOutputImageData); return 0; }