Untitled

/*
 * Copyright 2011 Juan Gabriel Gomila Salas <joan-by@hotmail.com>
 *
 * All rights reserved.
 *
 *
 * Patent warning:
 *
 * This file implement one algorithm possibly linked to the patent
 *
 * # N. Moroney, R.G. Beausoleil and I. Sobel, Local Color Correction,
 * US Patent 6,822,762, (November 23, 2004)
 *
 * This file is made available for the exclusive aim of serving as
 * scientific tool to verify the soundness and completeness of the
 * algorithm description. Compilation, execution and redistribution
 * of this file may violate patents rights in certain countries.
 * The situation being different for every country and changing
 * over time, it is your responsibility to determine which patent
 * rights restrictions apply to you before you compile, use,
 * modify, or redistribute this file. A patent lawyer is qualified
 * to make this determination.
 * If and only if they don't conflict with any patent terms, you
 * can benefit from the following license terms attached to this
 * file.
 *
 * License:
 *
 * This program is provided for scientific and educational only:
 * you can use and/or modify it for these purposes, but you are
 * not allowed to redistribute this work or derivative works in
 * source or executable form. A license must be obtained from the
 * patent right holders for any other use.
 *
 *
 */

/**
 * @file localcolorcorrection.c
 * @brief Local Color Correction algorithm
 *
 * Parameters:
 *
 * input image
 * output image
 *
 * Read/write operations (png format) make use
 * of io_png.c and io_png.h, by Nicolas Limare
 *
 * Conversion operations between color spaces, make use
 * colorspace library, by Pascal Getreuer
 *
 * @author Juan Gabriel Gomila Salas <joan-by@hotmail.com>
 *
 */

/**
 * @mainpage Local Color Correction
 *
 * README.txt:
 * @verbinclude README.txt
 */

/**
 * @file   localcolorcorrection.c
 * @brief  Main executable file
 *
 *
 * @author Juan Gabriel Gomila Salas <joan-by@hotmail.com>
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "structs.h"
#include "io_png.h"
#include <limits.h>
#include <xmmintrin.h>  // SSE
#include <pmmintrin.h>  // SSE
#include <assert.h>
#include <omp.h>


#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
/**
 * @brief main function call
 */
void *contrast_enhance ( pixel *input, int w, int h )
{
    int   radius, n, nx, ny, t;
    float *Mask, *Mask2, *Kernel;
    float sum;

    pixel * output = (pixel *) malloc ( sizeof ( pixel ) * w * h );
    radius = 40;

    //I     = (float *) malloc ( w * h * sizeof ( float ) );
    Mask  = (float *) malloc ( w * h * sizeof ( float ) );
    Mask2 = (float *) malloc ( w * h * sizeof ( float ) );
    /*Create convolution kernel*/
    Kernel = (float *) malloc ( ( 2 * radius + 1 ) * sizeof ( float ) );
/*
    for ( n = 0; n < w * h; n++ ) {
        I[n] = ( ( input[n].R / 255.0 ) + ( input[n].G / 255.0 ) + ( input[n].B / 255.0 ) ) / 3.0f;
    }

    for ( n = 0; n < w * h; n++ ) {
        Mask[n]  = I[n];
        Mask2[n] = Mask[n];
    }*/

    /*#pragma omp parallel private(step,start,cpu,stop) shared(w,h,input,Mask,Mask2)
    {
      //step = (w*h) / omp_get_num_threads ();
      //cpu = omp_get_thread_num ();
      //stop = min ( ( cpu + 1 ) * step, (w*h));
      //start = cpu * step;
      #pragma omp for private(n)
      for ( n = 0; n < w*h; n++) {
        Mask[n] = ( ( input[n].R / 255.0 ) + ( input[n].G / 255.0 ) + ( input[n].B / 255.0 ) ) / 3.0f;
        Mask2[n]  = Mask[n];
    }
  }*/
  #pragma omp parallel for private(n) shared(w,h,input,Mask,Mask2)
    for ( n = 0; n < w*h; n++) {
      Mask[n] = ( ( input[n].R / 255.0 ) + ( input[n].G / 255.0 ) + ( input[n].B / 255.0 ) ) / 3.0f;
      Mask2[n]  = Mask[n];
  }


  int step, cpu, stop, start;
/*  #pragma omp parallel default(none) shared(input, Mask, Mask2, h, w) private(start, stop, cpu, step, n)
  {
    step = (w*h) / omp_get_num_threads();
    cpu = omp_get_thread_num();
    printf("Thread number %i\n", cpu);
    stop = MIN ((cpu+1)*step, w*h);
    start = cpu*step;
      for ( n = start; n < stop; n++) {
        Mask[n] = ( ( input[n].R / 255.0 ) + ( input[n].G / 255.0 ) + ( input[n].B / 255.0 ) ) / 3.0f;
        Mask2[n]  = Mask[n];
    }
  }*/

    if ( radius > 0 ) {
        #pragma omp parallel for default(none) private(n) shared(radius, Kernel)
        for ( n = 0; n < 2 * radius + 1; n++ ) {
            Kernel[n] =
                (float) exp ( (double) ( -2.0f * ( -radius + n ) * ( -radius + n ) / ( radius * radius ) ) );
        }

        sum = 0.0f;
        //#pragma omp parallel for default(none) reduction(+:sum) shared(radius, Kernel)
        for ( n = 0; n < 2 * radius + 1; n++ ) {
                   sum = sum + Kernel[n];
        }
        #pragma omp parallel for default(none) private(n) shared(radius, Kernel, sum)
        for ( n = 0; n < 2 * radius + 1; n++ ) {
            Kernel[n] = (float) ( Kernel[n] / sum );
        }

        /*If the radius of the mask is less than the mid-size of the image,
           we perform the desired algorithm*/
        if ( radius < w / 2 && radius < h / 2 ) {
          /*Convolve in the x direction*/
        /*  #pragma omp parallel for default(none) shared(radius,w,h,Mask,Kernel,Mask2) private(ny,t, sum)
            for ( nx = 0; nx < w; nx++ ) {
                for ( ny = 0; ny < h; ny++ ) {
                    sum = 0.0f;
                    for ( t = -radius; t <= radius; t++ ) {
                        if ( nx + t >= w ) {
                            sum = sum + Mask[2 * w - 2 - nx - t + ny * w] * Kernel[t + radius];
                        }
                        else{
                            sum = sum + Mask[abs ( nx + t ) + ny * w] * Kernel[t + radius];
                        }
                    }
                    Mask2[nx + ny * w] = (float) sum;
                }
            }*/

/*
    //WITHOUT ABS()
 #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
     int limit, limit2;
     for ( nx = 0; nx < w; nx++ ) {
         for ( ny = 0; ny < h; ny++ ) {
             sum = 0.0f;
             for ( t = w-nx; t <= radius; t++ ) {
                sum = sum + Mask[2 * w - 2 - nx - t + ny * w] * Kernel[t + radius];
             }
             limit2 = MAX(-nx, -radius-1);
             for ( t = -radius; t <= limit2; t++ ) {
                 sum = sum + Mask[ -(nx + t ) + ny * w] * Kernel[t + radius];
             }
             limit = MIN(w-nx-1, radius);
             for ( t = limit2+1; t <= limit; t++ ) {
                 sum = sum + Mask[( nx + t ) + ny * w] * Kernel[t + radius];
             }
             Mask2[nx + ny * w] = (float) sum;
         }
     }
*/
     //WITH OpenMP
    /* int limit, limit2;
     #pragma omp parallel for private(t, ny, limit, limit2, sum) shared(w,h,Mask,Mask2,Kernel, radius) //collapse(2)
     for ( nx = 0; nx < w; nx++ ) {
       for ( ny = 0; ny < h; ny++ ) {
         sum = 0.0f;
         for ( t = w-nx; t <= radius; t++ ) {
           sum = sum + Mask[2 * w - 2 - nx - t + ny * w] * Kernel[t + radius];
         }
         limit2 = MAX(-nx, -radius-1);
         for ( t = -radius; t <= limit2; t++ ) {
           sum = sum + Mask[ -(nx + t ) + ny * w] * Kernel[t + radius];
         }
         limit = MIN(w-nx-1, radius);
         for ( t = limit2+1; t <= limit; t++ ) {
           sum = sum + Mask[( nx + t ) + ny * w] * Kernel[t + radius];
         }
         Mask2[nx + ny * w] = (float) sum;
       }
     }*/

     int limit, limit2;
     //#pragma omp parallel private(t, ny, limit, limit2, sum) shared(w,h,Mask,Mask2,Kernel, radius) //collapse(2)
     #pragma omp parallel private(t, ny, nx, limit, limit2, sum, start,step, cpu, stop) shared(w,h,Mask,Mask2,Kernel,radius) //collapse(2)
     {
       step = (w) / omp_get_num_threads();
       cpu = omp_get_thread_num();
       stop = MIN ((cpu+1)*step, w);
       start = cpu*step;
       for ( nx = start; nx < stop; nx++ ) {
         for ( ny = 0; ny < h; ny++ ) {
           sum = 0.0f;
           for ( t = w-nx; t <= radius; t++ ) {
             sum = sum + Mask[2 * w - 2 - nx - t + ny * w] * Kernel[t + radius];
           }
           limit2 = MAX(-nx, -radius-1);
           for ( t = -radius; t <= limit2; t++ ) {
             sum = sum + Mask[ -(nx + t ) + ny * w] * Kernel[t + radius];
           }
           limit = MIN(w-nx-1, radius);
           for ( t = limit2+1; t <= limit; t++ ) {
             sum = sum + Mask[( nx + t ) + ny * w] * Kernel[t + radius];
           }
           Mask2[nx + ny * w] = (float) sum;
         }
       }
     }
     /*int limit, limit2;
     #pragma omp parallel for private(t, ny, limit, limit2) shared(w,h,Mask,Mask2,Kernel, radius,start, stop, cpu, step) //collapse(2)
     for ( nx = 0; nx < w; nx++ ) {
     #pragma omp parallel default(none) shared(Kernel, Mask, Mask2, h, w, nx, radius) private(start, stop, cpu, step, ny, sum,t, limit, limit2)
     {
       step = (h) / omp_get_num_threads();
       cpu = omp_get_thread_num();
       printf("Thread number %i\n", cpu);
       stop = MIN ((cpu+1)*step, h);
       start = cpu*step;
       for ( ny = start; ny < stop; ny++ ) {
         sum = 0.0f;
         for ( t = w-nx; t <= radius; t++ ) {
           sum = sum + Mask[2 * w - 2 - nx - t + ny * w] * Kernel[t + radius];
         }
         limit2 = MAX(-nx, -radius-1);
         for ( t = -radius; t <= limit2; t++ ) {
           sum = sum + Mask[ -(nx + t ) + ny * w] * Kernel[t + radius];
         }
         limit = MIN(w-nx-1, radius);
         for ( t = limit2+1; t <= limit; t++ ) {
           sum = sum + Mask[( nx + t ) + ny * w] * Kernel[t + radius];
         }
         Mask2[nx + ny * w] = (float) sum;
       }
      }
    }*/


/*#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
int limit, limit2;
#pragma omp parallel for default(none) shared(w) private(nx)
for ( nx = 0; nx < w; nx++ ) {
  #pragma omp parallel for default(none) shared(h, w, radius, Mask2, nx) private(ny, sum, limit, limit2)
  for ( ny = 0; ny < h; ny++ ) {
    sum = 0.0f;
    #pragma omp parallel for reduction(+:sum) default(none) shared(radius, nx, ny, Mask, Kernel, w) private(t)
    for ( t = w-nx; t <= radius; t++ ) {
      sum = sum + Mask[2 * w - 2 - nx - t + ny * w] * Kernel[t + radius];
    }
    limit2 = MAX(-nx, -radius-1);
    #pragma omp parallel for reduction(+:sum) default(none) shared(radius, nx, ny, Mask, Kernel, limit2, w) private(t)
    for ( t = -radius; t <= limit2; t++ ) {
      sum = sum + Mask[ -(nx + t ) + ny * w] * Kernel[t + radius];
    }
    limit = MIN(w-nx-1, radius);
    #pragma omp parallel for reduction(+:sum) default(none) shared(radius, nx, ny, Mask, Kernel, limit2, limit, w) private(t)
    for ( t = limit2+1; t <= limit; t++ ) {
      sum = sum + Mask[( nx + t ) + ny * w] * Kernel[t + radius];
    }
    Mask2[nx + ny * w] = (float) sum;
  }
}/*
            /*Convolve in the y direction*/
            /*#pragma omp parallel for default(none) shared(radius,w,h,Mask,Kernel,Mask2) private(nx,t, sum)
            for ( ny = 0; ny < h; ny++ ) {
                for ( nx = 0; nx < w; nx++ ) {
                    sum = 0.0f;
                    for ( t = -radius; t <= radius; t++ ) {
                        if ( ny + t >= h ) {
                            sum = sum + Mask2[nx + ( 2 * h - 2 - ny - t ) * w] *
                                  Kernel[t + radius];
                        }
                        else{
                            sum = sum + Mask2[nx + abs ( ny + t ) * w] *
                                  Kernel[t + radius];
                        }
                    }
                    Mask[nx + ny * w] = (float) sum;
                }
            }*/
            /*#pragma omp parallel for default(none) private(t, nx, limit, limit2, sum) shared(w,h,Mask,Mask2,Kernel, radius) // collapse(2)
            for ( ny = 0; ny < h; ny++ ) {
                for ( nx = 0; nx < w; nx++ ) {
                    sum = 0.0f;
                    for ( t = h-ny; t <= radius; t++ ) {
                      sum = sum + Mask2[nx + ( 2 * h - 2 - ny - t ) * w] * Kernel[t + radius];
                    }
                    limit2 = MAX(-ny, -radius-1);
                    for ( t = -radius; t <= limit2; t++ ) {
                      sum = sum + Mask2[nx - ( ny + t ) * w] * Kernel[t + radius];
                    }
                    limit = MIN(h-ny-1, radius);
                    for ( t = limit2+1; t <= limit; t++ ){
                      sum = sum + Mask2[nx + ( ny + t ) * w] * Kernel[t + radius];
                    }
                    Mask[nx + ny * w] = (float) sum;
                }
            }*/

            #pragma omp parallel private(t, ny, nx, limit, limit2, sum, start,step, cpu, stop) shared(w,h,Mask,Mask2,Kernel,radius) //collapse(2)
            {
              step = (h) / omp_get_num_threads();
              cpu = omp_get_thread_num();
              stop = MIN ((cpu+1)*step, h);
              start = cpu*step;
              for ( ny = start; ny < stop; ny++ ) {
                  for ( nx = 0; nx < w; nx++ ) {
                      sum = 0.0f;
                      for ( t = h-ny; t <= radius; t++ ) {
                        sum = sum + Mask2[nx + ( 2 * h - 2 - ny - t ) * w] * Kernel[t + radius];
                      }
                      limit2 = MAX(-ny, -radius-1);
                      for ( t = -radius; t <= limit2; t++ ) {
                        sum = sum + Mask2[nx - ( ny + t ) * w] * Kernel[t + radius];
                      }
                      limit = MIN(h-ny-1, radius);
                      for ( t = limit2+1; t <= limit; t++ ){
                        sum = sum + Mask2[nx + ( ny + t ) * w] * Kernel[t + radius];
                      }
                      Mask[nx + ny * w] = (float) sum;
                  }
              }
            }

        }
        else {
            /*Otherwise, perform a simple gamma correction*/
            //#pragma omp parallel for default(none) private(nx) shared(Mask, h, w) reduction(+:sum) collapse(2)
            for ( ny = 0; ny < h; ny++ ) {
                for ( nx = 0; nx < w; nx++ ) {
                    sum = sum + Mask[nx + ny * w];
                }
            }
            sum = sum / ( w * h );
            #pragma omp parallel for default(none) private(nx) shared(Mask, h, w, sum)  collapse(2)
            for ( ny = 0; ny < h; ny++ ) {
                for ( nx = 0; nx < w; nx++ ) {
                    Mask[nx + ny * w] = sum;
                }
            }
        }
    }

  /*  #pragma omp parallel for default(none) private(n) shared(Mask, output, input, h, w)
    for ( n = 0; n < w * h; n++ ) {
        Mask[n] = (float) ( 2 * Mask[n] - 1 );
        Mask[n] = (float) pow ( 2.0, (double) Mask[n] );

        output[n].R = min ( 255, 255 * (float) pow ( (double) input[n].R / 255.0, (double) Mask[n] ) );
        output[n].G = min ( 255, 255 * (float) pow ( (double) ( input[n].G / 255.0 ), (double) Mask[n] ) );
        output[n].B = min ( 255, 255 * (float) pow ( (double) ( input[n].B / 255.0 ), (double) Mask[n] ) );
    }*/
    #pragma omp parallel default(none) private(n,start,step, cpu, stop) shared(Mask, output, input, h, w)
    {
      step = (h*w) / omp_get_num_threads();
      cpu = omp_get_thread_num();
      stop = MIN ((cpu+1)*step, h*w);
      start = cpu*step;
      for ( n = start; n < stop; n++ ) {
          Mask[n] = (float) ( 2 * Mask[n] - 1 );
          Mask[n] = (float) pow ( 2.0, (double) Mask[n] );

          output[n].R = min ( 255, 255 * (float) pow ( (double) input[n].R / 255.0, (double) Mask[n] ) );
          output[n].G = min ( 255, 255 * (float) pow ( (double) ( input[n].G / 255.0 ), (double) Mask[n] ) );
          output[n].B = min ( 255, 255 * (float) pow ( (double) ( input[n].B / 255.0 ), (double) Mask[n] ) );
      }
    }
    //free ( I );
    free ( Mask );
    free ( Mask2 );
    free ( Kernel );

    return output;
}