Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2017
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 15.63 KB | None | 0 0
  1.   /* Test and timing harness program for developing a multichannel
  2.      multikernel convolution (as used in deep learning networks)
  3.  
  4.      Note there are some simplifications around this implementation,
  5.      in particular with respect to computing the convolution at edge
  6.      pixels of the image.
  7.  
  8.      Author: David Gregg
  9.      Date:   February 2017
  10.  
  11.  
  12.      Version 1.4 : Modified the random generator to reduce the range
  13.                    of generated values;
  14.                    Changed the summation in the checking code from
  15.                    float to double to try to bring the checked value
  16.                    closer to the "true" value
  17.  
  18.      Version 1.3 : Fixed which loop variables were being incremented
  19.                    in write_out();
  20.                    Fixed dimensions of output and control_output
  21.                    matrices in main function
  22.  
  23.      Version 1.2 : Changed distribution of test data to (hopefully)
  24.                    eliminate random walk of floating point error;
  25.                    Also introduced checks to restrict kernel-order to
  26.                    a small set of values
  27.  
  28.      Version 1.1 : Fixed bug in code to create 4d matrix
  29.   */
  30.  
  31.   #include <stdio.h>
  32.   #include <stdlib.h>
  33.   #include <sys/time.h>
  34.   #include <assert.h>
  35.   #include <omp.h>
  36.   #include <math.h>
  37.   #include <x86intrin.h>
  38.  
  39.   /* the following two definitions of DEBUGGING control whether or not
  40.      debugging information is written out. To put the program into
  41.      debugging mode, uncomment the following line: */
  42.   /*#define DEBUGGING(_x) _x */
  43.   /* to stop the printing of debugging information, use the following line: */
  44.   #define DEBUGGING(_x)
  45.   #define ZERO_VALUE  0
  46.  
  47.  
  48.   /* write 3d matrix to stdout */
  49.      void write_out(float *** a, int dim0, int dim1, int dim2)
  50.      {
  51.       int i, j, k;
  52.  
  53.       for ( i = 0; i < dim0; i++ ) {
  54.         printf("Outer dimension number %d\n", i);
  55.         for ( j = 0; j < dim1; j++ ) {
  56.           for ( k = 0; k < dim2 - 1; k++ ) {
  57.             printf("%f, ", a[i][j][k]);
  58.           }
  59.         // print end of line
  60.           printf("%f\n", a[i][j][dim2-1]);
  61.         }
  62.       }
  63.     }
  64.  
  65.  
  66.   /* create new empty 4d matrix */
  67.     float **** new_empty_4d_matrix(int dim0, int dim1, int dim2, int dim3)
  68.     {
  69.       float **** result = malloc(dim0 * sizeof(float***));
  70.       float *** mat1 = malloc(dim0 * dim1 * sizeof(float**));
  71.       float ** mat2 = malloc(dim0 * dim1 * dim2 * sizeof(float*));
  72.       float * mat3 = malloc(dim0 * dim1 * dim2 *dim3 * sizeof(float));
  73.       int i, j, k;
  74.  
  75.  
  76.       for ( i = 0; i < dim0; i++ ) {
  77.         result[i] = &(mat1[i*dim1]);
  78.         for ( j = 0; j < dim1; j++ ) {
  79.           result[i][j] = &(mat2[i*dim1*dim2 + j*dim2]);
  80.           for ( k = 0; k < dim2; k++ ) {
  81.             result[i][j][k] = &(mat3[i*dim1*dim2*dim3+j*dim2*dim3+k*dim3]);
  82.           }
  83.         }
  84.       }
  85.  
  86.       return result;
  87.     }
  88.  
  89.   /* create new empty 3d matrix */
  90.     float *** new_empty_3d_matrix(int dim0, int dim1, int dim2)
  91.     {
  92.       float **** mat4d;
  93.       float *** mat3d;
  94.  
  95.     // create a 4d matrix with single first dimension
  96.       mat4d = new_empty_4d_matrix(1, dim0, dim1, dim2);
  97.     // now throw away out first dimension
  98.       mat3d = mat4d[0];
  99.       free(mat4d);
  100.       return mat3d;
  101.     }
  102.  
  103.   /* take a copy of the matrix asnd return in a newly allocated matrix */
  104.     float **** copy_4d_matrix(float **** source_matrix, int dim0,
  105.       int dim1, int dim2, int dim3)
  106.     {
  107.       int i, j, k, l;
  108.       float **** result = new_empty_4d_matrix(dim0, dim1, dim2, dim3);
  109.  
  110.       for ( i = 0; i < dim0; i++ ) {
  111.         for ( j = 0; j < dim1; j++ ) {
  112.           for ( k = 0; k < dim2; k++ ) {
  113.             for ( l = 0; l < dim3; l++ ) {
  114.               result[i][j][k][l] = source_matrix[i][j][k][l];
  115.             }
  116.           }
  117.         }
  118.       }
  119.       return result;
  120.     }
  121.  
  122.   /* create a matrix and fill it with random numbers */
  123.     float **** gen_random_4d_matrix(int dim0, int dim1, int dim2, int dim3)
  124.     {
  125.       float **** result;
  126.       int i, j, k, l;
  127.       struct timeval seedtime;
  128.       int seed;
  129.  
  130.       result = new_empty_4d_matrix(dim0, dim1, dim2, dim3);
  131.  
  132.     /* use the microsecond part of the current time as a pseudorandom seed */
  133.       gettimeofday(&seedtime, NULL);
  134.       seed = seedtime.tv_usec;
  135.       srandom(seed);
  136.  
  137.     /* fill the matrix with random numbers */
  138.     const int range = 1 << 12; // 2^12
  139.     const int bias = 1 << 16; // 2^16
  140.     float offset = 0.0;
  141.     for ( i = 0; i < dim0; i++ ) {
  142.       for ( j = 0; j < dim1; j++ ) {
  143.         for ( k = 0; k < dim2; k++ ) {
  144.           for ( l = 0; l < dim3; l++ ) {
  145.             // generate uniform random integer with mean of zero
  146.             long long rand = random();
  147.             // now cut down the range and bias the mean to reduce
  148.             // the likelihood of large floating point round-off errors
  149.             int reduced_range = (rand % range);
  150.             float num = (((float) reduced_range) / ((float) bias))+offset;
  151.             result[i][j][k][l] = num;
  152.           }
  153.         }
  154.       }
  155.     }
  156.  
  157.     return result;
  158.   }
  159.  
  160.   /* create a matrix and fill it with random numbers */
  161.   float *** gen_random_3d_matrix(int dim0, int dim1, int dim2)
  162.   {
  163.     float **** mat4d;
  164.     float *** mat3d;
  165.  
  166.     // create a 4d matrix with single first dimension
  167.     mat4d = gen_random_4d_matrix(1, dim0, dim1, dim2);
  168.     // now throw away out first dimension
  169.     mat3d = mat4d[0];
  170.     free(mat4d);
  171.     return mat3d;
  172.   }
  173.  
  174.   /* check the sum of absolute differences is within reasonable epsilon */
  175.   void check_result(float *** result, float *** control,
  176.     int dim0, int dim1, int dim2)
  177.   {
  178.     int i, j, k;
  179.     double sum_abs_diff = 0.0;
  180.     const double EPSILON = 0.0625;
  181.  
  182.     //printf("SAD\n");
  183.    
  184.     for ( i = 0; i < dim0; i++ ) {
  185.       for ( j = 0; j < dim1; j++ ) {
  186.         for ( k = 0; k < dim2; k++ ) {
  187.           double diff = fabs(control[i][j][k] - result[i][j][k]);
  188.           assert( diff >= 0.0 );
  189.           sum_abs_diff = sum_abs_diff + diff;
  190.         }
  191.       }
  192.     }
  193.  
  194.     if ( sum_abs_diff > EPSILON ) {
  195.       fprintf(stderr, "WARNING: sum of absolute differences (%f) > EPSILON (%f)\n",
  196.         sum_abs_diff, EPSILON);
  197.     }
  198.     else {
  199.       printf("COMMENT: sum of absolute differences (%f)  within acceptable range (%f)\n", sum_abs_diff, EPSILON);
  200.     }
  201.   }
  202.  
  203.   /* the slow but correct version of matmul written by David */
  204.   void multichannel_conv(float *** image, float **** kernels, float *** output,
  205.    int width, int height, int nchannels, int nkernels,
  206.    int kernel_order)
  207.   {
  208.     int h, w, x, y, c, m;
  209.     //#pragma omp parallel for private(m,w,h,c,x,y) collapse(3)
  210.     for ( m = 0; m < nkernels; m++ ) { //cycle through each kernel
  211.       for ( w = 0; w < width; w++ ) { //cycle through each x cooridinate of the image
  212.         for ( h = 0; h < height; h++ ) { //cycle through each y coordinate of the image
  213.           double sum = 0.0;
  214.           for ( c = 0; c < nchannels; c++ ) { //cycle through each channel of the image and the kernel
  215.             for ( x = 0; x < kernel_order; x++) { //cycle through each x coordinate of the kernel
  216.               for ( y = 0; y < kernel_order; y++ ) { //cycle through each y coordinate of the kernel
  217.                 sum += image[w+x][h+y][c] * kernels[m][c][x][y];
  218.               }
  219.             }
  220.           }
  221.           output[m][w][h] = sum;
  222.         }
  223.       }
  224.     }
  225.   }
  226.  
  227.  
  228.   /* the fast version of matmul written by the team */
  229.   void team_conv(float *** image, float **** kernels, float *** output,
  230.    int width, int height, int nchannels, int nkernels,
  231.    int kernel_order)
  232.   {
  233.     int h, w, x, y, c, m;
  234.     __m128 imageVector = _mm_setzero_ps(); //initialise imageVector
  235.     __m128 kernelVector = _mm_setzero_ps(); //initialise kernelVector
  236.     __m128 productVector = _mm_setzero_ps(); //initialise productVector
  237.     __m128 sumVector = _mm_setzero_ps(); //initialise sumVector
  238.  
  239.         float **** myReversedKernel = new_empty_4d_matrix(nkernels,kernel_order,kernel_order,nchannels);
  240.         for ( m = 0; m < nkernels; m++ ) { //cycle through each kernel
  241.           for(x = 0; x<kernel_order; x++){
  242.             for(y=0; y<kernel_order; y++){
  243.               for(c=0; c<nchannels; c++){
  244.                 //creates reordered kernel so that channels can be accessed in innermost loop
  245.                 myReversedKernel[m][x][y][c] = kernels[m][c][x][y];
  246.               }
  247.             }
  248.           }
  249.         }
  250.  
  251.     if(kernel_order == 1){
  252.  
  253.     //parallelizes three outermost for loops
  254.     #pragma omp parallel for private(m,w,h,c,imageVector,kernelVector,productVector,sumVector) collapse(3)
  255.     for ( m = 0; m < nkernels; m++ ) { //cycle through each kernel
  256.       for ( w = 0; w < width; w++ ) { //cycle through each x cooridinate of the image
  257.         for ( h = 0; h < height; h++ ) { //cycle through each y coordinate of the image
  258.           double sum = 0.0;
  259.           for ( c = 0; c < nchannels-3; c = c + 4 ) { //cycle through each channel of the image and the kernel
  260.             imageVector = _mm_loadu_ps(&image[w][h][c]); //loads 4 different channels of an image pixel into vector
  261.             kernelVector = _mm_loadu_ps(&myReversedKernel[m][0][0][c]); //loads 4 different channels of a kernel pixel into vector
  262.             productVector = _mm_mul_ps(imageVector,kernelVector); //multiplies two vectors together
  263.             sumVector = _mm_add_ps(sumVector, productVector); //adds two vectors together
  264.           }
  265.  
  266.           //if number of channels remaining is not divisible by 4
  267.           for (; c < nchannels; c++){
  268.             //horizontally sum channel value of remaining pixel as this cannot be loaded into vector
  269.             sum += image[w][h][c] * myReversedKernel[m][0][0][c];
  270.           }
  271.           //horizontally adds sumVector (adjacent elements in the same operand are added together)
  272.           sumVector = _mm_hadd_ps(sumVector,sumVector);
  273.           sumVector = _mm_hadd_ps(sumVector,sumVector);
  274.          
  275.           sum += _mm_cvtss_f32(sumVector); //extracts the lower order floating point value from sumVector
  276.           output[m][w][h] = sum; //stores sum in output array
  277.           sumVector = _mm_setzero_ps(); //set sumVector equal to zero
  278.         }
  279.       }
  280.     }
  281.   }else{ //when kernel_order is 3, 5 or 7
  282.         imageVector = _mm_setzero_ps(); //set imageVector equal to zero
  283.         kernelVector = _mm_setzero_ps(); //set kernelVector equal to zero
  284.         productVector = _mm_setzero_ps(); //set productVector equal to zero
  285.         sumVector = _mm_setzero_ps(); //set sumVector equal to zero
  286.  
  287.       //parallelizes three outermost for loops
  288.       #pragma omp parallel for private(m,w,h,c,x,y,imageVector,kernelVector,productVector,sumVector) collapse(3)
  289.         for ( m = 0; m < nkernels; m++ ) { //cycle through each kernel
  290.           for ( w = 0; w < width; w++ ) { //cycle through each x cooridinate of the image
  291.             for ( h = 0; h < height; h++ ) { //cycle through each y coordinate of the image
  292.               double sum = 0.0;
  293.               //cycle through each channel of the image and the kernel
  294.                 for ( x = 0; x < kernel_order; x++) { //cycle through each x coordinate of the kernel
  295.                   for ( y = 0; y < kernel_order; y++ ) { //cycle through each y coordinate of the kernel
  296.                     for ( c = 0; c < nchannels-3; c = c+4 ) {
  297.                     imageVector = _mm_load_ps(&image[w+x][h+y][c]); //loads 4 different channels of an image pixel into vector
  298.                     kernelVector = _mm_load_ps(&myReversedKernel[m][x][y][c]); //loads 4 different channels of a kernel pixel into vector
  299.                     productVector = _mm_mul_ps(imageVector,kernelVector); //multiplies two vectors together
  300.                     sumVector = _mm_add_ps(sumVector, productVector);  //adds two vectors together
  301.                     //sum += image[w+x][h+y][c] * kernels[m][c][x][y];
  302.                   }
  303.                 }
  304.         //if number of channels remaining is not divisible by 4
  305.           for (; c < nchannels; c++){
  306.             for(x=0; x<kernel_order; x++){
  307.               for(y=0; y<kernel_order; y++){
  308.             //horizontally sum channel value of remaining pixel as this cannot be loaded into vector
  309.             sum += image[w+x][h+y][c] * myReversedKernel[m][x][y][c];
  310.           }
  311.         }
  312.       }  
  313.  
  314.           //horizontally adds sumVector (adjacent elements in the same operand are added together)
  315.           sumVector = _mm_hadd_ps(sumVector,sumVector);
  316.           sumVector = _mm_hadd_ps(sumVector,sumVector);
  317.          
  318.           sum += _mm_cvtss_f32(sumVector); //extracts the lower order floating point value from sumVector
  319.           output[m][w][h] = sum; //stores sum in output array
  320.           sumVector = _mm_setzero_ps(); //set sumVector to zero
  321.               }
  322.             }
  323.           }
  324.         }
  325.       }
  326.     }
  327.  
  328.  
  329.     int main(int argc, char ** argv)
  330.     {
  331.     //float image[W][H][C];
  332.     //float kernels[M][C][K][K];
  333.     //float output[M][W][H];
  334.  
  335.       float *** image, **** kernels, *** output;
  336.       float *** control_output;
  337.       long long mul_time_my_team, mul_time_david_gregg;
  338.       int width, height, kernel_order, nchannels, nkernels;
  339.       struct timeval start_time, start_time2;
  340.       struct timeval stop_time, stop_time2;
  341.  
  342.       if ( argc != 6 ) {
  343.         fprintf(stderr, "Usage: conv-harness <image_width> <image_height> <kernel_order> <number of channels> <number of kernels>\n");
  344.         exit(1);
  345.       }
  346.       else {
  347.         width = atoi(argv[1]);
  348.         height = atoi(argv[2]);
  349.         kernel_order = atoi(argv[3]);
  350.         nchannels = atoi(argv[4]);
  351.         nkernels = atoi(argv[5]);
  352.       }
  353.       switch ( kernel_order ) {
  354.         case 1:
  355.         case 3:
  356.         case 5:
  357.         case 7: break;
  358.         default:
  359.         fprintf(stderr, "FATAL: kernel_order must be 1, 3, 5 or 7, not %d\n",
  360.           kernel_order);
  361.         exit(1);
  362.       }
  363.  
  364.     /* allocate the matrices */
  365.       image = gen_random_3d_matrix(width+kernel_order, height + kernel_order,
  366.        nchannels);
  367.       kernels = gen_random_4d_matrix(nkernels, nchannels, kernel_order, kernel_order);
  368.       output = new_empty_3d_matrix(nkernels, width, height);
  369.       control_output = new_empty_3d_matrix(nkernels, width, height);
  370.  
  371.     //DEBUGGING(write_out(A, a_dim1, a_dim2));
  372.  
  373.     /* use a simple multichannel convolution routine to produce control result */
  374.       gettimeofday(&start_time2, NULL);
  375.  
  376.       multichannel_conv(image, kernels, control_output, width,
  377.         height, nchannels, nkernels, kernel_order);
  378.  
  379.       gettimeofday(&stop_time2, NULL);
  380.  
  381.       mul_time_david_gregg = (stop_time2.tv_sec - start_time2.tv_sec) * 1000000L +
  382.       (stop_time2.tv_usec - start_time2.tv_usec);
  383.  
  384.  
  385.     /* record starting time of team's code*/
  386.       gettimeofday(&start_time, NULL);
  387.  
  388.     /* perform student team's multichannel convolution */
  389.       team_conv(image, kernels, output, width,
  390.         height, nchannels, nkernels, kernel_order);
  391.  
  392.     /* record finishing time */
  393.       gettimeofday(&stop_time, NULL);
  394.       mul_time_my_team = (stop_time.tv_sec - start_time.tv_sec) * 1000000L +
  395.       (stop_time.tv_usec - start_time.tv_usec);
  396.  
  397.  
  398.       printf("David Gregg conv time: %lld microseconds\n", mul_time_david_gregg);
  399.       printf("Our Team conv time: %lld microseconds\n", mul_time_my_team);
  400.  
  401.       long long speed = mul_time_david_gregg /mul_time_my_team;
  402.       printf("Speed Factor : %lld \n ", speed);
  403.  
  404.       DEBUGGING(write_out(output, nkernels, width, height));
  405.  
  406.     /* now check that the team's multichannel convolution routine
  407.        gives the same answer as the known working version */
  408.       check_result(output, control_output, nkernels, width, height);
  409.  
  410.       return 0;
  411.     }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement