SHARE
TWEET

final_code

a guest Mar 20th, 2017 53 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1.   /* Test and timing harness program for developing a multichannel
  2.      multikernel convolution (as used in deep learning networks)
  3.  
  4.      Note there are some simplifications around this implementation,
  5.      in particular with respect to computing the convolution at edge
  6.      pixels of the image.
  7.  
  8.      Author: David Gregg
  9.      Date:   February 2017
  10.  
  11.  
  12.      Version 1.4 : Modified the random generator to reduce the range
  13.                    of generated values;
  14.                    Changed the summation in the checking code from
  15.                    float to double to try to bring the checked value
  16.                    closer to the "true" value
  17.  
  18.      Version 1.3 : Fixed which loop variables were being incremented
  19.                    in write_out();
  20.                    Fixed dimensions of output and control_output
  21.                    matrices in main function
  22.  
  23.      Version 1.2 : Changed distribution of test data to (hopefully)
  24.                    eliminate random walk of floating point error;
  25.                    Also introduced checks to restrict kernel-order to
  26.                    a small set of values
  27.  
  28.      Version 1.1 : Fixed bug in code to create 4d matrix
  29.   */
  30.  
  31.   #include <stdio.h>
  32.   #include <stdlib.h>
  33.   #include <sys/time.h>
  34.   #include <assert.h>
  35.   #include <omp.h>
  36.   #include <math.h>
  37.   #include <x86intrin.h>
  38.  
  39.   /* the following two definitions of DEBUGGING control whether or not
  40.      debugging information is written out. To put the program into
  41.      debugging mode, uncomment the following line: */
  42.   /*#define DEBUGGING(_x) _x */
  43.   /* to stop the printing of debugging information, use the following line: */
  44.   #define DEBUGGING(_x)
  45.   #define ZERO_VALUE  0
  46.  
  47.  
  48.   /* write 3d matrix to stdout */
  49.      void write_out(float *** a, int dim0, int dim1, int dim2)
  50.      {
  51.       int i, j, k;
  52.  
  53.       for ( i = 0; i < dim0; i++ ) {
  54.         printf("Outer dimension number %d\n", i);
  55.         for ( j = 0; j < dim1; j++ ) {
  56.           for ( k = 0; k < dim2 - 1; k++ ) {
  57.             printf("%f, ", a[i][j][k]);
  58.           }
  59.         // print end of line
  60.           printf("%f\n", a[i][j][dim2-1]);
  61.         }
  62.       }
  63.     }
  64.  
  65.  
  66.   /* create new empty 4d matrix */
  67.     float **** new_empty_4d_matrix(int dim0, int dim1, int dim2, int dim3)
  68.     {
  69.       float **** result = malloc(dim0 * sizeof(float***));
  70.       float *** mat1 = malloc(dim0 * dim1 * sizeof(float**));
  71.       float ** mat2 = malloc(dim0 * dim1 * dim2 * sizeof(float*));
  72.       float * mat3 = malloc(dim0 * dim1 * dim2 *dim3 * sizeof(float));
  73.       int i, j, k;
  74.  
  75.  
  76.       for ( i = 0; i < dim0; i++ ) {
  77.         result[i] = &(mat1[i*dim1]);
  78.         for ( j = 0; j < dim1; j++ ) {
  79.           result[i][j] = &(mat2[i*dim1*dim2 + j*dim2]);
  80.           for ( k = 0; k < dim2; k++ ) {
  81.             result[i][j][k] = &(mat3[i*dim1*dim2*dim3+j*dim2*dim3+k*dim3]);
  82.           }
  83.         }
  84.       }
  85.  
  86.       return result;
  87.     }
  88.  
  89.   /* create new empty 3d matrix */
  90.     float *** new_empty_3d_matrix(int dim0, int dim1, int dim2)
  91.     {
  92.       float **** mat4d;
  93.       float *** mat3d;
  94.  
  95.     // create a 4d matrix with single first dimension
  96.       mat4d = new_empty_4d_matrix(1, dim0, dim1, dim2);
  97.     // now throw away out first dimension
  98.       mat3d = mat4d[0];
  99.       free(mat4d);
  100.       return mat3d;
  101.     }
  102.  
  103.   /* take a copy of the matrix asnd return in a newly allocated matrix */
  104.     float **** copy_4d_matrix(float **** source_matrix, int dim0,
  105.       int dim1, int dim2, int dim3)
  106.     {
  107.       int i, j, k, l;
  108.       float **** result = new_empty_4d_matrix(dim0, dim1, dim2, dim3);
  109.  
  110.       for ( i = 0; i < dim0; i++ ) {
  111.         for ( j = 0; j < dim1; j++ ) {
  112.           for ( k = 0; k < dim2; k++ ) {
  113.             for ( l = 0; l < dim3; l++ ) {
  114.               result[i][j][k][l] = source_matrix[i][j][k][l];
  115.             }
  116.           }
  117.         }
  118.       }
  119.       return result;
  120.     }
  121.  
  122.   /* create a matrix and fill it with random numbers */
  123.     float **** gen_random_4d_matrix(int dim0, int dim1, int dim2, int dim3)
  124.     {
  125.       float **** result;
  126.       int i, j, k, l;
  127.       struct timeval seedtime;
  128.       int seed;
  129.  
  130.       result = new_empty_4d_matrix(dim0, dim1, dim2, dim3);
  131.  
  132.     /* use the microsecond part of the current time as a pseudorandom seed */
  133.       gettimeofday(&seedtime, NULL);
  134.       seed = seedtime.tv_usec;
  135.       srandom(seed);
  136.  
  137.     /* fill the matrix with random numbers */
  138.     const int range = 1 << 12; // 2^12
  139.     const int bias = 1 << 16; // 2^16
  140.     float offset = 0.0;
  141.     for ( i = 0; i < dim0; i++ ) {
  142.       for ( j = 0; j < dim1; j++ ) {
  143.         for ( k = 0; k < dim2; k++ ) {
  144.           for ( l = 0; l < dim3; l++ ) {
  145.             // generate uniform random integer with mean of zero
  146.             long long rand = random();
  147.             // now cut down the range and bias the mean to reduce
  148.             // the likelihood of large floating point round-off errors
  149.             int reduced_range = (rand % range);
  150.             float num = (((float) reduced_range) / ((float) bias))+offset;
  151.             result[i][j][k][l] = num;
  152.           }
  153.         }
  154.       }
  155.     }
  156.  
  157.     return result;
  158.   }
  159.  
  160.   /* create a matrix and fill it with random numbers */
  161.   float *** gen_random_3d_matrix(int dim0, int dim1, int dim2)
  162.   {
  163.     float **** mat4d;
  164.     float *** mat3d;
  165.  
  166.     // create a 4d matrix with single first dimension
  167.     mat4d = gen_random_4d_matrix(1, dim0, dim1, dim2);
  168.     // now throw away out first dimension
  169.     mat3d = mat4d[0];
  170.     free(mat4d);
  171.     return mat3d;
  172.   }
  173.  
  174.   /* check the sum of absolute differences is within reasonable epsilon */
  175.   void check_result(float *** result, float *** control,
  176.     int dim0, int dim1, int dim2)
  177.   {
  178.     int i, j, k;
  179.     double sum_abs_diff = 0.0;
  180.     const double EPSILON = 0.0625;
  181.  
  182.     //printf("SAD\n");
  183.    
  184.     for ( i = 0; i < dim0; i++ ) {
  185.       for ( j = 0; j < dim1; j++ ) {
  186.         for ( k = 0; k < dim2; k++ ) {
  187.           double diff = fabs(control[i][j][k] - result[i][j][k]);
  188.           assert( diff >= 0.0 );
  189.           sum_abs_diff = sum_abs_diff + diff;
  190.         }
  191.       }
  192.     }
  193.  
  194.     if ( sum_abs_diff > EPSILON ) {
  195.       fprintf(stderr, "WARNING: sum of absolute differences (%f) > EPSILON (%f)\n",
  196.         sum_abs_diff, EPSILON);
  197.     }
  198.     else {
  199.       printf("COMMENT: sum of absolute differences (%f)  within acceptable range (%f)\n", sum_abs_diff, EPSILON);
  200.     }
  201.   }
  202.  
  203.   /* the slow but correct version of matmul written by David */
  204.   void multichannel_conv(float *** image, float **** kernels, float *** output,
  205.    int width, int height, int nchannels, int nkernels,
  206.    int kernel_order)
  207.   {
  208.     int h, w, x, y, c, m, check;
  209.     check = 0;
  210.  
  211.     for ( m = 0; m < nkernels; m++ ) { //cycle through each kernel
  212.       for ( w = 0; w < width; w++ ) { //cycle through each x cooridinate of the image
  213.         for ( h = 0; h < height; h++ ) { //cycle through each y coordinate of the image
  214.           double sum = 0.0;
  215.           for ( c = 0; c < nchannels; c++ ) { //cycle through each channel of the image and the kernel
  216.             for ( x = 0; x < kernel_order; x++) { //cycle through each x coordinate of the kernel
  217.               for ( y = 0; y < kernel_order; y++ ) { //cycle through each y coordinate of the kernel
  218.                 sum += image[w+x][h+y][c] * kernels[m][c][x][y];
  219.               }
  220.             }
  221.           }
  222.           output[m][w][h] = sum;
  223.           //printf("Sum %d %f\n " , check , sum);
  224.           check++;
  225.         }
  226.       }
  227.     }
  228.   }
  229.  
  230.  
  231.   /* the fast version of matmul written by the team */
  232.   void team_conv(float *** image, float **** kernels, float *** output,
  233.    int width, int height, int nchannels, int nkernels,
  234.    int kernel_order)
  235.   {
  236.  
  237.     float *myArray = malloc( 4 * sizeof(float) );
  238.     __m128 myKernel, myImage, productOfMul, sumOfVectors, productOfVectors;
  239.     float arrr[4];
  240.  
  241.  
  242.     // this call here is just dummy code
  243.     // insert your own code instead
  244.     int h, w, x, y, c, m,i,b;
  245.     int check = 1;
  246.     sumOfVectors = _mm_setzero_ps();
  247.     __m128 sumOfVectors2 = _mm_setzero_ps();
  248.     productOfVectors = _mm_setzero_ps();
  249.     __m128 productOfVectors2 = _mm_setzero_ps();
  250.     __m128 kernelVector = _mm_setzero_ps();
  251.     __m128 kernelVector2 = _mm_setzero_ps();
  252.     __m128 imageVector = _mm_setzero_ps();
  253.     __m128 imageVector2 = _mm_setzero_ps();
  254.     __m128 totalSumOfVectors = _mm_setzero_ps();
  255.  
  256.     float sumArray[8] = {0.0,0.0,0.0,0.0};
  257.  
  258.     if(kernel_order == 1)
  259.     {
  260.       for ( m = 0; m < nkernels; m++ ) {
  261.         for ( w = 0; w < width; w++ ) {
  262.           for ( h = 0; h < height; h++ ) {
  263.             float sum = 0.0;
  264.             if(nchannels >= 4)
  265.             {
  266.               for ( c = 0; c < nchannels-3; c=c+4 ) {
  267.               imageVector = _mm_set_ps(image[w][h][c], image[w][h][c+1], image[w][h][c+2], image[w][h][c+3]);
  268.               kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
  269.               productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  270.               sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  271.  
  272.  
  273.               //sum += image[w][h][c] * kernels[m][c][0][0];
  274.              
  275.               }
  276.             }
  277.             else if(nchannels >= 8)
  278.             {
  279.               for ( c = 0; c < nchannels-7; c=c+8 ) {
  280.               imageVector = _mm_set_ps(image[w][h][c], image[w][h][c+1], image[w][h][c+2], image[w][h][c+3]);
  281.               imageVector2 = _mm_set_ps(image[w][h][c+4], image[w][h][c+5], image[w][h][c+6], image[w][h][c+7]);
  282.  
  283.               kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
  284.               kernelVector2 = _mm_set_ps(kernels[m][c+4][0][0], kernels[m][c+5][0][0],kernels[m][c+6][0][0],kernels[m][c+7][0][0]);
  285.  
  286.               productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  287.               productOfVectors2 = _mm_mul_ps(imageVector2,kernelVector2);
  288.  
  289.               sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  290.               sumOfVectors2 = _mm_add_ps(sumOfVectors2, productOfVectors2);
  291.  
  292.               sumOfVectors = _mm_add_ps(sumOfVectors, sumOfVectors2);
  293.  
  294.               //sum += image[w][h][c] * kernels[m][c][0][0];
  295.              
  296.               }
  297.             }
  298.             for (; c < nchannels; c++)
  299.             {
  300.               sum += image[w][h][c] * kernels[m][c][0][0];
  301.             }
  302.             sumOfVectors = _mm_hadd_ps(sumOfVectors,sumOfVectors);
  303.             sumOfVectors = _mm_hadd_ps(sumOfVectors,sumOfVectors);
  304.             sum = _mm_cvtss_f32(sumOfVectors);
  305.  
  306.             //_mm_store_ps(sumArray,sumOfVectors);
  307.             //sum = sumArray[0] + sumArray[1] + sumArray[2] + sumArray[3] + sumArray[4] + sumArray[5] + sumArray[6] + sumArray[7];
  308.             output[m][w][h] = sum;
  309.             sumOfVectors = _mm_setzero_ps();
  310.             sumOfVectors2 = _mm_setzero_ps();
  311.             productOfVectors = _mm_setzero_ps();
  312.             productOfVectors2 = _mm_setzero_ps();
  313.             kernelVector = _mm_setzero_ps();
  314.             kernelVector2 = _mm_setzero_ps();
  315.             imageVector = _mm_setzero_ps();
  316.             imageVector2 = _mm_setzero_ps();
  317.             totalSumOfVectors = _mm_setzero_ps();;
  318.           }
  319.         }
  320.       }
  321.     }
  322.    
  323.     else
  324.     {
  325.  
  326.  
  327.       sumOfVectors = _mm_setzero_ps();
  328.       productOfVectors = _mm_setzero_ps();
  329.  
  330.       __m128 kernelVector = _mm_setzero_ps();
  331.       __m128 imageVector = _mm_setzero_ps();
  332.  
  333.  
  334.       int q=0;
  335.  
  336.       float test[4] = {0.0, 0.0, 0.0, 0.0};
  337.       int h, w, x, y, c, m, check;
  338.       float sumArray[4] = {0.0, 0.0, 0.0, 0.0};
  339.       check = 0;
  340.       for ( m = 0; m < nkernels; m++ )
  341.       {
  342.         for ( w = 0; w < width; w++ )
  343.         {
  344.           for(h = 0; h < height; h++)
  345.           {
  346.             float sum = 0.0;
  347.  
  348.             for ( c = 0; c < nchannels-3; c=c+4 )
  349.             {
  350.               if(kernel_order == 3)
  351.               {
  352.                 imageVector = _mm_set_ps(image[w+0][h+0][c], image[w+0][h+0][c+1], image[w+0][h+0][c+2], image[w+0][h+0][c+3]);
  353.                 kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
  354.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);            
  355.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  356.  
  357.                 imageVector = _mm_set_ps(image[w+0][h+1][c], image[w+0][h+1][c+1], image[w+0][h+1][c+2], image[w+0][h+1][c+3]);
  358.                 kernelVector = _mm_set_ps(kernels[m][c][0][1], kernels[m][c+1][0][1],kernels[m][c+2][0][1],kernels[m][c+3][0][1]);
  359.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  360.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  361.  
  362.                 imageVector = _mm_set_ps(image[w+0][h+2][c], image[w+0][h+2][c+1], image[w+0][h+2][c+2], image[w+0][h+2][c+3]);
  363.                 kernelVector = _mm_set_ps(kernels[m][c][0][2], kernels[m][c+1][0][2],kernels[m][c+2][0][2],kernels[m][c+3][0][2]);
  364.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  365.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  366.  
  367.                 imageVector = _mm_set_ps(image[w+1][h+0][c], image[w+1][h+0][c+1], image[w+1][h+0][c+2], image[w+1][h+0][c+3]);
  368.                 kernelVector = _mm_set_ps(kernels[m][c][1][0], kernels[m][c+1][1][0],kernels[m][c+2][1][0],kernels[m][c+3][1][0]);
  369.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  370.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  371.  
  372.                 imageVector = _mm_set_ps(image[w+1][h+1][c], image[w+1][h+1][c+1], image[w+1][h+1][c+2], image[w+1][h+1][c+3]);
  373.                 kernelVector = _mm_set_ps(kernels[m][c][1][1], kernels[m][c+1][1][1],kernels[m][c+2][1][1],kernels[m][c+3][1][1]);
  374.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  375.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  376.  
  377.                 imageVector = _mm_set_ps(image[w+1][h+2][c], image[w+1][h+2][c+1], image[w+1][h+2][c+2], image[w+1][h+2][c+3]);
  378.                 kernelVector = _mm_set_ps(kernels[m][c][1][2], kernels[m][c+1][1][2],kernels[m][c+2][1][2],kernels[m][c+3][1][2]);
  379.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  380.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  381.  
  382.                 imageVector = _mm_set_ps(image[w+2][h+0][c], image[w+2][h+0][c+1], image[w+2][h+0][c+2], image[w+2][h+0][c+3]);
  383.                 kernelVector = _mm_set_ps(kernels[m][c][2][0], kernels[m][c+1][2][0],kernels[m][c+2][2][0],kernels[m][c+3][2][0]);
  384.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  385.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  386.  
  387.                 imageVector = _mm_set_ps(image[w+2][h+1][c], image[w+2][h+1][c+1], image[w+2][h+1][c+2], image[w+2][h+1][c+3]);
  388.                 kernelVector = _mm_set_ps(kernels[m][c][2][1], kernels[m][c+1][2][1],kernels[m][c+2][2][1],kernels[m][c+3][2][1]);
  389.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  390.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  391.  
  392.                 imageVector = _mm_set_ps(image[w+2][h+2][c], image[w+2][h+2][c+1], image[w+2][h+2][c+2], image[w+2][h+2][c+3]);
  393.                 kernelVector = _mm_set_ps(kernels[m][c][2][2], kernels[m][c+1][2][2],kernels[m][c+2][2][2],kernels[m][c+3][2][2]);
  394.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  395.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  396.               }
  397.               else if(kernel_order == 5)
  398.               {
  399.                 imageVector = _mm_set_ps(image[w+0][h+0][c], image[w+0][h+0][c+1], image[w+0][h+0][c+2], image[w+0][h+0][c+3]);
  400.                 kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
  401.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  402.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  403.  
  404.                 imageVector = _mm_set_ps(image[w+0][h+1][c], image[w+0][h+1][c+1], image[w+0][h+1][c+2], image[w+0][h+1][c+3]);
  405.                 kernelVector = _mm_set_ps(kernels[m][c][0][1], kernels[m][c+1][0][1],kernels[m][c+2][0][1],kernels[m][c+3][0][1]);
  406.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  407.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  408.  
  409.                 imageVector = _mm_set_ps(image[w+0][h+2][c], image[w+0][h+2][c+1], image[w+0][h+2][c+2], image[w+0][h+2][c+3]);
  410.                 kernelVector = _mm_set_ps(kernels[m][c][0][2], kernels[m][c+1][0][2],kernels[m][c+2][0][2],kernels[m][c+3][0][2]);
  411.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  412.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  413.  
  414.                 imageVector = _mm_set_ps(image[w+0][h+3][c], image[w+0][h+3][c+1], image[w+0][h+3][c+2], image[w+0][h+3][c+3]);
  415.                 kernelVector = _mm_set_ps(kernels[m][c][0][3], kernels[m][c+1][0][3],kernels[m][c+2][0][3],kernels[m][c+3][0][3]);
  416.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  417.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  418.  
  419.                 imageVector = _mm_set_ps(image[w+0][h+4][c], image[w+0][h+4][c+1], image[w+0][h+4][c+2], image[w+0][h+4][c+3]);
  420.                 kernelVector = _mm_set_ps(kernels[m][c][0][4], kernels[m][c+1][0][4],kernels[m][c+2][0][4],kernels[m][c+3][0][4]);
  421.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  422.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  423.  
  424.                 imageVector = _mm_set_ps(image[w+1][h+0][c], image[w+1][h+0][c+1], image[w+1][h+0][c+2], image[w+1][h+0][c+3]);
  425.                 kernelVector = _mm_set_ps(kernels[m][c][1][0], kernels[m][c+1][1][0],kernels[m][c+2][1][0],kernels[m][c+3][1][0]);
  426.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  427.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  428.  
  429.                 imageVector = _mm_set_ps(image[w+1][h+1][c], image[w+1][h+1][c+1], image[w+1][h+1][c+2], image[w+1][h+1][c+3]);
  430.                 kernelVector = _mm_set_ps(kernels[m][c][1][1], kernels[m][c+1][1][1],kernels[m][c+2][1][1],kernels[m][c+3][1][1]);
  431.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  432.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  433.  
  434.                 imageVector = _mm_set_ps(image[w+1][h+2][c], image[w+1][h+2][c+1], image[w+1][h+2][c+2], image[w+1][h+2][c+3]);
  435.                 kernelVector = _mm_set_ps(kernels[m][c][1][2], kernels[m][c+1][1][2],kernels[m][c+2][1][2],kernels[m][c+3][1][2]);
  436.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  437.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  438.  
  439.                 imageVector = _mm_set_ps(image[w+1][h+3][c], image[w+1][h+3][c+1], image[w+1][h+3][c+2], image[w+1][h+3][c+3]);
  440.                 kernelVector = _mm_set_ps(kernels[m][c][1][3], kernels[m][c+1][1][3],kernels[m][c+2][1][3],kernels[m][c+3][1][3]);
  441.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  442.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  443.  
  444.                 imageVector = _mm_set_ps(image[w+1][h+4][c], image[w+1][h+4][c+1], image[w+1][h+4][c+2], image[w+1][h+4][c+3]);
  445.                 kernelVector = _mm_set_ps(kernels[m][c][1][4], kernels[m][c+1][1][4],kernels[m][c+2][1][4],kernels[m][c+3][1][4]);
  446.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  447.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  448.  
  449.                 imageVector = _mm_set_ps(image[w+2][h+0][c], image[w+2][h+0][c+1], image[w+2][h+0][c+2], image[w+2][h+0][c+3]);
  450.                 kernelVector = _mm_set_ps(kernels[m][c][2][0], kernels[m][c+1][2][0],kernels[m][c+2][2][0],kernels[m][c+3][2][0]);
  451.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  452.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  453.  
  454.                 imageVector = _mm_set_ps(image[w+2][h+1][c], image[w+2][h+1][c+1], image[w+2][h+1][c+2], image[w+2][h+1][c+3]);
  455.                 kernelVector = _mm_set_ps(kernels[m][c][2][1], kernels[m][c+1][2][1],kernels[m][c+2][2][1],kernels[m][c+3][2][1]);
  456.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  457.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  458.  
  459.                 imageVector = _mm_set_ps(image[w+2][h+2][c], image[w+2][h+2][c+1], image[w+2][h+2][c+2], image[w+2][h+2][c+3]);
  460.                 kernelVector = _mm_set_ps(kernels[m][c][2][2], kernels[m][c+1][2][2],kernels[m][c+2][2][2],kernels[m][c+3][2][2]);
  461.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  462.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  463.  
  464.                 imageVector = _mm_set_ps(image[w+2][h+3][c], image[w+2][h+3][c+1], image[w+2][h+3][c+2], image[w+2][h+3][c+3]);
  465.                 kernelVector = _mm_set_ps(kernels[m][c][2][3], kernels[m][c+1][2][3],kernels[m][c+2][2][3],kernels[m][c+3][2][3]);
  466.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  467.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  468.  
  469.                 imageVector = _mm_set_ps(image[w+2][h+4][c], image[w+2][h+4][c+1], image[w+2][h+4][c+2], image[w+2][h+4][c+3]);
  470.                 kernelVector = _mm_set_ps(kernels[m][c][2][4], kernels[m][c+1][2][4],kernels[m][c+2][2][4],kernels[m][c+3][2][4]);
  471.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  472.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  473.  
  474.                 imageVector = _mm_set_ps(image[w+3][h+0][c], image[w+3][h+0][c+1], image[w+3][h+0][c+2], image[w+3][h+0][c+3]);
  475.                 kernelVector = _mm_set_ps(kernels[m][c][3][0], kernels[m][c+1][3][0],kernels[m][c+2][3][0],kernels[m][c+3][3][0]);
  476.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  477.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  478.  
  479.                 imageVector = _mm_set_ps(image[w+3][h+1][c], image[w+3][h+1][c+1], image[w+3][h+1][c+2], image[w+3][h+1][c+3]);
  480.                 kernelVector = _mm_set_ps(kernels[m][c][3][1], kernels[m][c+1][3][1],kernels[m][c+2][3][1],kernels[m][c+3][3][1]);
  481.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  482.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  483.  
  484.                 imageVector = _mm_set_ps(image[w+3][h+2][c], image[w+3][h+2][c+1], image[w+3][h+2][c+2], image[w+3][h+2][c+3]);
  485.                 kernelVector = _mm_set_ps(kernels[m][c][3][2], kernels[m][c+1][3][2],kernels[m][c+2][3][2],kernels[m][c+3][3][2]);
  486.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  487.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  488.  
  489.                 imageVector = _mm_set_ps(image[w+3][h+3][c], image[w+3][h+3][c+1], image[w+3][h+3][c+2], image[w+3][h+3][c+3]);
  490.                 kernelVector = _mm_set_ps(kernels[m][c][3][3], kernels[m][c+1][3][3],kernels[m][c+2][3][3],kernels[m][c+3][3][3]);
  491.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  492.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  493.  
  494.                 imageVector = _mm_set_ps(image[w+3][h+4][c], image[w+3][h+4][c+1], image[w+3][h+4][c+2], image[w+3][h+4][c+3]);
  495.                 kernelVector = _mm_set_ps(kernels[m][c][3][4], kernels[m][c+1][3][4],kernels[m][c+2][3][4],kernels[m][c+3][3][4]);
  496.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  497.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  498.  
  499.                 imageVector = _mm_set_ps(image[w+4][h+0][c], image[w+4][h+0][c+1], image[w+4][h+0][c+2], image[w+4][h+0][c+3]);
  500.                 kernelVector = _mm_set_ps(kernels[m][c][4][0], kernels[m][c+1][4][0],kernels[m][c+2][4][0],kernels[m][c+3][4][0]);
  501.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  502.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  503.  
  504.                 imageVector = _mm_set_ps(image[w+4][h+1][c], image[w+4][h+1][c+1], image[w+4][h+1][c+2], image[w+4][h+1][c+3]);
  505.                 kernelVector = _mm_set_ps(kernels[m][c][4][1], kernels[m][c+1][4][1],kernels[m][c+2][4][1],kernels[m][c+3][4][1]);
  506.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  507.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  508.  
  509.                 imageVector = _mm_set_ps(image[w+4][h+2][c], image[w+4][h+2][c+1], image[w+4][h+2][c+2], image[w+4][h+2][c+3]);
  510.                 kernelVector = _mm_set_ps(kernels[m][c][4][2], kernels[m][c+1][4][2],kernels[m][c+2][4][2],kernels[m][c+3][4][2]);
  511.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  512.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  513.  
  514.                 imageVector = _mm_set_ps(image[w+4][h+3][c], image[w+4][h+3][c+1], image[w+4][h+3][c+2], image[w+4][h+3][c+3]);
  515.                 kernelVector = _mm_set_ps(kernels[m][c][4][3], kernels[m][c+1][4][3],kernels[m][c+2][4][3],kernels[m][c+3][4][3]);
  516.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  517.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  518.  
  519.                 imageVector = _mm_set_ps(image[w+4][h+4][c], image[w+4][h+4][c+1], image[w+4][h+4][c+2], image[w+4][h+4][c+3]);
  520.                 kernelVector = _mm_set_ps(kernels[m][c][4][4], kernels[m][c+1][4][4],kernels[m][c+2][4][4],kernels[m][c+3][4][4]);
  521.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  522.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  523.               }
  524.               else if(kernel_order == 7)
  525.               {
  526.                 imageVector = _mm_set_ps(image[w+0][h+0][c], image[w+0][h+0][c+1], image[w+0][h+0][c+2], image[w+0][h+0][c+3]);
  527.                 kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
  528.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  529.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  530.  
  531.                 imageVector = _mm_set_ps(image[w+0][h+1][c], image[w+0][h+1][c+1], image[w+0][h+1][c+2], image[w+0][h+1][c+3]);
  532.                 kernelVector = _mm_set_ps(kernels[m][c][0][1], kernels[m][c+1][0][1],kernels[m][c+2][0][1],kernels[m][c+3][0][1]);
  533.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  534.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  535.  
  536.                 imageVector = _mm_set_ps(image[w+0][h+2][c], image[w+0][h+2][c+1], image[w+0][h+2][c+2], image[w+0][h+2][c+3]);
  537.                 kernelVector = _mm_set_ps(kernels[m][c][0][2], kernels[m][c+1][0][2],kernels[m][c+2][0][2],kernels[m][c+3][0][2]);
  538.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  539.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  540.  
  541.                 imageVector = _mm_set_ps(image[w+0][h+3][c], image[w+0][h+3][c+1], image[w+0][h+3][c+2], image[w+0][h+3][c+3]);
  542.                 kernelVector = _mm_set_ps(kernels[m][c][0][3], kernels[m][c+1][0][3],kernels[m][c+2][0][3],kernels[m][c+3][0][3]);
  543.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  544.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  545.  
  546.                 imageVector = _mm_set_ps(image[w+0][h+4][c], image[w+0][h+4][c+1], image[w+0][h+4][c+2], image[w+0][h+4][c+3]);
  547.                 kernelVector = _mm_set_ps(kernels[m][c][0][4], kernels[m][c+1][0][4],kernels[m][c+2][0][4],kernels[m][c+3][0][4]);
  548.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  549.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  550.  
  551.                 imageVector = _mm_set_ps(image[w+0][h+5][c], image[w+0][h+5][c+1], image[w+0][h+5][c+2], image[w+0][h+5][c+3]);
  552.                 kernelVector = _mm_set_ps(kernels[m][c][0][5], kernels[m][c+1][0][5],kernels[m][c+2][0][5],kernels[m][c+3][0][5]);
  553.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  554.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  555.  
  556.                 imageVector = _mm_set_ps(image[w+0][h+6][c], image[w+0][h+6][c+1], image[w+0][h+6][c+2], image[w+0][h+6][c+3]);
  557.                 kernelVector = _mm_set_ps(kernels[m][c][0][6], kernels[m][c+1][0][6],kernels[m][c+2][0][6],kernels[m][c+3][0][6]);
  558.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  559.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  560.  
  561.                 imageVector = _mm_set_ps(image[w+1][h+0][c], image[w+1][h+0][c+1], image[w+1][h+0][c+2], image[w+1][h+0][c+3]);
  562.                 kernelVector = _mm_set_ps(kernels[m][c][1][0], kernels[m][c+1][1][0],kernels[m][c+2][1][0],kernels[m][c+3][1][0]);
  563.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  564.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  565.  
  566.                 imageVector = _mm_set_ps(image[w+1][h+1][c], image[w+1][h+1][c+1], image[w+1][h+1][c+2], image[w+1][h+1][c+3]);
  567.                 kernelVector = _mm_set_ps(kernels[m][c][1][1], kernels[m][c+1][1][1],kernels[m][c+2][1][1],kernels[m][c+3][1][1]);
  568.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  569.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  570.  
  571.                 imageVector = _mm_set_ps(image[w+1][h+2][c], image[w+1][h+2][c+1], image[w+1][h+2][c+2], image[w+1][h+2][c+3]);
  572.                 kernelVector = _mm_set_ps(kernels[m][c][1][2], kernels[m][c+1][1][2],kernels[m][c+2][1][2],kernels[m][c+3][1][2]);
  573.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  574.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  575.  
  576.                 imageVector = _mm_set_ps(image[w+1][h+3][c], image[w+1][h+3][c+1], image[w+1][h+3][c+2], image[w+1][h+3][c+3]);
  577.                 kernelVector = _mm_set_ps(kernels[m][c][1][3], kernels[m][c+1][1][3],kernels[m][c+2][1][3],kernels[m][c+3][1][3]);
  578.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  579.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  580.  
  581.                 imageVector = _mm_set_ps(image[w+1][h+4][c], image[w+1][h+4][c+1], image[w+1][h+4][c+2], image[w+1][h+4][c+3]);
  582.                 kernelVector = _mm_set_ps(kernels[m][c][1][4], kernels[m][c+1][1][4],kernels[m][c+2][1][4],kernels[m][c+3][1][4]);
  583.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  584.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  585.  
  586.                 imageVector = _mm_set_ps(image[w+1][h+5][c], image[w+1][h+5][c+1], image[w+1][h+5][c+2], image[w+1][h+5][c+3]);
  587.                 kernelVector = _mm_set_ps(kernels[m][c][1][5], kernels[m][c+1][1][5],kernels[m][c+2][1][5],kernels[m][c+3][1][5]);
  588.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  589.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  590.  
  591.                 imageVector = _mm_set_ps(image[w+1][h+6][c], image[w+1][h+6][c+1], image[w+1][h+6][c+2], image[w+1][h+6][c+3]);
  592.                 kernelVector = _mm_set_ps(kernels[m][c][1][6], kernels[m][c+1][1][6],kernels[m][c+2][1][6],kernels[m][c+3][1][6]);
  593.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  594.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  595.  
  596.                 imageVector = _mm_set_ps(image[w+2][h+0][c], image[w+2][h+0][c+1], image[w+2][h+0][c+2], image[w+2][h+0][c+3]);
  597.                 kernelVector = _mm_set_ps(kernels[m][c][2][0], kernels[m][c+1][2][0],kernels[m][c+2][2][0],kernels[m][c+3][2][0]);
  598.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  599.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  600.  
  601.                 imageVector = _mm_set_ps(image[w+2][h+1][c], image[w+2][h+1][c+1], image[w+2][h+1][c+2], image[w+2][h+1][c+3]);
  602.                 kernelVector = _mm_set_ps(kernels[m][c][2][1], kernels[m][c+1][2][1],kernels[m][c+2][2][1],kernels[m][c+3][2][1]);
  603.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  604.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  605.  
  606.                 imageVector = _mm_set_ps(image[w+2][h+2][c], image[w+2][h+2][c+1], image[w+2][h+2][c+2], image[w+2][h+2][c+3]);
  607.                 kernelVector = _mm_set_ps(kernels[m][c][2][2], kernels[m][c+1][2][2],kernels[m][c+2][2][2],kernels[m][c+3][2][2]);
  608.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  609.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  610.  
  611.                 imageVector = _mm_set_ps(image[w+2][h+3][c], image[w+2][h+3][c+1], image[w+2][h+3][c+2], image[w+2][h+3][c+3]);
  612.                 kernelVector = _mm_set_ps(kernels[m][c][2][3], kernels[m][c+1][2][3],kernels[m][c+2][2][3],kernels[m][c+3][2][3]);
  613.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  614.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  615.  
  616.                 imageVector = _mm_set_ps(image[w+2][h+4][c], image[w+2][h+4][c+1], image[w+2][h+4][c+2], image[w+2][h+4][c+3]);
  617.                 kernelVector = _mm_set_ps(kernels[m][c][2][4], kernels[m][c+1][2][4],kernels[m][c+2][2][4],kernels[m][c+3][2][4]);
  618.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  619.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  620.  
  621.                 imageVector = _mm_set_ps(image[w+2][h+5][c], image[w+2][h+5][c+1], image[w+2][h+5][c+2], image[w+2][h+5][c+3]);
  622.                 kernelVector = _mm_set_ps(kernels[m][c][2][5], kernels[m][c+1][2][5],kernels[m][c+2][2][5],kernels[m][c+3][2][5]);
  623.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  624.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  625.  
  626.                 imageVector = _mm_set_ps(image[w+2][h+6][c], image[w+2][h+6][c+1], image[w+2][h+6][c+2], image[w+2][h+6][c+3]);
  627.                 kernelVector = _mm_set_ps(kernels[m][c][2][6], kernels[m][c+1][2][6],kernels[m][c+2][2][6],kernels[m][c+3][2][6]);
  628.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  629.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  630.  
  631.                 imageVector = _mm_set_ps(image[w+3][h+0][c], image[w+3][h+0][c+1], image[w+3][h+0][c+2], image[w+3][h+0][c+3]);
  632.                 kernelVector = _mm_set_ps(kernels[m][c][3][0], kernels[m][c+1][3][0],kernels[m][c+2][3][0],kernels[m][c+3][3][0]);
  633.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  634.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  635.  
  636.                 imageVector = _mm_set_ps(image[w+3][h+1][c], image[w+3][h+1][c+1], image[w+3][h+1][c+2], image[w+3][h+1][c+3]);
  637.                 kernelVector = _mm_set_ps(kernels[m][c][3][1], kernels[m][c+1][3][1],kernels[m][c+2][3][1],kernels[m][c+3][3][1]);
  638.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  639.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  640.  
  641.                 imageVector = _mm_set_ps(image[w+3][h+2][c], image[w+3][h+2][c+1], image[w+3][h+2][c+2], image[w+3][h+2][c+3]);
  642.                 kernelVector = _mm_set_ps(kernels[m][c][3][2], kernels[m][c+1][3][2],kernels[m][c+2][3][2],kernels[m][c+3][3][2]);
  643.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  644.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  645.  
  646.                 imageVector = _mm_set_ps(image[w+3][h+3][c], image[w+3][h+3][c+1], image[w+3][h+3][c+2], image[w+3][h+3][c+3]);
  647.                 kernelVector = _mm_set_ps(kernels[m][c][3][3], kernels[m][c+1][3][3],kernels[m][c+2][3][3],kernels[m][c+3][3][3]);
  648.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  649.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  650.  
  651.                 imageVector = _mm_set_ps(image[w+3][h+4][c], image[w+3][h+4][c+1], image[w+3][h+4][c+2], image[w+3][h+4][c+3]);
  652.                 kernelVector = _mm_set_ps(kernels[m][c][3][4], kernels[m][c+1][3][4],kernels[m][c+2][3][4],kernels[m][c+3][3][4]);
  653.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  654.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  655.  
  656.                 imageVector = _mm_set_ps(image[w+3][h+5][c], image[w+3][h+5][c+1], image[w+3][h+5][c+2], image[w+3][h+5][c+3]);
  657.                 kernelVector = _mm_set_ps(kernels[m][c][3][5], kernels[m][c+1][3][5],kernels[m][c+2][3][5],kernels[m][c+3][3][5]);
  658.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  659.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  660.  
  661.                 imageVector = _mm_set_ps(image[w+3][h+6][c], image[w+3][h+6][c+1], image[w+3][h+6][c+2], image[w+3][h+6][c+3]);
  662.                 kernelVector = _mm_set_ps(kernels[m][c][3][6], kernels[m][c+1][3][6],kernels[m][c+2][3][6],kernels[m][c+3][3][6]);
  663.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  664.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  665.  
  666.                 imageVector = _mm_set_ps(image[w+4][h+0][c], image[w+4][h+0][c+1], image[w+4][h+0][c+2], image[w+4][h+0][c+3]);
  667.                 kernelVector = _mm_set_ps(kernels[m][c][4][0], kernels[m][c+1][4][0],kernels[m][c+2][4][0],kernels[m][c+3][4][0]);
  668.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  669.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  670.  
  671.                 imageVector = _mm_set_ps(image[w+4][h+1][c], image[w+4][h+1][c+1], image[w+4][h+1][c+2], image[w+4][h+1][c+3]);
  672.                 kernelVector = _mm_set_ps(kernels[m][c][4][1], kernels[m][c+1][4][1],kernels[m][c+2][4][1],kernels[m][c+3][4][1]);
  673.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  674.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  675.  
  676.                 imageVector = _mm_set_ps(image[w+4][h+2][c], image[w+4][h+2][c+1], image[w+4][h+2][c+2], image[w+4][h+2][c+3]);
  677.                 kernelVector = _mm_set_ps(kernels[m][c][4][2], kernels[m][c+1][4][2],kernels[m][c+2][4][2],kernels[m][c+3][4][2]);
  678.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  679.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  680.  
  681.                 imageVector = _mm_set_ps(image[w+4][h+3][c], image[w+4][h+3][c+1], image[w+4][h+3][c+2], image[w+4][h+3][c+3]);
  682.                 kernelVector = _mm_set_ps(kernels[m][c][4][3], kernels[m][c+1][4][3],kernels[m][c+2][4][3],kernels[m][c+3][4][3]);
  683.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  684.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  685.  
  686.                 imageVector = _mm_set_ps(image[w+4][h+4][c], image[w+4][h+4][c+1], image[w+4][h+4][c+2], image[w+4][h+4][c+3]);
  687.                 kernelVector = _mm_set_ps(kernels[m][c][4][4], kernels[m][c+1][4][4],kernels[m][c+2][4][4],kernels[m][c+3][4][4]);
  688.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  689.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  690.  
  691.                 imageVector = _mm_set_ps(image[w+4][h+5][c], image[w+4][h+5][c+1], image[w+4][h+5][c+2], image[w+4][h+5][c+3]);
  692.                 kernelVector = _mm_set_ps(kernels[m][c][4][5], kernels[m][c+1][4][5],kernels[m][c+2][4][5],kernels[m][c+3][4][5]);
  693.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  694.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  695.  
  696.                 imageVector = _mm_set_ps(image[w+4][h+6][c], image[w+4][h+6][c+1], image[w+4][h+6][c+2], image[w+4][h+6][c+3]);
  697.                 kernelVector = _mm_set_ps(kernels[m][c][4][6], kernels[m][c+1][4][6],kernels[m][c+2][4][6],kernels[m][c+3][4][6]);
  698.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  699.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  700.  
  701.                 imageVector = _mm_set_ps(image[w+5][h+0][c], image[w+5][h+0][c+1], image[w+5][h+0][c+2], image[w+5][h+0][c+3]);
  702.                 kernelVector = _mm_set_ps(kernels[m][c][5][0], kernels[m][c+1][5][0],kernels[m][c+2][5][0],kernels[m][c+3][5][0]);
  703.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  704.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  705.  
  706.                 imageVector = _mm_set_ps(image[w+5][h+1][c], image[w+5][h+1][c+1], image[w+5][h+1][c+2], image[w+5][h+1][c+3]);
  707.                 kernelVector = _mm_set_ps(kernels[m][c][5][1], kernels[m][c+1][5][1],kernels[m][c+2][5][1],kernels[m][c+3][5][1]);
  708.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  709.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  710.  
  711.                 imageVector = _mm_set_ps(image[w+5][h+2][c], image[w+5][h+2][c+1], image[w+5][h+2][c+2], image[w+5][h+2][c+3]);
  712.                 kernelVector = _mm_set_ps(kernels[m][c][5][2], kernels[m][c+1][5][2],kernels[m][c+2][5][2],kernels[m][c+3][5][2]);
  713.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  714.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  715.  
  716.                 imageVector = _mm_set_ps(image[w+5][h+3][c], image[w+5][h+3][c+1], image[w+5][h+3][c+2], image[w+5][h+3][c+3]);
  717.                 kernelVector = _mm_set_ps(kernels[m][c][5][3], kernels[m][c+1][5][3],kernels[m][c+2][5][3],kernels[m][c+3][5][3]);
  718.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  719.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  720.  
  721.                 imageVector = _mm_set_ps(image[w+5][h+4][c], image[w+5][h+4][c+1], image[w+5][h+4][c+2], image[w+5][h+4][c+3]);
  722.                 kernelVector = _mm_set_ps(kernels[m][c][5][4], kernels[m][c+1][5][4],kernels[m][c+2][5][4],kernels[m][c+3][5][4]);
  723.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  724.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  725.  
  726.                 imageVector = _mm_set_ps(image[w+5][h+5][c], image[w+5][h+5][c+1], image[w+5][h+5][c+2], image[w+5][h+5][c+3]);
  727.                 kernelVector = _mm_set_ps(kernels[m][c][5][5], kernels[m][c+1][5][5],kernels[m][c+2][5][5],kernels[m][c+3][5][5]);
  728.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  729.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  730.  
  731.                 imageVector = _mm_set_ps(image[w+5][h+6][c], image[w+5][h+6][c+1], image[w+5][h+6][c+2], image[w+5][h+6][c+3]);
  732.                 kernelVector = _mm_set_ps(kernels[m][c][5][6], kernels[m][c+1][5][6],kernels[m][c+2][5][6],kernels[m][c+3][5][6]);
  733.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  734.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  735.  
  736.                 imageVector = _mm_set_ps(image[w+6][h+0][c], image[w+6][h+0][c+1], image[w+6][h+0][c+2], image[w+6][h+0][c+3]);
  737.                 kernelVector = _mm_set_ps(kernels[m][c][6][0], kernels[m][c+1][6][0],kernels[m][c+2][6][0],kernels[m][c+3][6][0]);
  738.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  739.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  740.  
  741.                 imageVector = _mm_set_ps(image[w+6][h+1][c], image[w+6][h+1][c+1], image[w+6][h+1][c+2], image[w+6][h+1][c+3]);
  742.                 kernelVector = _mm_set_ps(kernels[m][c][6][1], kernels[m][c+1][6][1],kernels[m][c+2][6][1],kernels[m][c+3][6][1]);
  743.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  744.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  745.  
  746.                 imageVector = _mm_set_ps(image[w+6][h+2][c], image[w+6][h+2][c+1], image[w+6][h+2][c+2], image[w+6][h+2][c+3]);
  747.                 kernelVector = _mm_set_ps(kernels[m][c][6][2], kernels[m][c+1][6][2],kernels[m][c+2][6][2],kernels[m][c+3][6][2]);
  748.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  749.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  750.  
  751.                 imageVector = _mm_set_ps(image[w+6][h+3][c], image[w+6][h+3][c+1], image[w+6][h+3][c+2], image[w+6][h+3][c+3]);
  752.                 kernelVector = _mm_set_ps(kernels[m][c][6][3], kernels[m][c+1][6][3],kernels[m][c+2][6][3],kernels[m][c+3][6][3]);
  753.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  754.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  755.  
  756.                 imageVector = _mm_set_ps(image[w+6][h+4][c], image[w+6][h+4][c+1], image[w+6][h+4][c+2], image[w+6][h+4][c+3]);
  757.                 kernelVector = _mm_set_ps(kernels[m][c][6][4], kernels[m][c+1][6][4],kernels[m][c+2][6][4],kernels[m][c+3][6][4]);
  758.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  759.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  760.  
  761.                 imageVector = _mm_set_ps(image[w+6][h+5][c], image[w+6][h+5][c+1], image[w+6][h+5][c+2], image[w+6][h+5][c+3]);
  762.                 kernelVector = _mm_set_ps(kernels[m][c][6][5], kernels[m][c+1][6][5],kernels[m][c+2][6][5],kernels[m][c+3][6][5]);
  763.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  764.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  765.  
  766.                 imageVector = _mm_set_ps(image[w+6][h+6][c], image[w+6][h+6][c+1], image[w+6][h+6][c+2], image[w+6][h+6][c+3]);
  767.                 kernelVector = _mm_set_ps(kernels[m][c][6][6], kernels[m][c+1][6][6],kernels[m][c+2][6][6],kernels[m][c+3][6][6]);
  768.                 productOfVectors = _mm_mul_ps(imageVector,kernelVector);
  769.                 sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
  770.  
  771.  
  772.               }
  773.             }
  774.  
  775.             _mm_store_ps(sumArray,sumOfVectors);
  776.             sum = sumArray[0] + sumArray[1] + sumArray[2] + sumArray[3];
  777.             for (; c < nchannels; c++)
  778.             {
  779.               sum += image[w][h][c] * kernels[m][c][0][0];
  780.             }
  781.          
  782.             output[m][w][h] = sum;
  783.             check++;
  784.             sumOfVectors = _mm_setzero_ps();
  785.             productOfVectors = _mm_setzero_ps();
  786.             kernelVector = _mm_setzero_ps();
  787.             imageVector = _mm_setzero_ps();
  788.           }
  789.         }
  790.       }
  791.     }
  792.   }
  793.  
  794.  
  795.  
  796.  
  797. int main(int argc, char ** argv)
  798. {
  799.     //float image[W][H][C];
  800.     //float kernels[M][C][K][K];
  801.     //float output[M][W][H];
  802.  
  803.   float *** image, **** kernels, *** output;
  804.   float *** control_output;
  805.   long long mul_time_my_team, mul_time_david_gregg;
  806.   int width, height, kernel_order, nchannels, nkernels;
  807.   struct timeval start_time, start_time2;
  808.   struct timeval stop_time, stop_time2;
  809.  
  810.   if ( argc != 6 ) {
  811.     fprintf(stderr, "Usage: conv-harness <image_width> <image_height> <kernel_order> <number of channels> <number of kernels>\n");
  812.     exit(1);
  813.   }
  814.   else {
  815.     width = atoi(argv[1]);
  816.     height = atoi(argv[2]);
  817.     kernel_order = atoi(argv[3]);
  818.     nchannels = atoi(argv[4]);
  819.     nkernels = atoi(argv[5]);
  820.   }
  821.   switch ( kernel_order ) {
  822.     case 1:
  823.     case 3:
  824.     case 5:
  825.     case 7: break;
  826.     default:
  827.     fprintf(stderr, "FATAL: kernel_order must be 1, 3, 5 or 7, not %d\n",
  828.       kernel_order);
  829.     exit(1);
  830.   }
  831.  
  832.     /* allocate the matrices */
  833.   image = gen_random_3d_matrix(width+kernel_order, height + kernel_order,
  834.    nchannels);
  835.   kernels = gen_random_4d_matrix(nkernels, nchannels, kernel_order, kernel_order);
  836.   output = new_empty_3d_matrix(nkernels, width, height);
  837.   control_output = new_empty_3d_matrix(nkernels, width, height);
  838.  
  839.     //DEBUGGING(write_out(A, a_dim1, a_dim2));
  840.  
  841.     /* use a simple multichannel convolution routine to produce control result */
  842.   gettimeofday(&start_time2, NULL);
  843.  
  844.   multichannel_conv(image, kernels, control_output, width,
  845.     height, nchannels, nkernels, kernel_order);
  846.  
  847.   gettimeofday(&stop_time2, NULL);
  848.  
  849.   mul_time_david_gregg = (stop_time2.tv_sec - start_time2.tv_sec) * 1000000L +
  850.   (stop_time2.tv_usec - start_time2.tv_usec);
  851.  
  852.  
  853.     /* record starting time of team's code*/
  854.   gettimeofday(&start_time, NULL);
  855.  
  856.     /* perform student team's multichannel convolution */
  857.   team_conv(image, kernels, output, width,
  858.     height, nchannels, nkernels, kernel_order);
  859.  
  860.     /* record finishing time */
  861.   gettimeofday(&stop_time, NULL);
  862.   mul_time_my_team = (stop_time.tv_sec - start_time.tv_sec) * 1000000L +
  863.   (stop_time.tv_usec - start_time.tv_usec);
  864.  
  865.  
  866.   printf("David Gregg conv time: %lld microseconds\n", mul_time_david_gregg);
  867.   printf("Our Team conv time: %lld microseconds\n", mul_time_my_team);
  868.  
  869.   long long speed = mul_time_david_gregg /mul_time_my_team;
  870.   printf("Speed Factor : %lld \n ", speed);
  871.  
  872.   DEBUGGING(write_out(output, nkernels, width, height));
  873.  
  874.     /* now check that the team's multichannel convolution routine
  875.        gives the same answer as the known working version */
  876.   check_result(output, control_output, nkernels, width, height);
  877.  
  878.   return 0;
  879. }
RAW Paste Data
Top