Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* Test and timing harness program for developing a multichannel
- multikernel convolution (as used in deep learning networks)
- Note there are some simplifications around this implementation,
- in particular with respect to computing the convolution at edge
- pixels of the image.
- Author: David Gregg
- Date: February 2017
- Version 1.4 : Modified the random generator to reduce the range
- of generated values;
- Changed the summation in the checking code from
- float to double to try to bring the checked value
- closer to the "true" value
- Version 1.3 : Fixed which loop variables were being incremented
- in write_out();
- Fixed dimensions of output and control_output
- matrices in main function
- Version 1.2 : Changed distribution of test data to (hopefully)
- eliminate random walk of floating point error;
- Also introduced checks to restrict kernel-order to
- a small set of values
- Version 1.1 : Fixed bug in code to create 4d matrix
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <sys/time.h>
- #include <assert.h>
- #include <omp.h>
- #include <math.h>
- #include <x86intrin.h>
- /* the following two definitions of DEBUGGING control whether or not
- debugging information is written out. To put the program into
- debugging mode, uncomment the following line: */
- /*#define DEBUGGING(_x) _x */
- /* to stop the printing of debugging information, use the following line: */
- #define DEBUGGING(_x)
- #define ZERO_VALUE 0
- /* write 3d matrix to stdout */
- void write_out(float *** a, int dim0, int dim1, int dim2)
- {
- int i, j, k;
- for ( i = 0; i < dim0; i++ ) {
- printf("Outer dimension number %d\n", i);
- for ( j = 0; j < dim1; j++ ) {
- for ( k = 0; k < dim2 - 1; k++ ) {
- printf("%f, ", a[i][j][k]);
- }
- // print end of line
- printf("%f\n", a[i][j][dim2-1]);
- }
- }
- }
- /* create new empty 4d matrix */
- float **** new_empty_4d_matrix(int dim0, int dim1, int dim2, int dim3)
- {
- float **** result = malloc(dim0 * sizeof(float***));
- float *** mat1 = malloc(dim0 * dim1 * sizeof(float**));
- float ** mat2 = malloc(dim0 * dim1 * dim2 * sizeof(float*));
- float * mat3 = malloc(dim0 * dim1 * dim2 *dim3 * sizeof(float));
- int i, j, k;
- for ( i = 0; i < dim0; i++ ) {
- result[i] = &(mat1[i*dim1]);
- for ( j = 0; j < dim1; j++ ) {
- result[i][j] = &(mat2[i*dim1*dim2 + j*dim2]);
- for ( k = 0; k < dim2; k++ ) {
- result[i][j][k] = &(mat3[i*dim1*dim2*dim3+j*dim2*dim3+k*dim3]);
- }
- }
- }
- return result;
- }
- /* create new empty 3d matrix */
- float *** new_empty_3d_matrix(int dim0, int dim1, int dim2)
- {
- float **** mat4d;
- float *** mat3d;
- // create a 4d matrix with single first dimension
- mat4d = new_empty_4d_matrix(1, dim0, dim1, dim2);
- // now throw away out first dimension
- mat3d = mat4d[0];
- free(mat4d);
- return mat3d;
- }
- /* take a copy of the matrix asnd return in a newly allocated matrix */
- float **** copy_4d_matrix(float **** source_matrix, int dim0,
- int dim1, int dim2, int dim3)
- {
- int i, j, k, l;
- float **** result = new_empty_4d_matrix(dim0, dim1, dim2, dim3);
- for ( i = 0; i < dim0; i++ ) {
- for ( j = 0; j < dim1; j++ ) {
- for ( k = 0; k < dim2; k++ ) {
- for ( l = 0; l < dim3; l++ ) {
- result[i][j][k][l] = source_matrix[i][j][k][l];
- }
- }
- }
- }
- return result;
- }
- /* create a matrix and fill it with random numbers */
- float **** gen_random_4d_matrix(int dim0, int dim1, int dim2, int dim3)
- {
- float **** result;
- int i, j, k, l;
- struct timeval seedtime;
- int seed;
- result = new_empty_4d_matrix(dim0, dim1, dim2, dim3);
- /* use the microsecond part of the current time as a pseudorandom seed */
- gettimeofday(&seedtime, NULL);
- seed = seedtime.tv_usec;
- srandom(seed);
- /* fill the matrix with random numbers */
- const int range = 1 << 12; // 2^12
- const int bias = 1 << 16; // 2^16
- float offset = 0.0;
- for ( i = 0; i < dim0; i++ ) {
- for ( j = 0; j < dim1; j++ ) {
- for ( k = 0; k < dim2; k++ ) {
- for ( l = 0; l < dim3; l++ ) {
- // generate uniform random integer with mean of zero
- long long rand = random();
- // now cut down the range and bias the mean to reduce
- // the likelihood of large floating point round-off errors
- int reduced_range = (rand % range);
- float num = (((float) reduced_range) / ((float) bias))+offset;
- result[i][j][k][l] = num;
- }
- }
- }
- }
- return result;
- }
- /* create a matrix and fill it with random numbers */
- float *** gen_random_3d_matrix(int dim0, int dim1, int dim2)
- {
- float **** mat4d;
- float *** mat3d;
- // create a 4d matrix with single first dimension
- mat4d = gen_random_4d_matrix(1, dim0, dim1, dim2);
- // now throw away out first dimension
- mat3d = mat4d[0];
- free(mat4d);
- return mat3d;
- }
- /* check the sum of absolute differences is within reasonable epsilon */
- void check_result(float *** result, float *** control,
- int dim0, int dim1, int dim2)
- {
- int i, j, k;
- double sum_abs_diff = 0.0;
- const double EPSILON = 0.0625;
- //printf("SAD\n");
- for ( i = 0; i < dim0; i++ ) {
- for ( j = 0; j < dim1; j++ ) {
- for ( k = 0; k < dim2; k++ ) {
- double diff = fabs(control[i][j][k] - result[i][j][k]);
- assert( diff >= 0.0 );
- sum_abs_diff = sum_abs_diff + diff;
- }
- }
- }
- if ( sum_abs_diff > EPSILON ) {
- fprintf(stderr, "WARNING: sum of absolute differences (%f) > EPSILON (%f)\n",
- sum_abs_diff, EPSILON);
- }
- else {
- printf("COMMENT: sum of absolute differences (%f) within acceptable range (%f)\n", sum_abs_diff, EPSILON);
- }
- }
- /* the slow but correct version of matmul written by David */
- void multichannel_conv(float *** image, float **** kernels, float *** output,
- int width, int height, int nchannels, int nkernels,
- int kernel_order)
- {
- int h, w, x, y, c, m, check;
- check = 0;
- for ( m = 0; m < nkernels; m++ ) { //cycle through each kernel
- for ( w = 0; w < width; w++ ) { //cycle through each x cooridinate of the image
- for ( h = 0; h < height; h++ ) { //cycle through each y coordinate of the image
- double sum = 0.0;
- for ( c = 0; c < nchannels; c++ ) { //cycle through each channel of the image and the kernel
- for ( x = 0; x < kernel_order; x++) { //cycle through each x coordinate of the kernel
- for ( y = 0; y < kernel_order; y++ ) { //cycle through each y coordinate of the kernel
- sum += image[w+x][h+y][c] * kernels[m][c][x][y];
- }
- }
- }
- output[m][w][h] = sum;
- //printf("Sum %d %f\n " , check , sum);
- check++;
- }
- }
- }
- }
- /* the fast version of matmul written by the team */
- void team_conv(float *** image, float **** kernels, float *** output,
- int width, int height, int nchannels, int nkernels,
- int kernel_order)
- {
- float *myArray = malloc( 4 * sizeof(float) );
- __m128 myKernel, myImage, productOfMul, sumOfVectors, productOfVectors;
- float arrr[4];
- // this call here is just dummy code
- // insert your own code instead
- int h, w, x, y, c, m,i,b;
- int check = 1;
- sumOfVectors = _mm_setzero_ps();
- __m128 sumOfVectors2 = _mm_setzero_ps();
- productOfVectors = _mm_setzero_ps();
- __m128 productOfVectors2 = _mm_setzero_ps();
- __m128 kernelVector = _mm_setzero_ps();
- __m128 kernelVector2 = _mm_setzero_ps();
- __m128 imageVector = _mm_setzero_ps();
- __m128 imageVector2 = _mm_setzero_ps();
- __m128 totalSumOfVectors = _mm_setzero_ps();
- float sumArray[8] = {0.0,0.0,0.0,0.0};
- if(kernel_order == 1)
- {
- for ( m = 0; m < nkernels; m++ ) {
- for ( w = 0; w < width; w++ ) {
- for ( h = 0; h < height; h++ ) {
- float sum = 0.0;
- if(nchannels >= 4)
- {
- for ( c = 0; c < nchannels-3; c=c+4 ) {
- imageVector = _mm_set_ps(image[w][h][c], image[w][h][c+1], image[w][h][c+2], image[w][h][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- //sum += image[w][h][c] * kernels[m][c][0][0];
- }
- }
- else if(nchannels >= 8)
- {
- for ( c = 0; c < nchannels-7; c=c+8 ) {
- imageVector = _mm_set_ps(image[w][h][c], image[w][h][c+1], image[w][h][c+2], image[w][h][c+3]);
- imageVector2 = _mm_set_ps(image[w][h][c+4], image[w][h][c+5], image[w][h][c+6], image[w][h][c+7]);
- kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
- kernelVector2 = _mm_set_ps(kernels[m][c+4][0][0], kernels[m][c+5][0][0],kernels[m][c+6][0][0],kernels[m][c+7][0][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- productOfVectors2 = _mm_mul_ps(imageVector2,kernelVector2);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- sumOfVectors2 = _mm_add_ps(sumOfVectors2, productOfVectors2);
- sumOfVectors = _mm_add_ps(sumOfVectors, sumOfVectors2);
- //sum += image[w][h][c] * kernels[m][c][0][0];
- }
- }
- for (; c < nchannels; c++)
- {
- sum += image[w][h][c] * kernels[m][c][0][0];
- }
- sumOfVectors = _mm_hadd_ps(sumOfVectors,sumOfVectors);
- sumOfVectors = _mm_hadd_ps(sumOfVectors,sumOfVectors);
- sum = _mm_cvtss_f32(sumOfVectors);
- //_mm_store_ps(sumArray,sumOfVectors);
- //sum = sumArray[0] + sumArray[1] + sumArray[2] + sumArray[3] + sumArray[4] + sumArray[5] + sumArray[6] + sumArray[7];
- output[m][w][h] = sum;
- sumOfVectors = _mm_setzero_ps();
- sumOfVectors2 = _mm_setzero_ps();
- productOfVectors = _mm_setzero_ps();
- productOfVectors2 = _mm_setzero_ps();
- kernelVector = _mm_setzero_ps();
- kernelVector2 = _mm_setzero_ps();
- imageVector = _mm_setzero_ps();
- imageVector2 = _mm_setzero_ps();
- totalSumOfVectors = _mm_setzero_ps();;
- }
- }
- }
- }
- else
- {
- sumOfVectors = _mm_setzero_ps();
- productOfVectors = _mm_setzero_ps();
- __m128 kernelVector = _mm_setzero_ps();
- __m128 imageVector = _mm_setzero_ps();
- int q=0;
- float test[4] = {0.0, 0.0, 0.0, 0.0};
- int h, w, x, y, c, m, check;
- float sumArray[4] = {0.0, 0.0, 0.0, 0.0};
- check = 0;
- for ( m = 0; m < nkernels; m++ )
- {
- for ( w = 0; w < width; w++ )
- {
- for(h = 0; h < height; h++)
- {
- float sum = 0.0;
- for ( c = 0; c < nchannels-3; c=c+4 )
- {
- if(kernel_order == 3)
- {
- imageVector = _mm_set_ps(image[w+0][h+0][c], image[w+0][h+0][c+1], image[w+0][h+0][c+2], image[w+0][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+1][c], image[w+0][h+1][c+1], image[w+0][h+1][c+2], image[w+0][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][1], kernels[m][c+1][0][1],kernels[m][c+2][0][1],kernels[m][c+3][0][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+2][c], image[w+0][h+2][c+1], image[w+0][h+2][c+2], image[w+0][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][2], kernels[m][c+1][0][2],kernels[m][c+2][0][2],kernels[m][c+3][0][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+0][c], image[w+1][h+0][c+1], image[w+1][h+0][c+2], image[w+1][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][0], kernels[m][c+1][1][0],kernels[m][c+2][1][0],kernels[m][c+3][1][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+1][c], image[w+1][h+1][c+1], image[w+1][h+1][c+2], image[w+1][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][1], kernels[m][c+1][1][1],kernels[m][c+2][1][1],kernels[m][c+3][1][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+2][c], image[w+1][h+2][c+1], image[w+1][h+2][c+2], image[w+1][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][2], kernels[m][c+1][1][2],kernels[m][c+2][1][2],kernels[m][c+3][1][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+0][c], image[w+2][h+0][c+1], image[w+2][h+0][c+2], image[w+2][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][0], kernels[m][c+1][2][0],kernels[m][c+2][2][0],kernels[m][c+3][2][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+1][c], image[w+2][h+1][c+1], image[w+2][h+1][c+2], image[w+2][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][1], kernels[m][c+1][2][1],kernels[m][c+2][2][1],kernels[m][c+3][2][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+2][c], image[w+2][h+2][c+1], image[w+2][h+2][c+2], image[w+2][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][2], kernels[m][c+1][2][2],kernels[m][c+2][2][2],kernels[m][c+3][2][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- }
- else if(kernel_order == 5)
- {
- imageVector = _mm_set_ps(image[w+0][h+0][c], image[w+0][h+0][c+1], image[w+0][h+0][c+2], image[w+0][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+1][c], image[w+0][h+1][c+1], image[w+0][h+1][c+2], image[w+0][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][1], kernels[m][c+1][0][1],kernels[m][c+2][0][1],kernels[m][c+3][0][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+2][c], image[w+0][h+2][c+1], image[w+0][h+2][c+2], image[w+0][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][2], kernels[m][c+1][0][2],kernels[m][c+2][0][2],kernels[m][c+3][0][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+3][c], image[w+0][h+3][c+1], image[w+0][h+3][c+2], image[w+0][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][3], kernels[m][c+1][0][3],kernels[m][c+2][0][3],kernels[m][c+3][0][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+4][c], image[w+0][h+4][c+1], image[w+0][h+4][c+2], image[w+0][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][4], kernels[m][c+1][0][4],kernels[m][c+2][0][4],kernels[m][c+3][0][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+0][c], image[w+1][h+0][c+1], image[w+1][h+0][c+2], image[w+1][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][0], kernels[m][c+1][1][0],kernels[m][c+2][1][0],kernels[m][c+3][1][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+1][c], image[w+1][h+1][c+1], image[w+1][h+1][c+2], image[w+1][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][1], kernels[m][c+1][1][1],kernels[m][c+2][1][1],kernels[m][c+3][1][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+2][c], image[w+1][h+2][c+1], image[w+1][h+2][c+2], image[w+1][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][2], kernels[m][c+1][1][2],kernels[m][c+2][1][2],kernels[m][c+3][1][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+3][c], image[w+1][h+3][c+1], image[w+1][h+3][c+2], image[w+1][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][3], kernels[m][c+1][1][3],kernels[m][c+2][1][3],kernels[m][c+3][1][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+4][c], image[w+1][h+4][c+1], image[w+1][h+4][c+2], image[w+1][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][4], kernels[m][c+1][1][4],kernels[m][c+2][1][4],kernels[m][c+3][1][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+0][c], image[w+2][h+0][c+1], image[w+2][h+0][c+2], image[w+2][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][0], kernels[m][c+1][2][0],kernels[m][c+2][2][0],kernels[m][c+3][2][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+1][c], image[w+2][h+1][c+1], image[w+2][h+1][c+2], image[w+2][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][1], kernels[m][c+1][2][1],kernels[m][c+2][2][1],kernels[m][c+3][2][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+2][c], image[w+2][h+2][c+1], image[w+2][h+2][c+2], image[w+2][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][2], kernels[m][c+1][2][2],kernels[m][c+2][2][2],kernels[m][c+3][2][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+3][c], image[w+2][h+3][c+1], image[w+2][h+3][c+2], image[w+2][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][3], kernels[m][c+1][2][3],kernels[m][c+2][2][3],kernels[m][c+3][2][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+4][c], image[w+2][h+4][c+1], image[w+2][h+4][c+2], image[w+2][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][4], kernels[m][c+1][2][4],kernels[m][c+2][2][4],kernels[m][c+3][2][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+0][c], image[w+3][h+0][c+1], image[w+3][h+0][c+2], image[w+3][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][0], kernels[m][c+1][3][0],kernels[m][c+2][3][0],kernels[m][c+3][3][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+1][c], image[w+3][h+1][c+1], image[w+3][h+1][c+2], image[w+3][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][1], kernels[m][c+1][3][1],kernels[m][c+2][3][1],kernels[m][c+3][3][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+2][c], image[w+3][h+2][c+1], image[w+3][h+2][c+2], image[w+3][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][2], kernels[m][c+1][3][2],kernels[m][c+2][3][2],kernels[m][c+3][3][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+3][c], image[w+3][h+3][c+1], image[w+3][h+3][c+2], image[w+3][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][3], kernels[m][c+1][3][3],kernels[m][c+2][3][3],kernels[m][c+3][3][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+4][c], image[w+3][h+4][c+1], image[w+3][h+4][c+2], image[w+3][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][4], kernels[m][c+1][3][4],kernels[m][c+2][3][4],kernels[m][c+3][3][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+0][c], image[w+4][h+0][c+1], image[w+4][h+0][c+2], image[w+4][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][0], kernels[m][c+1][4][0],kernels[m][c+2][4][0],kernels[m][c+3][4][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+1][c], image[w+4][h+1][c+1], image[w+4][h+1][c+2], image[w+4][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][1], kernels[m][c+1][4][1],kernels[m][c+2][4][1],kernels[m][c+3][4][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+2][c], image[w+4][h+2][c+1], image[w+4][h+2][c+2], image[w+4][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][2], kernels[m][c+1][4][2],kernels[m][c+2][4][2],kernels[m][c+3][4][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+3][c], image[w+4][h+3][c+1], image[w+4][h+3][c+2], image[w+4][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][3], kernels[m][c+1][4][3],kernels[m][c+2][4][3],kernels[m][c+3][4][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+4][c], image[w+4][h+4][c+1], image[w+4][h+4][c+2], image[w+4][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][4], kernels[m][c+1][4][4],kernels[m][c+2][4][4],kernels[m][c+3][4][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- }
- else if(kernel_order == 7)
- {
- imageVector = _mm_set_ps(image[w+0][h+0][c], image[w+0][h+0][c+1], image[w+0][h+0][c+2], image[w+0][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][0], kernels[m][c+1][0][0],kernels[m][c+2][0][0],kernels[m][c+3][0][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+1][c], image[w+0][h+1][c+1], image[w+0][h+1][c+2], image[w+0][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][1], kernels[m][c+1][0][1],kernels[m][c+2][0][1],kernels[m][c+3][0][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+2][c], image[w+0][h+2][c+1], image[w+0][h+2][c+2], image[w+0][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][2], kernels[m][c+1][0][2],kernels[m][c+2][0][2],kernels[m][c+3][0][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+3][c], image[w+0][h+3][c+1], image[w+0][h+3][c+2], image[w+0][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][3], kernels[m][c+1][0][3],kernels[m][c+2][0][3],kernels[m][c+3][0][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+4][c], image[w+0][h+4][c+1], image[w+0][h+4][c+2], image[w+0][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][4], kernels[m][c+1][0][4],kernels[m][c+2][0][4],kernels[m][c+3][0][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+5][c], image[w+0][h+5][c+1], image[w+0][h+5][c+2], image[w+0][h+5][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][5], kernels[m][c+1][0][5],kernels[m][c+2][0][5],kernels[m][c+3][0][5]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+0][h+6][c], image[w+0][h+6][c+1], image[w+0][h+6][c+2], image[w+0][h+6][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][0][6], kernels[m][c+1][0][6],kernels[m][c+2][0][6],kernels[m][c+3][0][6]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+0][c], image[w+1][h+0][c+1], image[w+1][h+0][c+2], image[w+1][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][0], kernels[m][c+1][1][0],kernels[m][c+2][1][0],kernels[m][c+3][1][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+1][c], image[w+1][h+1][c+1], image[w+1][h+1][c+2], image[w+1][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][1], kernels[m][c+1][1][1],kernels[m][c+2][1][1],kernels[m][c+3][1][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+2][c], image[w+1][h+2][c+1], image[w+1][h+2][c+2], image[w+1][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][2], kernels[m][c+1][1][2],kernels[m][c+2][1][2],kernels[m][c+3][1][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+3][c], image[w+1][h+3][c+1], image[w+1][h+3][c+2], image[w+1][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][3], kernels[m][c+1][1][3],kernels[m][c+2][1][3],kernels[m][c+3][1][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+4][c], image[w+1][h+4][c+1], image[w+1][h+4][c+2], image[w+1][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][4], kernels[m][c+1][1][4],kernels[m][c+2][1][4],kernels[m][c+3][1][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+5][c], image[w+1][h+5][c+1], image[w+1][h+5][c+2], image[w+1][h+5][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][5], kernels[m][c+1][1][5],kernels[m][c+2][1][5],kernels[m][c+3][1][5]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+1][h+6][c], image[w+1][h+6][c+1], image[w+1][h+6][c+2], image[w+1][h+6][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][1][6], kernels[m][c+1][1][6],kernels[m][c+2][1][6],kernels[m][c+3][1][6]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+0][c], image[w+2][h+0][c+1], image[w+2][h+0][c+2], image[w+2][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][0], kernels[m][c+1][2][0],kernels[m][c+2][2][0],kernels[m][c+3][2][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+1][c], image[w+2][h+1][c+1], image[w+2][h+1][c+2], image[w+2][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][1], kernels[m][c+1][2][1],kernels[m][c+2][2][1],kernels[m][c+3][2][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+2][c], image[w+2][h+2][c+1], image[w+2][h+2][c+2], image[w+2][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][2], kernels[m][c+1][2][2],kernels[m][c+2][2][2],kernels[m][c+3][2][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+3][c], image[w+2][h+3][c+1], image[w+2][h+3][c+2], image[w+2][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][3], kernels[m][c+1][2][3],kernels[m][c+2][2][3],kernels[m][c+3][2][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+4][c], image[w+2][h+4][c+1], image[w+2][h+4][c+2], image[w+2][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][4], kernels[m][c+1][2][4],kernels[m][c+2][2][4],kernels[m][c+3][2][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+5][c], image[w+2][h+5][c+1], image[w+2][h+5][c+2], image[w+2][h+5][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][5], kernels[m][c+1][2][5],kernels[m][c+2][2][5],kernels[m][c+3][2][5]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+2][h+6][c], image[w+2][h+6][c+1], image[w+2][h+6][c+2], image[w+2][h+6][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][2][6], kernels[m][c+1][2][6],kernels[m][c+2][2][6],kernels[m][c+3][2][6]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+0][c], image[w+3][h+0][c+1], image[w+3][h+0][c+2], image[w+3][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][0], kernels[m][c+1][3][0],kernels[m][c+2][3][0],kernels[m][c+3][3][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+1][c], image[w+3][h+1][c+1], image[w+3][h+1][c+2], image[w+3][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][1], kernels[m][c+1][3][1],kernels[m][c+2][3][1],kernels[m][c+3][3][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+2][c], image[w+3][h+2][c+1], image[w+3][h+2][c+2], image[w+3][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][2], kernels[m][c+1][3][2],kernels[m][c+2][3][2],kernels[m][c+3][3][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+3][c], image[w+3][h+3][c+1], image[w+3][h+3][c+2], image[w+3][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][3], kernels[m][c+1][3][3],kernels[m][c+2][3][3],kernels[m][c+3][3][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+4][c], image[w+3][h+4][c+1], image[w+3][h+4][c+2], image[w+3][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][4], kernels[m][c+1][3][4],kernels[m][c+2][3][4],kernels[m][c+3][3][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+5][c], image[w+3][h+5][c+1], image[w+3][h+5][c+2], image[w+3][h+5][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][5], kernels[m][c+1][3][5],kernels[m][c+2][3][5],kernels[m][c+3][3][5]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+3][h+6][c], image[w+3][h+6][c+1], image[w+3][h+6][c+2], image[w+3][h+6][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][3][6], kernels[m][c+1][3][6],kernels[m][c+2][3][6],kernels[m][c+3][3][6]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+0][c], image[w+4][h+0][c+1], image[w+4][h+0][c+2], image[w+4][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][0], kernels[m][c+1][4][0],kernels[m][c+2][4][0],kernels[m][c+3][4][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+1][c], image[w+4][h+1][c+1], image[w+4][h+1][c+2], image[w+4][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][1], kernels[m][c+1][4][1],kernels[m][c+2][4][1],kernels[m][c+3][4][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+2][c], image[w+4][h+2][c+1], image[w+4][h+2][c+2], image[w+4][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][2], kernels[m][c+1][4][2],kernels[m][c+2][4][2],kernels[m][c+3][4][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+3][c], image[w+4][h+3][c+1], image[w+4][h+3][c+2], image[w+4][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][3], kernels[m][c+1][4][3],kernels[m][c+2][4][3],kernels[m][c+3][4][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+4][c], image[w+4][h+4][c+1], image[w+4][h+4][c+2], image[w+4][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][4], kernels[m][c+1][4][4],kernels[m][c+2][4][4],kernels[m][c+3][4][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+5][c], image[w+4][h+5][c+1], image[w+4][h+5][c+2], image[w+4][h+5][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][5], kernels[m][c+1][4][5],kernels[m][c+2][4][5],kernels[m][c+3][4][5]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+4][h+6][c], image[w+4][h+6][c+1], image[w+4][h+6][c+2], image[w+4][h+6][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][4][6], kernels[m][c+1][4][6],kernels[m][c+2][4][6],kernels[m][c+3][4][6]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+5][h+0][c], image[w+5][h+0][c+1], image[w+5][h+0][c+2], image[w+5][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][5][0], kernels[m][c+1][5][0],kernels[m][c+2][5][0],kernels[m][c+3][5][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+5][h+1][c], image[w+5][h+1][c+1], image[w+5][h+1][c+2], image[w+5][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][5][1], kernels[m][c+1][5][1],kernels[m][c+2][5][1],kernels[m][c+3][5][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+5][h+2][c], image[w+5][h+2][c+1], image[w+5][h+2][c+2], image[w+5][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][5][2], kernels[m][c+1][5][2],kernels[m][c+2][5][2],kernels[m][c+3][5][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+5][h+3][c], image[w+5][h+3][c+1], image[w+5][h+3][c+2], image[w+5][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][5][3], kernels[m][c+1][5][3],kernels[m][c+2][5][3],kernels[m][c+3][5][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+5][h+4][c], image[w+5][h+4][c+1], image[w+5][h+4][c+2], image[w+5][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][5][4], kernels[m][c+1][5][4],kernels[m][c+2][5][4],kernels[m][c+3][5][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+5][h+5][c], image[w+5][h+5][c+1], image[w+5][h+5][c+2], image[w+5][h+5][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][5][5], kernels[m][c+1][5][5],kernels[m][c+2][5][5],kernels[m][c+3][5][5]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+5][h+6][c], image[w+5][h+6][c+1], image[w+5][h+6][c+2], image[w+5][h+6][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][5][6], kernels[m][c+1][5][6],kernels[m][c+2][5][6],kernels[m][c+3][5][6]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+6][h+0][c], image[w+6][h+0][c+1], image[w+6][h+0][c+2], image[w+6][h+0][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][6][0], kernels[m][c+1][6][0],kernels[m][c+2][6][0],kernels[m][c+3][6][0]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+6][h+1][c], image[w+6][h+1][c+1], image[w+6][h+1][c+2], image[w+6][h+1][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][6][1], kernels[m][c+1][6][1],kernels[m][c+2][6][1],kernels[m][c+3][6][1]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+6][h+2][c], image[w+6][h+2][c+1], image[w+6][h+2][c+2], image[w+6][h+2][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][6][2], kernels[m][c+1][6][2],kernels[m][c+2][6][2],kernels[m][c+3][6][2]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+6][h+3][c], image[w+6][h+3][c+1], image[w+6][h+3][c+2], image[w+6][h+3][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][6][3], kernels[m][c+1][6][3],kernels[m][c+2][6][3],kernels[m][c+3][6][3]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+6][h+4][c], image[w+6][h+4][c+1], image[w+6][h+4][c+2], image[w+6][h+4][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][6][4], kernels[m][c+1][6][4],kernels[m][c+2][6][4],kernels[m][c+3][6][4]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+6][h+5][c], image[w+6][h+5][c+1], image[w+6][h+5][c+2], image[w+6][h+5][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][6][5], kernels[m][c+1][6][5],kernels[m][c+2][6][5],kernels[m][c+3][6][5]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- imageVector = _mm_set_ps(image[w+6][h+6][c], image[w+6][h+6][c+1], image[w+6][h+6][c+2], image[w+6][h+6][c+3]);
- kernelVector = _mm_set_ps(kernels[m][c][6][6], kernels[m][c+1][6][6],kernels[m][c+2][6][6],kernels[m][c+3][6][6]);
- productOfVectors = _mm_mul_ps(imageVector,kernelVector);
- sumOfVectors = _mm_add_ps(sumOfVectors, productOfVectors);
- }
- }
- _mm_store_ps(sumArray,sumOfVectors);
- sum = sumArray[0] + sumArray[1] + sumArray[2] + sumArray[3];
- for (; c < nchannels; c++)
- {
- sum += image[w][h][c] * kernels[m][c][0][0];
- }
- output[m][w][h] = sum;
- check++;
- sumOfVectors = _mm_setzero_ps();
- productOfVectors = _mm_setzero_ps();
- kernelVector = _mm_setzero_ps();
- imageVector = _mm_setzero_ps();
- }
- }
- }
- }
- }
- int main(int argc, char ** argv)
- {
- //float image[W][H][C];
- //float kernels[M][C][K][K];
- //float output[M][W][H];
- float *** image, **** kernels, *** output;
- float *** control_output;
- long long mul_time_my_team, mul_time_david_gregg;
- int width, height, kernel_order, nchannels, nkernels;
- struct timeval start_time, start_time2;
- struct timeval stop_time, stop_time2;
- if ( argc != 6 ) {
- fprintf(stderr, "Usage: conv-harness <image_width> <image_height> <kernel_order> <number of channels> <number of kernels>\n");
- exit(1);
- }
- else {
- width = atoi(argv[1]);
- height = atoi(argv[2]);
- kernel_order = atoi(argv[3]);
- nchannels = atoi(argv[4]);
- nkernels = atoi(argv[5]);
- }
- switch ( kernel_order ) {
- case 1:
- case 3:
- case 5:
- case 7: break;
- default:
- fprintf(stderr, "FATAL: kernel_order must be 1, 3, 5 or 7, not %d\n",
- kernel_order);
- exit(1);
- }
- /* allocate the matrices */
- image = gen_random_3d_matrix(width+kernel_order, height + kernel_order,
- nchannels);
- kernels = gen_random_4d_matrix(nkernels, nchannels, kernel_order, kernel_order);
- output = new_empty_3d_matrix(nkernels, width, height);
- control_output = new_empty_3d_matrix(nkernels, width, height);
- //DEBUGGING(write_out(A, a_dim1, a_dim2));
- /* use a simple multichannel convolution routine to produce control result */
- gettimeofday(&start_time2, NULL);
- multichannel_conv(image, kernels, control_output, width,
- height, nchannels, nkernels, kernel_order);
- gettimeofday(&stop_time2, NULL);
- mul_time_david_gregg = (stop_time2.tv_sec - start_time2.tv_sec) * 1000000L +
- (stop_time2.tv_usec - start_time2.tv_usec);
- /* record starting time of team's code*/
- gettimeofday(&start_time, NULL);
- /* perform student team's multichannel convolution */
- team_conv(image, kernels, output, width,
- height, nchannels, nkernels, kernel_order);
- /* record finishing time */
- gettimeofday(&stop_time, NULL);
- mul_time_my_team = (stop_time.tv_sec - start_time.tv_sec) * 1000000L +
- (stop_time.tv_usec - start_time.tv_usec);
- printf("David Gregg conv time: %lld microseconds\n", mul_time_david_gregg);
- printf("Our Team conv time: %lld microseconds\n", mul_time_my_team);
- long long speed = mul_time_david_gregg /mul_time_my_team;
- printf("Speed Factor : %lld \n ", speed);
- DEBUGGING(write_out(output, nkernels, width, height));
- /* now check that the team's multichannel convolution routine
- gives the same answer as the known working version */
- check_result(output, control_output, nkernels, width, height);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement