Advertisement
Guest User

Untitled

a guest
Jun 25th, 2017
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 2.32 KB | None | 0 0
  1. /* unoptimized
  2. double sumDBIndicies(double* input1, IndiciesInput indicies)
  3. {
  4.     double accumulate = 0;
  5.     for(unsigned int i = 0; i < indicies.nSingleIndicies; i++)
  6.     {
  7.         accumulate += input1[indicies.singleIndicies[i]];
  8.     }
  9.     for(unsigned int i = 0; i < indicies.nRanges; i++)
  10.     {
  11.         for(double* address = indicies.rangeStartPositions[i] + input1; address < indicies.rangeStartPositions[i] + input1 + indicies.rangeLengths[i]; address++)
  12.         {
  13.             accumulate += *address;
  14.         }
  15.     }
  16.     return accumulate;
  17. }
  18. */
  19. double sumDBIndicies(double* input1, IndiciesInput indicies)
  20. {
  21.     double accumulate = 0;
  22.     for(unsigned int i = 0; i < indicies.nSingleIndicies; i++)
  23.     {
  24.         accumulate += input1[indicies.singleIndicies[i]];
  25.     }
  26.     for(unsigned int i = 0; i < indicies.nRanges; i++)
  27.     {
  28.         unsigned int masked = indicies.rangeLengths[i] & ~0xF;
  29.         double* address = indicies.rangeStartPositions[i] + input1;
  30.         for(; address + 15 < indicies.rangeStartPositions[i] + input1 + masked; address+=16)
  31.         {
  32.             __m128d result1 = _mm_add_pd(_mm_load_pd(address), _mm_load_pd(address+2));
  33.             __m128d result2 = _mm_add_pd(_mm_load_pd(address+4), _mm_load_pd(address+6));
  34.             __m128d result3 = _mm_add_pd(_mm_load_pd(address+8), _mm_load_pd(address+10));
  35.             __m128d result4 = _mm_add_pd(_mm_load_pd(address+12), _mm_load_pd(address+14));
  36.            
  37.             //now aggregate those
  38.             __m128d result5 = _mm_add_pd(result1, result2);
  39.             __m128d result6 = _mm_add_pd(result3, result4);
  40.            
  41.             __m128d result7 = _mm_add_pd(result5, result6);
  42.            
  43.             double output[8];
  44.             double* aligned = (double*)((size_t)(output + 4) & ~0xF);
  45.             _mm_store_pd(aligned, result7);
  46.             accumulate += aligned[0] + aligned[1];
  47.         }
  48.         switch(indicies.rangeLengths[i] % 16)
  49.         {
  50.             case 15: accumulate += address[14];
  51.             case 14: accumulate += address[13];
  52.             case 13: accumulate += address[12];
  53.             case 12: accumulate += address[11];
  54.             case 11: accumulate += address[10];
  55.             case 10: accumulate += address[9];
  56.             case 9:  accumulate += address[8];
  57.             case 8:  accumulate += address[7];
  58.             case 7:  accumulate += address[6];
  59.             case 6:  accumulate += address[5];
  60.             case 5:  accumulate += address[4];
  61.             case 4:  accumulate += address[3];
  62.             case 3:  accumulate += address[2];
  63.             case 2:  accumulate += address[1];
  64.             case 1:  accumulate += address[0];
  65.             case 0: break;
  66.            
  67.         }
  68.     }
  69.     return accumulate;
  70. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement