Pastebin launched a little side project called HostCabi.net, check it out ;-)Don't like ads? PRO users don't see any ads ;-)
Guest

SSE vector+=

By: Eubie on Feb 28th, 2013  |  syntax: C++  |  size: 2.46 KB  |  hits: 40  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #include <cstdio>
  2. #include <cstdlib>
  3. #include <cstring>
  4. #include <map>
  5. #include <unordered_map>
  6. #include <vector>
  7. #include <fstream>
  8. #include <iterator>
  9. #include <algorithm>
  10. #include <list>
  11. #include <sstream>
  12. #include <iostream>
  13. #include <bitset>
  14. #include <array>
  15. #include <ctype.h>
  16. #include <ctime>
  17.  
  18. #include <emmintrin.h>
  19.  
  20. using namespace std;
  21.  
  22. #define FAST_SSE
  23.  
  24. inline void addToDoubleVectorSSE(const double * what, const double * toWhat, volatile double * dest, const unsigned int len)
  25. {
  26.         __m128d * _what         = (__m128d*)what;
  27.         __m128d * _toWhat       = (__m128d*)toWhat;
  28.         __m128d * _toWhatBase   = (__m128d*)toWhat;
  29.  
  30.         __m128d _dest1;
  31.         __m128d _dest2;
  32.  
  33. #ifdef FAST_SSE
  34.     for ( register unsigned int i = 0; i < len; i+= 4, _what += 2, _toWhat += 2, _toWhatBase+=2 )
  35.     {
  36.         _toWhatBase = _toWhat;
  37.         _dest1      = _mm_add_pd( *_what, *_toWhat );
  38.         _dest2      = _mm_add_pd( *(_what+1), *(_toWhat+1));
  39.  
  40.         *_toWhatBase = _dest1;
  41.         *(_toWhatBase+1) = _dest2;
  42.     }
  43. #else
  44.         for ( register unsigned int i = 0; i < len; i+= 4 )
  45.         {
  46.                 _toWhatBase = _toWhat;
  47.                 _dest1      = _mm_add_pd( *_what++, *_toWhat++ );
  48.                 _dest2      = _mm_add_pd( *_what++, *_toWhat++ );
  49.  
  50.                 *_toWhatBase++ = _dest1;
  51.                 *_toWhatBase++ = _dest2;
  52.         }
  53. #endif
  54. }
  55.  
  56. #define ARR_LEN   1000
  57. #define ARR_COUNT 1000
  58. #define REP_COUNT 10000
  59.  
  60. int main(int argc, const char* argv[]) {
  61.         double ** a = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 );
  62.         double ** b = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 );
  63.         double ** c = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 );
  64.  
  65.         for (int i =0 ; i != ARR_COUNT ; i++)
  66.         {
  67.                 a[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 );
  68.                 b[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 );
  69.                 c[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 );
  70.         }
  71.  
  72.         for (int i =0 ; i != 1000 ; i++)
  73.                 for (int j =0 ; j != 1000 ; j++)
  74.                 {
  75.                         a[i][j] = i*i;
  76.                         b[i][j] = sqrt((double)i);
  77.                         c[i][j] = 0.0;
  78.                 }
  79.    
  80.  
  81.     clock_t start = clock();
  82.     for (int i = 0 ; i != REP_COUNT ; i++)
  83.                 for ( int j = 0; j < ARR_COUNT; j++ )
  84.                         addToDoubleVectorSSE(a[j], b[j], c[j], ARR_LEN);
  85.  
  86.     cout << clock() - start << endl;
  87.         getchar();
  88.  
  89.         for (int i =0 ; i != ARR_COUNT; i++)
  90.         {
  91.                 _mm_free((void*) a[i]);
  92.                 _mm_free((void*) b[i]);
  93.                 _mm_free((void*) c[i]);
  94.         }
  95.         _mm_free((void*) a);
  96.         _mm_free((void*) b);
  97.         _mm_free((void*) c);
  98.  
  99.         return 0;
  100.     }