#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <map>
#include <unordered_map>
#include <vector>
#include <fstream>
#include <iterator>
#include <algorithm>
#include <list>
#include <sstream>
#include <iostream>
#include <bitset>
#include <array>
#include <ctype.h>
#include <ctime>
#include <emmintrin.h>
using namespace std;
#define FAST_SSE
inline void addToDoubleVectorSSE(const double * what, const double * toWhat, volatile double * dest, const unsigned int len)
{
__m128d * _what = (__m128d*)what;
__m128d * _toWhat = (__m128d*)toWhat;
__m128d * _toWhatBase = (__m128d*)toWhat;
__m128d _dest1;
__m128d _dest2;
#ifdef FAST_SSE
for ( register unsigned int i = 0; i < len; i+= 4, _what += 2, _toWhat += 2, _toWhatBase+=2 )
{
_toWhatBase = _toWhat;
_dest1 = _mm_add_pd( *_what, *_toWhat );
_dest2 = _mm_add_pd( *(_what+1), *(_toWhat+1));
*_toWhatBase = _dest1;
*(_toWhatBase+1) = _dest2;
}
#else
for ( register unsigned int i = 0; i < len; i+= 4 )
{
_toWhatBase = _toWhat;
_dest1 = _mm_add_pd( *_what++, *_toWhat++ );
_dest2 = _mm_add_pd( *_what++, *_toWhat++ );
*_toWhatBase++ = _dest1;
*_toWhatBase++ = _dest2;
}
#endif
}
#define ARR_LEN 1000
#define ARR_COUNT 1000
#define REP_COUNT 10000
int main(int argc, const char* argv[]) {
double ** a = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 );
double ** b = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 );
double ** c = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 );
for (int i =0 ; i != ARR_COUNT ; i++)
{
a[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 );
b[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 );
c[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 );
}
for (int i =0 ; i != 1000 ; i++)
for (int j =0 ; j != 1000 ; j++)
{
a[i][j] = i*i;
b[i][j] = sqrt((double)i);
c[i][j] = 0.0;
}
clock_t start = clock();
for (int i = 0 ; i != REP_COUNT ; i++)
for ( int j = 0; j < ARR_COUNT; j++ )
addToDoubleVectorSSE(a[j], b[j], c[j], ARR_LEN);
cout << clock() - start << endl;
getchar();
for (int i =0 ; i != ARR_COUNT; i++)
{
_mm_free((void*) a[i]);
_mm_free((void*) b[i]);
_mm_free((void*) c[i]);
}
_mm_free((void*) a);
_mm_free((void*) b);
_mm_free((void*) c);
return 0;
}