#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; #define FAST_SSE inline void addToDoubleVectorSSE(const double * what, const double * toWhat, volatile double * dest, const unsigned int len) { __m128d * _what = (__m128d*)what; __m128d * _toWhat = (__m128d*)toWhat; __m128d * _toWhatBase = (__m128d*)toWhat; __m128d _dest1; __m128d _dest2; #ifdef FAST_SSE for ( register unsigned int i = 0; i < len; i+= 4, _what += 2, _toWhat += 2, _toWhatBase+=2 ) { _toWhatBase = _toWhat; _dest1 = _mm_add_pd( *_what, *_toWhat ); _dest2 = _mm_add_pd( *(_what+1), *(_toWhat+1)); *_toWhatBase = _dest1; *(_toWhatBase+1) = _dest2; } #else for ( register unsigned int i = 0; i < len; i+= 4 ) { _toWhatBase = _toWhat; _dest1 = _mm_add_pd( *_what++, *_toWhat++ ); _dest2 = _mm_add_pd( *_what++, *_toWhat++ ); *_toWhatBase++ = _dest1; *_toWhatBase++ = _dest2; } #endif } #define ARR_LEN 1000 #define ARR_COUNT 1000 #define REP_COUNT 10000 int main(int argc, const char* argv[]) { double ** a = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 ); double ** b = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 ); double ** c = ( double **) _mm_malloc( sizeof(double*) * ARR_COUNT, 16 ); for (int i =0 ; i != ARR_COUNT ; i++) { a[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 ); b[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 ); c[i] = ( double *) _mm_malloc( sizeof(double) * ARR_LEN, 16 ); } for (int i =0 ; i != 1000 ; i++) for (int j =0 ; j != 1000 ; j++) { a[i][j] = i*i; b[i][j] = sqrt((double)i); c[i][j] = 0.0; } clock_t start = clock(); for (int i = 0 ; i != REP_COUNT ; i++) for ( int j = 0; j < ARR_COUNT; j++ ) addToDoubleVectorSSE(a[j], b[j], c[j], ARR_LEN); cout << clock() - start << endl; getchar(); for (int i =0 ; i != ARR_COUNT; i++) { _mm_free((void*) a[i]); _mm_free((void*) b[i]); _mm_free((void*) c[i]); } _mm_free((void*) a); _mm_free((void*) b); _mm_free((void*) c); return 0; }