Advertisement
Guest User

SIMD Quadrant

a guest
Mar 27th, 2014
330
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.40 KB | None | 0 0
  1. #include <windows.h>
  2. #include <stdio.h>
  3. #include <tchar.h>
  4. #include <time.h>
  5. #include <emmintrin.h>
  6. #include <vector>
  7. //
  8. static const __m128i zero   = _mm_set_epi32( 0, 0, 0, 0 );
  9. static const __m128i xor    = _mm_set_epi32( 0, 0, -1, -1 );
  10. static const __m128i shfl   = _mm_set_epi8(
  11.     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0
  12. );
  13. static const int Exp        = 16;
  14. //
  15. inline void Quadrant1( const __m128i&pair, volatile int*q1, volatile int*q2 ){
  16.     __m128i gz      = _mm_cmpgt_epi32( pair, zero );
  17.     __m128i lz      = _mm_cmpgt_epi32( zero, pair );
  18.     __m128i p1      = _mm_unpacklo_epi32( gz, lz );
  19.     __m128i p2      = _mm_unpackhi_epi32( gz, lz );
  20.     *q1             = _mm_shuffle_epi8( p1, shfl ).m128i_i32[0];
  21.     *q2             = _mm_shuffle_epi8( p2, shfl ).m128i_i32[0];
  22. }
  23. inline void Quadrant1( int x1, int y1, int x2, int y2, const __m128i&c, volatile int*q1, volatile int*q2 ){
  24.     return Quadrant1( _mm_sub_epi32( _mm_set_epi32( y2, x2, y1, x1 ), c ), q1, q2 );
  25. }
  26. //
  27. inline void Quadrant2( const __m128i&pair, volatile int*q1, volatile int*q2 ){
  28.     __m128i gz      = _mm_cmpgt_epi32( pair, zero );
  29.     __m128i lz      = _mm_cmpgt_epi32( zero, pair );
  30.     __m128i p1      = _mm_unpacklo_epi32( gz, lz );
  31.     __m128i p2      = _mm_unpackhi_epi32( gz, lz );
  32.     //
  33.     if( lz.m128i_i32[1] ){
  34.         p1  = _mm_xor_si128( p1, xor );
  35.     }
  36.     if( lz.m128i_i32[3] ){
  37.         p2  = _mm_xor_si128( p2, xor );
  38.     }
  39.     *q1             = _mm_shuffle_epi8( p1, shfl ).m128i_i32[0];
  40.     *q2             = _mm_shuffle_epi8( p2, shfl ).m128i_i32[0];
  41. }
  42. //
  43. inline void Quadrant2( int x1, int y1, int x2, int y2, const __m128i&c, volatile int*q1, volatile int*q2 ){
  44.     return Quadrant2( _mm_sub_epi32( _mm_set_epi32( y2, x2, y1, x1 ), c ), q1, q2 );
  45. }
  46. //
  47. int _tmain( int argc, LPCTSTR argv[] ){
  48.     static const DWORD Measure  = 5000;
  49.     //
  50.     std::vector< __m128i >Points;
  51.     __m128i c;
  52.     ULONGLONG Start, T0, T1, T2;
  53.     DWORD_PTR Mask;
  54.     double ms;
  55.     SIZE_T ic, Count;
  56.     int cx, cy, p;
  57.     volatile int q1, q2;
  58.     //
  59.     if( ( argc < 2 ) || ( 1 != _stscanf_s( argv[1], TEXT("%Iu"), &Count ) ) ){
  60.         return 0;
  61.     }
  62.     _tprintf( TEXT("count=%Iu\r\n" ), Count );
  63.     Mask    = ::SetThreadAffinityMask( ::GetCurrentThread(), 1 );
  64.     _tprintf( TEXT("measure...\r\n" ) );
  65.     Start   = __rdtsc();
  66.     Sleep( Measure );
  67.     ms      = double( __rdtsc() - Start ) / Measure;
  68.     _tprintf( TEXT("Freq=%lf GHz\r\n" ), ms / 1000000 );
  69.     //
  70.     srand( (unsigned)time( nullptr ) );
  71.     Points.resize( Count / 2 );
  72.     _tprintf( TEXT("fill...\r\n" ) );
  73.     cx  = rand() - INT_MAX / 2;
  74.     cy  = rand() - INT_MAX / 2;
  75.     c   = _mm_set_epi32( cy, cx, cy, cx );
  76.     for( ic = Points.size() ; ic-- ; ){
  77.         Points[ic].m128i_i32[0] = rand() - INT_MAX / 2;
  78.         Points[ic].m128i_i32[1] = rand() - INT_MAX / 2;
  79.         Points[ic].m128i_i32[2] = rand() - INT_MAX / 2;
  80.         Points[ic].m128i_i32[3] = rand() - INT_MAX / 2;
  81.     }
  82.     _tprintf( TEXT("calc...\r\n" ) );
  83.     Start   = __rdtsc();
  84.     for( p = 0, ic = Points.size() / Exp ; ic-- ; p += Exp ){
  85.         Points[p + 0]   = _mm_sub_epi32( Points[p + 0], c );
  86.         Points[p + 1]   = _mm_sub_epi32( Points[p + 1], c );
  87.         Points[p + 2]   = _mm_sub_epi32( Points[p + 2], c );
  88.         Points[p + 3]   = _mm_sub_epi32( Points[p + 3], c );
  89.         Points[p + 4]   = _mm_sub_epi32( Points[p + 4], c );
  90.         Points[p + 5]   = _mm_sub_epi32( Points[p + 5], c );
  91.         Points[p + 6]   = _mm_sub_epi32( Points[p + 6], c );
  92.         Points[p + 7]   = _mm_sub_epi32( Points[p + 7], c );
  93.         Points[p + 8]   = _mm_sub_epi32( Points[p + 8], c );
  94.         Points[p + 9]   = _mm_sub_epi32( Points[p + 9], c );
  95.         Points[p + 10]  = _mm_sub_epi32( Points[p + 10], c );
  96.         Points[p + 11]  = _mm_sub_epi32( Points[p + 11], c );
  97.         Points[p + 12]  = _mm_sub_epi32( Points[p + 12], c );
  98.         Points[p + 13]  = _mm_sub_epi32( Points[p + 13], c );
  99.         Points[p + 14]  = _mm_sub_epi32( Points[p + 14], c );
  100.         Points[p + 15]  = _mm_sub_epi32( Points[p + 15], c );
  101.     }
  102.     T0      = __rdtsc() - Start;
  103.     Start   = __rdtsc();
  104.     for( p = 0, ic = Points.size() / Exp ; ic-- ; p += Exp ){
  105.         Quadrant1( Points[p + 0], &q1, &q2 );
  106.         Quadrant1( Points[p + 1], &q1, &q2 );
  107.         Quadrant1( Points[p + 2], &q1, &q2 );
  108.         Quadrant1( Points[p + 3], &q1, &q2 );
  109.         Quadrant1( Points[p + 4], &q1, &q2 );
  110.         Quadrant1( Points[p + 5], &q1, &q2 );
  111.         Quadrant1( Points[p + 6], &q1, &q2 );
  112.         Quadrant1( Points[p + 7], &q1, &q2 );
  113.         Quadrant1( Points[p + 8], &q1, &q2 );
  114.         Quadrant1( Points[p + 9], &q1, &q2 );
  115.         Quadrant1( Points[p + 10], &q1, &q2 );
  116.         Quadrant1( Points[p + 11], &q1, &q2 );
  117.         Quadrant1( Points[p + 12], &q1, &q2 );
  118.         Quadrant1( Points[p + 13], &q1, &q2 );
  119.         Quadrant1( Points[p + 14], &q1, &q2 );
  120.         Quadrant1( Points[p + 15], &q1, &q2 );
  121.     }
  122.     T1      = __rdtsc() - Start;
  123.     Start   = __rdtsc();
  124.     for( p = 0, ic = Points.size() / Exp ; ic-- ; p += Exp ){
  125.         Quadrant2( Points[p + 0], &q1, &q2 );
  126.         Quadrant2( Points[p + 1], &q1, &q2 );
  127.         Quadrant2( Points[p + 2], &q1, &q2 );
  128.         Quadrant2( Points[p + 3], &q1, &q2 );
  129.         Quadrant2( Points[p + 4], &q1, &q2 );
  130.         Quadrant2( Points[p + 5], &q1, &q2 );
  131.         Quadrant2( Points[p + 6], &q1, &q2 );
  132.         Quadrant2( Points[p + 7], &q1, &q2 );
  133.         Quadrant2( Points[p + 8], &q1, &q2 );
  134.         Quadrant2( Points[p + 9], &q1, &q2 );
  135.         Quadrant2( Points[p + 10], &q1, &q2 );
  136.         Quadrant2( Points[p + 11], &q1, &q2 );
  137.         Quadrant2( Points[p + 12], &q1, &q2 );
  138.         Quadrant2( Points[p + 13], &q1, &q2 );
  139.         Quadrant2( Points[p + 14], &q1, &q2 );
  140.         Quadrant2( Points[p + 15], &q1, &q2 );
  141.     }
  142.     T2      = __rdtsc() - Start;
  143.     _tprintf(
  144.         TEXT("SIMD time Recenter (Freq/1GHz): %lf/%lf ms.\r\n"), T0 / ms, T0 / 1000000.0
  145.     );
  146.     _tprintf(
  147.         TEXT("SIMD time Quadrant1 (Freq/1GHz): %lf/%lf ms.\r\n"), T1 / ms, T1 / 1000000.0
  148.     );
  149.     _tprintf(
  150.         TEXT("SIMD time Quadrant2 (Freq/1GHz): %lf/%lf ms.\r\n"), T2 / ms, T2 / 1000000.0
  151.     );
  152.     //
  153.     _tprintf(
  154.         TEXT("SIMD avg. time Recenter (Freq/1GHz): %lf/%lf ms.\r\n"),
  155.         ( T0 / ms ) / Count, ( T0 / 1000000.0 ) / Count
  156.     );
  157.     _tprintf(
  158.         TEXT("SIMD avg. time Quadrant1 (Freq/1GHz): %lf/%lf ms.\r\n"),
  159.         ( T1 / ms ) / Count, ( T1 / 1000000.0 ) / Count
  160.     );
  161.     _tprintf(
  162.         TEXT("SIMD avg. time Quadrant2 (Freq/1GHz): %lf/%lf ms.\r\n"),
  163.         ( T2 / ms ) / Count, ( T2 / 1000000.0 ) / Count
  164.     );
  165.     _tprintf(
  166.         TEXT("SIMD speed Recenter (Freq/1GHz): %lf/%lf p./s.\r\n"),
  167.         Count / ( T0 / ms ) * 1000, Count / ( T0 / 1000000.0 ) * 1000
  168.     );
  169.     _tprintf(
  170.         TEXT("SIMD speed Quadrant1 (Freq/1GHz): %lf/%lf q./s.\r\n"),
  171.         Count / ( T1 / ms ) * 1000, Count / ( T1 / 1000000.0 ) * 1000
  172.     );
  173.     _tprintf(
  174.         TEXT("SIMD speed Quadrant2 (Freq/1GHz): %lf/%lf q./s.\r\n"),
  175.         Count / ( T2 / ms ) * 1000, Count / ( T2 / 1000000.0 ) * 1000
  176.     );
  177.     ::SetThreadAffinityMask( ::GetCurrentThread(), Mask );
  178.     return 0;
  179. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement