daily pastebin goal
94%
SHARE
TWEET

Untitled

a guest Jan 18th, 2019 68 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include <iostream>
  2. #include <ctime>
  3. #include <cstdio>
  4. #include <xmmintrin.h>
  5. using namespace std;
  6.  
  7. #define N (1 << 20)
  8.  
  9. float sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum5 = 0, sum6 = 0;
  10. alignas(16) float a[N], b[N];
  11. void init() {
  12.     for (int i = 0; i < N; i++) {
  13.         a[i] = rand() % 4;
  14.         b[i] = rand() % 4;
  15.     }
  16. }
  17.  
  18. void slow() {
  19.     for (int i = 0; i < N; i++) {
  20.         sum1 += a[i];
  21.     }
  22.     for (int i = 0; i < N; i++) {
  23.         sum1 += b[i];
  24.     }
  25. }
  26.  
  27. void faster() {
  28.     for (int i = 0; i < N; i++) {
  29.         sum2 += a[i] + b[i];
  30.     }
  31. }
  32.  
  33. void FASTER() {
  34.     int i, to = N - 1;
  35.     float s0 = 0, s1 = 0;
  36.     for (i = 0; i < to; i += 2) {
  37.         s0 = a[i] + a[i + 1] + s0;
  38.         s1 = b[i] + b[i + 1] + s1;
  39.     }
  40.     while (i < N) sum3 += a[i] + b[i], i++;
  41.     sum3 = s0 + s1;
  42. }
  43.  
  44. void BIGFUCKINGOPTIMIZATION() {
  45.     __m128 sse_s0 = _mm_setr_ps(0, 0, 0, 0);
  46.     __m128 sse_s1 = _mm_setr_ps(0, 0, 0, 0);
  47.     float s[4];
  48.     for (int i = 0; i < N; i += 8) {
  49.         __m128 sse_a0 = _mm_load_ps(a + i);
  50.         __m128 sse_a1 = _mm_load_ps(a + i + 4);
  51.         __m128 sse_b0 = _mm_load_ps(b + i);
  52.         __m128 sse_b1 = _mm_load_ps(b + i + 4);
  53.          sse_s0 = _mm_add_ps(sse_s0, _mm_add_ps(sse_a0, sse_a1));
  54.          sse_s1 = _mm_add_ps(sse_s1, _mm_add_ps(sse_b0, sse_b1));
  55.     }
  56.     __m128 sse_s = _mm_add_ps(sse_s0, sse_s1);
  57.     _mm_store_ps(s, sse_s);
  58.     sum4 = s[0] + s[1] + s[2] + s[3];
  59. }
  60.  
  61. void KILLME() {
  62.     __m128 sse_s0 = _mm_setr_ps(0, 0, 0, 0);
  63.     __m128 sse_s1 = _mm_setr_ps(0, 0, 0, 0);
  64.     float s[4];
  65.     for (int i = 0; i < N; i += 16) {
  66.         __m128 sse_a0 = _mm_load_ps(a + i);
  67.         __m128 sse_a1 = _mm_load_ps(a + i + 4);
  68.         __m128 sse_a2 = _mm_load_ps(a + i + 8);
  69.         __m128 sse_a3 = _mm_load_ps(a + i + 12);
  70.         __m128 sse_b0 = _mm_load_ps(b + i);
  71.         __m128 sse_b1 = _mm_load_ps(b + i + 4);
  72.         __m128 sse_b2 = _mm_load_ps(b + i + 8);
  73.         __m128 sse_b3 = _mm_load_ps(b + i + 12);
  74.          sse_s0 = _mm_add_ps(sse_s0, _mm_add_ps(_mm_add_ps(sse_a0, sse_a1), _mm_add_ps(sse_a2, sse_a3)));
  75.          sse_s1 = _mm_add_ps(sse_s1, _mm_add_ps(_mm_add_ps(sse_b0, sse_b1), _mm_add_ps(sse_b2, sse_b3)));
  76.     }
  77.     __m128 sse_s = _mm_add_ps(sse_s0, sse_s1);
  78.     _mm_store_ps(s, sse_s);
  79.     sum5 = s[0] + s[1] + s[2] + s[3];
  80. }
  81.  
  82. void NIGHTMARE() {
  83.     __m128 sse_s0 = _mm_setr_ps(0, 0, 0, 0);
  84.     __m128 sse_s1 = _mm_setr_ps(0, 0, 0, 0);
  85.     __m128 sse_s2 = _mm_setr_ps(0, 0, 0, 0);
  86.     __m128 sse_s3 = _mm_setr_ps(0, 0, 0, 0);
  87.     float s[4];
  88.     for (int i = 0; i < N; i += 16) {
  89.         __m128 sse_a0 = _mm_load_ps(a + i);
  90.         __m128 sse_a1 = _mm_load_ps(a + i + 4);
  91.         __m128 sse_a2 = _mm_load_ps(a + i + 8);
  92.         __m128 sse_a3 = _mm_load_ps(a + i + 12);
  93.         __m128 sse_b0 = _mm_load_ps(b + i);
  94.         __m128 sse_b1 = _mm_load_ps(b + i + 4);
  95.         __m128 sse_b2 = _mm_load_ps(b + i + 8);
  96.         __m128 sse_b3 = _mm_load_ps(b + i + 12);
  97.          sse_s0 = _mm_add_ps(sse_s0, _mm_add_ps(sse_a0, sse_a1));
  98.          sse_s1 = _mm_add_ps(sse_s1, _mm_add_ps(sse_a2, sse_a3));
  99.          sse_s2 = _mm_add_ps(sse_s2, _mm_add_ps(sse_b0, sse_b1));
  100.          sse_s3 = _mm_add_ps(sse_s3, _mm_add_ps(sse_b2, sse_b3));
  101.     }
  102.     __m128 sse_s = _mm_add_ps(_mm_add_ps(sse_s0, sse_s1), _mm_add_ps(sse_s2, sse_s3));
  103.     _mm_store_ps(s, sse_s);
  104.     sum6 = s[0] + s[1] + s[2] + s[3];
  105. }
  106.  
  107. int main() {
  108.     srand(time(0));
  109.     init();
  110.     auto sp = clock();
  111.     slow();
  112.     auto fp = clock();
  113.     printf("----Slow sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum1);
  114.    
  115.     sp = clock();
  116.     faster();
  117.     fp = clock();
  118.     printf("----Fast sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum2);
  119.    
  120.     sp = clock();
  121.     FASTER();
  122.     fp = clock();
  123.     printf("----FASTER sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum3);
  124.    
  125.     sp = clock();
  126.     BIGFUCKINGOPTIMIZATION();
  127.     fp = clock();
  128.     printf("----BIGFUCKINGOPTIMIZATION sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum4);
  129.    
  130.     sp = clock();
  131.     KILLME();
  132.     fp = clock();
  133.     printf("----KILLME sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum5);
  134.    
  135.     sp = clock();
  136.     NIGHTMARE();
  137.     fp = clock();
  138.     printf("----NIGHTMARE sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum6);
  139.     return 0;
  140. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top