Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <ctime>
- #include <cstdio>
- #include <xmmintrin.h>
- using namespace std;
- #define N (1 << 20)
- float sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum5 = 0, sum6 = 0;
- alignas(16) float a[N], b[N];
- void init() {
- for (int i = 0; i < N; i++) {
- a[i] = rand() % 4;
- b[i] = rand() % 4;
- }
- }
- void slow() {
- for (int i = 0; i < N; i++) {
- sum1 += a[i];
- }
- for (int i = 0; i < N; i++) {
- sum1 += b[i];
- }
- }
- void faster() {
- for (int i = 0; i < N; i++) {
- sum2 += a[i] + b[i];
- }
- }
- void FASTER() {
- int i, to = N - 1;
- float s0 = 0, s1 = 0;
- for (i = 0; i < to; i += 2) {
- s0 = a[i] + a[i + 1] + s0;
- s1 = b[i] + b[i + 1] + s1;
- }
- while (i < N) sum3 += a[i] + b[i], i++;
- sum3 = s0 + s1;
- }
- void BIGFUCKINGOPTIMIZATION() {
- __m128 sse_s0 = _mm_setr_ps(0, 0, 0, 0);
- __m128 sse_s1 = _mm_setr_ps(0, 0, 0, 0);
- float s[4];
- for (int i = 0; i < N; i += 8) {
- __m128 sse_a0 = _mm_load_ps(a + i);
- __m128 sse_a1 = _mm_load_ps(a + i + 4);
- __m128 sse_b0 = _mm_load_ps(b + i);
- __m128 sse_b1 = _mm_load_ps(b + i + 4);
- sse_s0 = _mm_add_ps(sse_s0, _mm_add_ps(sse_a0, sse_a1));
- sse_s1 = _mm_add_ps(sse_s1, _mm_add_ps(sse_b0, sse_b1));
- }
- __m128 sse_s = _mm_add_ps(sse_s0, sse_s1);
- _mm_store_ps(s, sse_s);
- sum4 = s[0] + s[1] + s[2] + s[3];
- }
- void KILLME() {
- __m128 sse_s0 = _mm_setr_ps(0, 0, 0, 0);
- __m128 sse_s1 = _mm_setr_ps(0, 0, 0, 0);
- float s[4];
- for (int i = 0; i < N; i += 16) {
- __m128 sse_a0 = _mm_load_ps(a + i);
- __m128 sse_a1 = _mm_load_ps(a + i + 4);
- __m128 sse_a2 = _mm_load_ps(a + i + 8);
- __m128 sse_a3 = _mm_load_ps(a + i + 12);
- __m128 sse_b0 = _mm_load_ps(b + i);
- __m128 sse_b1 = _mm_load_ps(b + i + 4);
- __m128 sse_b2 = _mm_load_ps(b + i + 8);
- __m128 sse_b3 = _mm_load_ps(b + i + 12);
- sse_s0 = _mm_add_ps(sse_s0, _mm_add_ps(_mm_add_ps(sse_a0, sse_a1), _mm_add_ps(sse_a2, sse_a3)));
- sse_s1 = _mm_add_ps(sse_s1, _mm_add_ps(_mm_add_ps(sse_b0, sse_b1), _mm_add_ps(sse_b2, sse_b3)));
- }
- __m128 sse_s = _mm_add_ps(sse_s0, sse_s1);
- _mm_store_ps(s, sse_s);
- sum5 = s[0] + s[1] + s[2] + s[3];
- }
- void NIGHTMARE() {
- __m128 sse_s0 = _mm_setr_ps(0, 0, 0, 0);
- __m128 sse_s1 = _mm_setr_ps(0, 0, 0, 0);
- __m128 sse_s2 = _mm_setr_ps(0, 0, 0, 0);
- __m128 sse_s3 = _mm_setr_ps(0, 0, 0, 0);
- float s[4];
- for (int i = 0; i < N; i += 16) {
- __m128 sse_a0 = _mm_load_ps(a + i);
- __m128 sse_a1 = _mm_load_ps(a + i + 4);
- __m128 sse_a2 = _mm_load_ps(a + i + 8);
- __m128 sse_a3 = _mm_load_ps(a + i + 12);
- __m128 sse_b0 = _mm_load_ps(b + i);
- __m128 sse_b1 = _mm_load_ps(b + i + 4);
- __m128 sse_b2 = _mm_load_ps(b + i + 8);
- __m128 sse_b3 = _mm_load_ps(b + i + 12);
- sse_s0 = _mm_add_ps(sse_s0, _mm_add_ps(sse_a0, sse_a1));
- sse_s1 = _mm_add_ps(sse_s1, _mm_add_ps(sse_a2, sse_a3));
- sse_s2 = _mm_add_ps(sse_s2, _mm_add_ps(sse_b0, sse_b1));
- sse_s3 = _mm_add_ps(sse_s3, _mm_add_ps(sse_b2, sse_b3));
- }
- __m128 sse_s = _mm_add_ps(_mm_add_ps(sse_s0, sse_s1), _mm_add_ps(sse_s2, sse_s3));
- _mm_store_ps(s, sse_s);
- sum6 = s[0] + s[1] + s[2] + s[3];
- }
- int main() {
- srand(time(0));
- init();
- auto sp = clock();
- slow();
- auto fp = clock();
- printf("----Slow sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum1);
- sp = clock();
- faster();
- fp = clock();
- printf("----Fast sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum2);
- sp = clock();
- FASTER();
- fp = clock();
- printf("----FASTER sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum3);
- sp = clock();
- BIGFUCKINGOPTIMIZATION();
- fp = clock();
- printf("----BIGFUCKINGOPTIMIZATION sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum4);
- sp = clock();
- KILLME();
- fp = clock();
- printf("----KILLME sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum5);
- sp = clock();
- NIGHTMARE();
- fp = clock();
- printf("----NIGHTMARE sum----\nTime: %.6lf\nResult: %d\n", ((float)fp - sp) / CLOCKS_PER_SEC, (int)sum6);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement