SHOW:
|
|
- or go back to the newest paste.
1 | #include <emmintrin.h> | |
2 | #include <stdlib.h> | |
3 | #include <stdio.h> | |
4 | #include <time.h> | |
5 | ||
6 | #define NEAR 10 | |
7 | #define T1 100 | |
8 | #define T2 1000 | |
9 | #define T3 10000 | |
10 | ||
11 | int func(int Di) { | |
12 | if (Di <= -T3) return -4; | |
13 | if (Di <= -T2) return -3; | |
14 | if (Di <= -T1) return -2; | |
15 | if (Di < -NEAR) return -1; | |
16 | if (Di <= NEAR) return 0; | |
17 | if (Di < T1) return 1; | |
18 | if (Di < T2) return 2; | |
19 | if (Di < T3) return 3; | |
20 | return 4; | |
21 | } | |
22 | ||
23 | int funcX(int Di) { | |
24 | int cmp_m3 = -(Di > -T3); | |
25 | int cmp_m2 = -(Di > -T2); | |
26 | int cmp_m1 = -(Di > -T1); | |
27 | int cmp_p0 = -(Di > NEAR); | |
28 | int reduce_true = cmp_m3 + cmp_m2 + cmp_m1 + cmp_p0; | |
29 | int cmp_m0 = -(Di < -NEAR); | |
30 | int cmp_p1 = -(Di < T1); | |
31 | int cmp_p2 = -(Di < T2); | |
32 | int cmp_p3 = -(Di < T3); | |
33 | int reduce_false = cmp_p3 + cmp_p2 + cmp_p1 + cmp_m0; | |
34 | return reduce_false - reduce_true; | |
35 | } | |
36 | ||
37 | __m128i func4(__m128i D) { | |
38 | __m128i cmp_m3 = _mm_cmpgt_epi32(D, _mm_set1_epi32(-T3)); | |
39 | __m128i cmp_m2 = _mm_cmpgt_epi32(D, _mm_set1_epi32(-T2)); | |
40 | __m128i cmp_m1 = _mm_cmpgt_epi32(D, _mm_set1_epi32(-T1)); | |
41 | __m128i cmp_p0 = _mm_cmpgt_epi32(D, _mm_set1_epi32(NEAR)); | |
42 | __m128i reduce_true = _mm_add_epi32(_mm_add_epi32(cmp_m3, cmp_m2), _mm_add_epi32(cmp_m1, cmp_p0)); | |
43 | __m128i cmp_m0 = _mm_cmplt_epi32(D, _mm_set1_epi32(-NEAR)); | |
44 | __m128i cmp_p1 = _mm_cmplt_epi32(D, _mm_set1_epi32(T1)); | |
45 | __m128i cmp_p2 = _mm_cmplt_epi32(D, _mm_set1_epi32(T2)); | |
46 | __m128i cmp_p3 = _mm_cmplt_epi32(D, _mm_set1_epi32(T3)); | |
47 | __m128i reduce_false = _mm_add_epi32(_mm_add_epi32(cmp_p3, cmp_p2), _mm_add_epi32(cmp_p1, cmp_m0)); | |
48 | return _mm_sub_epi32(reduce_false, reduce_true); | |
49 | } | |
50 | ||
51 | - | data_i[i] = rand() - 16768; |
51 | + | |
52 | ||
53 | static union { | |
54 | int data_i[4 * COUNT]; | |
55 | __m128i data_r[COUNT]; | |
56 | }; | |
57 | ||
58 | static union { | |
59 | int res_i[4]; | |
60 | __m128i res_r; | |
61 | }; | |
62 | ||
63 | int main() { | |
64 | for (int i = 0; i < 4*COUNT; i++) | |
65 | data_i[i] = rand() % 32768 - 16768; | |
66 | ||
67 | for (int i = 0; i < COUNT; i++) { | |
68 | int corr[4]; | |
69 | for (int j = 0; j < 4; j++) | |
70 | corr[j] = func(data_i[4*i+j]); | |
71 | ||
72 | res_r = func4(data_r[i]); | |
73 | for (int j = 0; j < 4; j++) | |
74 | if (corr[j] != res_i[j]) | |
75 | printf("Wrong!"); | |
76 | ||
77 | for (int j = 0; j < 4; j++) | |
78 | res_i[j] = funcX(data_i[4*i+j]); | |
79 | for (int j = 0; j < 4; j++) | |
80 | if (corr[j] != res_i[j]) | |
81 | printf("Wrong!"); | |
82 | } | |
83 | ||
84 | { | |
85 | int start = clock(); | |
86 | int sum = 0; | |
87 | for (int q = 0; q < 10000; q++) { | |
88 | for (int i = 0; i < 4*COUNT; i++) | |
89 | sum += func(data_i[i]); | |
90 | } | |
91 | printf("Time = %0.3lf (%d)\n", double(clock() - start) / CLOCKS_PER_SEC, sum); | |
92 | } | |
93 | ||
94 | { | |
95 | int start = clock(); | |
96 | __m128i sum = _mm_setzero_si128(); | |
97 | for (int q = 0; q < 10000; q++) { | |
98 | for (int i = 0; i < COUNT; i++) | |
99 | sum = _mm_add_epi32(sum, func4(data_r[i])); | |
100 | } | |
101 | res_r = sum; | |
102 | int rsum = res_i[0] + res_i[1] + res_i[2] + res_i[3]; | |
103 | printf("Time = %0.3lf (%d)\n", double(clock() - start) / CLOCKS_PER_SEC, rsum); | |
104 | } | |
105 | ||
106 | return 0; | |
107 | } |