Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # branch-predictor-stressing test code:
- # loop of 400M iterations, 800M branches altogether, 200M mispredicts (1/4 mispredict rate), perf-measured on both amd64 and arm64
- #include <stdio.h>
- #include <stdint.h>
- #include <stdlib.h>
- int main(int, char**) {
- const size_t len = 1 << 20;
- char *buf = (char*) malloc(len);
- FILE *f = fopen("rand", "rb"); // 1MB blob from /dev/urandom
- fread(buf, len, 1, f);
- fclose(f);
- const uint64_t reps = uint64_t(1e8) * 4;
- const uint64_t bpp = sizeof(*buf) * 8;
- for (uint64_t i = 0; i < reps; ++i) {
- if (buf[i / bpp % len] & (1 << i % bpp)) {
- asm volatile (
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- "nop\n\t"
- : : : "memory");
- }
- }
- free(buf);
- return 0;
- }
- # misprediction rate of the test: 1/4 of all branches = 800M / 4 = 200M mispredicts
- # A53 mispredict penalty: 7 clocks, 200M * 7 = 1.4G clocks wasted in mispredicts
- # (http://www.7-cpu.com/cpu/Cortex-A53.html)
- # A72 mispredict penalty: 16 cycles, 200M * 16 = 3.2G clocks wasted in mispredicts
- # (http://www.7-cpu.com/cpu/Cortex-A57.html; A57 and A72 have the same on-paper penalty)
- phablet@ubuntu-phablet:~/proj/xxx$ time ./a.out # A53 clock: 1.5GHz
- real 0m4.106s
- user 0m4.080s
- sys 0m0.000s
- phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5" | bc
- 6.1590 # clocks elapsed, in 10^9 units
- phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5 / .4" | bc
- 15.3975 # clocks per loop iteration (400M iterations)
- phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5 - 1.4" | bc
- 4.7590 # clocks elapsed sans mispredictions, in 10^9 units
- phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; (4.106 * 1.5 - 1.4) / .4" | bc
- 11.8975 # clocks per loop iteration (400M iterations) -- misprediction clocks deducted
- blu@macchiato:~/proj/xxx$ time ./a.out # A72 clock: 1.3GHz
- real 0m6.029s
- user 0m6.028s
- sys 0m0.000s
- blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3" | bc
- 7.8377 # clocks elapsed, in 10^9 units
- blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3 / 0.4" | bc
- 19.5942 # clocks per loop iteration (400M iterations)
- blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3 - 3.2" | bc
- 4.6377 # clocks elapsed sans mispredictions, in 10^9 units
- blu@macchiato:~/proj/xxx$ echo "scale=4; (6.029 * 1.3 - 3.2) / 0.4" | bc
- 11.5942 # clocks per loop iteration (400M iterations) -- misprediction clocks deducted
- # So, on this test A53 idealized performance is 11.8975 clk/iteration
- # A72 idealized performance is 11.5942 clk/iteration
- # Do you believe A72 does and A53 does not speculate on this test?
Add Comment
Please, Sign In to add comment