Untitled

# branch-predictor-stressing test code:
# loop of 400M iterations, 800M branches altogether, 200M mispredicts (1/4 mispredict rate), perf-measured on both amd64 and arm64


#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

int main(int, char**) {
    const size_t len = 1 << 20;
    char *buf = (char*) malloc(len);

    FILE *f = fopen("rand", "rb"); // 1MB blob from /dev/urandom
    fread(buf, len, 1, f);
    fclose(f);

    const uint64_t reps = uint64_t(1e8) * 4;
    const uint64_t bpp = sizeof(*buf) * 8;

    for (uint64_t i = 0; i < reps; ++i) {
        if (buf[i / bpp % len] & (1 << i % bpp)) {
            asm volatile (
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
                "nop\n\t"
            : : : "memory");
        }
    }

    free(buf);
    return 0;
}

# misprediction rate of the test: 1/4 of all branches = 800M / 4 = 200M mispredicts
# A53 mispredict penalty: 7 clocks, 200M * 7 = 1.4G clocks wasted in mispredicts
# (http://www.7-cpu.com/cpu/Cortex-A53.html)
# A72 mispredict penalty: 16 cycles, 200M * 16 = 3.2G clocks wasted in mispredicts
# (http://www.7-cpu.com/cpu/Cortex-A57.html; A57 and A72 have the same on-paper penalty)

phablet@ubuntu-phablet:~/proj/xxx$ time ./a.out    # A53 clock: 1.5GHz

real    0m4.106s
user    0m4.080s
sys     0m0.000s
phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5" | bc
6.1590    # clocks elapsed, in 10^9 units
phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5 / .4" | bc
15.3975   # clocks per loop iteration (400M iterations)
phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5 - 1.4" | bc
4.7590    # clocks elapsed sans mispredictions, in 10^9 units
phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; (4.106 * 1.5 - 1.4) / .4" | bc
11.8975   # clocks per loop iteration (400M iterations) -- misprediction clocks deducted

blu@macchiato:~/proj/xxx$ time ./a.out    # A72 clock: 1.3GHz

real    0m6.029s
user    0m6.028s
sys     0m0.000s
blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3" | bc
7.8377    # clocks elapsed, in 10^9 units
blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3 / 0.4" | bc
19.5942   # clocks per loop iteration (400M iterations)
blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3 - 3.2" | bc
4.6377    # clocks elapsed sans mispredictions, in 10^9 units
blu@macchiato:~/proj/xxx$ echo "scale=4; (6.029 * 1.3 - 3.2) / 0.4" | bc
11.5942   # clocks per loop iteration (400M iterations) -- misprediction clocks deducted

# So, on this test A53 idealized performance is 11.8975 clk/iteration
#                  A72 idealized performance is 11.5942 clk/iteration
# Do you believe A72 does and A53 does not speculate on this test?