Guest User

Untitled

a guest
Jan 7th, 2018
124
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 2.75 KB | None
  1. # branch-predictor-stressing test code:
  2. # loop of 400M iterations, 800M branches altogether, 200M mispredicts (1/4 mispredict rate), perf-measured on both amd64 and arm64
  3.  
  4.  
  5. #include <stdio.h>
  6. #include <stdint.h>
  7. #include <stdlib.h>
  8.  
  9. int main(int, char**) {
  10.     const size_t len = 1 << 20;
  11.     char *buf = (char*) malloc(len);
  12.  
  13.     FILE *f = fopen("rand", "rb"); // 1MB blob from /dev/urandom
  14.     fread(buf, len, 1, f);
  15.     fclose(f);
  16.  
  17.     const uint64_t reps = uint64_t(1e8) * 4;
  18.     const uint64_t bpp = sizeof(*buf) * 8;
  19.  
  20.     for (uint64_t i = 0; i < reps; ++i) {
  21.         if (buf[i / bpp % len] & (1 << i % bpp)) {
  22.             asm volatile (
  23.                 "nop\n\t"
  24.                 "nop\n\t"
  25.                 "nop\n\t"
  26.                 "nop\n\t"
  27.                 "nop\n\t"
  28.                 "nop\n\t"
  29.                 "nop\n\t"
  30.                 "nop\n\t"
  31.                 "nop\n\t"
  32.                 "nop\n\t"
  33.                 "nop\n\t"
  34.                 "nop\n\t"
  35.                 "nop\n\t"
  36.                 "nop\n\t"
  37.             : : : "memory");
  38.         }
  39.     }
  40.  
  41.     free(buf);
  42.     return 0;
  43. }
  44.  
  45. # misprediction rate of the test: 1/4 of all branches = 800M / 4 = 200M mispredicts
  46. # A53 mispredict penalty: 7 clocks, 200M * 7 = 1.4G clocks wasted in mispredicts
  47. # (http://www.7-cpu.com/cpu/Cortex-A53.html)
  48. # A72 mispredict penalty: 16 cycles, 200M * 16 = 3.2G clocks wasted in mispredicts
  49. # (http://www.7-cpu.com/cpu/Cortex-A57.html; A57 and A72 have the same on-paper penalty)
  50.  
  51. phablet@ubuntu-phablet:~/proj/xxx$ time ./a.out    # A53 clock: 1.5GHz
  52.  
  53. real    0m4.106s
  54. user    0m4.080s
  55. sys     0m0.000s
  56. phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5" | bc
  57. 6.1590    # clocks elapsed, in 10^9 units
  58. phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5 / .4" | bc
  59. 15.3975   # clocks per loop iteration (400M iterations)
  60. phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; 4.106 * 1.5 - 1.4" | bc
  61. 4.7590    # clocks elapsed sans mispredictions, in 10^9 units
  62. phablet@ubuntu-phablet:~/proj/xxx$ echo "scale=4; (4.106 * 1.5 - 1.4) / .4" | bc
  63. 11.8975   # clocks per loop iteration (400M iterations) -- misprediction clocks deducted
  64.  
  65. blu@macchiato:~/proj/xxx$ time ./a.out    # A72 clock: 1.3GHz
  66.  
  67. real    0m6.029s
  68. user    0m6.028s
  69. sys     0m0.000s
  70. blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3" | bc
  71. 7.8377    # clocks elapsed, in 10^9 units
  72. blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3 / 0.4" | bc
  73. 19.5942   # clocks per loop iteration (400M iterations)
  74. blu@macchiato:~/proj/xxx$ echo "scale=4; 6.029 * 1.3 - 3.2" | bc
  75. 4.6377    # clocks elapsed sans mispredictions, in 10^9 units
  76. blu@macchiato:~/proj/xxx$ echo "scale=4; (6.029 * 1.3 - 3.2) / 0.4" | bc
  77. 11.5942   # clocks per loop iteration (400M iterations) -- misprediction clocks deducted
  78.  
  79. # So, on this test A53 idealized performance is 11.8975 clk/iteration
  80. #                  A72 idealized performance is 11.5942 clk/iteration
  81. # Do you believe A72 does and A53 does not speculate on this test?
RAW Paste Data Copied