Advertisement
bLd759

alertmanagerrules.yml

Apr 7th, 2021
1,294
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.70 KB | None | 0 0
  1. groups:
  2. - name: alert_rules
  3. rules:
  4. - alert: InstanceDown
  5. expr: up == 0
  6. for: 5m
  7. labels:
  8. severity: critical
  9. annotations:
  10. summary: "Instance [{{ $labels.instance }}] down"
  11. description: "[{{ $labels.instance }}] of job [{{ $labels.job }}] has been down for more than 1 minute."
  12.  
  13. - alert: HostHighCpuLoad
  14. expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
  15. for: 0m
  16. labels:
  17. severity: warning
  18. annotations:
  19. summary: Host high CPU load (instance [{{ $labels.instance }}])
  20. description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  21.  
  22. ### This one is commented since it can be triggered very often, during a node compilation for example
  23. # Use it only for testing purpose ###
  24. #
  25. # - alert: HostHighCpuLoadSpike
  26. # expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[10s])) * 100) > 98
  27. # for: 0m
  28. # labels:
  29. # severity: information
  30. # annotations:
  31. # summary: Host CPU load spike (instance [{{ $labels.instance }}])
  32. # description: "CPU instant load is > 98%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  33.  
  34. - alert: HostHighRamLoad
  35. expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes *100 > 80
  36. for: 0m
  37. labels:
  38. severity: warning
  39. annotations:
  40. summary: Host high RAM usage (instance [{{ $labels.instance }}])
  41. description: "RAM usage is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  42.  
  43. - alert: HostHighIOUsage
  44. expr: irate(node_disk_io_time_seconds_total [5m]) > 0.8
  45. for: 0m
  46. labels:
  47. severity: warning
  48. annotations:
  49. summary: Host high IO usage (instance [{{ $labels.instance }}])
  50. description: "IO usage is > 0.8 seconds (max = 1)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  51.  
  52. - alert: HostHighNetworkRecvLo
  53. expr: irate(node_network_receive_bytes_total{device="lo"}[5m])*8/1000000 > 5
  54. for: 0m
  55. labels:
  56. severity: warning
  57. annotations:
  58. summary: Host high Network Receice Lo (instance [{{ $labels.instance }}])
  59. description: "Network Receive Lo > 5Mbps (usually < 1Mbps)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  60.  
  61. - alert: HostHighNetworkRecvEns3
  62. expr: irate(node_network_receive_bytes_total{device="ens3"}[5m])*8/1000000 > 50
  63. for: 0m
  64. labels:
  65. severity: warning
  66. annotations:
  67. summary: Host high Network Receice Ens3 (instance [{{ $labels.instance }}])
  68. description: "Network Receive Ens3 > 50Mbps (usually < 20Mbps)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  69.  
  70. - alert: HostFsCapacity
  71. expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 50
  72. for: 0m
  73. labels:
  74. severity: warning
  75. annotations:
  76. summary: Host FS storage > 50% (instance [{{ $labels.instance }}])
  77. description: "File storage available is > 50%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  78.  
  79. - alert: PeersCountLow
  80. expr: polkadot_sub_libp2p_peers_count < 20
  81. for: 0m
  82. labels:
  83. severity: warning
  84. annotations:
  85. summary: Low peers count (instance [{{ $labels.instance }}])
  86. description: "Number of peers connected under 20\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
  87.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement