j0h

Pi_brownout_detect

j0h
Dec 30th, 2025
21
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 5.75 KB | None | 0 0
  1. #!/usr/bin/env bash
  2. set -euo pipefail
  3.  
  4. INTERVAL_SEC=2
  5. SNAPSHOT_COOLDOWN_SEC=30      # minimum seconds between snapshots
  6. PER_TAG_COOLDOWN_SEC=60       # minimum seconds between logs for same tag
  7. ETH_IF="${ETH_IF:-eth0}"
  8.  
  9. LOGDIR="${LOGDIR:-/home/pi/e}"
  10. LOGFILE="${LOGFILE:-$LOGDIR/brownout.log}"
  11. STATEFILE="${STATEFILE:-$LOGDIR/brownout.state}"
  12. LOCKFILE="${LOCKFILE:-/run/fr201-brownout-monitor.lock}"
  13.  
  14. # Broad kernel patterns that may indicate (or result from) brownouts
  15. # (not limited to USB/Ethernet)
  16. KERNEL_PAT='under[- ]voltage|over[- ]current|brownout|throttl|watchdog|hung task|soft lockup|hard lockup|kernel panic|oops|BUG:|segfault|rcu_sched|mmc|sdhci|mmcblk|I/O error|Buffer I/O error|blk_update_request|EXT4-fs error|XFS.*error|FAT-fs|exFAT|BTRFS.*error|nvme.*reset|pcie.*AER|usb .*disconnect|usb .*reset|xhci|dwc2|uas|link down|link up|NETDEV WATCHDOG'
  17.  
  18. ts() { date --iso-8601=seconds; }
  19. epoch() { date +%s; }
  20. have() { command -v "$1" >/dev/null 2>&1; }
  21. log() { printf '%s %s\n' "$(ts)" "$*" >>"$LOGFILE"; }
  22.  
  23. get_throttled_raw() {
  24.   if have vcgencmd; then
  25.     vcgencmd get_throttled 2>/dev/null | awk -F= '{print $2}' | tr -d '\r\n'
  26.     return 0
  27.   fi
  28.   if [[ -r /sys/devices/platform/soc/soc:firmware/get_throttled ]]; then
  29.     cat /sys/devices/platform/soc/soc:firmware/get_throttled 2>/dev/null | tr -d '\r\n'
  30.     return 0
  31.   fi
  32.   echo ""
  33. }
  34.  
  35. hex_to_int() {
  36.   local h="${1#0x}"
  37.   [[ -z "$h" ]] && { echo 0; return 0; }
  38.   echo $((16#$h))
  39. }
  40.  
  41. describe_mask() {
  42.   local m="$1"
  43.   local out=()
  44.  
  45.   (( m & 0x1 ))      && out+=("NOW:undervoltage")
  46.   (( m & 0x2 ))      && out+=("NOW:freq_capped")
  47.   (( m & 0x4 ))      && out+=("NOW:throttling")
  48.   (( m & 0x8 ))      && out+=("NOW:temp_limit")
  49.  
  50.   (( m & 0x10000 ))  && out+=("SINCE:undervoltage")
  51.   (( m & 0x20000 ))  && out+=("SINCE:freq_capped")
  52.   (( m & 0x40000 ))  && out+=("SINCE:throttling")
  53.   (( m & 0x80000 ))  && out+=("SINCE:temp_limit")
  54.  
  55.   ((${#out[@]}==0)) && out=("OK")
  56.   printf '%s' "${out[*]}"
  57. }
  58.  
  59. snapshot_devices() {
  60.   local reason="$1"
  61.   local now_e
  62.   now_e="$(epoch)"
  63.  
  64.   if (( now_e - LAST_SNAPSHOT_EPOCH < SNAPSHOT_COOLDOWN_SEC )); then
  65.     log "SNAPSHOT skipped (cooldown) reason=$reason"
  66.     return 0
  67.   fi
  68.   LAST_SNAPSHOT_EPOCH="$now_e"
  69.  
  70.   log "SNAPSHOT begin reason=$reason"
  71.  
  72.   if have vcgencmd; then
  73.     {
  74.       vcgencmd get_throttled || true
  75.       vcgencmd measure_volts core || true
  76.       vcgencmd measure_clock arm || true
  77.       vcgencmd measure_temp || true
  78.     } | sed 's/^/  /' >>"$LOGFILE"
  79.   fi
  80.  
  81.   {
  82.     echo "  [net]"
  83.     ip -br link 2>/dev/null || true
  84.     if have ethtool && ip link show "$ETH_IF" >/dev/null 2>&1; then
  85.       ethtool "$ETH_IF" 2>/dev/null | egrep -i 'Speed:|Duplex:|Auto-negotiation:|Link detected:' || true
  86.     fi
  87.   } >>"$LOGFILE"
  88.  
  89.   {
  90.     echo "  [usb]"
  91.     if have lsusb; then
  92.       lsusb -t 2>/dev/null || true
  93.     fi
  94.   } >>"$LOGFILE"
  95.  
  96.   {
  97.     echo "  [block]"
  98.     lsblk -o NAME,MODEL,SIZE,FSTYPE,TYPE,TRAN,MOUNTPOINTS 2>/dev/null || true
  99.   } >>"$LOGFILE"
  100.  
  101.   # Small kernel window for context, *only* around the event time
  102.   if have journalctl; then
  103.     local since until
  104.     since="$((now_e - 15))"
  105.     until="$((now_e + 2))"
  106.     {
  107.       echo "  [kernel window]"
  108.       journalctl -k --since "@$since" --until "@$until" -o short-iso 2>/dev/null \
  109.         | egrep -i "$KERNEL_PAT" || true
  110.     } >>"$LOGFILE"
  111.   fi
  112.  
  113.   log "SNAPSHOT end"
  114. }
  115.  
  116. tag_line() {
  117.   local l="$1"
  118.   shopt -s nocasematch
  119.   if [[ "$l" =~ under[-\ ]voltage|over[-\ ]current|brownout|throttl ]]; then
  120.     echo "POWER"
  121.   elif [[ "$l" =~ mmc|sdhci|mmcblk ]]; then
  122.     echo "MMC"
  123.   elif [[ "$l" =~ Buffer\ I/O\ error|blk_update_request|I/O\ error ]]; then
  124.     echo "BLOCK_IO"
  125.   elif [[ "$l" =~ EXT4-fs\ error|XFS.*error|BTRFS.*error|FAT-fs|exFAT ]]; then
  126.     echo "FS"
  127.   elif [[ "$l" =~ usb\ .*disconnect|usb\ .*reset|xhci|dwc2|uas ]]; then
  128.     echo "USB"
  129.   elif [[ "$l" =~ link\ down|link\ up|NETDEV\ WATCHDOG ]]; then
  130.     echo "NET"
  131.   elif [[ "$l" =~ watchdog|soft\ lockup|hard\ lockup|hung\ task ]]; then
  132.     echo "SCHED"
  133.   elif [[ "$l" =~ kernel\ panic|oops|BUG: ]]; then
  134.     echo "KERNEL"
  135.   else
  136.     echo "OTHER"
  137.   fi
  138.   shopt -u nocasematch
  139. }
  140.  
  141. should_log_tag() {
  142.   local tag="$1"
  143.   local now_e
  144.   now_e="$(epoch)"
  145.   local last="${LAST_TAG_EPOCH[$tag]:-0}"
  146.   if (( now_e - last >= PER_TAG_COOLDOWN_SEC )); then
  147.     LAST_TAG_EPOCH[$tag]="$now_e"
  148.     return 0
  149.   fi
  150.   return 1
  151. }
  152.  
  153. follow_kernel() {
  154.   have journalctl || { log "WARN: journalctl missing; kernel-follow disabled"; return 0; }
  155.  
  156.   journalctl -k -f -o short-iso 2>/dev/null | while IFS= read -r line; do
  157.     if echo "$line" | egrep -qi "$KERNEL_PAT"; then
  158.       local tag
  159.       tag="$(tag_line "$line")"
  160.       if should_log_tag "$tag"; then
  161.         log "KERNEL[$tag] $line"
  162.         snapshot_devices "kernel:$tag"
  163.       fi
  164.     fi
  165.   done
  166. }
  167.  
  168. poll_throttled() {
  169.   local last_mask="-1"
  170.   if [[ -r "$STATEFILE" ]]; then
  171.     last_mask="$(cat "$STATEFILE" 2>/dev/null || echo -1)"
  172.   fi
  173.  
  174.   while true; do
  175.     local raw mask
  176.     raw="$(get_throttled_raw || true)"
  177.     if [[ -n "$raw" ]]; then
  178.       mask="$(hex_to_int "$raw")"
  179.       if [[ "$mask" != "$last_mask" ]]; then
  180.         log "THROTTLED raw=0x${raw#0x} $(describe_mask "$mask")"
  181.         printf '%s\n' "$mask" > "${STATEFILE}.tmp"
  182.         mv -f "${STATEFILE}.tmp" "$STATEFILE"
  183.         # Snapshot on any change (rare), not every poll
  184.         snapshot_devices "throttled_change"
  185.         last_mask="$mask"
  186.       fi
  187.     fi
  188.     sleep "$INTERVAL_SEC"
  189.   done
  190. }
  191.  
  192. # ---- main ----
  193. mkdir -p "$LOGDIR"
  194. touch "$LOGFILE"
  195.  
  196. exec 9>"$LOCKFILE"
  197. if ! flock -n 9; then
  198.   exit 0
  199. fi
  200.  
  201. declare -A LAST_TAG_EPOCH
  202. LAST_SNAPSHOT_EPOCH=0
  203.  
  204. log "monitor start (interval=${INTERVAL_SEC}s, eth_if=${ETH_IF})"
  205.  
  206. follow_kernel &
  207. poll_throttled
  208.  
Add Comment
Please, Sign In to add comment