Advertisement
Guest User

ESXi spinpid.sh

a guest
May 18th, 2017
202
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 12.01 KB | None | 0 0
  1. #!/usr/local/bin/bash
  2. # spinpid.sh version 2017-01-01. Run as superuser. See notes at end.
  3.  
  4. ##############################################
  5. #
  6. #  Settings
  7. #
  8. ##############################################
  9. IPMITOOL="/usr/local/bin/ipmitool -H <IP> -U <U> -P <U>"
  10.  
  11.  
  12. # Drive Settings:
  13. SP=35.0        #  Setpoint mean temperature
  14. #  Time interval for checking drives in minutes.
  15. T=2
  16. Kp=4            #  Proportional tunable constant
  17. Ki=0            #  Integral tunable constant
  18. Kd=40           #  Derivative tunable constant
  19. PID=0
  20.  
  21. # Fan minimum duty cycle (%) (to avoid stalling)
  22. FAN_MIN=50
  23.  
  24. LOG=/root/spinpid.log
  25.  
  26. ##############################################
  27. # function get_disk_name
  28. # Get disk name from current LINE of DEVLIST
  29. ##############################################
  30. # The awk statement works by taking $LINE as input,
  31. # setting '(' as a _F_ield separator and taking the second field it separates
  32. # (ie after the separator), passing that to another awk that uses
  33. # ',' as a separator, and taking the first field (ie before the separator).
  34. # In other words, everything between '(' and ',' is kept.
  35.  
  36. # camcontrol output for disks on HBA seems to reverse every version,
  37. # so need 2 options to get ada/da disk name.
  38. function get_disk_name {
  39.    if [[ $LINE == *",d"* ]] ; then     # for (pass#,da#) (HBA disks sometimes)
  40.       DEVID=$(echo $LINE | awk -F ',' '{print $2}' | awk -F ')' '{print$1}')
  41.    else                                # for (ada#,pass#) (motherboard disks)
  42.       DEVID=$(echo $LINE | awk -F '(' '{print $2}' | awk -F ',' '{print$1}')
  43.    fi
  44. }
  45.  
  46. ############################################################
  47. # function print_header
  48. # Called when script starts and each quarter day
  49. ############################################################
  50. function print_header {
  51.    DATE=$(date +"%A, %b %d")
  52.    let "SPACES = DEVCOUNT * 5 + 70"  # 5 spaces per drive
  53.    printf "\n%-*s %-8s %s \n" $SPACES "$DATE" "Fan %" "Interim CPU"
  54.    echo -n "          "
  55.    while read LINE ; do
  56.       get_disk_name
  57.       printf "%-5s" $DEVID
  58.    done <<< "$DEVLIST"             # while statement works on DEVLIST
  59.    printf "%4s %5s %5s %6s %5s %6s %3s %s %4s %-7s %s %s" "Tmax" "Tmean" "ERRc" "P" "I" "D" "CPU" "Driver" "RPM" "MODE" "Curr/New" "Adjustments"
  60. }
  61.  
  62. #################################################
  63. # function ipmi_read_duty
  64. #################################################
  65. function ipmi_read_duty {
  66.    DUTY_CURR=$($IPMITOOL raw 0x30 0x70 0x66 0x00 $1)
  67. }
  68.  
  69. #################################################
  70. # function ipmi_set_duty
  71. #################################################
  72. function ipmi_set_duty {
  73.    # Set new duty cycle. "echo -n ``" prevents newline generated in log
  74.    echo -n `$IPMITOOL raw 0x30 0x70 0x66 1 $1 $2`
  75. }
  76.  
  77. #################################################
  78. # function drive_data: Read, process, print data
  79. #################################################
  80. function drive_data {
  81.    Tmean=$(echo "scale=3; $Tsum / $i" | bc)
  82.    ERRp=$ERRc
  83.    ERRc=$(echo "scale=2; $Tmean - $SP" | bc)
  84.    ERR=$(echo "scale=2; $ERRc * $T + $I" | bc)
  85.    P=$(echo "scale=2; $Kp * $ERRc" | bc)
  86.    I=$(echo "scale=2; $Ki * $ERR" | bc)
  87.    D=$(echo "scale=2; $Kd * ($ERRc - $ERRp) / $T" | bc)
  88.    PID=$(echo "scale=2; $P + $I + $D" | bc)  # add 3 corrections
  89.    PID=$(printf %0.f $PID)  # round
  90.    # Read duty cycle, convert to decimal.
  91.    # May need to disable these 3 lines as some boards apparently return
  92.    # incorrect data. In that case just assume $DUTY hasn't changed.
  93.    ipmi_read_duty 0x01 # in hex
  94.    DUTY_CURR=$(printf "0x%s" $DUTY_CURR)                 # add Ox in front
  95.    DUTY_CURR=`echo $(($DUTY_CURR))`                      # convert to decimal
  96.    # Read fan mode, convert to decimal.
  97.    MODE=$($IPMITOOL raw 0x30 0x45 0) # in hex
  98.    MODE=$(printf "0x%s" $MODE)                 # add Ox in front
  99.    MODE=`echo $(($MODE))`                      # convert to decimal
  100.    # Text for mode
  101.    case $MODE in
  102.       0) MODEt="Standard" ;;
  103.       4) MODEt="HeavyIO" ;;
  104.       2) MODEt="Optimal" ;;
  105.       1) MODEt="Full" ;;
  106.    esac
  107.    # Get reported fan speed in RPM.
  108.    # Takes the line with FAN1, then 2nd through the 5th
  109.    # digit if there are that many.
  110.    RPM=$($IPMITOOL sdr | grep "FANA" | grep -Eo '[0-9]{2,5}')
  111.    # print current Tmax, Tmean, CPU 0 temp, fan speed, mode, and duty and CPU 0 temperature
  112.    printf "^%-3d %5.2f" $Tmax $Tmean
  113. }
  114.  
  115. ##############################################
  116. # function DRIVES_check_adjust
  117. # Print time on new log line.
  118. # Go through each drive, getting and printing
  119. # status and temp.  Calculate sum and max
  120. # temp, then call function drive_data.
  121. # Apply max of $PID and CPU_CORR to the fans.
  122. ##############################################
  123. function DRIVES_check_adjust {
  124.    echo  # start new line
  125.    # print time on each line
  126.    TIME=$(date "+%H:%M:%S"); echo -n "$TIME  "
  127.    Tmax=0; Tsum=0  # initialize drive temps for new loop through drives
  128.    i=0  # count number of spinning drives
  129.    while read LINE ; do
  130.       get_disk_name
  131.       TEMP=$(/usr/local/sbin/smartctl -a -n standby "/dev/$DEVID" | grep "Temperature_Celsius" | /usr/local/bin/pcregrep -o1 '([0-9]*)( \(.*\))?$')
  132.       /usr/local/sbin/smartctl -n standby "/dev/$DEVID" > /var/tempfile
  133.       RETURN=$?               # need to preserve because $? changes with each 'if'
  134.       if [[ $RETURN == "0" ]] ; then
  135.          STATE="*"  # spinning
  136.       elif [[ $RETURN == "2" ]] ; then
  137.          STATE="_"  # standby
  138.       else
  139.          STATE="?"  # state unknown
  140.       fi
  141.       printf "%s%-2d  " "$STATE" $TEMP
  142.       # Update temperatures each drive; spinners only
  143.       if [ "$STATE" == "*" ] ; then
  144.          let "Tsum += $TEMP"
  145.          if [[ $TEMP > $Tmax ]]; then Tmax=$TEMP; fi;
  146.          let "i += 1"
  147.       fi
  148.    done <<< "$DEVLIST"
  149.    drive_data  # manage data
  150.    let "DUTY_DRIVE = $DUTY_CURR + $PID"
  151.  
  152.    DRIVER="Drives"
  153.    MAX=$DUTY_DRIVE
  154.    
  155.    adjust_fans $MAX  # passing higher duty to the function adjust_fans
  156. }
  157.  
  158. ##############################################
  159. # function adjust_fans
  160. # Add correction to current duty,
  161. # set duty, print diagnostic data
  162. ##############################################
  163. function adjust_fans {
  164.    # Reset BMC if fans seem stuck: cool and >80% OR warm and <30%
  165.    # if [[ $Tmean<$(($SP - 1)) && $DUTY>0x50 ]] || [[ $Tmean>$(($SP + 5)) && $DUTY<0x1E ]]; then
  166.    #    $IPMITOOL bmc reset warm; fi
  167.    # $1 is the new duty
  168.    # passed to this function when called
  169.    DUTY_NEW=$1
  170.    # Don't allow duty cycle beyond 20/95%
  171.    if [[ $DUTY_NEW -gt 95 ]]; then DUTY_NEW=95; fi
  172.    if [[ $DUTY_NEW -lt $FAN_MIN ]]; then DUTY_NEW=$FAN_MIN; fi
  173.    # Change if different from current duty
  174.    if [[ $DUTY_NEW -ne $DUTY_CURR ]]; then
  175.       DUTYhex=$( printf "0x%x" $DUTY_NEW )  #  hexify
  176.      
  177.       ipmi_set_duty 0x01 $DUTYhex
  178.    fi
  179. }
  180.  
  181. #####################################################
  182. # All this happens only at the beginning
  183. # Initializing values, list of drives, print header
  184. #####################################################
  185. sleep 180
  186. DRIVE_T=$( echo "$T * 60" | bc )
  187.  
  188. I=0; ERRc=0  # Initialize errors to 0
  189.  
  190. # Creates logfile and sends all stdout and stderr to the log, as well as to the console.
  191. # If you want to append to existing log, add '-a' to the tee command.
  192. exec > >(tee -i $LOG) 2>&1
  193.  
  194. # Get list of drives
  195. DEVLIST1=$(/sbin/camcontrol devlist)
  196. # Remove lines with flash drives or SSD; edit as needed
  197. # You could use another strategy, e.g., find something in the camcontrol devlist
  198. # output that is unique to the drives you want, for instance only WDC drives:
  199. # if [[ $LINE != *"WDC"* ]] . . .
  200. DEVLIST="$(echo "$DEVLIST1"|sed '/Virtual disk/d')"
  201. DEVCOUNT=$(echo "$DEVLIST" | wc -l)
  202.  
  203. # Set mode to 'Full' to avoid BMC changing duty cycle
  204. # Need to wait a tick or it doesn't get 2nd command
  205. # "echo -n ``" to avoid annoying newline generated in log
  206. ### Not using 'Full'
  207. ### echo -n `$IPMITOOL raw 0x30 0x45 1 1`; sleep 1
  208.  
  209. # Then start with 50% duty cycle and let algorithm adjust from there
  210. DUTY_NEW=50
  211. DUTY_DRIVE=50
  212.  
  213. DUTYhex=$( printf "0x%x" $DUTY_NEW )
  214. ipmi_set_duty 0x01 $DUTYhex
  215.  
  216. sleep 3  # let fans respond
  217.  
  218. printf "\nDrive states:  * spinning;  _ standby;  ? unknown\n"
  219. print_header
  220.  
  221. ############################################
  222. # Main: Loop through drives every T minutes
  223. ############################################
  224. while [ 1 ] ; do
  225.    # Print header every quarter day.  Expression removes any
  226.    # leading 0 so it is not seen as octal
  227.    HM=$(date +%k%M); HM=`expr $HM + 0`
  228.    R=$(( HM % 600 ))  # remainder after dividing by 6 hours
  229.    if (( $R < $T )); then
  230.       print_header;
  231.    fi
  232.    
  233.    DRIVES_check_adjust
  234.    printf "%6.2f %6.2f %5.2f %6.2f %3d %-6s %4d %-7s %2d/%-6d" $ERRc $P $I $D -1 $DRIVER $RPM $MODEt $DUTY_CURR $DUTY_NEW
  235.    
  236.    sleep $DRIVE_T
  237. done
  238.  
  239. # Adjusts fans based on drive or CPU temperatures, whichever
  240. # needs more cooling. Max temp among drives is maintained at a setpoint
  241. # using a PID algorithm.  CPU temp regulation uses just core 0
  242. # (they all stay within a few degrees of each other).  CPU temp
  243. # need not and cannot be maintained at a setpoint, so PID is not
  244. # used; instead fan duty cycle is simply increased with temp.
  245.  
  246. # Drives are checked and fans adjusted on a set interval, such as 6 minutes.
  247. # Logging is done at that point.  CPU temps can spike much faster,
  248. # so are checked at a shorter interval, such as 30 seconds.  Those
  249. # adjustments are not logged.
  250.  
  251. # Logs:
  252. #   - disk status (spinning or standby)
  253. #   - disk temperature (Celsius) if spinning
  254. #   - max and mean disk temperature
  255. #   - CPU 0 temperature
  256. #   - fan rpm and mode
  257. #   - current and new fan duty cycle
  258. #   - PID variables
  259. #   - adjustments to fan duty cycle due to interim CPU loops
  260.  
  261. # Includes disks on motherboard and on HBA.
  262.  
  263. #  Relation between percent duty cycle, hex value of that number,
  264. #  and RPMs for my fans.  RPM will vary among fans, is not
  265. #  precisely related to duty cycle, and does not matter to the script.
  266. #  It is merely reported.
  267. #
  268. #  Percent  Hex     RPM
  269. #  10         A     300
  270. #  20        14     400
  271. #  30        1E     500
  272. #  40        28     600/700
  273. #  50        32     800
  274. #  60        3C     900
  275. #  70        46     1000/1100
  276. #  80        50     1100/1200
  277. #  90        5A     1200/1300
  278. # 100        64     1300
  279.  
  280. # Some boards apparently report incorrect duty cycle.
  281. # If that is happening, disable lines 86-88 in function drive_data.
  282. # Then the script will assume the duty cycle is the
  283. # same as it was last set.
  284.  
  285. # Tuning suggestions
  286. # PID tuning advice on the internet generally does not work well in this application.
  287. # First run the script spincheck.sh and get familiar with your temperature and fan variations without any intervention.
  288. # Choose a setpoint that is an actual observed Tmean, given the number of drives you have.  It should be the Tmean associated with the Tmax that you want.  
  289. # Set Ki=0 and leave it there.  You probably will never need it.
  290. # Start with Kp low.  Use a value that results in a rounded correction=1 when error is the lowest value you observe other than 0  (i.e., when ERRc is minimal, Kp ~= 1 / ERRc)
  291. # Set Kd at about Kp*10
  292. # Get Tmean within ~0.3 degree of SP before starting script.
  293. # Start script and run for a few hours or so.  If Tmean oscillates (best to graph it), you probably need to reduce Kd.  If no oscillation but response is too slow, raise Kd.
  294. # Stop script and get Tmean at least 1 C off SP.  Restart.  If there is overshoot and it goes through some cycles, you may need to reduce Kd.
  295. # If you have problems, examine PK and PD in the log and see which is messing you up.  If all else fails you can try Ki. If you use Ki, make it small, ~ 0.1 or less.
  296.  
  297. # Uses joeschmuck's smartctl method for drive status (returns 0 if spinning, 2 in standby)
  298. # https://forums.freenas.org/index.php?threads/how-to-find-out-if-a-drive-is-spinning-down-properly.2068/#post-28451
  299. # Other method (camcontrol cmd -a) doesn't work with HBA
  300.  
  301. # Removed from drive_data.  Though it was working
  302. # it doesn't seem right to hexify PID ?????
  303. # PID=$( printf "0x%x" $PID )  # fully hexify with '0x' in front
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement