#!/usr/bin/perl # External fencing agent that uses the NUT daemon to control an external UPS. # See the comments below, and the various NUT man pages, for how this # script works. It should work unchanged with most modern "smart" APC UPSes in # a Redhat/Fedora/RHEL-style distribution with the nut package installed. # Author: William Seligman # License: GPLv2 # The Following Agent Has Been Tested With: # pacemaker-1.1.6 # nut-2.4.3 # As you're designing your UPS and fencing set-up, consider that there may be # three computers involved: # 1) the machine running this fencing agent; # 2) the machine being controlled by this agent; # 3) the machine that can send commands to the UPS. # On my cluster, all the UPSes have SNMP smartcards, so every host can communicate # with every UPS; in other words, machines (1) and (3) are the same. If your UPSes # are controlled via serial or USB connections, then you might have a # situation in which host (2) is plugged into a UPS that has a serial connection # to some master "power-control" computer, and can potentially be fenced # by any other machine in your cluster. # You'll probably need the nut daemon running on both the hosts (1) and # (3). Strictly speaking, there's no reason for NUT to run on (2). # From a practical standpoint you'll probably want NUT to be running on all the # systems in your cluster. # For this agent to work, the following conditions have to be met: # - NUT has to be installed; on RHEL systems, this requires packages nut and # nut-client. # - The nut daemon (the ups or upsd service on RHEL) must be running on hosts # (1) and (3). This agent does not start/stop the nut daemons for you. # - The name of the UPS that affects host (2) has to be defined in ups.conf on # host (3). The format for the --ups option is upsname[@controlhost[:port]]. The # default controlhost is 'localhost'. If you use SNMP management cards, you want # to make sure you issue comands to a community with read/write privileges; the # default is the 'private' community. An example ups.conf: # [myhost-ups] # driver = snmp-ups # port = myhost-ups.example.com # community = private # mibs=apcc # - The --username and --password options to access the UPS must be defined in # upsd.users on host (3), with the instcmds for poweron, poweroff, and reset allowed. # An example upsd.users: # [myuser] # password = mypassword # actions = SET # instcmds = ALL # - Host (1) must be allowed access via upsd.conf and upsd.users on host (3). # On RHEL systems, these files are in /etc/ups. In nut-2.4 and greater, there's # no per-host access restrictions, but you'll need to grant access in # nut-2.2 or lower. # - If you want to be able to unfence host (2) via stonith_admin, you might want # to set its BIOS to boot up on AC power restore, as opposed to "last state" or "off." # Otherwise the machine might not come back on even if the UPS restores power. # This agent doesn't keep track of which host it controls. Use the # Pacemaker parameters for that ("man stonithd"); e.g.,: # primitive StonithMyHost stonith:fence_nut \ # op monitor interval="60" timeout="30" on-fail="stop" \ # params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \ # ups=myhost-ups username=myuser password=mypassword \ # stonith-timeout="120s" # Note the use of on-fail="stop". The main way this resource's monitor can fail # is if we lose communication with the UPS. That's not great if it happens, but # consider what happens if allow the default on-fail="fence", especially in a # two-node cluster; do you want host (1) to be fenced solely because it can # no longer fence host (2)? If you have more than two nodes, on-fail="restart" # is an alternative, but if someone's pulled the communications cable from the # UPS then the resource will just shift from node to node. (Maybe there's no # need to monitor this agent if there's no logical automated response if it fails.) # More on this agent's options: # The defaults will probably work with most APC UPS devices. They might work on others; # 'upscmd -l ${ups}' and 'upsc ${ups}' will list the commands and variables, and you # can change the values for --poweron, --poweroff, --reset, --statusvar, and --cycledelay # to suit your UPS. Change --upscmd and --upsc if your NUT binaries are not in /usr/bin. # If you want to use the "graceful" reboot/shutdown on host (2), described in the # next two paragraphs, you'll probably want it running APC's PowerChute software # instead of NUT. # On most APC UPSes, --poweroff=load.off will cut the power to (2) immediately. The # option --poweroff=shutdown.stayoff is risky, since the fenced host will continue # running for ups.delay.shutdown seconds. # By default, this agent reboots a system by telling the UPS to cut its load, wait # "cycledelay" seconds, then restore power. If you provide a value for --reset, it will # use that command instead of cycling power. Note that if you use '--reset=shutdown.return' # a reboot/reset will wait for ups.delay.shutdown seconds, with power restored after # ups.delay.start seconds, where ups.* are variables defined in the UPS. To change # the value of UPS variables, see 'man upsrw'. # On my rather old UPSes (circa 2002), if the load.off command is immediately followed # by load.on, the latter command might be ignored. I found I had to wait at least # 8 seconds for load.on to be accepted without problems. I've decided to be a bit # cautious and use a default of cycledelay=20. # I also found that there can be a delay between the time the UPS turns its # load on/off and the time that fact is updated in its status variables; e.g., # the UPS immediately turns off its load in response to load.off, but ups.status # doesn't show OFF until up to 20 seconds afterwards. Therefore I added the # ondelay and offdelay options, which control how long to wait after issuing # the corresponding poweron/poweroff command before checking the UPS status. # If you want to completely ignore the UPS status after issuing a poweron or # poweroff command, set the --noverifyonoff option. This is useful if you # trust your UPS (and this agent!) to respond immediately to the command, # and don't want to wait out the delay until the UPS status is updated. # Note that large values of cycledelay, ondelay, and offdelay may cause this # agent to take a long time to run. Be sure to set stonith-timeout to at # least 3x the largest of these values. # There's no explicit support for multiple-outlet UPSes/PDUs in this agent, because: # a) I don't have any UPSes that let me control their individual outlets; # b) NUT's documentation on how to use them is unclear to me; # c) You probably want to use fence_apc or fence_apc_snmp instead. # If commands/variables like these exist for your UPS, these options may work # (I have not tested them!): # --statusvar=outlet.3.status --upscmd=/usr/bin/upsrw \ # --poweroff="-s outlet.3.switch=0" --poweron="-s outlet.3.switch=1" # ... assuming host (2) is plugged into outlet 3 of the UPS. # The NUT command default locations. In the RHEL-type nut packages, these binaries # are in /usr/bin. my $RHELUPSCMD="/usr/bin/upscmd"; my $RHELUPSC="/usr/bin/upsc"; # Defaults for APC smart UPSes: # Poweroff = turn off $hostname immediately by cutting the power on $ups. my $APCPOWEROFF="load.off"; # Poweron = turn on the power to $ups, which will presumably turn on host (2). my $APCPOWERON="load.on"; # Status = returns a short string with the $ups status; OL = on-line, # OB = on battery, LB = low battery, etc. my $APCSTATUSVAR="ups.status"; # Reset = reboot host (2). See the description of the 'reset' parameter above. # For documentation only at this point, since the user must supply this parameter if # they want to change the default reboot procedure (cycle the load on the UPS). my $APCRESET="shutdown.return"; my $progname = $0; sub usage { print < Action: monitor, status, off, on, reset, metadata (default) -s, --ups= UPS that controls power to hostname -u, --username= Username for accessing UPS -p, --password= Password for accessing UPS -h, --help Print this message and exit --poweron= NUT command to turn on UPS (default $APCPOWERON) --poweroff= NUT command to turn off UPS (default $APCPOWEROFF) --statusvar= UPS status variable (default $APCSTATUSVAR) --cycledelay= How long to wait between poweroff and poweron in a reboot --ondelay= How long to wait for UPS to turn on during poweron --offdelay= How long to wait for UPS to turn off during poweroff --noverifyonoff If set, do not verify the result of poweron/poweroff --reset= NUT command to reset UPS (default is to cycle the load) --upscmd= Default ${RHELUPSCMD} --upsc= Default ${RHELUPSC} For more information: fence_nut -o metadata less $progname EOF } use Sys::Syslog; # Parse the options. use Getopt::Long; # Options and default values my $poweron = $APCPOWERON; my $poweroff = $APCPOWEROFF; my $cycledelay = 20; my $ondelay = 20; my $offdelay = 20; my $noverifyonoff = 0; my $statusvar = $APCSTATUSVAR; my $upscmd = $RHELUPSCMD; my $upsc = $RHELUPSC; # I'm overly cautious, so the default action is "metadata" instead of # "reboot" as it is in the other scripts in the fence-agents package. my $option = "metadata"; my $ups = ""; my $username = ""; my $password = ""; my $reset = ""; my $help = ""; my $debug = 0; GetOptions ( 'action|o=s' => \$option, 'help|h' => \$help, 'debug' => \$debug, 'noverifyonoff' => \$noverifyonoff, 'ups|s=s' => \$ups, 'username|u=s' => \$username, 'password|p=s' => \$password, 'poweron=s' => \$poweron, 'poweroff=s' => \$poweroff, 'cycledelay=i' => \$cycledelay, 'ondelay=i' => \$ondelay, 'offdelay=i' => \$offdelay, 'reset=s' => \$reset, 'statusvar=s' => \$statusvar, 'upscmd=s' => \$upscmd, 'upsc=s' => \$upsc ); # In stonith-ng, the options can also come in via STDIN. # They're in the form "param=value", where param is one of the # parameter defined in the metadata section below; the exception # is "action", which is supplied by stonith-ng as "option". while ($line = ) { # A tiny bit of syntax checking. if ( $line =~ /^(\w+)\s*=\s*(.+)\s*$/ ) { my $key = $1; my $value = $2; # Just in case a future version changes "option" back to "action" $key =~ s/^action/option/; # Execute the input line as a perl statement. my $command = "\$$key=\"$value\"\;"; eval $command; } } if ( $debug ) { openlog($progname, "ndelay,pid", LOG_LOCAL0); syslog(LOG_INFO, "logger action=%s, ups=%s, username=%s, password=%s", $option, $ups, $username, $password); closelog; } if ( $help ) { usage; exit 0; } # Possible exit codes my $EC_SUCCESS = 0; my $EC_ERROR = 1; my $EC_OFF = 2; # Parse the action option. if ( $option =~ "^(on|poweron)" ) { do_nut($poweron); exit $EC_SUCCESS if $noverifyonoff; # Did we actually turn on the load to the UPS? Wait # a little while, then check. sleep $ondelay; my $rc = status_ups(); if ( $rc == 0 ) { # We tried to turn it on, and it's on. exit $EC_SUCCESS; } else { # We tried to turn it on, but somehow it's off # or in a weird state. exit $EC_ERROR; } } elsif ( $option =~ "^(off|poweroff)" ) { do_nut($poweroff); exit $EC_SUCCESS if $noverifyonoff; # Did we actually turn off the load to the UPS? Wait # a little while, then check. sleep $offdelay; my $rc = status_ups(); if ( $rc == 2 ) { # We tried to turn it off, and it's off. exit $EC_SUCCESS; } else { # We tried to turn it off, but somehow it's on # or in a weird state. exit $EC_ERROR; } } elsif ( $option =~ "^(reboot|reset)" ) { # By default, cycle the power via the UPS. If the # user has supplied a command via the reset parameter, # use that command instead. if ( $reset ) { do_nut($reset); } else { # Turn the power off, wait a little while, then turn it on. do_nut($poweroff); sleep $cycledelay; do_nut($poweron); } exit $EC_SUCCESS if $noverifyonoff; # Did we actually turn on the load to the UPS? Wait # a little while, then check. sleep $ondelay; my $rc = status_ups(); if ( $rc == 0 ) { # We tried to turn it on, and it's on. exit $EC_SUCCESS; } else { # We tried to turn it on, but somehow it's off # or in a weird state. exit $EC_ERROR; } } elsif ( $option =~ "^status" ) { # The difference between "monitor" and "status" # is that status will display a message, and adjust # the return code if the UPS is off. my $code = status_ups(); my $status = "ON"; if ( $code == $EC_OFF ) { $status = "OFF"; } print "Status: $status\n"; exit $code; } elsif ( $option =~ "^monitor" ) { # The difference between "monitor" and "status" # is that monitor will simply return 0 if the UPS and NUT # are working. my ($rc,$ignore) = monitor_ups(); if ( $rc != 0 ) { exit $EC_ERROR; } exit $EC_SUCCESS; } elsif ( $option =~ "^metadata" ) { print < fence_nut is a Fencing Agent that controls an external UPS via NUT (Network UPS Tools). Example: crm configure primitive StonithMyHost stonith:fence_nut \ params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \ op monitor interval="60" timeout="30" on-fail="stop" \ ups=myhost-ups username=myuser password=mypassword See the comments in $progname for more details and advice. http://www.networkupstools.org/ Fencing Action UPS name Username Password UPS Power On command UPS Power Off command How long to wait between poweroff and poweron in a reboot How long to wait for UPS to turn on during poweron How long to wait for UPS to turn off during poweroff If set, do not verify the result of poweron/poweroff UPS Status variable UPS alternate reset command path to upscmd binary path to upsc binary enable some minor debugging info in log EOF exit 0; } elsif ( $option =~ "^(start|stop)" ) { # Do nothing; there's nothing to start or stop # in this agent. (These actions are implemented solely # to prevent warning messages from crm_verify.) exit 0; } else { print "fence_nut: invalid option '$option'\n"; usage(); exit $EC_ERROR; } 1; # Send a command to the UPS via NUT. sub do_nut { my ($command) = @_; if ( ! -x $upscmd ) { print "fence_nut: Can't find executable ${upscmd}\n"; return $EC_ERROR; } if ( not $username || not $password || not $ups ) { print "fence_nut: username, password or ups name missing; check configuration\n"; usage(); return $EC_ERROR; } # Execute the command given in the argument. my $cmd = "$upscmd -u $username -p $password $ups $command"; print "cmd=$cmd\n" if $debug; $result = `$cmd`; if ( $? != 0 ) { print "fence_nut: error executing '$cmd': $result\n"; return $EC_ERROR; } return $EC_SUCCCESS; } sub status_ups { # The different between "monitor_ups" and # "status_ups" is that in this routine we # adjust the return code depending on the # contents of the message returned by the # UPS. my ($rc, $result) = monitor_ups(); return $EC_ERROR if $rc != 0; my $code = $EC_SUCCESS; if ( $result =~ /(off|0)/i ) { $code = $EC_OFF; } return $code; } # Note that we return an array: ($code,$text); # $code is the return code from executing commands, # $text is the output returned from the status command. sub monitor_ups { if ( ! -x $upsc ) { print "fence_nut: Can't find executable $upsc\n"; return (1,""); } # Define the command to fetch the UPS status. my $cmd = "${upsc} ${ups} ${statusvar}"; print "cmd=$cmd\n" if $debug; my $result = `$cmd`; if ( $? != 0 ) { print "fence_nut: error executing '$cmd': $result\n"; return ($?,""); } # At this point the UPS is on or off... but either way the # device is working. return (0,$result); }