#!/usr/bin/perl
# External fencing agent that uses the NUT daemon to control an external UPS.
# See the comments below, and the various NUT man pages, for how this
# script works. It should work unchanged with most modern "smart" APC UPSes in
# a Redhat/Fedora/RHEL-style distribution with the nut package installed.
# Author: William Seligman <seligman@nevis.columbia.edu>
# License: GPLv2
# The Following Agent Has Been Tested With:
# pacemaker-1.1.6
# nut-2.4.3
# As you're designing your UPS and fencing set-up, consider that there may be
# three computers involved:
# 1) the machine running this fencing agent;
# 2) the machine being controlled by this agent;
# 3) the machine that can send commands to the UPS.
# On my cluster, all the UPSes have SNMP smartcards, so every host can communicate
# with every UPS; in other words, machines (1) and (3) are the same. If your UPSes
# are controlled via serial or USB connections, then you might have a
# situation in which host (2) is plugged into a UPS that has a serial connection
# to some master "power-control" computer, and can potentially be fenced
# by any other machine in your cluster.
# You'll probably need the nut daemon running on both the hosts (1) and
# (3). Strictly speaking, there's no reason for NUT to run on (2).
# From a practical standpoint you'll probably want NUT to be running on all the
# systems in your cluster.
# For this agent to work, the following conditions have to be met:
# - NUT has to be installed; on RHEL systems, this requires packages nut and
# nut-client.
# - The nut daemon (the ups or upsd service on RHEL) must be running on hosts
# (1) and (3). This agent does not start/stop the nut daemons for you.
# - The name of the UPS that affects host (2) has to be defined in ups.conf on
# host (3). The format for the --ups option is upsname[@controlhost[:port]]. The
# default controlhost is 'localhost'. If you use SNMP management cards, you want
# to make sure you issue comands to a community with read/write privileges; the
# default is the 'private' community. An example ups.conf:
# [myhost-ups]
# driver = snmp-ups
# port = myhost-ups.example.com
# community = private
# mibs=apcc
# - The --username and --password options to access the UPS must be defined in
# upsd.users on host (3), with the instcmds for poweron, poweroff, and reset allowed.
# An example upsd.users:
# [myuser]
# password = mypassword
# actions = SET
# instcmds = ALL
# - Host (1) must be allowed access via upsd.conf and upsd.users on host (3).
# On RHEL systems, these files are in /etc/ups. In nut-2.4 and greater, there's
# no per-host access restrictions, but you'll need to grant access in
# nut-2.2 or lower.
# - If you want to be able to unfence host (2) via stonith_admin, you might want
# to set its BIOS to boot up on AC power restore, as opposed to "last state" or "off."
# Otherwise the machine might not come back on even if the UPS restores power.
# This agent doesn't keep track of which host it controls. Use the
# Pacemaker parameters for that ("man stonithd"); e.g.,:
# primitive StonithMyHost stonith:fence_nut \
# op monitor interval="60" timeout="30" on-fail="stop" \
# params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \
# ups=myhost-ups username=myuser password=mypassword \
# stonith-timeout="120s"
# Note the use of on-fail="stop". The main way this resource's monitor can fail
# is if we lose communication with the UPS. That's not great if it happens, but
# consider what happens if allow the default on-fail="fence", especially in a
# two-node cluster; do you want host (1) to be fenced solely because it can
# no longer fence host (2)? If you have more than two nodes, on-fail="restart"
# is an alternative, but if someone's pulled the communications cable from the
# UPS then the resource will just shift from node to node. (Maybe there's no
# need to monitor this agent if there's no logical automated response if it fails.)
# More on this agent's options:
# The defaults will probably work with most APC UPS devices. They might work on others;
# 'upscmd -l ${ups}' and 'upsc ${ups}' will list the commands and variables, and you
# can change the values for --poweron, --poweroff, --reset, --statusvar, and --cycledelay
# to suit your UPS. Change --upscmd and --upsc if your NUT binaries are not in /usr/bin.
# If you want to use the "graceful" reboot/shutdown on host (2), described in the
# next two paragraphs, you'll probably want it running APC's PowerChute software
# instead of NUT.
# On most APC UPSes, --poweroff=load.off will cut the power to (2) immediately. The
# option --poweroff=shutdown.stayoff is risky, since the fenced host will continue
# running for ups.delay.shutdown seconds.
# By default, this agent reboots a system by telling the UPS to cut its load, wait
# "cycledelay" seconds, then restore power. If you provide a value for --reset, it will
# use that command instead of cycling power. Note that if you use '--reset=shutdown.return'
# a reboot/reset will wait for ups.delay.shutdown seconds, with power restored after
# ups.delay.start seconds, where ups.* are variables defined in the UPS. To change
# the value of UPS variables, see 'man upsrw'.
# On my rather old UPSes (circa 2002), if the load.off command is immediately followed
# by load.on, the latter command might be ignored. I found I had to wait at least
# 8 seconds for load.on to be accepted without problems. I've decided to be a bit
# cautious and use a default of cycledelay=20.
# I also found that there can be a delay between the time the UPS turns its
# load on/off and the time that fact is updated in its status variables; e.g.,
# the UPS immediately turns off its load in response to load.off, but ups.status
# doesn't show OFF until up to 20 seconds afterwards. Therefore I added the
# ondelay and offdelay options, which control how long to wait after issuing
# the corresponding poweron/poweroff command before checking the UPS status.
# If you want to completely ignore the UPS status after issuing a poweron or
# poweroff command, set the --noverifyonoff option. This is useful if you
# trust your UPS (and this agent!) to respond immediately to the command,
# and don't want to wait out the delay until the UPS status is updated.
# Note that large values of cycledelay, ondelay, and offdelay may cause this
# agent to take a long time to run. Be sure to set stonith-timeout to at
# least 3x the largest of these values.
# There's no explicit support for multiple-outlet UPSes/PDUs in this agent, because:
# a) I don't have any UPSes that let me control their individual outlets;
# b) NUT's documentation on how to use them is unclear to me;
# c) You probably want to use fence_apc or fence_apc_snmp instead.
# If commands/variables like these exist for your UPS, these options may work
# (I have not tested them!):
# --statusvar=outlet.3.status --upscmd=/usr/bin/upsrw \
# --poweroff="-s outlet.3.switch=0" --poweron="-s outlet.3.switch=1"
# ... assuming host (2) is plugged into outlet 3 of the UPS.
# The NUT command default locations. In the RHEL-type nut packages, these binaries
# are in /usr/bin.
my $RHELUPSCMD="/usr/bin/upscmd";
my $RHELUPSC="/usr/bin/upsc";
# Defaults for APC smart UPSes:
# Poweroff = turn off $hostname immediately by cutting the power on $ups.
my $APCPOWEROFF="load.off";
# Poweron = turn on the power to $ups, which will presumably turn on host (2).
my $APCPOWERON="load.on";
# Status = returns a short string with the $ups status; OL = on-line,
# OB = on battery, LB = low battery, etc.
my $APCSTATUSVAR="ups.status";
# Reset = reboot host (2). See the description of the 'reset' parameter above.
# For documentation only at this point, since the user must supply this parameter if
# they want to change the default reboot procedure (cycle the load on the UPS).
my $APCRESET="shutdown.return";
my $progname = $0;
sub usage
{
print <<EOF;
Usage:
fence_nut [options]
Options:
-o, --action=<action> Action: monitor, status, off, on, reset, metadata (default)
-s, --ups=<ups> UPS that controls power to hostname
-u, --username=<username> Username for accessing UPS
-p, --password=<password> Password for accessing UPS
-h, --help Print this message and exit
--poweron=<UPS 'on' cmd> NUT command to turn on UPS (default $APCPOWERON)
--poweroff=<UPS 'off' cmd> NUT command to turn off UPS (default $APCPOWEROFF)
--statusvar=<UPS status var> UPS status variable (default $APCSTATUSVAR)
--cycledelay=<integer> How long to wait between poweroff and poweron in a reboot
--ondelay=<integer> How long to wait for UPS to turn on during poweron
--offdelay=<integer> How long to wait for UPS to turn off during poweroff
--noverifyonoff If set, do not verify the result of poweron/poweroff
--reset=<UPS 'reset' cmd> NUT command to reset UPS (default is to cycle the load)
--upscmd=<path to upscmd> Default ${RHELUPSCMD}
--upsc=<path to upsc> Default ${RHELUPSC}
For more information:
fence_nut -o metadata
less $progname
EOF
}
use Sys::Syslog;
# Parse the options.
use Getopt::Long;
# Options and default values
my $poweron = $APCPOWERON;
my $poweroff = $APCPOWEROFF;
my $cycledelay = 20;
my $ondelay = 20;
my $offdelay = 20;
my $noverifyonoff = 0;
my $statusvar = $APCSTATUSVAR;
my $upscmd = $RHELUPSCMD;
my $upsc = $RHELUPSC;
# I'm overly cautious, so the default action is "metadata" instead of
# "reboot" as it is in the other scripts in the fence-agents package.
my $option = "metadata";
my $ups = "";
my $username = "";
my $password = "";
my $reset = "";
my $help = "";
my $debug = 0;
GetOptions ( 'action|o=s' => \$option,
'help|h' => \$help,
'debug' => \$debug,
'noverifyonoff' => \$noverifyonoff,
'ups|s=s' => \$ups,
'username|u=s' => \$username,
'password|p=s' => \$password,
'poweron=s' => \$poweron,
'poweroff=s' => \$poweroff,
'cycledelay=i' => \$cycledelay,
'ondelay=i' => \$ondelay,
'offdelay=i' => \$offdelay,
'reset=s' => \$reset,
'statusvar=s' => \$statusvar,
'upscmd=s' => \$upscmd,
'upsc=s' => \$upsc );
# In stonith-ng, the options can also come in via STDIN.
# They're in the form "param=value", where param is one of the
# parameter defined in the metadata section below; the exception
# is "action", which is supplied by stonith-ng as "option".
while ($line = <STDIN>)
{
# A tiny bit of syntax checking.
if ( $line =~ /^(\w+)\s*=\s*(.+)\s*$/ )
{
my $key = $1;
my $value = $2;
# Just in case a future version changes "option" back to "action"
$key =~ s/^action/option/;
# Execute the input line as a perl statement.
my $command = "\$$key=\"$value\"\;";
eval $command;
}
}
if ( $debug )
{
openlog($progname, "ndelay,pid", LOG_LOCAL0);
syslog(LOG_INFO, "logger action=%s, ups=%s, username=%s, password=%s", $option, $ups, $username, $password);
closelog;
}
if ( $help ) { usage; exit 0; }
# Possible exit codes
my $EC_SUCCESS = 0;
my $EC_ERROR = 1;
my $EC_OFF = 2;
# Parse the action option.
if ( $option =~ "^(on|poweron)" )
{
do_nut($poweron);
exit $EC_SUCCESS if $noverifyonoff;
# Did we actually turn on the load to the UPS? Wait
# a little while, then check.
sleep $ondelay;
my $rc = status_ups();
if ( $rc == 0 )
{
# We tried to turn it on, and it's on.
exit $EC_SUCCESS;
}
else
{
# We tried to turn it on, but somehow it's off
# or in a weird state.
exit $EC_ERROR;
}
}
elsif ( $option =~ "^(off|poweroff)" )
{
do_nut($poweroff);
exit $EC_SUCCESS if $noverifyonoff;
# Did we actually turn off the load to the UPS? Wait
# a little while, then check.
sleep $offdelay;
my $rc = status_ups();
if ( $rc == 2 )
{
# We tried to turn it off, and it's off.
exit $EC_SUCCESS;
}
else
{
# We tried to turn it off, but somehow it's on
# or in a weird state.
exit $EC_ERROR;
}
}
elsif ( $option =~ "^(reboot|reset)" )
{
# By default, cycle the power via the UPS. If the
# user has supplied a command via the reset parameter,
# use that command instead.
if ( $reset )
{
do_nut($reset);
}
else
{
# Turn the power off, wait a little while, then turn it on.
do_nut($poweroff);
sleep $cycledelay;
do_nut($poweron);
}
exit $EC_SUCCESS if $noverifyonoff;
# Did we actually turn on the load to the UPS? Wait
# a little while, then check.
sleep $ondelay;
my $rc = status_ups();
if ( $rc == 0 )
{
# We tried to turn it on, and it's on.
exit $EC_SUCCESS;
}
else
{
# We tried to turn it on, but somehow it's off
# or in a weird state.
exit $EC_ERROR;
}
}
elsif ( $option =~ "^status" )
{
# The difference between "monitor" and "status"
# is that status will display a message, and adjust
# the return code if the UPS is off.
my $code = status_ups();
my $status = "ON";
if ( $code == $EC_OFF ) { $status = "OFF"; }
print "Status: $status\n";
exit $code;
}
elsif ( $option =~ "^monitor" )
{
# The difference between "monitor" and "status"
# is that monitor will simply return 0 if the UPS and NUT
# are working.
my ($rc,$ignore) = monitor_ups();
if ( $rc != 0 ) { exit $EC_ERROR; }
exit $EC_SUCCESS;
}
elsif ( $option =~ "^metadata" )
{
print <<EOF;
<?xml version="1.0" ?>
<resource-agent name="fence_nut" shortdesc="Fence agent for UPSes controlled by NUT" >
<longdesc lang="en">
fence_nut is a Fencing Agent that controls an external UPS via NUT (Network UPS Tools).
Example:
crm configure primitive StonithMyHost stonith:fence_nut \
params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \
op monitor interval="60" timeout="30" on-fail="stop" \
ups=myhost-ups username=myuser password=mypassword
See the comments in $progname for more details and advice.
</longdesc>
<vendor-url>http://www.networkupstools.org/</vendor-url>
<parameters>
<parameter name="action">
<getopt mixed="-o, --action=<action>" />
<content type="string" default="metadata" />
<shortdesc lang="en">Fencing Action</shortdesc>
</parameter>
<parameter name="ups" required="1">
<getopt mixed="-s, --ups=<ups>" />
<content type="string" />
<shortdesc lang="en">UPS name</shortdesc>
</parameter>
<parameter name="username" required="1">
<getopt mixed="-u, --username=<username>" />
<content type="string" />
<shortdesc lang="en">Username</shortdesc>
</parameter>
<parameter name="password" required="1">
<getopt mixed="-p, --password=<password>" />
<content type="string" />
<shortdesc lang="en">Password</shortdesc>
</parameter>
<parameter name="poweron">
<getopt mixed="--poweron=<power-on command>" />
<content type="string" default="$APCPOWERON" />
<shortdesc lang="en">UPS Power On command</shortdesc>
</parameter>
<parameter name="poweroff">
<getopt mixed="--poweroff=<power-off command>" />
<content type="string" default="$APCPOWEROFF" />
<shortdesc lang="en">UPS Power Off command</shortdesc>
</parameter>
<parameter name="cycledelay">
<getopt mixed="--cycledelay=<integer>" />
<content type="integer" default="20" />
<shortdesc lang="en">How long to wait between poweroff and poweron in a reboot</shortdesc>
</parameter>
<parameter name="ondelay">
<getopt mixed="--ondelay=<integer>" />
<content type="integer" default="20" />
<shortdesc lang="en">How long to wait for UPS to turn on during poweron</shortdesc>
</parameter>
<parameter name="offdelay">
<getopt mixed="--offdelay=<integer>" />
<content type="integer" default="20" />
<shortdesc lang="en">How long to wait for UPS to turn off during poweroff</shortdesc>
</parameter>
<parameter name="noverifyonoff">
<getopt mixed="--offdelay=<0 or 1>" />
<content type="integer" default="0" />
<shortdesc lang="en">If set, do not verify the result of poweron/poweroff</shortdesc>
</parameter>
<parameter name="statusvar">
<getopt mixed="--statusvar=<status variable>" />
<content type="string" default="$APCSTATUSVAR" />
<shortdesc lang="en">UPS Status variable</shortdesc>
</parameter>
<parameter name="reset">
<getopt mixed="--reset=<alternate reset command>" />
<content type="string" default="" />
<shortdesc lang="en">UPS alternate reset command</shortdesc>
</parameter>
<parameter name="upscmd">
<getopt mixed="--upscmd=<upscmd path>" />
<content type="string" default="$RHELUPSCMD" />
<shortdesc lang="en">path to upscmd binary</shortdesc>
</parameter>
<parameter name="upsc">
<getopt mixed="--upsc=<upsc path>" />
<content type="string" default="$RHELUPSC" />
<shortdesc lang="en">path to upsc binary</shortdesc>
</parameter>
<parameter name="debug">
<getopt mixed="--debug=<0 or 1>" />
<content type="integer" default="0" />
<shortdesc lang="en">enable some minor debugging info in log</shortdesc>
</parameter>
</parameters>
<actions>
<action name="on" />
<action name="off" />
<action name="reboot" />
<action name="status" />
<action name="monitor" />
<action name="metadata" />
<action name="start" />
<action name="stop" />
</actions>
</resource-agent>
EOF
exit 0;
}
elsif ( $option =~ "^(start|stop)" )
{
# Do nothing; there's nothing to start or stop
# in this agent. (These actions are implemented solely
# to prevent warning messages from crm_verify.)
exit 0;
}
else
{
print "fence_nut: invalid option '$option'\n";
usage();
exit $EC_ERROR;
}
1;
# Send a command to the UPS via NUT.
sub do_nut
{
my ($command) = @_;
if ( ! -x $upscmd )
{
print "fence_nut: Can't find executable ${upscmd}\n";
return $EC_ERROR;
}
if ( not $username || not $password || not $ups )
{
print "fence_nut: username, password or ups name missing; check configuration\n";
usage();
return $EC_ERROR;
}
# Execute the command given in the argument.
my $cmd = "$upscmd -u $username -p $password $ups $command";
print "cmd=$cmd\n" if $debug;
$result = `$cmd`;
if ( $? != 0 )
{
print "fence_nut: error executing '$cmd': $result\n";
return $EC_ERROR;
}
return $EC_SUCCCESS;
}
sub status_ups
{
# The different between "monitor_ups" and
# "status_ups" is that in this routine we
# adjust the return code depending on the
# contents of the message returned by the
# UPS.
my ($rc, $result) = monitor_ups();
return $EC_ERROR if $rc != 0;
my $code = $EC_SUCCESS;
if ( $result =~ /(off|0)/i )
{
$code = $EC_OFF;
}
return $code;
}
# Note that we return an array: ($code,$text);
# $code is the return code from executing commands,
# $text is the output returned from the status command.
sub monitor_ups
{
if ( ! -x $upsc )
{
print "fence_nut: Can't find executable $upsc\n";
return (1,"");
}
# Define the command to fetch the UPS status.
my $cmd = "${upsc} ${ups} ${statusvar}";
print "cmd=$cmd\n" if $debug;
my $result = `$cmd`;
if ( $? != 0 )
{
print "fence_nut: error executing '$cmd': $result\n";
return ($?,"");
}
# At this point the UPS is on or off... but either way the
# device is working.
return (0,$result);
}