#!/usr/bin/perl

# External fencing agent that uses the NUT daemon to control an external UPS.
# See the comments below, and the various NUT man pages, for how this
# script works. It should work unchanged with most modern "smart" APC UPSes in
# a Redhat/Fedora/RHEL-style distribution with the nut package installed.

# Author: William Seligman <seligman@nevis.columbia.edu>
# License: GPLv2

# The Following Agent Has Been Tested With:
# pacemaker-1.1.6
# nut-2.4.3

# As you're designing your UPS and fencing set-up, consider that there may be 
# three computers involved:
#   1) the machine running this fencing agent;
#   2) the machine being controlled by this agent;
#   3) the machine that can send commands to the UPS.

# On my cluster, all the UPSes have SNMP smartcards, so every host can communicate
# with every UPS; in other words, machines (1) and (3) are the same. If your UPSes 
# are controlled via serial or USB connections, then you might have a
# situation in which host (2) is plugged into a UPS that has a serial connection
# to some master "power-control" computer, and can potentially be fenced
# by any other machine in your cluster.

# You'll probably need the nut daemon running on both the hosts (1) and
# (3). Strictly speaking, there's no reason for NUT to run on (2).
# From a practical standpoint you'll probably want NUT to be running on all the
# systems in your cluster. 

# For this agent to work, the following conditions have to be met:

# - NUT has to be installed; on RHEL systems, this requires packages nut and 
# nut-client.

# - The nut daemon (the ups or upsd service on RHEL) must be running on hosts 
# (1) and (3). This agent does not start/stop the nut daemons for you.

# - The name of the UPS that affects host (2) has to be defined in ups.conf on 
# host (3). The format for the --ups option is upsname[@controlhost[:port]]. The
# default controlhost is 'localhost'. If you use SNMP management cards, you want 
# to make sure you issue comands to a community with read/write privileges; the 
# default is the 'private' community. An example ups.conf:

# [myhost-ups]
#         driver = snmp-ups
#         port = myhost-ups.example.com
#         community = private
#         mibs=apcc

# - The --username and --password options to access the UPS must be defined in
# upsd.users on host (3), with the instcmds for poweron, poweroff, and reset allowed. 
# An example upsd.users:

# [myuser]
#        password = mypassword
#        actions = SET
#        instcmds = ALL

# - Host (1) must be allowed access via upsd.conf and upsd.users on host (3). 
# On RHEL systems, these files are in /etc/ups. In nut-2.4 and greater, there's
# no per-host access restrictions, but you'll need to grant access in
# nut-2.2 or lower. 

# - If you want to be able to unfence host (2) via stonith_admin, you might want 
# to set its BIOS to boot up on AC power restore, as opposed to "last state" or "off."
# Otherwise the machine might not come back on even if the UPS restores power. 

# This agent doesn't keep track of which host it controls. Use the 
# Pacemaker parameters for that ("man stonithd"); e.g.,:
#   primitive StonithMyHost stonith:fence_nut \
#      op monitor interval="60" timeout="30" on-fail="stop" \
#      params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \
#          ups=myhost-ups username=myuser password=mypassword \
#          stonith-timeout="120s"

# Note the use of on-fail="stop". The main way this resource's monitor can fail
# is if we lose communication with the UPS. That's not great if it happens, but
# consider what happens if allow the default on-fail="fence", especially in a
# two-node cluster; do you want host (1) to be fenced solely because it can 
# no longer fence host (2)? If you have more than two nodes, on-fail="restart"
# is an alternative, but if someone's pulled the communications cable from the
# UPS then the resource will just shift from node to node. (Maybe there's no
# need to monitor this agent if there's no logical automated response if it fails.)

# More on this agent's options: 

# The defaults will probably work with most APC UPS devices. They might work on others; 
# 'upscmd -l ${ups}' and 'upsc ${ups}' will list the commands and variables, and you
# can change the values for --poweron, --poweroff, --reset, --statusvar, and --cycledelay
# to suit your UPS. Change --upscmd and --upsc if your NUT binaries are not in /usr/bin.

# If you want to use the "graceful" reboot/shutdown on host (2), described in the
# next two paragraphs, you'll probably want it running APC's PowerChute software 
# instead of NUT. 

# On most APC UPSes, --poweroff=load.off will cut the power to (2) immediately. The 
# option --poweroff=shutdown.stayoff is risky, since the fenced host will continue 
# running for ups.delay.shutdown seconds. 

# By default, this agent reboots a system by telling the UPS to cut its load, wait
# "cycledelay" seconds, then restore power. If you provide a value for --reset, it will
# use that command instead of cycling power. Note that if you use '--reset=shutdown.return'
# a reboot/reset will wait for ups.delay.shutdown seconds, with power restored after 
# ups.delay.start seconds, where ups.* are variables defined in the UPS. To change 
# the value of UPS variables, see 'man upsrw'.

# On my rather old UPSes (circa 2002), if the load.off command is immediately followed 
# by load.on, the latter command might be ignored. I found I had to wait at least 
# 8 seconds for load.on to be accepted without problems. I've decided to be a bit
# cautious and use a default of cycledelay=20. 

# I also found that there can be a delay between the time the UPS turns its
# load on/off and the time that fact is updated in its status variables; e.g.,
# the UPS immediately turns off its load in response to load.off, but ups.status
# doesn't show OFF until up to 20 seconds afterwards. Therefore I added the 
# ondelay and offdelay options, which control how long to wait after issuing
# the corresponding poweron/poweroff command before checking the UPS status.

# If you want to completely ignore the UPS status after issuing a poweron or
# poweroff command, set the --noverifyonoff option. This is useful if you
# trust your UPS (and this agent!) to respond immediately to the command,
# and don't want to wait out the delay until the UPS status is updated. 

# Note that large values of cycledelay, ondelay, and offdelay may cause this
# agent to take a long time to run. Be sure to set stonith-timeout to at
# least 3x the largest of these values. 

# There's no explicit support for multiple-outlet UPSes/PDUs in this agent, because:
# a) I don't have any UPSes that let me control their individual outlets;
# b) NUT's documentation on how to use them is unclear to me;
# c) You probably want to use fence_apc or fence_apc_snmp instead. 

# If commands/variables like these exist for your UPS, these options may work
# (I have not tested them!):
#       --statusvar=outlet.3.status --upscmd=/usr/bin/upsrw \
#       --poweroff="-s outlet.3.switch=0" --poweron="-s outlet.3.switch=1"
# ... assuming host (2) is plugged into outlet 3 of the UPS. 

# The NUT command default locations. In the RHEL-type nut packages, these binaries 
# are in /usr/bin.
my $RHELUPSCMD="/usr/bin/upscmd";
my $RHELUPSC="/usr/bin/upsc";

# Defaults for APC smart UPSes:

# Poweroff = turn off $hostname immediately by cutting the power on $ups.
my $APCPOWEROFF="load.off";

# Poweron = turn on the power to $ups, which will presumably turn on host (2).
my $APCPOWERON="load.on";

# Status = returns a short string with the $ups status; OL = on-line, 
# OB = on battery, LB = low battery, etc.
my $APCSTATUSVAR="ups.status";

# Reset = reboot host (2). See the description of the 'reset' parameter above.
# For documentation only at this point, since the user must supply this parameter if
# they want to change the default reboot procedure (cycle the load on the UPS).
my $APCRESET="shutdown.return";


my $progname = $0;

sub usage 
{
	print <<EOF;
Usage:
	fence_nut [options]
Options:
   -o, --action=<action>          Action: monitor, status, off, on, reset, metadata (default)
   -s, --ups=<ups>                UPS that controls power to hostname
   -u, --username=<username>      Username for accessing UPS
   -p, --password=<password>      Password for accessing UPS
   -h, --help                     Print this message and exit
   --poweron=<UPS 'on' cmd>       NUT command to turn on UPS (default $APCPOWERON)
   --poweroff=<UPS 'off' cmd>     NUT command to turn off UPS (default $APCPOWEROFF)
   --statusvar=<UPS status var>   UPS status variable (default $APCSTATUSVAR)
   --cycledelay=<integer>         How long to wait between poweroff and poweron in a reboot
   --ondelay=<integer>            How long to wait for UPS to turn on during poweron
   --offdelay=<integer>           How long to wait for UPS to turn off during poweroff
   --noverifyonoff                If set, do not verify the result of poweron/poweroff
   --reset=<UPS 'reset' cmd>      NUT command to reset UPS (default is to cycle the load)
   --upscmd=<path to upscmd>      Default ${RHELUPSCMD}
   --upsc=<path to upsc>          Default ${RHELUPSC}
   
   For more information:
     fence_nut -o metadata
     less $progname
EOF
}

use Sys::Syslog;

# Parse the options.
use Getopt::Long;

# Options and default values
my $poweron       = $APCPOWERON;
my $poweroff      = $APCPOWEROFF;
my $cycledelay    = 20;
my $ondelay       = 20;
my $offdelay      = 20;
my $noverifyonoff = 0;
my $statusvar     = $APCSTATUSVAR;
my $upscmd        = $RHELUPSCMD;
my $upsc          = $RHELUPSC;
# I'm overly cautious, so the default action is "metadata" instead of
# "reboot" as it is in the other scripts in the fence-agents package. 
my $option        = "metadata";
my $ups           = "";
my $username      = "";
my $password      = "";
my $reset         = ""; 
my $help          = "";
my $debug         = 0;

GetOptions ( 'action|o=s'    => \$option, 
             'help|h'        => \$help,
             'debug'         => \$debug,
             'noverifyonoff' => \$noverifyonoff,
             'ups|s=s'       => \$ups,
             'username|u=s'  => \$username,
             'password|p=s'  => \$password,
             'poweron=s'     => \$poweron,
             'poweroff=s'    => \$poweroff,
             'cycledelay=i'  => \$cycledelay,
             'ondelay=i'     => \$ondelay,
             'offdelay=i'    => \$offdelay,
             'reset=s'       => \$reset,
             'statusvar=s'   => \$statusvar,
             'upscmd=s'      => \$upscmd,
             'upsc=s'        => \$upsc );
             
# In stonith-ng, the options can also come in via STDIN.
# They're in the form "param=value", where param is one of the
# parameter defined in the metadata section below; the exception
# is "action", which is supplied by stonith-ng as "option".

while ($line = <STDIN>)
{
	# A tiny bit of syntax checking.
	if ( $line =~ /^(\w+)\s*=\s*(.+)\s*$/ )
	{
		my $key = $1;
		my $value = $2;
		# Just in case a future version changes "option" back to "action"
		$key =~ s/^action/option/;
		# Execute the input line as a perl statement.
		my $command = "\$$key=\"$value\"\;";
		eval $command;
	}
}

if ( $debug )
{
	openlog($progname, "ndelay,pid", LOG_LOCAL0);
	syslog(LOG_INFO, "logger action=%s, ups=%s, username=%s, password=%s", $option, $ups, $username, $password);
	closelog;
}

if ( $help ) { usage; exit 0; }

# Possible exit codes
my $EC_SUCCESS = 0;
my $EC_ERROR = 1;
my $EC_OFF = 2;

# Parse the action option. 
if ( $option =~ "^(on|poweron)" )
{
	do_nut($poweron);
	exit $EC_SUCCESS if $noverifyonoff;
	# Did we actually turn on the load to the UPS? Wait
	# a little while, then check. 
	sleep $ondelay;
	my $rc = status_ups();
	if ( $rc == 0 )
	{
		# We tried to turn it on, and it's on.
		exit $EC_SUCCESS;
	}
	else
	{
		# We tried to turn it on, but somehow it's off
		# or in a weird state.
		exit $EC_ERROR;
	}
}
elsif ( $option =~ "^(off|poweroff)" )
{
	do_nut($poweroff);
	exit $EC_SUCCESS if $noverifyonoff;
	# Did we actually turn off the load to the UPS? Wait
	# a little while, then check. 
	sleep $offdelay;
	my $rc = status_ups();
	if ( $rc == 2 )
	{
		# We tried to turn it off, and it's off.
		exit $EC_SUCCESS;
	}
	else
	{
		# We tried to turn it off, but somehow it's on
		# or in a weird state.
		exit $EC_ERROR;
	}
}
elsif ( $option =~ "^(reboot|reset)" )
{
	# By default, cycle the power via the UPS. If the
	# user has supplied a command via the reset parameter,
	# use that command instead. 
	if ( $reset )
	{
		do_nut($reset);
	}
	else
	{
		# Turn the power off, wait a little while, then turn it on. 
		do_nut($poweroff);
		sleep $cycledelay;
		do_nut($poweron);
	}
	exit $EC_SUCCESS if $noverifyonoff;
	# Did we actually turn on the load to the UPS? Wait
	# a little while, then check. 
	sleep $ondelay;
	my $rc = status_ups();
	if ( $rc == 0 )
	{
		# We tried to turn it on, and it's on.
		exit $EC_SUCCESS;
	}
	else
	{
		# We tried to turn it on, but somehow it's off
		# or in a weird state.
		exit $EC_ERROR;
	}
}
elsif ( $option =~ "^status" )
{
	# The difference between "monitor" and "status"
	# is that status will display a message, and adjust
	# the return code if the UPS is off. 
	my $code = status_ups();
	my $status = "ON";
	if ( $code == $EC_OFF ) { $status = "OFF"; }
	print "Status: $status\n";
	exit $code;
}
elsif ( $option =~ "^monitor" )
{
	# The difference between "monitor" and "status"
	# is that monitor will simply return 0 if the UPS and NUT
	# are working. 
	my ($rc,$ignore) = monitor_ups();
	if ( $rc != 0 ) { exit $EC_ERROR; }
	exit $EC_SUCCESS;
}
elsif ( $option =~ "^metadata" )
{
	print <<EOF;
<?xml version="1.0" ?>
<resource-agent name="fence_nut" shortdesc="Fence agent for UPSes controlled by NUT" >
<longdesc lang="en">
fence_nut is a Fencing Agent that controls an external UPS via NUT (Network UPS Tools).

Example:
   crm configure primitive StonithMyHost stonith:fence_nut \
   params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \
          op monitor interval="60" timeout="30" on-fail="stop" \
          ups=myhost-ups username=myuser password=mypassword

See the comments in $progname for more details and advice.
</longdesc>
<vendor-url>http://www.networkupstools.org/</vendor-url>
<parameters>
	<parameter name="action">
		<getopt mixed="-o, --action=&lt;action&gt;" />
		<content type="string" default="metadata" />
		<shortdesc lang="en">Fencing Action</shortdesc>
	</parameter>
	<parameter name="ups" required="1">
		<getopt mixed="-s, --ups=&lt;ups&gt;" />
		<content type="string" />
		<shortdesc lang="en">UPS name</shortdesc>
	</parameter>
	<parameter name="username" required="1">
		<getopt mixed="-u, --username=&lt;username&gt;" />
		<content type="string" />
		<shortdesc lang="en">Username</shortdesc>
	</parameter>
	<parameter name="password" required="1">
		<getopt mixed="-p, --password=&lt;password&gt;" />
		<content type="string" />
		<shortdesc lang="en">Password</shortdesc>
	</parameter>
	<parameter name="poweron">
		<getopt mixed="--poweron=&lt;power-on command&gt;" />
		<content type="string" default="$APCPOWERON" />
		<shortdesc lang="en">UPS Power On command</shortdesc>
	</parameter>
	<parameter name="poweroff">
		<getopt mixed="--poweroff=&lt;power-off command&gt;" />
		<content type="string" default="$APCPOWEROFF" />
		<shortdesc lang="en">UPS Power Off command</shortdesc>
	</parameter>
	<parameter name="cycledelay">
		<getopt mixed="--cycledelay=&lt;integer&gt;" />
		<content type="integer" default="20" />
		<shortdesc lang="en">How long to wait between poweroff and poweron in a reboot</shortdesc>
	</parameter>
	<parameter name="ondelay">
		<getopt mixed="--ondelay=&lt;integer&gt;" />
		<content type="integer" default="20" />
		<shortdesc lang="en">How long to wait for UPS to turn on during poweron</shortdesc>
	</parameter>
	<parameter name="offdelay">
		<getopt mixed="--offdelay=&lt;integer&gt;" />
		<content type="integer" default="20" />
		<shortdesc lang="en">How long to wait for UPS to turn off during poweroff</shortdesc>
	</parameter>
	<parameter name="noverifyonoff">
		<getopt mixed="--offdelay=&lt;0 or 1&gt;" />
		<content type="integer" default="0" />
		<shortdesc lang="en">If set, do not verify the result of poweron/poweroff</shortdesc>
	</parameter>
	<parameter name="statusvar">
		<getopt mixed="--statusvar=&lt;status variable&gt;" />
		<content type="string" default="$APCSTATUSVAR" />
		<shortdesc lang="en">UPS Status variable</shortdesc>
	</parameter>
	<parameter name="reset">
		<getopt mixed="--reset=&lt;alternate reset command&gt;" />
		<content type="string" default="" />
		<shortdesc lang="en">UPS alternate reset command</shortdesc>
	</parameter>
	<parameter name="upscmd">
		<getopt mixed="--upscmd=&lt;upscmd path&gt;" />
		<content type="string" default="$RHELUPSCMD" />
		<shortdesc lang="en">path to upscmd binary</shortdesc>
	</parameter>
	<parameter name="upsc">
		<getopt mixed="--upsc=&lt;upsc path&gt;" />
		<content type="string" default="$RHELUPSC" />
		<shortdesc lang="en">path to upsc binary</shortdesc>
	</parameter>
	<parameter name="debug">
		<getopt mixed="--debug=&lt;0 or 1&gt;" />
		<content type="integer" default="0" />
		<shortdesc lang="en">enable some minor debugging info in log</shortdesc>
	</parameter>
</parameters>
<actions>
    <action name="on" />
    <action name="off" />
    <action name="reboot" />
    <action name="status" />
    <action name="monitor" />
    <action name="metadata" />
    <action name="start" />
    <action name="stop" />
</actions>
</resource-agent>
EOF
exit 0;
}
elsif ( $option =~ "^(start|stop)" )
{
	# Do nothing; there's nothing to start or stop
	# in this agent. (These actions are implemented solely
	# to prevent warning messages from crm_verify.)
	exit 0;
}
else
{
	print "fence_nut: invalid option '$option'\n";
	usage();
	exit $EC_ERROR;
}

1;


# Send a command to the UPS via NUT.
sub do_nut 
{
	my ($command) = @_;

	if ( ! -x $upscmd )
	{
		print "fence_nut: Can't find executable ${upscmd}\n";
		return $EC_ERROR;
	}
	if ( not $username || not $password || not $ups )
	{
		print "fence_nut: username, password or ups name missing; check configuration\n";
		usage();
		return $EC_ERROR;
	}
	# Execute the command given in the argument.
	my $cmd = "$upscmd -u $username -p $password $ups $command";
	print "cmd=$cmd\n" if $debug;
	$result = `$cmd`; 
	if ( $? != 0 )
	{
		print "fence_nut: error executing '$cmd': $result\n";
		return $EC_ERROR;
	}
	return $EC_SUCCCESS;
}

sub status_ups
{
	# The different between "monitor_ups" and
	# "status_ups" is that in this routine we
	# adjust the return code depending on the
	# contents of the message returned by the
	# UPS. 
	
	my ($rc, $result) = monitor_ups();
	return $EC_ERROR if $rc != 0;
	
	my $code = $EC_SUCCESS;
	if ( $result =~ /(off|0)/i ) 
	{ 
		$code = $EC_OFF;
	}
	return $code;
}

# Note that we return an array: ($code,$text);
# $code is the return code from executing commands,
# $text is the output returned from the status command.
sub monitor_ups
{
	if ( ! -x $upsc )
	{
		print "fence_nut: Can't find executable $upsc\n";
		return (1,"");
	}
	# Define the command to fetch the UPS status.
	my $cmd = "${upsc} ${ups} ${statusvar}";
	print "cmd=$cmd\n" if $debug;
	my $result = `$cmd`;
	if ( $? != 0 )
	{
		print "fence_nut: error executing '$cmd': $result\n";
		return ($?,"");
	}
	# At this point the UPS is on or off... but either way the
	# device is working.
	return (0,$result);
}