1. #!/usr/bin/perl
  2.  
  3. # External fencing agent that uses the NUT daemon to control an external UPS.
  4. # See the comments below, and the various NUT man pages, for how this
  5. # script works. It should work unchanged with most modern "smart" APC UPSes in
  6. # a Redhat/Fedora/RHEL-style distribution with the nut package installed.
  7.  
  8. # Author: William Seligman <seligman@nevis.columbia.edu>
  9. # License: GPLv2
  10.  
  11. # The Following Agent Has Been Tested With:
  12. # pacemaker-1.1.6
  13. # nut-2.4.3
  14.  
  15. # As you're designing your UPS and fencing set-up, consider that there may be
  16. # three computers involved:
  17. # 1) the machine running this fencing agent;
  18. # 2) the machine being controlled by this agent;
  19. # 3) the machine that can send commands to the UPS.
  20.  
  21. # On my cluster, all the UPSes have SNMP smartcards, so every host can communicate
  22. # with every UPS; in other words, machines (1) and (3) are the same. If your UPSes
  23. # are controlled via serial or USB connections, then you might have a
  24. # situation in which host (2) is plugged into a UPS that has a serial connection
  25. # to some master "power-control" computer, and can potentially be fenced
  26. # by any other machine in your cluster.
  27.  
  28. # You'll probably need the nut daemon running on both the hosts (1) and
  29. # (3). Strictly speaking, there's no reason for NUT to run on (2).
  30. # From a practical standpoint you'll probably want NUT to be running on all the
  31. # systems in your cluster.
  32.  
  33. # For this agent to work, the following conditions have to be met:
  34.  
  35. # - NUT has to be installed; on RHEL systems, this requires packages nut and
  36. # nut-client.
  37.  
  38. # - The nut daemon (the ups or upsd service on RHEL) must be running on hosts
  39. # (1) and (3). This agent does not start/stop the nut daemons for you.
  40.  
  41. # - The name of the UPS that affects host (2) has to be defined in ups.conf on
  42. # host (3). The format for the --ups option is upsname[@controlhost[:port]]. The
  43. # default controlhost is 'localhost'. If you use SNMP management cards, you want
  44. # to make sure you issue comands to a community with read/write privileges; the
  45. # default is the 'private' community. An example ups.conf:
  46.  
  47. # [myhost-ups]
  48. # driver = snmp-ups
  49. # port = myhost-ups.example.com
  50. # community = private
  51. # mibs=apcc
  52.  
  53. # - The --username and --password options to access the UPS must be defined in
  54. # upsd.users on host (3), with the instcmds for poweron, poweroff, and reset allowed.
  55. # An example upsd.users:
  56.  
  57. # [myuser]
  58. # password = mypassword
  59. # actions = SET
  60. # instcmds = ALL
  61.  
  62. # - Host (1) must be allowed access via upsd.conf and upsd.users on host (3).
  63. # On RHEL systems, these files are in /etc/ups. In nut-2.4 and greater, there's
  64. # no per-host access restrictions, but you'll need to grant access in
  65. # nut-2.2 or lower.
  66.  
  67. # - If you want to be able to unfence host (2) via stonith_admin, you might want
  68. # to set its BIOS to boot up on AC power restore, as opposed to "last state" or "off."
  69. # Otherwise the machine might not come back on even if the UPS restores power.
  70.  
  71. # This agent doesn't keep track of which host it controls. Use the
  72. # Pacemaker parameters for that ("man stonithd"); e.g.,:
  73. # primitive StonithMyHost stonith:fence_nut \
  74. # op monitor interval="60" timeout="30" on-fail="stop" \
  75. # params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \
  76. # ups=myhost-ups username=myuser password=mypassword \
  77. # stonith-timeout="120s"
  78.  
  79. # Note the use of on-fail="stop". The main way this resource's monitor can fail
  80. # is if we lose communication with the UPS. That's not great if it happens, but
  81. # consider what happens if allow the default on-fail="fence", especially in a
  82. # two-node cluster; do you want host (1) to be fenced solely because it can
  83. # no longer fence host (2)? If you have more than two nodes, on-fail="restart"
  84. # is an alternative, but if someone's pulled the communications cable from the
  85. # UPS then the resource will just shift from node to node. (Maybe there's no
  86. # need to monitor this agent if there's no logical automated response if it fails.)
  87.  
  88. # More on this agent's options:
  89.  
  90. # The defaults will probably work with most APC UPS devices. They might work on others;
  91. # 'upscmd -l ${ups}' and 'upsc ${ups}' will list the commands and variables, and you
  92. # can change the values for --poweron, --poweroff, --reset, --statusvar, and --cycledelay
  93. # to suit your UPS. Change --upscmd and --upsc if your NUT binaries are not in /usr/bin.
  94.  
  95. # If you want to use the "graceful" reboot/shutdown on host (2), described in the
  96. # next two paragraphs, you'll probably want it running APC's PowerChute software
  97. # instead of NUT.
  98.  
  99. # On most APC UPSes, --poweroff=load.off will cut the power to (2) immediately. The
  100. # option --poweroff=shutdown.stayoff is risky, since the fenced host will continue
  101. # running for ups.delay.shutdown seconds.
  102.  
  103. # By default, this agent reboots a system by telling the UPS to cut its load, wait
  104. # "cycledelay" seconds, then restore power. If you provide a value for --reset, it will
  105. # use that command instead of cycling power. Note that if you use '--reset=shutdown.return'
  106. # a reboot/reset will wait for ups.delay.shutdown seconds, with power restored after
  107. # ups.delay.start seconds, where ups.* are variables defined in the UPS. To change
  108. # the value of UPS variables, see 'man upsrw'.
  109.  
  110. # On my rather old UPSes (circa 2002), if the load.off command is immediately followed
  111. # by load.on, the latter command might be ignored. I found I had to wait at least
  112. # 8 seconds for load.on to be accepted without problems. I've decided to be a bit
  113. # cautious and use a default of cycledelay=20.
  114.  
  115. # I also found that there can be a delay between the time the UPS turns its
  116. # load on/off and the time that fact is updated in its status variables; e.g.,
  117. # the UPS immediately turns off its load in response to load.off, but ups.status
  118. # doesn't show OFF until up to 20 seconds afterwards. Therefore I added the
  119. # ondelay and offdelay options, which control how long to wait after issuing
  120. # the corresponding poweron/poweroff command before checking the UPS status.
  121.  
  122. # If you want to completely ignore the UPS status after issuing a poweron or
  123. # poweroff command, set the --noverifyonoff option. This is useful if you
  124. # trust your UPS (and this agent!) to respond immediately to the command,
  125. # and don't want to wait out the delay until the UPS status is updated.
  126.  
  127. # Note that large values of cycledelay, ondelay, and offdelay may cause this
  128. # agent to take a long time to run. Be sure to set stonith-timeout to at
  129. # least 3x the largest of these values.
  130.  
  131. # There's no explicit support for multiple-outlet UPSes/PDUs in this agent, because:
  132. # a) I don't have any UPSes that let me control their individual outlets;
  133. # b) NUT's documentation on how to use them is unclear to me;
  134. # c) You probably want to use fence_apc or fence_apc_snmp instead.
  135.  
  136. # If commands/variables like these exist for your UPS, these options may work
  137. # (I have not tested them!):
  138. # --statusvar=outlet.3.status --upscmd=/usr/bin/upsrw \
  139. # --poweroff="-s outlet.3.switch=0" --poweron="-s outlet.3.switch=1"
  140. # ... assuming host (2) is plugged into outlet 3 of the UPS.
  141.  
  142. # The NUT command default locations. In the RHEL-type nut packages, these binaries
  143. # are in /usr/bin.
  144. my $RHELUPSCMD="/usr/bin/upscmd";
  145. my $RHELUPSC="/usr/bin/upsc";
  146.  
  147. # Defaults for APC smart UPSes:
  148.  
  149. # Poweroff = turn off $hostname immediately by cutting the power on $ups.
  150. my $APCPOWEROFF="load.off";
  151.  
  152. # Poweron = turn on the power to $ups, which will presumably turn on host (2).
  153. my $APCPOWERON="load.on";
  154.  
  155. # Status = returns a short string with the $ups status; OL = on-line,
  156. # OB = on battery, LB = low battery, etc.
  157. my $APCSTATUSVAR="ups.status";
  158.  
  159. # Reset = reboot host (2). See the description of the 'reset' parameter above.
  160. # For documentation only at this point, since the user must supply this parameter if
  161. # they want to change the default reboot procedure (cycle the load on the UPS).
  162. my $APCRESET="shutdown.return";
  163.  
  164.  
  165. my $progname = $0;
  166.  
  167. sub usage
  168. {
  169. print <<EOF;
  170. Usage:
  171. fence_nut [options]
  172. Options:
  173. -o, --action=<action> Action: monitor, status, off, on, reset, metadata (default)
  174. -s, --ups=<ups> UPS that controls power to hostname
  175. -u, --username=<username> Username for accessing UPS
  176. -p, --password=<password> Password for accessing UPS
  177. -h, --help Print this message and exit
  178. --poweron=<UPS 'on' cmd> NUT command to turn on UPS (default $APCPOWERON)
  179. --poweroff=<UPS 'off' cmd> NUT command to turn off UPS (default $APCPOWEROFF)
  180. --statusvar=<UPS status var> UPS status variable (default $APCSTATUSVAR)
  181. --cycledelay=<integer> How long to wait between poweroff and poweron in a reboot
  182. --ondelay=<integer> How long to wait for UPS to turn on during poweron
  183. --offdelay=<integer> How long to wait for UPS to turn off during poweroff
  184. --noverifyonoff If set, do not verify the result of poweron/poweroff
  185. --reset=<UPS 'reset' cmd> NUT command to reset UPS (default is to cycle the load)
  186. --upscmd=<path to upscmd> Default ${RHELUPSCMD}
  187. --upsc=<path to upsc> Default ${RHELUPSC}
  188.  
  189. For more information:
  190. fence_nut -o metadata
  191. less $progname
  192. EOF
  193. }
  194.  
  195. use Sys::Syslog;
  196.  
  197. # Parse the options.
  198. use Getopt::Long;
  199.  
  200. # Options and default values
  201. my $poweron = $APCPOWERON;
  202. my $poweroff = $APCPOWEROFF;
  203. my $cycledelay = 20;
  204. my $ondelay = 20;
  205. my $offdelay = 20;
  206. my $noverifyonoff = 0;
  207. my $statusvar = $APCSTATUSVAR;
  208. my $upscmd = $RHELUPSCMD;
  209. my $upsc = $RHELUPSC;
  210. # I'm overly cautious, so the default action is "metadata" instead of
  211. # "reboot" as it is in the other scripts in the fence-agents package.
  212. my $option = "metadata";
  213. my $ups = "";
  214. my $username = "";
  215. my $password = "";
  216. my $reset = "";
  217. my $help = "";
  218. my $debug = 0;
  219.  
  220. GetOptions ( 'action|o=s' => \$option,
  221. 'help|h' => \$help,
  222. 'debug' => \$debug,
  223. 'noverifyonoff' => \$noverifyonoff,
  224. 'ups|s=s' => \$ups,
  225. 'username|u=s' => \$username,
  226. 'password|p=s' => \$password,
  227. 'poweron=s' => \$poweron,
  228. 'poweroff=s' => \$poweroff,
  229. 'cycledelay=i' => \$cycledelay,
  230. 'ondelay=i' => \$ondelay,
  231. 'offdelay=i' => \$offdelay,
  232. 'reset=s' => \$reset,
  233. 'statusvar=s' => \$statusvar,
  234. 'upscmd=s' => \$upscmd,
  235. 'upsc=s' => \$upsc );
  236.  
  237. # In stonith-ng, the options can also come in via STDIN.
  238. # They're in the form "param=value", where param is one of the
  239. # parameter defined in the metadata section below; the exception
  240. # is "action", which is supplied by stonith-ng as "option".
  241.  
  242. while ($line = <STDIN>)
  243. {
  244. # A tiny bit of syntax checking.
  245. if ( $line =~ /^(\w+)\s*=\s*(.+)\s*$/ )
  246. {
  247. my $key = $1;
  248. my $value = $2;
  249. # Just in case a future version changes "option" back to "action"
  250. $key =~ s/^action/option/;
  251. # Execute the input line as a perl statement.
  252. my $command = "\$$key=\"$value\"\;";
  253. eval $command;
  254. }
  255. }
  256.  
  257. if ( $debug )
  258. {
  259. openlog($progname, "ndelay,pid", LOG_LOCAL0);
  260. syslog(LOG_INFO, "logger action=%s, ups=%s, username=%s, password=%s", $option, $ups, $username, $password);
  261. closelog;
  262. }
  263.  
  264. if ( $help ) { usage; exit 0; }
  265.  
  266. # Possible exit codes
  267. my $EC_SUCCESS = 0;
  268. my $EC_ERROR = 1;
  269. my $EC_OFF = 2;
  270.  
  271. # Parse the action option.
  272. if ( $option =~ "^(on|poweron)" )
  273. {
  274. do_nut($poweron);
  275. exit $EC_SUCCESS if $noverifyonoff;
  276. # Did we actually turn on the load to the UPS? Wait
  277. # a little while, then check.
  278. sleep $ondelay;
  279. my $rc = status_ups();
  280. if ( $rc == 0 )
  281. {
  282. # We tried to turn it on, and it's on.
  283. exit $EC_SUCCESS;
  284. }
  285. else
  286. {
  287. # We tried to turn it on, but somehow it's off
  288. # or in a weird state.
  289. exit $EC_ERROR;
  290. }
  291. }
  292. elsif ( $option =~ "^(off|poweroff)" )
  293. {
  294. do_nut($poweroff);
  295. exit $EC_SUCCESS if $noverifyonoff;
  296. # Did we actually turn off the load to the UPS? Wait
  297. # a little while, then check.
  298. sleep $offdelay;
  299. my $rc = status_ups();
  300. if ( $rc == 2 )
  301. {
  302. # We tried to turn it off, and it's off.
  303. exit $EC_SUCCESS;
  304. }
  305. else
  306. {
  307. # We tried to turn it off, but somehow it's on
  308. # or in a weird state.
  309. exit $EC_ERROR;
  310. }
  311. }
  312. elsif ( $option =~ "^(reboot|reset)" )
  313. {
  314. # By default, cycle the power via the UPS. If the
  315. # user has supplied a command via the reset parameter,
  316. # use that command instead.
  317. if ( $reset )
  318. {
  319. do_nut($reset);
  320. }
  321. else
  322. {
  323. # Turn the power off, wait a little while, then turn it on.
  324. do_nut($poweroff);
  325. sleep $cycledelay;
  326. do_nut($poweron);
  327. }
  328. exit $EC_SUCCESS if $noverifyonoff;
  329. # Did we actually turn on the load to the UPS? Wait
  330. # a little while, then check.
  331. sleep $ondelay;
  332. my $rc = status_ups();
  333. if ( $rc == 0 )
  334. {
  335. # We tried to turn it on, and it's on.
  336. exit $EC_SUCCESS;
  337. }
  338. else
  339. {
  340. # We tried to turn it on, but somehow it's off
  341. # or in a weird state.
  342. exit $EC_ERROR;
  343. }
  344. }
  345. elsif ( $option =~ "^status" )
  346. {
  347. # The difference between "monitor" and "status"
  348. # is that status will display a message, and adjust
  349. # the return code if the UPS is off.
  350. my $code = status_ups();
  351. my $status = "ON";
  352. if ( $code == $EC_OFF ) { $status = "OFF"; }
  353. print "Status: $status\n";
  354. exit $code;
  355. }
  356. elsif ( $option =~ "^monitor" )
  357. {
  358. # The difference between "monitor" and "status"
  359. # is that monitor will simply return 0 if the UPS and NUT
  360. # are working.
  361. my ($rc,$ignore) = monitor_ups();
  362. if ( $rc != 0 ) { exit $EC_ERROR; }
  363. exit $EC_SUCCESS;
  364. }
  365. elsif ( $option =~ "^metadata" )
  366. {
  367. print <<EOF;
  368. <?xml version="1.0" ?>
  369. <resource-agent name="fence_nut" shortdesc="Fence agent for UPSes controlled by NUT" >
  370. <longdesc lang="en">
  371. fence_nut is a Fencing Agent that controls an external UPS via NUT (Network UPS Tools).
  372.  
  373. Example:
  374. crm configure primitive StonithMyHost stonith:fence_nut \
  375. params pcmk_host_list=myhost.example.com pcmk_host_check=static-list \
  376. op monitor interval="60" timeout="30" on-fail="stop" \
  377. ups=myhost-ups username=myuser password=mypassword
  378.  
  379. See the comments in $progname for more details and advice.
  380. </longdesc>
  381. <vendor-url>http://www.networkupstools.org/</vendor-url>
  382. <parameters>
  383. <parameter name="action">
  384. <getopt mixed="-o, --action=&lt;action&gt;" />
  385. <content type="string" default="metadata" />
  386. <shortdesc lang="en">Fencing Action</shortdesc>
  387. </parameter>
  388. <parameter name="ups" required="1">
  389. <getopt mixed="-s, --ups=&lt;ups&gt;" />
  390. <content type="string" />
  391. <shortdesc lang="en">UPS name</shortdesc>
  392. </parameter>
  393. <parameter name="username" required="1">
  394. <getopt mixed="-u, --username=&lt;username&gt;" />
  395. <content type="string" />
  396. <shortdesc lang="en">Username</shortdesc>
  397. </parameter>
  398. <parameter name="password" required="1">
  399. <getopt mixed="-p, --password=&lt;password&gt;" />
  400. <content type="string" />
  401. <shortdesc lang="en">Password</shortdesc>
  402. </parameter>
  403. <parameter name="poweron">
  404. <getopt mixed="--poweron=&lt;power-on command&gt;" />
  405. <content type="string" default="$APCPOWERON" />
  406. <shortdesc lang="en">UPS Power On command</shortdesc>
  407. </parameter>
  408. <parameter name="poweroff">
  409. <getopt mixed="--poweroff=&lt;power-off command&gt;" />
  410. <content type="string" default="$APCPOWEROFF" />
  411. <shortdesc lang="en">UPS Power Off command</shortdesc>
  412. </parameter>
  413. <parameter name="cycledelay">
  414. <getopt mixed="--cycledelay=&lt;integer&gt;" />
  415. <content type="integer" default="20" />
  416. <shortdesc lang="en">How long to wait between poweroff and poweron in a reboot</shortdesc>
  417. </parameter>
  418. <parameter name="ondelay">
  419. <getopt mixed="--ondelay=&lt;integer&gt;" />
  420. <content type="integer" default="20" />
  421. <shortdesc lang="en">How long to wait for UPS to turn on during poweron</shortdesc>
  422. </parameter>
  423. <parameter name="offdelay">
  424. <getopt mixed="--offdelay=&lt;integer&gt;" />
  425. <content type="integer" default="20" />
  426. <shortdesc lang="en">How long to wait for UPS to turn off during poweroff</shortdesc>
  427. </parameter>
  428. <parameter name="noverifyonoff">
  429. <getopt mixed="--offdelay=&lt;0 or 1&gt;" />
  430. <content type="integer" default="0" />
  431. <shortdesc lang="en">If set, do not verify the result of poweron/poweroff</shortdesc>
  432. </parameter>
  433. <parameter name="statusvar">
  434. <getopt mixed="--statusvar=&lt;status variable&gt;" />
  435. <content type="string" default="$APCSTATUSVAR" />
  436. <shortdesc lang="en">UPS Status variable</shortdesc>
  437. </parameter>
  438. <parameter name="reset">
  439. <getopt mixed="--reset=&lt;alternate reset command&gt;" />
  440. <content type="string" default="" />
  441. <shortdesc lang="en">UPS alternate reset command</shortdesc>
  442. </parameter>
  443. <parameter name="upscmd">
  444. <getopt mixed="--upscmd=&lt;upscmd path&gt;" />
  445. <content type="string" default="$RHELUPSCMD" />
  446. <shortdesc lang="en">path to upscmd binary</shortdesc>
  447. </parameter>
  448. <parameter name="upsc">
  449. <getopt mixed="--upsc=&lt;upsc path&gt;" />
  450. <content type="string" default="$RHELUPSC" />
  451. <shortdesc lang="en">path to upsc binary</shortdesc>
  452. </parameter>
  453. <parameter name="debug">
  454. <getopt mixed="--debug=&lt;0 or 1&gt;" />
  455. <content type="integer" default="0" />
  456. <shortdesc lang="en">enable some minor debugging info in log</shortdesc>
  457. </parameter>
  458. </parameters>
  459. <actions>
  460. <action name="on" />
  461. <action name="off" />
  462. <action name="reboot" />
  463. <action name="status" />
  464. <action name="monitor" />
  465. <action name="metadata" />
  466. <action name="start" />
  467. <action name="stop" />
  468. </actions>
  469. </resource-agent>
  470. EOF
  471. exit 0;
  472. }
  473. elsif ( $option =~ "^(start|stop)" )
  474. {
  475. # Do nothing; there's nothing to start or stop
  476. # in this agent. (These actions are implemented solely
  477. # to prevent warning messages from crm_verify.)
  478. exit 0;
  479. }
  480. else
  481. {
  482. print "fence_nut: invalid option '$option'\n";
  483. usage();
  484. exit $EC_ERROR;
  485. }
  486.  
  487. 1;
  488.  
  489.  
  490. # Send a command to the UPS via NUT.
  491. sub do_nut
  492. {
  493. my ($command) = @_;
  494.  
  495. if ( ! -x $upscmd )
  496. {
  497. print "fence_nut: Can't find executable ${upscmd}\n";
  498. return $EC_ERROR;
  499. }
  500. if ( not $username || not $password || not $ups )
  501. {
  502. print "fence_nut: username, password or ups name missing; check configuration\n";
  503. usage();
  504. return $EC_ERROR;
  505. }
  506. # Execute the command given in the argument.
  507. my $cmd = "$upscmd -u $username -p $password $ups $command";
  508. print "cmd=$cmd\n" if $debug;
  509. $result = `$cmd`;
  510. if ( $? != 0 )
  511. {
  512. print "fence_nut: error executing '$cmd': $result\n";
  513. return $EC_ERROR;
  514. }
  515. return $EC_SUCCCESS;
  516. }
  517.  
  518. sub status_ups
  519. {
  520. # The different between "monitor_ups" and
  521. # "status_ups" is that in this routine we
  522. # adjust the return code depending on the
  523. # contents of the message returned by the
  524. # UPS.
  525.  
  526. my ($rc, $result) = monitor_ups();
  527. return $EC_ERROR if $rc != 0;
  528.  
  529. my $code = $EC_SUCCESS;
  530. if ( $result =~ /(off|0)/i )
  531. {
  532. $code = $EC_OFF;
  533. }
  534. return $code;
  535. }
  536.  
  537. # Note that we return an array: ($code,$text);
  538. # $code is the return code from executing commands,
  539. # $text is the output returned from the status command.
  540. sub monitor_ups
  541. {
  542. if ( ! -x $upsc )
  543. {
  544. print "fence_nut: Can't find executable $upsc\n";
  545. return (1,"");
  546. }
  547. # Define the command to fetch the UPS status.
  548. my $cmd = "${upsc} ${ups} ${statusvar}";
  549. print "cmd=$cmd\n" if $debug;
  550. my $result = `$cmd`;
  551. if ( $? != 0 )
  552. {
  553. print "fence_nut: error executing '$cmd': $result\n";
  554. return ($?,"");
  555. }
  556. # At this point the UPS is on or off... but either way the
  557. # device is working.
  558. return (0,$result);
  559. }