test_rabbit_master_failover

#!/bin/bash
# This tests rabbitmq master resource failover in static scenario (no manipulations with slaves)
# Given env 1, nodes 1,2,3. Assume node-2 is always a master under kill test and its virsh domname is env61_2_slave-03.
# 1) Move master to node-2
# 2) wait for ostf ha passed
# 3) kill the node-2 and start count for failover time in a wait loop (wait for ostf ha passed)
# a. also get pcs status for cluster nodes
# b. and get the rabbit pacemaker resource status,
# c. and get the rabbitmqctl cluster_status output
# 4) report estimated failover time
# 5) power on node-2 and wait for it joined the rabbit cluster
# 6) repeat 1-5

echo start > out.txt
count=0

wait_ostf_ha() {
  passed=0
  while [ $passed -ne 1 ]; do
    out=`fuel health --check ha --env 1 2>/dev/null`
    echo $out | grep -e fail -e skip -e err -q
    passed=$?
  done
}

# before
ssh node-1 "pcs constraint delete location-p_rabbitmq-server 2>/dev/null | true"
fence_virsh -o status -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 2>&1 | grep -q OFF
[[ $? -eq 1 ]] || (echo "Power on the node-2 first!";exit 1)

# start test
while true; do
  echo "Iteration ${count}" >> out.txt

  failover_start_time=`date +%s`
  # Move the rabbit master to the node-2
  ssh node-1 "pcs constraint delete location-p_rabbitmq-server 2>/dev/null | true"
  ssh node-1 "pcs constraint location p_rabbitmq-server rule role=master score=-INFINITY \#uname ne node-2.test.domain.local"
  sleep 2
  ssh node-1 "pcs constraint delete location-p_rabbitmq-server"

  # Wait for ostf ha passed
  wait_ostf_ha
  now_time=`date +%s`
  failover_time=$(( now_time - failover_start_time ))
  echo "Failover (moved master) time was: ${failover_time}" >> out.txt

  # Kill the node running the rabbit master resource (node-2)
  fence_virsh -o off -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 >> out.txt

  failover_start_time=`date +%s`
  # Wait for ostf ha passed
  wait_ostf_ha
  now_time=`date +%s`
  failover_time=$(( now_time - failover_start_time ))
  echo "Failover (master node destroyed) time was: ${failover_time}" >> out.txt

  # Check for corosync clusterstatus
  out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; crm_mon -1 | grep -i -e 'DC.*node' -e 'line:.*node' -e 'node.*UNCLEAN'\" | grep -A3 -e \"^[123]\"" 2>&1 &`
  sleep 5
  echo $out | tr -s "\\\\\n" "\n" | grep "^n\w.*" >> out.txt
  echo $out | grep -HE -q -e UNCLEAN -e WITHOUT -e 'OFFLINE: \[ (\S+) (\S+) \]'
  rc=$?

  [ $rc -eq 0 ]  && (echo "CAUGHT COROSYNC SPLITBRAIN at $count";break)

  # Check for rabbit resource status
  out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; pcs resource | grep -A2 master_p_rabbitmq-server\" | grep -A3 -e \"^[123]\"" 2>&1 &`
  sleep 5
  echo $out | tr -s "\\\\\n" "\n" | grep -v stderr >> out.txt

  # Check for rabbit cluster status
  out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; rabbitmqctl cluster_status | grep -e nodes -e running\" | grep -A3 -e \"^[123]\"" 2>&1 &`
  sleep 5
  echo $out | tr -s "\\\\\n" "\n" | grep -v stderr >> out.txt


  # Power on the killed node-2
  fence_virsh -o on -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 >> out.txt
  # Do not reboot more often than each 10 minutes, or UMM will be activated

  # Wait for the node-2 joined rabbit cluster
  joined=1
  while [ $joined -ne 0 ]; do
    ssh node-1 rabbitmqctl cluster_status | grep -q running.*node-2
    joined=$?
    sleep 10
  done

  # Next iter
  count=$(( count += 1 ))
done