Advertisement
Guest User

test_rabbit_master_failover

a guest
May 21st, 2015
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 3.72 KB | None | 0 0
  1. #!/bin/bash
  2. # This tests rabbitmq master resource failover in static scenario (no manipulations with slaves)
  3. # Given env 1, nodes 1,2,3. Assume node-2 is always a master under kill test and its virsh domname is env61_2_slave-03.
  4. # 1) Move master to node-2
  5. # 2) wait for ostf ha passed
  6. # 3) kill the node-2 and start count for failover time in a wait loop (wait for ostf ha passed)
  7. # a. also get pcs status for cluster nodes
  8. # b. and get the rabbit pacemaker resource status,
  9. # c. and get the rabbitmqctl cluster_status output
  10. # 4) report estimated failover time
  11. # 5) power on node-2 and wait for it joined the rabbit cluster
  12. # 6) repeat 1-5
  13.  
  14. echo start > out.txt
  15. count=0
  16.  
  17. wait_ostf_ha() {
  18.   passed=0
  19.   while [ $passed -ne 1 ]; do
  20.     out=`fuel health --check ha --env 1 2>/dev/null`
  21.     echo $out | grep -e fail -e skip -e err -q
  22.     passed=$?
  23.   done
  24. }
  25.  
  26. # before
  27. ssh node-1 "pcs constraint delete location-p_rabbitmq-server 2>/dev/null | true"
  28. fence_virsh -o status -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 2>&1 | grep -q OFF
  29. [[ $? -eq 1 ]] || (echo "Power on the node-2 first!";exit 1)
  30.  
  31. # start test
  32. while true; do
  33.   echo "Iteration ${count}" >> out.txt
  34.  
  35.   failover_start_time=`date +%s`
  36.   # Move the rabbit master to the node-2
  37.   ssh node-1 "pcs constraint delete location-p_rabbitmq-server 2>/dev/null | true"
  38.   ssh node-1 "pcs constraint location p_rabbitmq-server rule role=master score=-INFINITY \#uname ne node-2.test.domain.local"
  39.   sleep 2
  40.   ssh node-1 "pcs constraint delete location-p_rabbitmq-server"
  41.  
  42.   # Wait for ostf ha passed
  43.   wait_ostf_ha
  44.   now_time=`date +%s`
  45.   failover_time=$(( now_time - failover_start_time ))
  46.   echo "Failover (moved master) time was: ${failover_time}" >> out.txt
  47.  
  48.   # Kill the node running the rabbit master resource (node-2)
  49.   fence_virsh -o off -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 >> out.txt
  50.  
  51.   failover_start_time=`date +%s`
  52.   # Wait for ostf ha passed
  53.   wait_ostf_ha
  54.   now_time=`date +%s`
  55.   failover_time=$(( now_time - failover_start_time ))
  56.   echo "Failover (master node destroyed) time was: ${failover_time}" >> out.txt
  57.  
  58.   # Check for corosync clusterstatus
  59.   out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; crm_mon -1 | grep -i -e 'DC.*node' -e 'line:.*node' -e 'node.*UNCLEAN'\" | grep -A3 -e \"^[123]\"" 2>&1 &`
  60.   sleep 5
  61.   echo $out | tr -s "\\\\\n" "\n" | grep "^n\w.*" >> out.txt
  62.   echo $out | grep -HE -q -e UNCLEAN -e WITHOUT -e 'OFFLINE: \[ (\S+) (\S+) \]'
  63.   rc=$?
  64.  
  65.   [ $rc -eq 0 ]  && (echo "CAUGHT COROSYNC SPLITBRAIN at $count";break)
  66.  
  67.   # Check for rabbit resource status
  68.   out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; pcs resource | grep -A2 master_p_rabbitmq-server\" | grep -A3 -e \"^[123]\"" 2>&1 &`
  69.   sleep 5
  70.   echo $out | tr -s "\\\\\n" "\n" | grep -v stderr >> out.txt
  71.  
  72.   # Check for rabbit cluster status
  73.   out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; rabbitmqctl cluster_status | grep -e nodes -e running\" | grep -A3 -e \"^[123]\"" 2>&1 &`
  74.   sleep 5
  75.   echo $out | tr -s "\\\\\n" "\n" | grep -v stderr >> out.txt
  76.  
  77.  
  78.   # Power on the killed node-2
  79.   fence_virsh -o on -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 >> out.txt
  80.   # Do not reboot more often than each 10 minutes, or UMM will be activated
  81.  
  82.   # Wait for the node-2 joined rabbit cluster
  83.   joined=1
  84.   while [ $joined -ne 0 ]; do
  85.     ssh node-1 rabbitmqctl cluster_status | grep -q running.*node-2
  86.     joined=$?
  87.     sleep 10
  88.   done
  89.  
  90.   # Next iter
  91.   count=$(( count += 1 ))
  92. done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement