Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # This tests rabbitmq master resource failover in static scenario (no manipulations with slaves)
- # Given env 1, nodes 1,2,3. Assume node-2 is always a master under kill test and its virsh domname is env61_2_slave-03.
- # 1) Move master to node-2
- # 2) wait for ostf ha passed
- # 3) kill the node-2 and start count for failover time in a wait loop (wait for ostf ha passed)
- # a. also get pcs status for cluster nodes
- # b. and get the rabbit pacemaker resource status,
- # c. and get the rabbitmqctl cluster_status output
- # 4) report estimated failover time
- # 5) power on node-2 and wait for it joined the rabbit cluster
- # 6) repeat 1-5
- echo start > out.txt
- count=0
- wait_ostf_ha() {
- passed=0
- while [ $passed -ne 1 ]; do
- out=`fuel health --check ha --env 1 2>/dev/null`
- echo $out | grep -e fail -e skip -e err -q
- passed=$?
- done
- }
- # before
- ssh node-1 "pcs constraint delete location-p_rabbitmq-server 2>/dev/null | true"
- fence_virsh -o status -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 2>&1 | grep -q OFF
- [[ $? -eq 1 ]] || (echo "Power on the node-2 first!";exit 1)
- # start test
- while true; do
- echo "Iteration ${count}" >> out.txt
- failover_start_time=`date +%s`
- # Move the rabbit master to the node-2
- ssh node-1 "pcs constraint delete location-p_rabbitmq-server 2>/dev/null | true"
- ssh node-1 "pcs constraint location p_rabbitmq-server rule role=master score=-INFINITY \#uname ne node-2.test.domain.local"
- sleep 2
- ssh node-1 "pcs constraint delete location-p_rabbitmq-server"
- # Wait for ostf ha passed
- wait_ostf_ha
- now_time=`date +%s`
- failover_time=$(( now_time - failover_start_time ))
- echo "Failover (moved master) time was: ${failover_time}" >> out.txt
- # Kill the node running the rabbit master resource (node-2)
- fence_virsh -o off -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 >> out.txt
- failover_start_time=`date +%s`
- # Wait for ostf ha passed
- wait_ostf_ha
- now_time=`date +%s`
- failover_time=$(( now_time - failover_start_time ))
- echo "Failover (master node destroyed) time was: ${failover_time}" >> out.txt
- # Check for corosync clusterstatus
- out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; crm_mon -1 | grep -i -e 'DC.*node' -e 'line:.*node' -e 'node.*UNCLEAN'\" | grep -A3 -e \"^[123]\"" 2>&1 &`
- sleep 5
- echo $out | tr -s "\\\\\n" "\n" | grep "^n\w.*" >> out.txt
- echo $out | grep -HE -q -e UNCLEAN -e WITHOUT -e 'OFFLINE: \[ (\S+) (\S+) \]'
- rc=$?
- [ $rc -eq 0 ] && (echo "CAUGHT COROSYNC SPLITBRAIN at $count";break)
- # Check for rabbit resource status
- out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; pcs resource | grep -A2 master_p_rabbitmq-server\" | grep -A3 -e \"^[123]\"" 2>&1 &`
- sleep 5
- echo $out | tr -s "\\\\\n" "\n" | grep -v stderr >> out.txt
- # Check for rabbit cluster status
- out=`dockerctl shell astute bash -c "mco rpc -v execute_shell_command execute cmd=\"hostname; date; rabbitmqctl cluster_status | grep -e nodes -e running\" | grep -A3 -e \"^[123]\"" 2>&1 &`
- sleep 5
- echo $out | tr -s "\\\\\n" "\n" | grep -v stderr >> out.txt
- # Power on the killed node-2
- fence_virsh -o on -a 10.109.0.1 --shell-timeout 10 --login-timeout 10 -l fence -p 0ff3nc3 -n env61_2_slave-03 >> out.txt
- # Do not reboot more often than each 10 minutes, or UMM will be activated
- # Wait for the node-2 joined rabbit cluster
- joined=1
- while [ $joined -ne 0 ]; do
- ssh node-1 rabbitmqctl cluster_status | grep -q running.*node-2
- joined=$?
- sleep 10
- done
- # Next iter
- count=$(( count += 1 ))
- done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement