|
1 | 1 | #!/usr/bin/env bash |
2 | 2 |
|
| 3 | +<% if p('cf_mysql_enabled') == true %> |
3 | 4 | set -e -o pipefail |
4 | 5 |
|
5 | 6 | <% |
6 | | - require "shellwords" |
7 | | - |
8 | 7 | cluster_ips = link('mysql').instances.map(&:address) |
9 | 8 | if_link('arbitrator') do |
10 | 9 | cluster_ips += link('arbitrator').instances.map(&:address) |
11 | 10 | end |
12 | 11 | %> |
13 | 12 |
|
14 | | -CLUSTER_NODES=(<%= cluster_ips.map{|e| Shellwords.escape e}.join(' ') %>) |
15 | | -MYSQL_PORT=<%= Shellwords.escape p("cf_mysql.mysql.port") %> |
16 | | - |
17 | | -function prepend_datetime() { |
18 | | - awk -W interactive '{ system("echo -n [$(date +%FT%T%z)]"); print " " $0 }' |
19 | | -} |
20 | | - |
21 | | -function wsrep_var() { |
22 | | - local var_name="$1" |
23 | | - local host="$2" |
24 | | - if [[ $var_name =~ ^wsrep_[a-z_]+$ ]]; then |
25 | | - timeout 5 \ |
26 | | - /usr/local/bin/mysql --defaults-file=/var/vcap/jobs/mysql/config/drain.cnf -h "$host" -P "$MYSQL_PORT" \ |
27 | | - --execute="SHOW STATUS LIKE '$var_name'" -N \ |
28 | | - | awk '{print $2}' \ |
29 | | - | tr -d '\n' |
30 | | - fi |
31 | | -} |
32 | | - |
| 13 | +CLUSTER_NODES=(<%= cluster_ips.map{|e| e }.join(' ') %>) |
| 14 | +MYSQL_PORT=<%= p("cf_mysql.mysql.port") %> |
| 15 | +GALERA_HEALTHCHECK_PORT=<%= p("cf_mysql.mysql.galera_healthcheck.port") %> |
33 | 16 | LOG_DIR="/var/vcap/sys/log/mysql" |
34 | 17 |
|
35 | | -exec 3>&1 |
36 | | -exec \ |
37 | | - 1> >(prepend_datetime >> $LOG_DIR/drain.out.log) \ |
38 | | - 2> >(prepend_datetime >> $LOG_DIR/drain.err.log) |
39 | | - |
40 | | -# if the node ain't running, ain't got nothin' to drain |
41 | | -if ! ps -p $(</var/vcap/sys/run/mysql/mysql.pid) >/dev/null; then |
42 | | - echo "mysql is not running: drain OK" |
43 | | - echo 0 >&3; exit 0 # drain success |
| 18 | +# If the node is not running, exit drain successfully |
| 19 | +if ! ps -p "$(</var/vcap/sys/run/mysql/mysql.pid)" >/dev/null; then |
| 20 | + echo "$(date): mysql is not running: OK to drain" >> "${LOG_DIR}/drain.log" |
| 21 | + echo 0; exit 0 # drain success |
44 | 22 | fi |
45 | 23 |
|
46 | | -# Check each cluster node's availability. |
47 | | -# Jump to next node if unreachable(timeout 5 sec), then do not add it as test component. |
48 | | -# Node may have been deleted or mysql port has been updated. |
| 24 | +# Check the galera healthcheck endpoint on all of the nodes. If the http status returned is 000, there |
| 25 | +# is no node at that IP, so we assume we are scaling down. If the http status returned is 200 from all nodes |
| 26 | +# it will continue to drain. If it detects any other nodes to be unhealthy, it will fail to drain |
| 27 | +# and exit. |
49 | 28 | for NODE in "${CLUSTER_NODES[@]}"; do |
50 | | - { nc -zv -w 5 $NODE $MYSQL_PORT \ |
51 | | - && CLUSTER_TEST_NODES=(${CLUSTER_TEST_NODES[@]} $NODE); } \ |
52 | | - || continue |
53 | | -done |
54 | | - |
55 | | -# Check if all nodes are part of the PRIMARY component; if not then |
56 | | -# something is terribly wrong (loss of quorum or split-brain) and doing a |
57 | | -# rolling restart can actually cause data loss (e.g. if a node that is out |
58 | | -# of sync is used to bootstrap the cluster): in this case we fail immediately. |
59 | | -for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do |
60 | | - cluster_status=$(wsrep_var wsrep_cluster_status "$TEST_NODE") |
61 | | - if [ "$cluster_status" != Primary ]; then |
62 | | - echo "wsrep_cluster_status of node '$TEST_NODE' is '$cluster_status' (expected 'Primary'): drain failed" |
63 | | - exit -1 # drain failed |
| 29 | + set +e |
| 30 | + status_code=$(curl -s -o "/dev/null" -w "%{http_code}" "$NODE:$GALERA_HEALTHCHECK_PORT") |
| 31 | + set -e |
| 32 | + if [[ $status_code -eq 000 || $status_code -eq 200 ]]; then |
| 33 | + continue |
| 34 | + else |
| 35 | + echo "$(date): galera heathcheck returned $status_code; drain failed on node ${NODE}" >> "${LOG_DIR}/drain.err.log" |
| 36 | + exit -1 |
64 | 37 | fi |
65 | 38 | done |
66 | 39 |
|
67 | | -# Check if all nodes are synced: if not we wait and retry |
68 | | -# This check must be done against *ALL* nodes, not just against the local node. |
69 | | -# Consider a 3 node cluster: if node1 is donor for node2 and we shut down node3 |
70 | | -# -that is synced- then node1 is joining, node2 is donor and node3 is down: as |
71 | | -# a result the cluster lose quorum until node1/node2 complete the transfer!) |
72 | | -for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do |
73 | | - state=$(wsrep_var wsrep_local_state_comment "$TEST_NODE") |
74 | | - if [ "$state" != Synced ]; then |
75 | | - echo "wsrep_local_state_comment of node '$TEST_NODE' is '$state' (expected 'Synced'): retry drain in 5 seconds" |
76 | | - # TODO: rewrite to avoid using dynamic drain (soon to be deprecated) |
77 | | - echo -5 >&3; exit 0 # retry in 5 seconds |
78 | | - fi |
79 | | -done |
| 40 | +# Actually drain with a kill_and_wait on the mysql pid |
| 41 | +PIDFILE=/var/vcap/sys/run/mariadb_ctl/mariadb_ctl.pid |
| 42 | +source /var/vcap/packages/cf-mysql-common/pid_utils.sh |
| 43 | + |
| 44 | +set +e |
| 45 | +kill_and_wait "${PIDFILE}" 300 0 > /dev/null |
| 46 | +return_code=$? |
| 47 | + |
| 48 | +echo 0 |
| 49 | +exit ${return_code} |
80 | 50 |
|
81 | | -echo "Drain Success" |
82 | | -echo 0 >&3; exit 0 # drain success |
| 51 | +<% else %> |
| 52 | +echo 0 |
| 53 | +exit 0 |
| 54 | +<% end %> |
0 commit comments