Skip to content
This repository was archived by the owner on Sep 21, 2022. It is now read-only.

Commit e465f81

Browse files
APShirleypcf-core-services-writer
authored andcommitted
Use galera-healthcheck endpoint instead of connecting directly to mysql and drain with kill_and_wait
[#157656403] Signed-off-by: Joseph Palermo <[email protected]>
1 parent 01c449d commit e465f81

File tree

5 files changed

+34
-76
lines changed

5 files changed

+34
-76
lines changed

jobs/mysql/spec

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@ name: mysql
44
templates:
55
disable_mysql_cli_history.sh.erb: config/disable_mysql_cli_history.sh
66
drain.sh: bin/drain
7-
drain_user_setup.sql.erb: config/drain_user_setup.sql
8-
drain.cnf.erb: config/drain.cnf
97
mariadb_ctl.erb: bin/mariadb_ctl
108
my.cnf.erb: config/my.cnf
119
mylogin.cnf.erb: config/mylogin.cnf
@@ -207,9 +205,6 @@ properties:
207205
cf_mysql.mysql.galera_healthcheck.db_password:
208206
description: 'Password used by the sidecar to connect to the database'
209207

210-
cf_mysql.mysql.drain.db_password:
211-
description: 'Password used by the drain script to connect to the database. Use bosh "--skip-drain" flag when need to update drain db_password'
212-
213208
cf_mysql.mysql.disable_auto_sst:
214209
description: 'When disable_auto_sst is true, nodes unable to IST will be prevented from automatically deleting their data and performing an SST'
215210
default: false

jobs/mysql/templates/drain.cnf.erb

Lines changed: 0 additions & 3 deletions
This file was deleted.

jobs/mysql/templates/drain.sh

Lines changed: 34 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,54 @@
11
#!/usr/bin/env bash
22

3+
<% if p('cf_mysql_enabled') == true %>
34
set -e -o pipefail
45

56
<%
6-
require "shellwords"
7-
87
cluster_ips = link('mysql').instances.map(&:address)
98
if_link('arbitrator') do
109
cluster_ips += link('arbitrator').instances.map(&:address)
1110
end
1211
%>
1312

14-
CLUSTER_NODES=(<%= cluster_ips.map{|e| Shellwords.escape e}.join(' ') %>)
15-
MYSQL_PORT=<%= Shellwords.escape p("cf_mysql.mysql.port") %>
16-
17-
function prepend_datetime() {
18-
awk -W interactive '{ system("echo -n [$(date +%FT%T%z)]"); print " " $0 }'
19-
}
20-
21-
function wsrep_var() {
22-
local var_name="$1"
23-
local host="$2"
24-
if [[ $var_name =~ ^wsrep_[a-z_]+$ ]]; then
25-
timeout 5 \
26-
/usr/local/bin/mysql --defaults-file=/var/vcap/jobs/mysql/config/drain.cnf -h "$host" -P "$MYSQL_PORT" \
27-
--execute="SHOW STATUS LIKE '$var_name'" -N \
28-
| awk '{print $2}' \
29-
| tr -d '\n'
30-
fi
31-
}
32-
13+
CLUSTER_NODES=(<%= cluster_ips.map{|e| e }.join(' ') %>)
14+
MYSQL_PORT=<%= p("cf_mysql.mysql.port") %>
15+
GALERA_HEALTHCHECK_PORT=<%= p("cf_mysql.mysql.galera_healthcheck.port") %>
3316
LOG_DIR="/var/vcap/sys/log/mysql"
3417

35-
exec 3>&1
36-
exec \
37-
1> >(prepend_datetime >> $LOG_DIR/drain.out.log) \
38-
2> >(prepend_datetime >> $LOG_DIR/drain.err.log)
39-
40-
# if the node ain't running, ain't got nothin' to drain
41-
if ! ps -p $(</var/vcap/sys/run/mysql/mysql.pid) >/dev/null; then
42-
echo "mysql is not running: drain OK"
43-
echo 0 >&3; exit 0 # drain success
18+
# If the node is not running, exit drain successfully
19+
if ! ps -p "$(</var/vcap/sys/run/mysql/mysql.pid)" >/dev/null; then
20+
echo "$(date): mysql is not running: OK to drain" >> "${LOG_DIR}/drain.log"
21+
echo 0; exit 0 # drain success
4422
fi
4523

46-
# Check each cluster node's availability.
47-
# Jump to next node if unreachable(timeout 5 sec), then do not add it as test component.
48-
# Node may have been deleted or mysql port has been updated.
24+
# Check the galera healthcheck endpoint on all of the nodes. If the http status returned is 000, there
25+
# is no node at that IP, so we assume we are scaling down. If the http status returned is 200 from all nodes
26+
# it will continue to drain. If it detects any other nodes to be unhealthy, it will fail to drain
27+
# and exit.
4928
for NODE in "${CLUSTER_NODES[@]}"; do
50-
{ nc -zv -w 5 $NODE $MYSQL_PORT \
51-
&& CLUSTER_TEST_NODES=(${CLUSTER_TEST_NODES[@]} $NODE); } \
52-
|| continue
53-
done
54-
55-
# Check if all nodes are part of the PRIMARY component; if not then
56-
# something is terribly wrong (loss of quorum or split-brain) and doing a
57-
# rolling restart can actually cause data loss (e.g. if a node that is out
58-
# of sync is used to bootstrap the cluster): in this case we fail immediately.
59-
for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do
60-
cluster_status=$(wsrep_var wsrep_cluster_status "$TEST_NODE")
61-
if [ "$cluster_status" != Primary ]; then
62-
echo "wsrep_cluster_status of node '$TEST_NODE' is '$cluster_status' (expected 'Primary'): drain failed"
63-
exit -1 # drain failed
29+
set +e
30+
status_code=$(curl -s -o "/dev/null" -w "%{http_code}" "$NODE:$GALERA_HEALTHCHECK_PORT")
31+
set -e
32+
if [[ $status_code -eq 000 || $status_code -eq 200 ]]; then
33+
continue
34+
else
35+
echo "$(date): galera heathcheck returned $status_code; drain failed on node ${NODE}" >> "${LOG_DIR}/drain.err.log"
36+
exit -1
6437
fi
6538
done
6639

67-
# Check if all nodes are synced: if not we wait and retry
68-
# This check must be done against *ALL* nodes, not just against the local node.
69-
# Consider a 3 node cluster: if node1 is donor for node2 and we shut down node3
70-
# -that is synced- then node1 is joining, node2 is donor and node3 is down: as
71-
# a result the cluster lose quorum until node1/node2 complete the transfer!)
72-
for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do
73-
state=$(wsrep_var wsrep_local_state_comment "$TEST_NODE")
74-
if [ "$state" != Synced ]; then
75-
echo "wsrep_local_state_comment of node '$TEST_NODE' is '$state' (expected 'Synced'): retry drain in 5 seconds"
76-
# TODO: rewrite to avoid using dynamic drain (soon to be deprecated)
77-
echo -5 >&3; exit 0 # retry in 5 seconds
78-
fi
79-
done
40+
# Actually drain with a kill_and_wait on the mysql pid
41+
PIDFILE=/var/vcap/sys/run/mariadb_ctl/mariadb_ctl.pid
42+
source /var/vcap/packages/cf-mysql-common/pid_utils.sh
43+
44+
set +e
45+
kill_and_wait "${PIDFILE}" 300 0 > /dev/null
46+
return_code=$?
47+
48+
echo 0
49+
exit ${return_code}
8050

81-
echo "Drain Success"
82-
echo 0 >&3; exit 0 # drain success
51+
<% else %>
52+
echo 0
53+
exit 0
54+
<% end %>

jobs/mysql/templates/drain_user_setup.sql.erb

Lines changed: 0 additions & 5 deletions
This file was deleted.

jobs/mysql/templates/mariadb_ctl_config.yml.erb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ Db:
3939
Password: <%= seed["password"] %>
4040
<% end %>
4141
PostStartSQLFiles:
42-
- /var/vcap/jobs/mysql/config/drain_user_setup.sql
4342
- /var/vcap/jobs/mysql/config/galera_healthcheck_setup.sql
4443
- /var/vcap/jobs/mysql/config/cluster_health_logger_setup.sql
4544
Socket: /var/vcap/sys/run/mysql/mysqld.sock

0 commit comments

Comments
 (0)