Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions ops/pod_replace_drain_agent.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!bash

function mesos_agent_get_tasks () {
local agent_id=$1

curl -skSL \
-H "Authorization: token=$(dcos config show core.dcos_acs_token)" \
-H "Content-Type: application/json" \
"$(dcos config show core.dcos_url)/mesos/state" | \
jq -er ".frameworks[].tasks[] | select(.slave_id==\"${agent_id}\") | select(.state==\"TASK_RUNNING\") | {id: .id, name: .name, framework_id: .framework_id, executor_id: .executor_id, state: .state}"
}

function mesos_framework_name_for_id() {
local framework_id=$1
curl -skSL \
-H "Authorization: token=$(dcos config show core.dcos_acs_token)" \
-H "Content-Type: application/json" \
"$(dcos config show core.dcos_url)/mesos/frameworks?framework_id=${framework_id}" | \
jq -er '.frameworks[].name'
}

echo "Getting the list of tasks on ${1} ..." > /dev/stderr
tasks=$(mesos_agent_get_tasks $1)
task_names=$(echo "${tasks}" | jq -er '.name')

for framework_id in $(echo "${tasks}" | jq -r .framework_id | uniq); do
framework_name=$(mesos_framework_name_for_id ${framework_id})
echo "Framework ${framework_name} has tasks on node." > /dev/stderr

# Trick ahead: It doesnt really matter which sub-command to use, as long as its a SDK
# service and name points to the correct framework.
pod_list=$(dcos kafka --name="${framework_name}" pod list | jq -er '.[]' 2>/dev/null)
[ "${pod_list}" != "" ] || continue
for pod in ${pod_list}; do
if echo "$task_names" | grep -qE "^${pod}"; then
echo "Pod to issue replace for \"${pod}\":" > /dev/stderr
echo "dcos kafka --name="${framework_name}" pod replace ${pod}"
fi
done
done
41 changes: 41 additions & 0 deletions ops/safe_node_decommission.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Replace an agent node with data services and/or K8s

## 1. Get a list of nodes to decommission, have their mesos internal agent uuid ready
```
agent1-cluster111.team.acme.com 172.31.3.226 85409ae2-9f36-4738-ae5d-712bba77ebc3-S13
agent2-cluster111.team.acme.com 172.31.14.16 aaf0a62f-a6eb-4c1d-80db-5fdd26fe8008-S2
[...]
```

## 2. Move one-by-one: Set the first agent into maintenance mode
```
# taken from https://docs.mesosphere.com/1.11/administering-clusters/update-a-node/
cat <<EOF > maintenance.json
{
"windows" : [
{
"machine_ids" : [
{ "hostname" : "agent1-cluster111.team.acme.com", "ip" : "172.31.3.226" }
],
"unavailability" : {
"start" : { "nanoseconds" : 1 },
"duration" : { "nanoseconds" : 3600000000000 }
}
}
]
}
EOF
bash ../mesos/maintain-agents.sh maintenance.json
```

## 3. Issue pod replacements for each SDK based service
```
eval $(bash pod_replace_drain_agent.sh '85409ae2-9f36-4738-ae5d-712bba77ebc3-S1')
```

## 4. Check if the node is really drained from SDK services, wait just a little longer, check that the service is healthy and on a new node

## 5. Decommission node
```
dcos node decommission 85409ae2-9f36-4738-ae5d-712bba77ebc3-S13
```