Skip to content

Commit 7280716

Browse files
authored
CLOUDP-75590: Opensource Readiness Probe Code (#337)
1 parent 6b76b5a commit 7280716

22 files changed

+1140
-4
lines changed

cmd/readiness/main.go

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"io/ioutil"
7+
"os"
8+
"time"
9+
10+
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/config"
11+
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/headless"
12+
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/health"
13+
"github.com/mongodb/mongodb-kubernetes-operator/pkg/util/contains"
14+
15+
"k8s.io/client-go/kubernetes"
16+
"k8s.io/client-go/rest"
17+
18+
"go.uber.org/zap"
19+
)
20+
21+
const (
22+
headlessAgent = "HEADLESS_AGENT"
23+
)
24+
25+
var riskySteps []string
26+
var logger *zap.SugaredLogger
27+
28+
func init() {
29+
riskySteps = []string{"WaitAllRsMembersUp", "WaitRsInit"}
30+
31+
// By default we log to the output (convenient for tests)
32+
cfg := zap.NewDevelopmentConfig()
33+
log, err := cfg.Build()
34+
if err != nil {
35+
panic(err)
36+
}
37+
logger = log.Sugar()
38+
}
39+
40+
// isPodReady main function which makes decision if the pod is ready or not. The decision is based on the information
41+
// from the AA health status file.
42+
// The logic depends on if the pod is a standard MongoDB or an AppDB one.
43+
// - If MongoDB: then just the 'statuses[0].IsInGoalState` field is used to learn if the Agent has reached the goal
44+
// - if AppDB: the 'mmsStatus[0].lastGoalVersionAchieved' field is compared with the one from mounted automation config
45+
// Additionally if the previous check hasn't returned 'true' the "deadlock" case is checked to make sure the Agent is
46+
// not waiting for the other members.
47+
func isPodReady(conf config.Config) bool {
48+
fd, err := os.Open(conf.HealthStatusFilePath)
49+
if err != nil {
50+
logger.Warn("No health status file exists, assuming the Automation agent is old")
51+
return true
52+
}
53+
defer fd.Close()
54+
55+
health, err := readAgentHealthStatus(fd)
56+
if err != nil {
57+
logger.Errorf("Failed to read agent health status file: %s", err)
58+
// panicking allows to see the problem in the events for the pod (kubectl describe pod ..)
59+
panic("Failed to read agent health status file: %s")
60+
}
61+
62+
// The 'statuses' file can be empty only for OM Agents
63+
if len(health.Healthiness) == 0 && !isHeadlessMode() {
64+
logger.Info("'statuses' is empty. We assume there is no automation config for the agent yet.")
65+
return true
66+
}
67+
68+
// If the agent has reached the goal state - returning true
69+
ok, err := isInGoalState(health, conf)
70+
71+
if err != nil {
72+
logger.Errorf("There was problem checking the health status: %s", err)
73+
panic(err)
74+
}
75+
76+
if ok {
77+
logger.Info("Agent has reached goal state")
78+
return true
79+
}
80+
81+
// Failback logic: the agent is not in goal state and got stuck in some steps
82+
if hasDeadlockedSteps(health) {
83+
return true
84+
}
85+
86+
return false
87+
}
88+
89+
func readAgentHealthStatus(file *os.File) (health.Status, error) {
90+
var health health.Status
91+
92+
data, err := ioutil.ReadAll(file)
93+
if err != nil {
94+
return health, err
95+
}
96+
97+
err = json.Unmarshal(data, &health)
98+
return health, err
99+
}
100+
101+
// hasDeadlockedSteps returns true if the agent is stuck on waiting for the other agents
102+
func hasDeadlockedSteps(health health.Status) bool {
103+
currentStep := findCurrentStep(health.ProcessPlans)
104+
if currentStep != nil {
105+
return isDeadlocked(currentStep)
106+
}
107+
return false
108+
}
109+
110+
// findCurrentStep returns the step which seems to be run by the Agent now. The step is always in the last plan
111+
// (see https://github.com/10gen/ops-manager-kubernetes/pull/401#discussion_r333071555) so we iterate over all the steps
112+
// there and find the last step which has "Started" non nil
113+
// (indeed this is not the perfect logic as sometimes the agent doesn't update the 'Started' as well - see
114+
// 'health-status-ok.json', but seems it works for finding deadlocks still
115+
//noinspection GoNilness
116+
func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *health.StepStatus {
117+
var currentPlan *health.PlanStatus
118+
if len(processStatuses) == 0 {
119+
// Seems shouldn't happen but let's check anyway - may be needs to be changed to Info if this happens
120+
logger.Warnf("There is no information about Agent process plans")
121+
return nil
122+
}
123+
if len(processStatuses) > 1 {
124+
logger.Errorf("Only one process status is expected but got %d!", len(processStatuses))
125+
return nil
126+
}
127+
// There is always only one process managed by the Agent - so there will be only one loop
128+
for k, v := range processStatuses {
129+
if len(v.Plans) == 0 {
130+
logger.Errorf("The process %s doesn't contain any plans!", k)
131+
return nil
132+
}
133+
currentPlan = v.Plans[len(v.Plans)-1]
134+
}
135+
136+
if currentPlan.Completed != nil {
137+
logger.Debugf("The Agent hasn't reported working on the new config yet, the last plan finished at %s",
138+
currentPlan.Completed.Format(time.RFC3339))
139+
return nil
140+
}
141+
142+
var lastStartedStep *health.StepStatus
143+
for _, m := range currentPlan.Moves {
144+
for _, s := range m.Steps {
145+
if s.Started != nil {
146+
lastStartedStep = s
147+
}
148+
}
149+
}
150+
151+
return lastStartedStep
152+
}
153+
154+
func isDeadlocked(status *health.StepStatus) bool {
155+
// Some logic behind 15 seconds: the health status file is dumped each 10 seconds so we are sure that if the agent
156+
// has been in the the step for 10 seconds - this means it is waiting for the other hosts and they are not available
157+
fifteenSecondsAgo := time.Now().Add(time.Duration(-15) * time.Second)
158+
if contains.String(riskySteps, status.Step) && status.Completed == nil && status.Started.Before(fifteenSecondsAgo) {
159+
logger.Infof("Indicated a possible deadlock, status: %s, started at %s but hasn't finished "+
160+
"yet. Marking the probe as ready", status.Step, status.Started.Format(time.RFC3339))
161+
return true
162+
}
163+
return false
164+
}
165+
166+
func isInGoalState(health health.Status, conf config.Config) (bool, error) {
167+
if isHeadlessMode() {
168+
return headless.PerformCheckHeadlessMode(health, conf)
169+
}
170+
return performCheckOMMode(health), nil
171+
}
172+
173+
// performCheckOMMode does a general check if the Agent has reached the goal state - must be called when Agent is in
174+
// "OM mode"
175+
func performCheckOMMode(health health.Status) bool {
176+
for _, v := range health.Healthiness {
177+
logger.Debug(v)
178+
if v.IsInGoalState {
179+
return true
180+
}
181+
}
182+
return false
183+
}
184+
185+
func isHeadlessMode() bool {
186+
return os.Getenv(headlessAgent) == "true"
187+
}
188+
189+
func kubernetesClientset() (kubernetes.Interface, error) {
190+
config, err := rest.InClusterConfig()
191+
if err != nil {
192+
return nil, fmt.Errorf("failed to get in cluster config: %s", err)
193+
}
194+
// creates the clientset
195+
clientset, err := kubernetes.NewForConfig(config)
196+
if err != nil {
197+
return nil, fmt.Errorf("failed to build config: %s", err)
198+
}
199+
return clientset, nil
200+
}
201+
func main() {
202+
clientSet, err := kubernetesClientset()
203+
if err != nil {
204+
panic(err)
205+
}
206+
207+
config, err := config.BuildFromEnvVariables(clientSet, isHeadlessMode())
208+
if err != nil {
209+
panic(err)
210+
}
211+
cfg := zap.NewDevelopmentConfig()
212+
// In production we log to the file
213+
cfg.OutputPaths = []string{
214+
config.LogFilePath,
215+
}
216+
log, err := cfg.Build()
217+
if err != nil {
218+
panic(err)
219+
}
220+
logger = log.Sugar()
221+
if !isPodReady(config) {
222+
os.Exit(1)
223+
}
224+
}

cmd/readiness/readiness_test.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"os"
6+
"testing"
7+
"time"
8+
9+
"github.com/mongodb/mongodb-kubernetes-operator/cmd/readiness/testdata"
10+
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/config"
11+
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/health"
12+
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/client-go/kubernetes/fake"
15+
16+
"github.com/stretchr/testify/assert"
17+
)
18+
19+
// TestDeadlockDetection verifies that if the agent is stuck in "WaitAllRsMembersUp" phase (started > 15 seconds ago)
20+
// then the function returns "ready"
21+
func TestDeadlockDetection(t *testing.T) {
22+
assert.True(t, isPodReady(testConfig("testdata/health-status-deadlocked.json")))
23+
}
24+
25+
// TestNoDeadlock verifies that if the agent has started (but not finished) "WaitRsInit" and then there is another
26+
// started phase ("WaitFeatureCompatibilityVersionCorrect") then no deadlock is found as the latter is considered to
27+
// be the "current" step
28+
func TestNoDeadlock(t *testing.T) {
29+
health := readHealthinessFile("testdata/health-status-no-deadlock.json")
30+
stepStatus := findCurrentStep(health.ProcessPlans)
31+
32+
assert.Equal(t, "WaitFeatureCompatibilityVersionCorrect", stepStatus.Step)
33+
34+
assert.False(t, isPodReady(testConfig("testdata/health-status-no-deadlock.json")))
35+
}
36+
37+
// TestDeadlockDetection verifies that if the agent is in "WaitAllRsMembersUp" phase but started < 15 seconds ago
38+
// then the function returns "not ready". To achieve this "started" is put into some long future.
39+
// Note, that the status file is artificial: it has two plans (the first one is complete and has no moves) to make sure
40+
// the readiness logic takes only the last plan for consideration
41+
func TestNotReadyWaitingForRsReady(t *testing.T) {
42+
assert.False(t, isPodReady(testConfig("testdata/health-status-pending.json")))
43+
}
44+
45+
// TestNotReadyHealthFileHasNoPlans verifies that the readiness script doesn't panic if the health file has unexpected
46+
// data (there are no plans at all)
47+
func TestNotReadyHealthFileHasNoPlans(t *testing.T) {
48+
assert.False(t, isPodReady(testConfig("testdata/health-status-no-plans.json")))
49+
}
50+
51+
// TestNotReadyHealthFileHasNoProcesses verifies that the readiness script doesn't panic if the health file has unexpected
52+
// data (there are no processes at all)
53+
func TestNotReadyHealthFileHasNoProcesses(t *testing.T) {
54+
assert.False(t, isPodReady(testConfig("testdata/health-status-no-processes.json")))
55+
}
56+
57+
// TestReady verifies that the probe reports "ready" despite "WaitRsInit" stage reporting as not reached
58+
// (this is some bug in Automation Agent which we can work with)
59+
func TestReady(t *testing.T) {
60+
assert.True(t, isPodReady(testConfig("testdata/health-status-ok.json")))
61+
}
62+
63+
// TestNoDeadlockForDownloadProcess verifies that the steps not listed as "riskySteps" (like "download") are not
64+
// considered as stuck
65+
func TestNoDeadlockForDownloadProcess(t *testing.T) {
66+
before := time.Now().Add(time.Duration(-30) * time.Second)
67+
downloadStatus := &health.StepStatus{
68+
Step: "Download",
69+
Started: &before,
70+
Completed: nil,
71+
Result: "",
72+
}
73+
74+
assert.False(t, isDeadlocked(downloadStatus))
75+
}
76+
77+
// TestNoDeadlockForImmediateWaitRs verifies the "WaitRsInit" step is not marked as deadlocked if
78+
// it was started < 15 seconds ago
79+
func TestNoDeadlockForImmediateWaitRs(t *testing.T) {
80+
before := time.Now().Add(time.Duration(-10) * time.Second)
81+
downloadStatus := &health.StepStatus{
82+
Step: "WaitRsInit",
83+
Started: &before,
84+
Completed: nil,
85+
Result: "Wait",
86+
}
87+
88+
assert.False(t, isDeadlocked(downloadStatus))
89+
}
90+
91+
// TestHeadlessAgentHasntReachedGoal verifies that the probe reports "false" if the config version is higher than the
92+
// last achieved version of the Agent
93+
// Note that the edge case is checked here: the health-status-ok.json has the "WaitRsInit" phase stuck in the last plan
94+
// (as Agent doesn't marks all the step statuses finished when it reaches the goal) but this doesn't affect the result
95+
// as the whole plan is complete already
96+
func TestHeadlessAgentHasntReachedGoal(t *testing.T) {
97+
_ = os.Setenv(headlessAgent, "true")
98+
c := testConfig("testdata/health-status-ok.json")
99+
c.ClientSet = fake.NewSimpleClientset(testdata.TestPod(c.Namespace, c.Hostname), testdata.TestSecret(c.Namespace, c.AutomationConfigSecretName, 6))
100+
assert.False(t, isPodReady(c))
101+
thePod, _ := c.ClientSet.CoreV1().Pods(c.Namespace).Get(context.TODO(), c.Hostname, metav1.GetOptions{})
102+
assert.Equal(t, map[string]string{"agent.mongodb.com/version": "5"}, thePod.Annotations)
103+
}
104+
105+
// TestHeadlessAgentReachedGoal verifies that the probe reports "true" if the config version is equal to the
106+
// last achieved version of the Agent
107+
func TestHeadlessAgentReachedGoal(t *testing.T) {
108+
_ = os.Setenv(headlessAgent, "true")
109+
c := testConfig("testdata/health-status-ok.json")
110+
c.ClientSet = fake.NewSimpleClientset(testdata.TestPod(c.Namespace, c.Hostname), testdata.TestSecret(c.Namespace, c.AutomationConfigSecretName, 5))
111+
assert.True(t, isPodReady(c))
112+
thePod, _ := c.ClientSet.CoreV1().Pods(c.Namespace).Get(context.TODO(), c.Hostname, metav1.GetOptions{})
113+
assert.Equal(t, map[string]string{"agent.mongodb.com/version": "5"}, thePod.Annotations)
114+
}
115+
116+
func readHealthinessFile(path string) health.Status {
117+
fd, _ := os.Open(path)
118+
health, _ := readAgentHealthStatus(fd)
119+
return health
120+
}
121+
122+
func testConfig(healthFilePath string) config.Config {
123+
return config.Config{
124+
HealthStatusFilePath: healthFilePath,
125+
Namespace: "test-ns",
126+
AutomationConfigSecretName: "test-mongodb-automation-config",
127+
Hostname: "test-mongodb-0",
128+
}
129+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"version": 5
3+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"version": 6
3+
}

0 commit comments

Comments
 (0)