mongodb
diff --git a/‎cmd/readiness/main.go
Lines changed: 224 additions & 0 deletions b/‎cmd/readiness/main.go
Lines changed: 224 additions & 0 deletions
diff --git a/‎cmd/readiness/readiness_test.go
Lines changed: 129 additions & 0 deletions b/‎cmd/readiness/readiness_test.go
Lines changed: 129 additions & 0 deletions
diff --git a/‎cmd/readiness/testdata/config-current-version.json
Lines changed: 3 additions & 0 deletions b/‎cmd/readiness/testdata/config-current-version.json
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmd/readiness/testdata/config-new-version.json
Lines changed: 3 additions & 0 deletions b/‎cmd/readiness/testdata/config-new-version.json
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,224 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"time"
+
+	"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/config"
+	"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/headless"
+	"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/health"
+	"github.com/mongodb/mongodb-kubernetes-operator/pkg/util/contains"
+
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+
+	"go.uber.org/zap"
+)
+
+const (
+	headlessAgent = "HEADLESS_AGENT"
+)
+
+var riskySteps []string
+var logger *zap.SugaredLogger
+
+func init() {
+	riskySteps = []string{"WaitAllRsMembersUp", "WaitRsInit"}
+
+	// By default we log to the output (convenient for tests)
+	cfg := zap.NewDevelopmentConfig()
+	log, err := cfg.Build()
+	if err != nil {
+		panic(err)
+	}
+	logger = log.Sugar()
+}
+
+// isPodReady main function which makes decision if the pod is ready or not. The decision is based on the information
+// from the AA health status file.
+// The logic depends on if the pod is a standard MongoDB or an AppDB one.
+// - If MongoDB: then just the 'statuses[0].IsInGoalState` field is used to learn if the Agent has reached the goal
+// - if AppDB: the 'mmsStatus[0].lastGoalVersionAchieved' field is compared with the one from mounted automation config
+// Additionally if the previous check hasn't returned 'true' the "deadlock" case is checked to make sure the Agent is
+// not waiting for the other members.
+func isPodReady(conf config.Config) bool {
+	fd, err := os.Open(conf.HealthStatusFilePath)
+	if err != nil {
+		logger.Warn("No health status file exists, assuming the Automation agent is old")
+		return true
+	}
+	defer fd.Close()
+
+	health, err := readAgentHealthStatus(fd)
+	if err != nil {
+		logger.Errorf("Failed to read agent health status file: %s", err)
+		// panicking allows to see the problem in the events for the pod (kubectl describe pod ..)
+		panic("Failed to read agent health status file: %s")
+	}
+
+	// The 'statuses' file can be empty only for OM Agents
+	if len(health.Healthiness) == 0 && !isHeadlessMode() {
+		logger.Info("'statuses' is empty. We assume there is no automation config for the agent yet.")
+		return true
+	}
+
+	// If the agent has reached the goal state - returning true
+	ok, err := isInGoalState(health, conf)
+
+	if err != nil {
+		logger.Errorf("There was problem checking the health status: %s", err)
+		panic(err)
+	}
+
+	if ok {
+		logger.Info("Agent has reached goal state")
+		return true
+	}
+
+	// Failback logic: the agent is not in goal state and got stuck in some steps
+	if hasDeadlockedSteps(health) {
+		return true
+	}
+
+	return false
+}
+
+func readAgentHealthStatus(file *os.File) (health.Status, error) {
+	var health health.Status
+
+	data, err := ioutil.ReadAll(file)
+	if err != nil {
+		return health, err
+	}
+
+	err = json.Unmarshal(data, &health)
+	return health, err
+}
+
+// hasDeadlockedSteps returns true if the agent is stuck on waiting for the other agents
+func hasDeadlockedSteps(health health.Status) bool {
+	currentStep := findCurrentStep(health.ProcessPlans)
+	if currentStep != nil {
+		return isDeadlocked(currentStep)
+	}
+	return false
+}
+
+// findCurrentStep returns the step which seems to be run by the Agent now. The step is always in the last plan
+// (see https://github.com/10gen/ops-manager-kubernetes/pull/401#discussion_r333071555) so we iterate over all the steps
+// there and find the last step which has "Started" non nil
+// (indeed this is not the perfect logic as sometimes the agent doesn't update the 'Started' as well - see
+// 'health-status-ok.json', but seems it works for finding deadlocks still
+//noinspection GoNilness
+func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *health.StepStatus {
+	var currentPlan *health.PlanStatus
+	if len(processStatuses) == 0 {
+		// Seems shouldn't happen but let's check anyway - may be needs to be changed to Info if this happens
+		logger.Warnf("There is no information about Agent process plans")
+		return nil
+	}
+	if len(processStatuses) > 1 {
+		logger.Errorf("Only one process status is expected but got %d!", len(processStatuses))
+		return nil
+	}
+	// There is always only one process managed by the Agent - so there will be only one loop
+	for k, v := range processStatuses {
+		if len(v.Plans) == 0 {
+			logger.Errorf("The process %s doesn't contain any plans!", k)
+			return nil
+		}
+		currentPlan = v.Plans[len(v.Plans)-1]
+	}
+
+	if currentPlan.Completed != nil {
+		logger.Debugf("The Agent hasn't reported working on the new config yet, the last plan finished at %s",
+			currentPlan.Completed.Format(time.RFC3339))
+		return nil
+	}
+
+	var lastStartedStep *health.StepStatus
+	for _, m := range currentPlan.Moves {
+		for _, s := range m.Steps {
+			if s.Started != nil {
+				lastStartedStep = s
+			}
+		}
+	}
+
+	return lastStartedStep
+}
+
+func isDeadlocked(status *health.StepStatus) bool {
+	// Some logic behind 15 seconds: the health status file is dumped each 10 seconds so we are sure that if the agent
+	// has been in the the step for 10 seconds - this means it is waiting for the other hosts and they are not available
+	fifteenSecondsAgo := time.Now().Add(time.Duration(-15) * time.Second)
+	if contains.String(riskySteps, status.Step) && status.Completed == nil && status.Started.Before(fifteenSecondsAgo) {
+		logger.Infof("Indicated a possible deadlock, status: %s, started at %s but hasn't finished "+
+			"yet. Marking the probe as ready", status.Step, status.Started.Format(time.RFC3339))
+		return true
+	}
+	return false
+}
+
+func isInGoalState(health health.Status, conf config.Config) (bool, error) {
+	if isHeadlessMode() {
+		return headless.PerformCheckHeadlessMode(health, conf)
+	}
+	return performCheckOMMode(health), nil
+}
+
+// performCheckOMMode does a general check if the Agent has reached the goal state - must be called when Agent is in
+// "OM mode"
+func performCheckOMMode(health health.Status) bool {
+	for _, v := range health.Healthiness {
+		logger.Debug(v)
+		if v.IsInGoalState {
+			return true
+		}
+	}
+	return false
+}
+
+func isHeadlessMode() bool {
+	return os.Getenv(headlessAgent) == "true"
+}
+
+func kubernetesClientset() (kubernetes.Interface, error) {
+	config, err := rest.InClusterConfig()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get in cluster config: %s", err)
+	}
+	// creates the clientset
+	clientset, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to build config: %s", err)
+	}
+	return clientset, nil
+}
+func main() {
+	clientSet, err := kubernetesClientset()
+	if err != nil {
+		panic(err)
+	}
+
+	config, err := config.BuildFromEnvVariables(clientSet, isHeadlessMode())
+	if err != nil {
+		panic(err)
+	}
+	cfg := zap.NewDevelopmentConfig()
+	// In production we log to the file
+	cfg.OutputPaths = []string{
+		config.LogFilePath,
+	}
+	log, err := cfg.Build()
+	if err != nil {
+		panic(err)
+	}
+	logger = log.Sugar()
+	if !isPodReady(config) {
+		os.Exit(1)
+	}
+}
@@ -0,0 +1,129 @@
+package main
+
+import (
+	"context"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/mongodb/mongodb-kubernetes-operator/cmd/readiness/testdata"
+	"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/config"
+	"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/health"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// TestDeadlockDetection verifies that if the agent is stuck in "WaitAllRsMembersUp" phase (started > 15 seconds ago)
+// then the function returns "ready"
+func TestDeadlockDetection(t *testing.T) {
+	assert.True(t, isPodReady(testConfig("testdata/health-status-deadlocked.json")))
+}
+
+// TestNoDeadlock verifies that if the agent has started (but not finished) "WaitRsInit" and then there is another
+// started phase ("WaitFeatureCompatibilityVersionCorrect") then no deadlock is found as the latter is considered to
+// be the "current" step
+func TestNoDeadlock(t *testing.T) {
+	health := readHealthinessFile("testdata/health-status-no-deadlock.json")
+	stepStatus := findCurrentStep(health.ProcessPlans)
+
+	assert.Equal(t, "WaitFeatureCompatibilityVersionCorrect", stepStatus.Step)
+
+	assert.False(t, isPodReady(testConfig("testdata/health-status-no-deadlock.json")))
+}
+
+// TestDeadlockDetection verifies that if the agent is in "WaitAllRsMembersUp" phase but started < 15 seconds ago
+// then the function returns "not ready". To achieve this "started" is put into some long future.
+// Note, that the status file is artificial: it has two plans (the first one is complete and has no moves) to make sure
+// the readiness logic takes only the last plan for consideration
+func TestNotReadyWaitingForRsReady(t *testing.T) {
+	assert.False(t, isPodReady(testConfig("testdata/health-status-pending.json")))
+}
+
+// TestNotReadyHealthFileHasNoPlans verifies that the readiness script doesn't panic if the health file has unexpected
+// data (there are no plans at all)
+func TestNotReadyHealthFileHasNoPlans(t *testing.T) {
+	assert.False(t, isPodReady(testConfig("testdata/health-status-no-plans.json")))
+}
+
+// TestNotReadyHealthFileHasNoProcesses verifies that the readiness script doesn't panic if the health file has unexpected
+// data (there are no processes at all)
+func TestNotReadyHealthFileHasNoProcesses(t *testing.T) {
+	assert.False(t, isPodReady(testConfig("testdata/health-status-no-processes.json")))
+}
+
+// TestReady verifies that the probe reports "ready" despite "WaitRsInit" stage reporting as not reached
+// (this is some bug in Automation Agent which we can work with)
+func TestReady(t *testing.T) {
+	assert.True(t, isPodReady(testConfig("testdata/health-status-ok.json")))
+}
+
+// TestNoDeadlockForDownloadProcess verifies that the steps not listed as "riskySteps" (like "download") are not
+// considered as stuck
+func TestNoDeadlockForDownloadProcess(t *testing.T) {
+	before := time.Now().Add(time.Duration(-30) * time.Second)
+	downloadStatus := &health.StepStatus{
+		Step:      "Download",
+		Started:   &before,
+		Completed: nil,
+		Result:    "",
+	}
+
+	assert.False(t, isDeadlocked(downloadStatus))
+}
+
+// TestNoDeadlockForImmediateWaitRs verifies the "WaitRsInit" step is not marked as deadlocked if
+// it was started < 15 seconds ago
+func TestNoDeadlockForImmediateWaitRs(t *testing.T) {
+	before := time.Now().Add(time.Duration(-10) * time.Second)
+	downloadStatus := &health.StepStatus{
+		Step:      "WaitRsInit",
+		Started:   &before,
+		Completed: nil,
+		Result:    "Wait",
+	}
+
+	assert.False(t, isDeadlocked(downloadStatus))
+}
+
+// TestHeadlessAgentHasntReachedGoal verifies that the probe reports "false" if the config version is higher than the
+// last achieved version of the Agent
+// Note that the edge case is checked here: the health-status-ok.json has the "WaitRsInit" phase stuck in the last plan
+// (as Agent doesn't marks all the step statuses finished when it reaches the goal) but this doesn't affect the result
+// as the whole plan is complete already
+func TestHeadlessAgentHasntReachedGoal(t *testing.T) {
+	_ = os.Setenv(headlessAgent, "true")
+	c := testConfig("testdata/health-status-ok.json")
+	c.ClientSet = fake.NewSimpleClientset(testdata.TestPod(c.Namespace, c.Hostname), testdata.TestSecret(c.Namespace, c.AutomationConfigSecretName, 6))
+	assert.False(t, isPodReady(c))
+	thePod, _ := c.ClientSet.CoreV1().Pods(c.Namespace).Get(context.TODO(), c.Hostname, metav1.GetOptions{})
+	assert.Equal(t, map[string]string{"agent.mongodb.com/version": "5"}, thePod.Annotations)
+}
+
+// TestHeadlessAgentReachedGoal verifies that the probe reports "true" if the config version is equal to the
+// last achieved version of the Agent
+func TestHeadlessAgentReachedGoal(t *testing.T) {
+	_ = os.Setenv(headlessAgent, "true")
+	c := testConfig("testdata/health-status-ok.json")
+	c.ClientSet = fake.NewSimpleClientset(testdata.TestPod(c.Namespace, c.Hostname), testdata.TestSecret(c.Namespace, c.AutomationConfigSecretName, 5))
+	assert.True(t, isPodReady(c))
+	thePod, _ := c.ClientSet.CoreV1().Pods(c.Namespace).Get(context.TODO(), c.Hostname, metav1.GetOptions{})
+	assert.Equal(t, map[string]string{"agent.mongodb.com/version": "5"}, thePod.Annotations)
+}
+
+func readHealthinessFile(path string) health.Status {
+	fd, _ := os.Open(path)
+	health, _ := readAgentHealthStatus(fd)
+	return health
+}
+
+func testConfig(healthFilePath string) config.Config {
+	return config.Config{
+		HealthStatusFilePath:       healthFilePath,
+		Namespace:                  "test-ns",
+		AutomationConfigSecretName: "test-mongodb-automation-config",
+		Hostname:                   "test-mongodb-0",
+	}
+}
@@ -0,0 +1,3 @@
+{
+    "version": 5
+}
@@ -0,0 +1,3 @@
+{
+    "version": 6
+}