From 28f082a8871deaa4a73f654ceb421d5984880875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Tue, 3 Dec 2024 17:35:29 +0100 Subject: [PATCH 01/10] comment lots of stuff to simplify dev --- test/e2e/kind_test.go | 279 ++++++++++++++++++------------------ test/e2e/rc_updater_test.go | 23 ++- 2 files changed, 152 insertions(+), 150 deletions(-) diff --git a/test/e2e/kind_test.go b/test/e2e/kind_test.go index ebb7df0f2..91aebd052 100644 --- a/test/e2e/kind_test.go +++ b/test/e2e/kind_test.go @@ -23,6 +23,7 @@ import ( "github.com/DataDog/datadog-agent/test/new-e2e/pkg/runner/parameters" "github.com/DataDog/datadog-api-client-go/v2/api/datadog" "github.com/DataDog/datadog-api-client-go/v2/api/datadogV1" + "github.com/DataDog/datadog-api-client-go/v2/api/datadogV2" "github.com/DataDog/test-infra-definitions/common/utils" "github.com/DataDog/test-infra-definitions/components/datadog/agent" localKubernetes "github.com/DataDog/test-infra-definitions/components/kubernetes" @@ -36,7 +37,7 @@ import ( "github.com/pulumi/pulumi/sdk/v3/go/auto" "github.com/pulumi/pulumi/sdk/v3/go/pulumi" "github.com/stretchr/testify/assert" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + // v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) type kindEnv struct { @@ -52,6 +53,7 @@ type datadogClient struct { ctx context.Context metricsApi *datadogV1.MetricsApi logsApi *datadogV1.LogsApi + spansApi *datadogV2.SpansApi } func (suite *kindSuite) SetupSuite() { @@ -81,7 +83,8 @@ func TestKindSuite(t *testing.T) { e2eParams := []e2e.SuiteOption{ e2e.WithStackName(fmt.Sprintf("operator-kind-%s", k8sVersion)), e2e.WithProvisioner(kindProvisioner(k8sVersion, nil)), - e2e.WithDevMode(), + // Un-comment to keep the VM running after the test + // e2e.WithDevMode(), } e2e.Run[kindEnv](t, &kindSuite{}, e2eParams...) @@ -230,142 +233,142 @@ func (s *kindSuite) TestKindRun() { verifyNumPodsForSelector(t, kubectlOptions, 1, clusterAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-minimum") }) - s.T().Run("Autodiscovery works", func(t *testing.T) { - // Add nginx with annotations - var nginxConfigPath string - nginxConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "autodiscovery-annotation.yaml")) - assert.NoError(t, err) - k8s.KubectlApply(t, kubectlOptions, nginxConfigPath) - - verifyNumPodsForSelector(t, kubectlOptions, 1, "agent.datadoghq.com/e2e-test=datadog-agent-autodiscovery-annotated") - - // check agent pods for http check - s.EventuallyWithTf(func(c *assert.CollectT) { - agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", - }) - assert.NoError(c, err) - - for _, pod := range agentPods { - k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "-j") - assert.NoError(c, err) - - verifyCheck(c, output, "http_check") - } - }, 900*time.Second, 30*time.Second, "could not validate http check on agent pod") - - s.EventuallyWithTf(func(c *assert.CollectT) { - verifyHTTPCheck(s, c) - }, 600*time.Second, 30*time.Second, "could not validate http.can_connect check with api client") - }) - - s.T().Run("Kubelet check works", func(t *testing.T) { - s.EventuallyWithTf(func(c *assert.CollectT) { - agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", - }) - assert.NoError(c, err) - - for _, pod := range agentPods { - k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") - assert.NoError(c, err) - - verifyCheck(c, output, "kubelet") - } - }, 900*time.Second, 30*time.Second, "could not validate kubelet check on agent pod") - - metricQuery := fmt.Sprintf("exclude_null(avg:kubernetes.cpu.usage.total{kube_cluster_name:%s, container_id:*})", s.Env().Kind.ClusterName) - s.EventuallyWithTf(func(c *assert.CollectT) { - resp, _, err := s.datadogClient.metricsApi.QueryMetrics(s.datadogClient.ctx, time.Now().Add(-time.Minute*5).Unix(), time.Now().Add(time.Minute*5).Unix(), metricQuery) - - assert.Truef(c, len(resp.Series) > 0, "expected metric series for query `%s` to not be empty: %s", metricQuery, err) - }, 600*time.Second, 30*time.Second, fmt.Sprintf("metric series has not changed to not empty with query %s", metricQuery)) - }) - - s.T().Run("KSM Check Works (cluster check)", func(t *testing.T) { - s.EventuallyWithTf(func(c *assert.CollectT) { - clusterAgentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - LabelSelector: clusterAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", - }) - assert.NoError(t, err) - - for _, pod := range clusterAgentPods { - k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") - assert.NoError(t, err) - - verifyCheck(c, output, "kubernetes_state_core") - } - }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster agent pod") - - s.EventuallyWithTf(func(c *assert.CollectT) { - verifyKSMCheck(s, c) - }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") - }) - - s.T().Run("KSM Check Works (cluster check runner)", func(t *testing.T) { - // Update DDA - ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-ccr-enabled.yaml")) - assert.NoError(t, err) - k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) - verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") - verifyNumPodsForSelector(t, kubectlOptions, 1, clusterAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") - verifyNumPodsForSelector(t, kubectlOptions, 1, clusterCheckRunnerSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") - - s.EventuallyWithTf(func(c *assert.CollectT) { - ccrPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - LabelSelector: clusterCheckRunnerSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled", - }) - assert.NoError(c, err) - - for _, ccr := range ccrPods { - k8s.WaitUntilPodAvailable(t, kubectlOptions, ccr.Name, 9, 15*time.Second) - output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", ccr.Name, "--", "agent", "status", "collector", "-j") - assert.NoError(c, err) - - verifyCheck(c, output, "kubernetes_state_core") - } - }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster check runners pod") - - s.EventuallyWithTf(func(c *assert.CollectT) { - verifyKSMCheck(s, c) - }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") - }) - - s.T().Run("Logs collection works", func(t *testing.T) { - // Update DDA - ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-logs.yaml")) - assert.NoError(t, err) - - k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) - verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-logs") - - // Verify logs collection on agent pod - s.EventuallyWithTf(func(c *assert.CollectT) { - agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-logs", - }) - assert.NoError(c, err) - - for _, pod := range agentPods { - k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "logs agent", "-j") - assert.NoError(c, err) - - verifyAgentPodLogs(c, output) - } - }, 900*time.Second, 30*time.Second, "could not validate log check on agent pod") - - s.EventuallyWithTf(func(c *assert.CollectT) { - verifyAPILogs(s, c) - }, 600*time.Second, 30*time.Second, "could not valid logs collection with api client") - - }) + // s.T().Run("Autodiscovery works", func(t *testing.T) { + // // Add nginx with annotations + // var nginxConfigPath string + // nginxConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "autodiscovery-annotation.yaml")) + // assert.NoError(t, err) + // k8s.KubectlApply(t, kubectlOptions, nginxConfigPath) + + // verifyNumPodsForSelector(t, kubectlOptions, 1, "agent.datadoghq.com/e2e-test=datadog-agent-autodiscovery-annotated") + + // // check agent pods for http check + // s.EventuallyWithTf(func(c *assert.CollectT) { + // agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + // LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", + // }) + // assert.NoError(c, err) + + // for _, pod := range agentPods { + // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + + // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "-j") + // assert.NoError(c, err) + + // verifyCheck(c, output, "http_check") + // } + // }, 900*time.Second, 30*time.Second, "could not validate http check on agent pod") + + // s.EventuallyWithTf(func(c *assert.CollectT) { + // verifyHTTPCheck(s, c) + // }, 600*time.Second, 30*time.Second, "could not validate http.can_connect check with api client") + // }) + + // s.T().Run("Kubelet check works", func(t *testing.T) { + // s.EventuallyWithTf(func(c *assert.CollectT) { + // agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + // LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", + // }) + // assert.NoError(c, err) + + // for _, pod := range agentPods { + // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + + // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") + // assert.NoError(c, err) + + // verifyCheck(c, output, "kubelet") + // } + // }, 900*time.Second, 30*time.Second, "could not validate kubelet check on agent pod") + + // metricQuery := fmt.Sprintf("exclude_null(avg:kubernetes.cpu.usage.total{kube_cluster_name:%s, container_id:*})", s.Env().Kind.ClusterName) + // s.EventuallyWithTf(func(c *assert.CollectT) { + // resp, _, err := s.datadogClient.metricsApi.QueryMetrics(s.datadogClient.ctx, time.Now().Add(-time.Minute*5).Unix(), time.Now().Add(time.Minute*5).Unix(), metricQuery) + + // assert.Truef(c, len(resp.Series) > 0, "expected metric series for query `%s` to not be empty: %s", metricQuery, err) + // }, 600*time.Second, 30*time.Second, fmt.Sprintf("metric series has not changed to not empty with query %s", metricQuery)) + // }) + + // s.T().Run("KSM Check Works (cluster check)", func(t *testing.T) { + // s.EventuallyWithTf(func(c *assert.CollectT) { + // clusterAgentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + // LabelSelector: clusterAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", + // }) + // assert.NoError(t, err) + + // for _, pod := range clusterAgentPods { + // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") + // assert.NoError(t, err) + + // verifyCheck(c, output, "kubernetes_state_core") + // } + // }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster agent pod") + + // s.EventuallyWithTf(func(c *assert.CollectT) { + // verifyKSMCheck(s, c) + // }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") + // }) + + // s.T().Run("KSM Check Works (cluster check runner)", func(t *testing.T) { + // // Update DDA + // ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-ccr-enabled.yaml")) + // assert.NoError(t, err) + // k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) + // verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") + // verifyNumPodsForSelector(t, kubectlOptions, 1, clusterAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") + // verifyNumPodsForSelector(t, kubectlOptions, 1, clusterCheckRunnerSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") + + // s.EventuallyWithTf(func(c *assert.CollectT) { + // ccrPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + // LabelSelector: clusterCheckRunnerSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled", + // }) + // assert.NoError(c, err) + + // for _, ccr := range ccrPods { + // k8s.WaitUntilPodAvailable(t, kubectlOptions, ccr.Name, 9, 15*time.Second) + // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", ccr.Name, "--", "agent", "status", "collector", "-j") + // assert.NoError(c, err) + + // verifyCheck(c, output, "kubernetes_state_core") + // } + // }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster check runners pod") + + // s.EventuallyWithTf(func(c *assert.CollectT) { + // verifyKSMCheck(s, c) + // }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") + // }) + + // s.T().Run("Logs collection works", func(t *testing.T) { + // // Update DDA + // ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-logs.yaml")) + // assert.NoError(t, err) + + // k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) + // verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-logs") + + // // Verify logs collection on agent pod + // s.EventuallyWithTf(func(c *assert.CollectT) { + // agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + // LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-logs", + // }) + // assert.NoError(c, err) + + // for _, pod := range agentPods { + // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + + // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "logs agent", "-j") + // assert.NoError(c, err) + + // verifyAgentPodLogs(c, output) + // } + // }, 900*time.Second, 30*time.Second, "could not validate log check on agent pod") + + // s.EventuallyWithTf(func(c *assert.CollectT) { + // verifyAPILogs(s, c) + // }, 600*time.Second, 30*time.Second, "could not valid logs collection with api client") + + // }) s.T().Run("Cleanup DDA", func(t *testing.T) { deleteDda(t, kubectlOptions, ddaConfigPath) diff --git a/test/e2e/rc_updater_test.go b/test/e2e/rc_updater_test.go index 98292b460..52b87d60a 100644 --- a/test/e2e/rc_updater_test.go +++ b/test/e2e/rc_updater_test.go @@ -11,7 +11,6 @@ package e2e import ( "fmt" "path/filepath" - "testing" "time" "github.com/DataDog/datadog-agent/test/new-e2e/pkg/e2e" @@ -32,20 +31,20 @@ type updaterSuite struct { configID string } -func TestUpdaterSuite(t *testing.T) { +// func TestUpdaterSuite(t *testing.T) { - e2eParams := []e2e.SuiteOption{ - e2e.WithStackName(fmt.Sprintf("operator-kind-rc-%s", k8sVersion)), - e2e.WithProvisioner(kindProvisioner(k8sVersion, []string{"rc-e2e-manager.yaml"})), - } +// e2eParams := []e2e.SuiteOption{ +// e2e.WithStackName(fmt.Sprintf("operator-kind-rc-%s", k8sVersion)), +// e2e.WithProvisioner(kindProvisioner(k8sVersion, []string{"rc-e2e-manager.yaml"})), +// } - apiKey, _ := api.GetAPIKey() - appKey, _ := api.GetAPPKey() - require.NotEmpty(t, apiKey, "Could not get APIKey") - require.NotEmpty(t, appKey, "Could not get APPKey") - e2e.Run[kindEnv](t, &updaterSuite{clusterName: "rc-updater-e2e-test-cluster"}, e2eParams...) +// apiKey, _ := api.GetAPIKey() +// appKey, _ := api.GetAPPKey() +// require.NotEmpty(t, apiKey, "Could not get APIKey") +// require.NotEmpty(t, appKey, "Could not get APPKey") +// e2e.Run[kindEnv](t, &updaterSuite{clusterName: "rc-updater-e2e-test-cluster"}, e2eParams...) -} +// } func (u *updaterSuite) SetupSuite() { u.BaseSuite.SetupSuite() From 01883828c5831c596fb08f18cc023a7d846c8800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Thu, 5 Dec 2024 10:38:23 +0100 Subject: [PATCH 02/10] apm e2e test and un-comment for dev --- test/e2e/kind_test.go | 368 +++++++++++------- test/e2e/manifests/apm/datadog-agent-apm.yaml | 39 ++ test/e2e/manifests/apm/tracegen-deploy.yaml | 78 ++++ test/e2e/rc_updater_test.go | 23 +- 4 files changed, 359 insertions(+), 149 deletions(-) create mode 100644 test/e2e/manifests/apm/datadog-agent-apm.yaml create mode 100644 test/e2e/manifests/apm/tracegen-deploy.yaml diff --git a/test/e2e/kind_test.go b/test/e2e/kind_test.go index 91aebd052..4b333097a 100644 --- a/test/e2e/kind_test.go +++ b/test/e2e/kind_test.go @@ -37,7 +37,7 @@ import ( "github.com/pulumi/pulumi/sdk/v3/go/auto" "github.com/pulumi/pulumi/sdk/v3/go/pulumi" "github.com/stretchr/testify/assert" - // v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) type kindEnv struct { @@ -77,6 +77,7 @@ func (suite *kindSuite) SetupSuite() { client := datadog.NewAPIClient(configuration) suite.datadogClient.metricsApi = datadogV1.NewMetricsApi(client) suite.datadogClient.logsApi = datadogV1.NewLogsApi(client) + suite.datadogClient.spansApi = datadogV2.NewSpansApi(client) } func TestKindSuite(t *testing.T) { @@ -84,7 +85,8 @@ func TestKindSuite(t *testing.T) { e2e.WithStackName(fmt.Sprintf("operator-kind-%s", k8sVersion)), e2e.WithProvisioner(kindProvisioner(k8sVersion, nil)), // Un-comment to keep the VM running after the test - // e2e.WithDevMode(), + // TODO: should be commented on main + e2e.WithDevMode(), } e2e.Run[kindEnv](t, &kindSuite{}, e2eParams...) @@ -233,142 +235,185 @@ func (s *kindSuite) TestKindRun() { verifyNumPodsForSelector(t, kubectlOptions, 1, clusterAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-minimum") }) - // s.T().Run("Autodiscovery works", func(t *testing.T) { - // // Add nginx with annotations - // var nginxConfigPath string - // nginxConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "autodiscovery-annotation.yaml")) - // assert.NoError(t, err) - // k8s.KubectlApply(t, kubectlOptions, nginxConfigPath) - - // verifyNumPodsForSelector(t, kubectlOptions, 1, "agent.datadoghq.com/e2e-test=datadog-agent-autodiscovery-annotated") - - // // check agent pods for http check - // s.EventuallyWithTf(func(c *assert.CollectT) { - // agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - // LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", - // }) - // assert.NoError(c, err) - - // for _, pod := range agentPods { - // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "-j") - // assert.NoError(c, err) - - // verifyCheck(c, output, "http_check") - // } - // }, 900*time.Second, 30*time.Second, "could not validate http check on agent pod") - - // s.EventuallyWithTf(func(c *assert.CollectT) { - // verifyHTTPCheck(s, c) - // }, 600*time.Second, 30*time.Second, "could not validate http.can_connect check with api client") - // }) - - // s.T().Run("Kubelet check works", func(t *testing.T) { - // s.EventuallyWithTf(func(c *assert.CollectT) { - // agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - // LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", - // }) - // assert.NoError(c, err) - - // for _, pod := range agentPods { - // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") - // assert.NoError(c, err) - - // verifyCheck(c, output, "kubelet") - // } - // }, 900*time.Second, 30*time.Second, "could not validate kubelet check on agent pod") - - // metricQuery := fmt.Sprintf("exclude_null(avg:kubernetes.cpu.usage.total{kube_cluster_name:%s, container_id:*})", s.Env().Kind.ClusterName) - // s.EventuallyWithTf(func(c *assert.CollectT) { - // resp, _, err := s.datadogClient.metricsApi.QueryMetrics(s.datadogClient.ctx, time.Now().Add(-time.Minute*5).Unix(), time.Now().Add(time.Minute*5).Unix(), metricQuery) - - // assert.Truef(c, len(resp.Series) > 0, "expected metric series for query `%s` to not be empty: %s", metricQuery, err) - // }, 600*time.Second, 30*time.Second, fmt.Sprintf("metric series has not changed to not empty with query %s", metricQuery)) - // }) - - // s.T().Run("KSM Check Works (cluster check)", func(t *testing.T) { - // s.EventuallyWithTf(func(c *assert.CollectT) { - // clusterAgentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - // LabelSelector: clusterAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", - // }) - // assert.NoError(t, err) - - // for _, pod := range clusterAgentPods { - // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") - // assert.NoError(t, err) - - // verifyCheck(c, output, "kubernetes_state_core") - // } - // }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster agent pod") - - // s.EventuallyWithTf(func(c *assert.CollectT) { - // verifyKSMCheck(s, c) - // }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") - // }) - - // s.T().Run("KSM Check Works (cluster check runner)", func(t *testing.T) { - // // Update DDA - // ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-ccr-enabled.yaml")) - // assert.NoError(t, err) - // k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) - // verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") - // verifyNumPodsForSelector(t, kubectlOptions, 1, clusterAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") - // verifyNumPodsForSelector(t, kubectlOptions, 1, clusterCheckRunnerSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") - - // s.EventuallyWithTf(func(c *assert.CollectT) { - // ccrPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - // LabelSelector: clusterCheckRunnerSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled", - // }) - // assert.NoError(c, err) - - // for _, ccr := range ccrPods { - // k8s.WaitUntilPodAvailable(t, kubectlOptions, ccr.Name, 9, 15*time.Second) - // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", ccr.Name, "--", "agent", "status", "collector", "-j") - // assert.NoError(c, err) - - // verifyCheck(c, output, "kubernetes_state_core") - // } - // }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster check runners pod") - - // s.EventuallyWithTf(func(c *assert.CollectT) { - // verifyKSMCheck(s, c) - // }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") - // }) - - // s.T().Run("Logs collection works", func(t *testing.T) { - // // Update DDA - // ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-logs.yaml")) - // assert.NoError(t, err) - - // k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) - // verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-logs") - - // // Verify logs collection on agent pod - // s.EventuallyWithTf(func(c *assert.CollectT) { - // agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - // LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-logs", - // }) - // assert.NoError(c, err) - - // for _, pod := range agentPods { - // k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - // output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "logs agent", "-j") - // assert.NoError(c, err) - - // verifyAgentPodLogs(c, output) - // } - // }, 900*time.Second, 30*time.Second, "could not validate log check on agent pod") - - // s.EventuallyWithTf(func(c *assert.CollectT) { - // verifyAPILogs(s, c) - // }, 600*time.Second, 30*time.Second, "could not valid logs collection with api client") - - // }) + s.T().Run("Autodiscovery works", func(t *testing.T) { + // Add nginx with annotations + var nginxConfigPath string + nginxConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "autodiscovery-annotation.yaml")) + assert.NoError(t, err) + k8s.KubectlApply(t, kubectlOptions, nginxConfigPath) + + verifyNumPodsForSelector(t, kubectlOptions, 1, "agent.datadoghq.com/e2e-test=datadog-agent-autodiscovery-annotated") + + // check agent pods for http check + s.EventuallyWithTf(func(c *assert.CollectT) { + agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", + }) + assert.NoError(c, err) + + for _, pod := range agentPods { + k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "-j") + assert.NoError(c, err) + + verifyCheck(c, output, "http_check") + } + }, 900*time.Second, 30*time.Second, "could not validate http check on agent pod") + + s.EventuallyWithTf(func(c *assert.CollectT) { + verifyHTTPCheck(s, c) + }, 600*time.Second, 30*time.Second, "could not validate http.can_connect check with api client") + // TODO: remove nginx deployment to clean environment + }) + + s.T().Run("Kubelet check works", func(t *testing.T) { + s.EventuallyWithTf(func(c *assert.CollectT) { + agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", + }) + assert.NoError(c, err) + + for _, pod := range agentPods { + k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") + assert.NoError(c, err) + + verifyCheck(c, output, "kubelet") + } + }, 900*time.Second, 30*time.Second, "could not validate kubelet check on agent pod") + + metricQuery := fmt.Sprintf("exclude_null(avg:kubernetes.cpu.usage.total{kube_cluster_name:%s, container_id:*})", s.Env().Kind.ClusterName) + s.EventuallyWithTf(func(c *assert.CollectT) { + resp, _, err := s.datadogClient.metricsApi.QueryMetrics(s.datadogClient.ctx, time.Now().Add(-time.Minute*5).Unix(), time.Now().Add(time.Minute*5).Unix(), metricQuery) + + assert.Truef(c, len(resp.Series) > 0, "expected metric series for query `%s` to not be empty: %s", metricQuery, err) + }, 600*time.Second, 30*time.Second, fmt.Sprintf("metric series has not changed to not empty with query %s", metricQuery)) + }) + + s.T().Run("KSM Check Works (cluster check)", func(t *testing.T) { + s.EventuallyWithTf(func(c *assert.CollectT) { + clusterAgentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + LabelSelector: clusterAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-minimum", + }) + assert.NoError(t, err) + + for _, pod := range clusterAgentPods { + k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "collector", "-j") + assert.NoError(t, err) + + verifyCheck(c, output, "kubernetes_state_core") + } + }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster agent pod") + + s.EventuallyWithTf(func(c *assert.CollectT) { + verifyKSMCheck(s, c) + }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") + }) + + s.T().Run("KSM Check Works (cluster check runner)", func(t *testing.T) { + // Update DDA + ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-ccr-enabled.yaml")) + assert.NoError(t, err) + k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) + verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") + verifyNumPodsForSelector(t, kubectlOptions, 1, clusterAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") + verifyNumPodsForSelector(t, kubectlOptions, 1, clusterCheckRunnerSelector+",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled") + + s.EventuallyWithTf(func(c *assert.CollectT) { + ccrPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + LabelSelector: clusterCheckRunnerSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-ccr-enabled", + }) + assert.NoError(c, err) + + for _, ccr := range ccrPods { + k8s.WaitUntilPodAvailable(t, kubectlOptions, ccr.Name, 9, 15*time.Second) + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", ccr.Name, "--", "agent", "status", "collector", "-j") + assert.NoError(c, err) + + verifyCheck(c, output, "kubernetes_state_core") + } + }, 1200*time.Second, 30*time.Second, "could not validate kubernetes_state_core check on cluster check runners pod") + + s.EventuallyWithTf(func(c *assert.CollectT) { + verifyKSMCheck(s, c) + }, 600*time.Second, 30*time.Second, "could not validate kubernetes_state_core check with api client") + }) + + s.T().Run("Logs collection works", func(t *testing.T) { + // Update DDA + ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "datadog-agent-logs.yaml")) + assert.NoError(t, err) + + k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) + verifyAgentPods(t, kubectlOptions, nodeAgentSelector+",agent.datadoghq.com/e2e-test=datadog-agent-logs") + + // Verify logs collection on agent pod + s.EventuallyWithTf(func(c *assert.CollectT) { + agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + LabelSelector: nodeAgentSelector + ",agent.datadoghq.com/e2e-test=datadog-agent-logs", + }) + assert.NoError(c, err) + + for _, pod := range agentPods { + k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "logs agent", "-j") + assert.NoError(c, err) + + verifyAgentPodLogs(c, output) + } + }, 900*time.Second, 30*time.Second, "could not validate log check on agent pod") + + s.EventuallyWithTf(func(c *assert.CollectT) { + verifyAPILogs(s, c) + }, 600*time.Second, 30*time.Second, "could not valid logs collection with api client") + + }) + + s.T().Run("APM (hostPort, local service, UDP) works", func(t *testing.T) { + var apmAgentSelector = ",agent.datadoghq.com/e2e-test=datadog-agent-apm" + // Update DDA + ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "apm", "datadog-agent-apm.yaml")) + assert.NoError(t, err) + k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) + // Ensure agent pods are running + verifyAgentPods(t, kubectlOptions, nodeAgentSelector+apmAgentSelector) + + // Deploy trace generator + traceGenConfigPath, err := getAbsPath(filepath.Join(manifestsPath, "apm", "tracegen-deploy.yaml")) + assert.NoError(t, err) + k8s.KubectlApply(t, kubectlOptions, traceGenConfigPath) + + // Verify traces collection on agent pod + s.EventuallyWithTf(func(c *assert.CollectT) { + agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ + LabelSelector: nodeAgentSelector + apmAgentSelector, + }) + assert.NoError(c, err) + + // This works because we have a single Agent pod (so located on same node as tracegen) + // Otherwise, we would need to deploy tracegen on the same node as the Agent pod / as a DaemonSet + for _, pod := range agentPods { + k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) + + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "apm agent", "-j") + assert.NoError(c, err) + + verifyAgentTraces(c, output) + } + }, 180*time.Second, 30*time.Second, "could not validate traces on agent pod") // 3 minutes could be reduced even further + // Cleanup trace generator + k8s.KubectlDelete(t, kubectlOptions, traceGenConfigPath) + + // Verify traces collection with API client + // 4 queries within a minute to ensure traces are collected from the last 15 minutes + s.EventuallyWithTf(func(c *assert.CollectT) { + verifyAPISpans(s, c) + }, 60*time.Second, 15*time.Second, "could not validate traces with api client") + }) s.T().Run("Cleanup DDA", func(t *testing.T) { deleteDda(t, kubectlOptions, ddaConfigPath) @@ -400,6 +445,53 @@ func verifyAgentPodLogs(c *assert.CollectT, collectorOutput string) { assert.True(c, tailedIntegrations >= totalIntegrations*80/100, "Expected at least 80%% of integrations to be tailed, got %d/%d", tailedIntegrations, totalIntegrations) } +func verifyAgentTraces(c *assert.CollectT, collectorOutput string) { + apmAgentJson := parseCollectorJson(collectorOutput) + // The order of services in the Agent JSON output is not guaranteed. + // We use a map to assert that we have received traces for all expected services. + expectedServices := map[string]bool{ + "e2e-test-apm-hostip": true, + "e2e-test-apm-socket": true, + "e2e-test-apm-agent-service": true, + } + // Track found services + foundServices := map[string]bool{} + + if apmAgentJson != nil { + apmStats := apmAgentJson["apmStats"].(map[string]interface{})["receiver"].([]interface{}) + for _, service := range apmStats { + serviceName := service.(map[string]interface{})["Service"].(string) + tracesReceived := service.(map[string]interface{})["TracesReceived"].(float64) + // Ensure we received at least one trace for the service + assert.Greater(c, tracesReceived, float64(0), "Expected traces to be received for service %s", serviceName) + // Mark the service as found + foundServices[serviceName] = true + } + } + assert.Equal(c, expectedServices, foundServices, "The found services do not match the expected services") +} + +func verifyAPISpans(s *kindSuite, c *assert.CollectT) { + // TODO: check uniqueness of kind.ClusterName per test run + // If not unique, we can also use pod_name to filter spans + spansQuery := fmt.Sprintf("(env:e2e-operator service:e2e-test-apm* kube_cluster_name:%s)", s.Env().Kind.ClusterName) + timeNow := time.Now() + timeNowMinus15Mins := timeNow.Add(-15 * time.Minute) + timeNowMillisStr := fmt.Sprintf("%d", timeNow.UnixMilli()) + timeNowMinus15MinsMillisStr := fmt.Sprintf("%d", timeNowMinus15Mins.UnixMilli()) + // Query spans from the last 15 minutes, limit to 20 spans to reduce size of response + spansBody := datadogV2.ListSpansGetOptionalParameters{ + FilterQuery: datadog.PtrString(spansQuery), + FilterFrom: datadog.PtrString(timeNowMinus15MinsMillisStr), + FilterTo: datadog.PtrString(timeNowMillisStr), + PageLimit: datadog.PtrInt32(20), + } + resp, _, err := s.datadogClient.spansApi.ListSpansGet(s.datadogClient.ctx, spansBody) + + assert.NoError(c, err, "failed to query spans: %v", err) + assert.Greater(c, len(resp.Data), 0, fmt.Sprintf("expected spans to not be empty: %s", err)) +} + func verifyCheck(c *assert.CollectT, collectorOutput string, checkName string) { var runningChecks map[string]interface{} diff --git a/test/e2e/manifests/apm/datadog-agent-apm.yaml b/test/e2e/manifests/apm/datadog-agent-apm.yaml new file mode 100644 index 000000000..26ec679eb --- /dev/null +++ b/test/e2e/manifests/apm/datadog-agent-apm.yaml @@ -0,0 +1,39 @@ +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + name: datadog +spec: + global: + credentials: + apiSecret: + keyName: api-key + secretName: datadog-secret + appSecret: + keyName: app-key + secretName: datadog-secret + kubelet: + tlsVerify: false + features: + apm: + enabled: true + hostPortConfig: + enabled: true + hostPort: 8126 + unixDomainSocketConfig: + enabled: true + path: /var/run/datadog/apm.socket + override: + nodeAgent: + labels: + agent.datadoghq.com/e2e-test: datadog-agent-apm + containers: + agent: + env: + - name: DD_SKIP_SSL_VALIDATION + value: "true" + env: + - name: DD_CLUSTER_NAME + valueFrom: + configMapKeyRef: + name: datadog-cluster-name + key: DD_CLUSTER_NAME diff --git a/test/e2e/manifests/apm/tracegen-deploy.yaml b/test/e2e/manifests/apm/tracegen-deploy.yaml new file mode 100644 index 000000000..fb6e2e2b4 --- /dev/null +++ b/test/e2e/manifests/apm/tracegen-deploy.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tracegen-tribrid + labels: + app: tracegen-tribrid +spec: + replicas: 1 + selector: + matchLabels: + app: tracegen-tribrid + template: + metadata: + labels: + app: tracegen-tribrid + spec: + containers: + - name: tracegen-tcp-hostip + image: ghcr.io/datadog/apps-tracegen:main + env: + # IP of the node - listened by the trace-Agent if hostPort is enabled + - name: DD_AGENT_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: DD_SERVICE + value: "e2e-test-apm-hostip" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + - name: tracegen-tcp-agent-service + image: ghcr.io/datadog/apps-tracegen:main + env: + # Kubernetes service of the node Agent - enabled by default with the APM feature + - name: DD_AGENT_HOST + value: "datadog-agent" + - name: DD_SERVICE + value: "e2e-test-apm-agent-service" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + - name: tracegen-udp + image: ghcr.io/datadog/apps-tracegen:main + # Socket of the trace-agent + env: + - name: DD_TRACE_AGENT_URL + value: "unix:///var/run/datadog/apm.socket" + - name: DD_SERVICE + value: "e2e-test-apm-socket" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + volumeMounts: + - name: apmsocketpath + mountPath: /var/run/datadog/apm.socket + volumes: + - name: apmsocketpath + hostPath: + path: /var/run/datadog/apm.socket + type: Socket diff --git a/test/e2e/rc_updater_test.go b/test/e2e/rc_updater_test.go index 52b87d60a..98292b460 100644 --- a/test/e2e/rc_updater_test.go +++ b/test/e2e/rc_updater_test.go @@ -11,6 +11,7 @@ package e2e import ( "fmt" "path/filepath" + "testing" "time" "github.com/DataDog/datadog-agent/test/new-e2e/pkg/e2e" @@ -31,20 +32,20 @@ type updaterSuite struct { configID string } -// func TestUpdaterSuite(t *testing.T) { +func TestUpdaterSuite(t *testing.T) { -// e2eParams := []e2e.SuiteOption{ -// e2e.WithStackName(fmt.Sprintf("operator-kind-rc-%s", k8sVersion)), -// e2e.WithProvisioner(kindProvisioner(k8sVersion, []string{"rc-e2e-manager.yaml"})), -// } + e2eParams := []e2e.SuiteOption{ + e2e.WithStackName(fmt.Sprintf("operator-kind-rc-%s", k8sVersion)), + e2e.WithProvisioner(kindProvisioner(k8sVersion, []string{"rc-e2e-manager.yaml"})), + } -// apiKey, _ := api.GetAPIKey() -// appKey, _ := api.GetAPPKey() -// require.NotEmpty(t, apiKey, "Could not get APIKey") -// require.NotEmpty(t, appKey, "Could not get APPKey") -// e2e.Run[kindEnv](t, &updaterSuite{clusterName: "rc-updater-e2e-test-cluster"}, e2eParams...) + apiKey, _ := api.GetAPIKey() + appKey, _ := api.GetAPPKey() + require.NotEmpty(t, apiKey, "Could not get APIKey") + require.NotEmpty(t, appKey, "Could not get APPKey") + e2e.Run[kindEnv](t, &updaterSuite{clusterName: "rc-updater-e2e-test-cluster"}, e2eParams...) -// } +} func (u *updaterSuite) SetupSuite() { u.BaseSuite.SetupSuite() From ccc3a555ea6e041c7189cd894d2e813a7aeb5065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Wed, 8 Jan 2025 12:38:00 +0100 Subject: [PATCH 03/10] rebase on latest fanny --- test/e2e/kind_test.go | 47 ++++--------------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) diff --git a/test/e2e/kind_test.go b/test/e2e/kind_test.go index d17e0788a..986cb7a55 100644 --- a/test/e2e/kind_test.go +++ b/test/e2e/kind_test.go @@ -260,6 +260,9 @@ func (s *kindSuite) TestKindRun() { } }, 300*time.Second, 30*time.Second, "could not validate http check on agent pod") + // cleanup nginx deployment + k8s.KubectlDelete(t, kubectlOptions, nginxConfigPath) + s.EventuallyWithTf(func(c *assert.CollectT) { verifyHTTPCheck(s, c) }, 300*time.Second, 30*time.Second, "could not validate http.can_connect check with api client") @@ -375,49 +378,7 @@ func (s *kindSuite) TestKindRun() { }) - s.T().Run("APM (hostPort, local service, UDP) works", func(t *testing.T) { - var apmAgentSelector = ",agent.datadoghq.com/e2e-test=datadog-agent-apm" - // Update DDA - ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "apm", "datadog-agent-apm.yaml")) - assert.NoError(t, err) - k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) - // Ensure agent pods are running - verifyAgentPods(t, kubectlOptions, nodeAgentSelector+apmAgentSelector) - - // Deploy trace generator - traceGenConfigPath, err := getAbsPath(filepath.Join(manifestsPath, "apm", "tracegen-deploy.yaml")) - assert.NoError(t, err) - k8s.KubectlApply(t, kubectlOptions, traceGenConfigPath) - - // Verify traces collection on agent pod - s.EventuallyWithTf(func(c *assert.CollectT) { - agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - LabelSelector: nodeAgentSelector + apmAgentSelector, - }) - assert.NoError(c, err) - - // This works because we have a single Agent pod (so located on same node as tracegen) - // Otherwise, we would need to deploy tracegen on the same node as the Agent pod / as a DaemonSet - for _, pod := range agentPods { - k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "apm agent", "-j") - assert.NoError(c, err) - - verifyAgentTraces(c, output) - } - }, 180*time.Second, 30*time.Second, "could not validate traces on agent pod") // 3 minutes could be reduced even further - // Cleanup trace generator - k8s.KubectlDelete(t, kubectlOptions, traceGenConfigPath) - - // Verify traces collection with API client - // 4 queries within a minute to ensure traces are collected from the last 15 minutes - s.EventuallyWithTf(func(c *assert.CollectT) { - verifyAPISpans(s, c) - }, 60*time.Second, 15*time.Second, "could not validate traces with api client") - }) - - s.T().Run("APM (hostPort, local service, UDP) works", func(t *testing.T) { + s.T().Run("APM hostPort k8s service UDP works", func(t *testing.T) { var apmAgentSelector = ",agent.datadoghq.com/e2e-test=datadog-agent-apm" // Update DDA ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "apm", "datadog-agent-apm.yaml")) From ba813ff7d44ebc15f9d9d6ce830dec098f0b536d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Wed, 8 Jan 2025 15:04:24 +0100 Subject: [PATCH 04/10] apm e2e test refactor --- test/e2e/manifests/apm/datadog-agent-apm.yaml | 27 +------ .../new_manifests/apm/datadog-agent-apm.yaml | 19 +++++ .../new_manifests/apm/tracegen-deploy.yaml | 78 +++++++++++++++++++ test/e2e/tests/k8s_suite/k8s_suite_test.go | 67 +++++++++++++++- test/e2e/tests/utils/utils.go | 33 +++++++- 5 files changed, 196 insertions(+), 28 deletions(-) create mode 100644 test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml create mode 100644 test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml diff --git a/test/e2e/manifests/apm/datadog-agent-apm.yaml b/test/e2e/manifests/apm/datadog-agent-apm.yaml index 26ec679eb..8de6a8a8d 100644 --- a/test/e2e/manifests/apm/datadog-agent-apm.yaml +++ b/test/e2e/manifests/apm/datadog-agent-apm.yaml @@ -1,16 +1,12 @@ apiVersion: datadoghq.com/v2alpha1 kind: DatadogAgent metadata: - name: datadog + name: datadog-agent-apm + namespace: e2e-operator + labels: + agent.datadoghq.com/e2e-test: datadog-agent-apm spec: global: - credentials: - apiSecret: - keyName: api-key - secretName: datadog-secret - appSecret: - keyName: app-key - secretName: datadog-secret kubelet: tlsVerify: false features: @@ -22,18 +18,3 @@ spec: unixDomainSocketConfig: enabled: true path: /var/run/datadog/apm.socket - override: - nodeAgent: - labels: - agent.datadoghq.com/e2e-test: datadog-agent-apm - containers: - agent: - env: - - name: DD_SKIP_SSL_VALIDATION - value: "true" - env: - - name: DD_CLUSTER_NAME - valueFrom: - configMapKeyRef: - name: datadog-cluster-name - key: DD_CLUSTER_NAME diff --git a/test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml b/test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml new file mode 100644 index 000000000..741bf381b --- /dev/null +++ b/test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml @@ -0,0 +1,19 @@ +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + namespace: e2e-operator + labels: + agent.datadoghq.com/e2e-test: datadog-agent-apm +spec: + global: + kubelet: + tlsVerify: false + features: + apm: + enabled: true + hostPortConfig: + enabled: true + hostPort: 8126 + unixDomainSocketConfig: + enabled: true + path: /var/run/datadog/apm.socket diff --git a/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml new file mode 100644 index 000000000..fb6e2e2b4 --- /dev/null +++ b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tracegen-tribrid + labels: + app: tracegen-tribrid +spec: + replicas: 1 + selector: + matchLabels: + app: tracegen-tribrid + template: + metadata: + labels: + app: tracegen-tribrid + spec: + containers: + - name: tracegen-tcp-hostip + image: ghcr.io/datadog/apps-tracegen:main + env: + # IP of the node - listened by the trace-Agent if hostPort is enabled + - name: DD_AGENT_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: DD_SERVICE + value: "e2e-test-apm-hostip" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + - name: tracegen-tcp-agent-service + image: ghcr.io/datadog/apps-tracegen:main + env: + # Kubernetes service of the node Agent - enabled by default with the APM feature + - name: DD_AGENT_HOST + value: "datadog-agent" + - name: DD_SERVICE + value: "e2e-test-apm-agent-service" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + - name: tracegen-udp + image: ghcr.io/datadog/apps-tracegen:main + # Socket of the trace-agent + env: + - name: DD_TRACE_AGENT_URL + value: "unix:///var/run/datadog/apm.socket" + - name: DD_SERVICE + value: "e2e-test-apm-socket" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + volumeMounts: + - name: apmsocketpath + mountPath: /var/run/datadog/apm.socket + volumes: + - name: apmsocketpath + hostPath: + path: /var/run/datadog/apm.socket + type: Socket diff --git a/test/e2e/tests/k8s_suite/k8s_suite_test.go b/test/e2e/tests/k8s_suite/k8s_suite_test.go index 936c92cde..122346c67 100644 --- a/test/e2e/tests/k8s_suite/k8s_suite_test.go +++ b/test/e2e/tests/k8s_suite/k8s_suite_test.go @@ -201,8 +201,7 @@ func (s *k8sSuite) TestGenericK8s() { utils.VerifyAgentPods(s.T(), c, common.NamespaceName, s.Env().KubernetesCluster.Client(), common.NodeAgentSelector+",agent.datadoghq.com/name=dda-autodiscovery") // check agent pods for http check - agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + ",agent.datadoghq.com/name=dda-autodiscovery", - FieldSelector: "status.phase=Running"}) + agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + ",agent.datadoghq.com/name=dda-autodiscovery"}) assert.NoError(c, err) for _, pod := range agentPods.Items { @@ -254,6 +253,59 @@ func (s *k8sSuite) TestGenericK8s() { s.verifyAPILogs() }, 900*time.Second, 15*time.Second, "could not valid logs collection in time") }) + + s.T().Run("APM hostPort k8s service UDP works", func(t *testing.T) { + var apmAgentSelector = ",agent.datadoghq.com/name=datadog-agent-apm" + + ddaConfigPath, err := common.GetAbsPath(filepath.Join(common.ManifestsPath, "apm", "datadog-agent-apm.yaml")) + assert.NoError(s.T(), err) + + ddaOpts := []agentwithoperatorparams.Option{ + agentwithoperatorparams.WithDDAConfig(agentwithoperatorparams.DDAConfig{ + Name: "datadog-agent-apm", + YamlFilePath: ddaConfigPath, + }), + } + ddaOpts = append(ddaOpts, defaultDDAOpts...) + + provisionerOptions := []provisioners.KubernetesProvisionerOption{ + provisioners.WithTestName("e2e-operator-apm"), + provisioners.WithDDAOptions(ddaOpts...), + provisioners.WithYAMLWorkload(provisioners.YAMLWorkload{ + Name: "tracegen-deploy", + Path: strings.Join([]string{common.ManifestsPath, "apm", "tracegen-deploy.yaml"}, "/"), + }), + provisioners.WithLocal(s.local), + } + provisionerOptions = append(provisionerOptions, defaultProvisionerOpts...) + + // Deploy trace generator + s.UpdateEnv(provisioners.KubernetesProvisioner(provisionerOptions...)) + + // Verify traces collection on agent pod + s.EventuallyWithTf(func(c *assert.CollectT) { + // Verify tracegen deployment is running + utils.VerifyNumPodsForSelector(s.T(), c, common.NamespaceName, s.Env().KubernetesCluster.Client(), 1, "app=tracegen-tribrid") + + // Verify agent pods are running + utils.VerifyAgentPods(s.T(), c, common.NamespaceName, s.Env().KubernetesCluster.Client(), common.NodeAgentSelector+apmAgentSelector) + agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + apmAgentSelector}) + assert.NoError(c, err) + + // This works because we have a single Agent pod (so located on same node as tracegen) + // Otherwise, we would need to deploy tracegen on the same node as the Agent pod / as a DaemonSet + for _, pod := range agentPods.Items { + + output, _, err := s.Env().KubernetesCluster.KubernetesClient.PodExec(common.NamespaceName, pod.Name, "agent", []string{"agent", "status", "apm agent", "-j"}) + assert.NoError(c, err) + + utils.VerifyAgentTraces(c, output) + } + + // Verify traces collection with API client + s.verifyAPITraces(c) + }, 600*time.Second, 15*time.Second, "could not validate traces on agent pod") // TODO: check duration + }) } func (s *k8sSuite) verifyAPILogs() { @@ -262,6 +314,17 @@ func (s *k8sSuite) verifyAPILogs() { s.Assert().NotEmptyf(logs, fmt.Sprintf("Expected fake intake-ingested logs to not be empty: %s", err)) } +func (s *k8sSuite) verifyAPITraces(c *assert.CollectT) { + _, err := s.Env().FakeIntake.Client().GetTraces() + assert.NoError(c, err) + + // for _, trace := range traces { + // tags := lo.MapToSlice(trace.Tags, func(k string, v string) string { + // return k + ":" + v + // }) + +} + func (s *k8sSuite) verifyKSMCheck(c *assert.CollectT) { metricNames, err := s.Env().FakeIntake.Client().GetMetricNames() assert.NoError(c, err) diff --git a/test/e2e/tests/utils/utils.go b/test/e2e/tests/utils/utils.go index 6ef50cda5..c026278ba 100644 --- a/test/e2e/tests/utils/utils.go +++ b/test/e2e/tests/utils/utils.go @@ -8,13 +8,14 @@ package utils import ( "context" "fmt" + "strconv" + "strings" + "testing" + "github.com/DataDog/datadog-operator/test/e2e/common" "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeClient "k8s.io/client-go/kubernetes" - "strconv" - "strings" - "testing" ) func VerifyOperator(t *testing.T, c *assert.CollectT, namespace string, k8sClient kubeClient.Interface) { @@ -106,3 +107,29 @@ func VerifyAgentPodLogs(c *assert.CollectT, collectorOutput string) { totalIntegrations := len(agentLogs) assert.True(c, tailedIntegrations >= totalIntegrations*80/100, "Expected at least 80%% of integrations to be tailed, got %d/%d", tailedIntegrations, totalIntegrations) } + +func VerifyAgentTraces(c *assert.CollectT, collectorOutput string) { + apmAgentJson := common.ParseCollectorJson(collectorOutput) + // The order of services in the Agent JSON output is not guaranteed. + // We use a map to assert that we have received traces for all expected services. + expectedServices := map[string]bool{ + "e2e-test-apm-hostip": true, + "e2e-test-apm-socket": true, + "e2e-test-apm-agent-service": true, + } + // Track found services + foundServices := map[string]bool{} + + if apmAgentJson != nil { + apmStats := apmAgentJson["apmStats"].(map[string]interface{})["receiver"].([]interface{}) + for _, service := range apmStats { + serviceName := service.(map[string]interface{})["Service"].(string) + tracesReceived := service.(map[string]interface{})["TracesReceived"].(float64) + // Ensure we received at least one trace for the service + assert.Greater(c, tracesReceived, float64(0), "Expected traces to be received for service %s", serviceName) + // Mark the service as found + foundServices[serviceName] = true + } + } + assert.Equal(c, expectedServices, foundServices, "The found services do not match the expected services") +} From c4379f7ad69a820fdce27db2a624d86d15aa3ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Wed, 8 Jan 2025 16:10:51 +0100 Subject: [PATCH 05/10] fix apm refactor --- .../manifests/new_manifests/apm/tracegen-deploy.yaml | 4 +++- test/e2e/tests/k8s_suite/k8s_suite_test.go | 11 +++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml index fb6e2e2b4..75f71e89e 100644 --- a/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml +++ b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml @@ -2,6 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: tracegen-tribrid + namespace: e2e-operator labels: app: tracegen-tribrid spec: @@ -38,8 +39,9 @@ spec: image: ghcr.io/datadog/apps-tracegen:main env: # Kubernetes service of the node Agent - enabled by default with the APM feature + # The service is created by the Datadog Operator following convention: -agent - name: DD_AGENT_HOST - value: "datadog-agent" + value: "datadog-agent-apm-agent" - name: DD_SERVICE value: "e2e-test-apm-agent-service" - name: DD_ENV diff --git a/test/e2e/tests/k8s_suite/k8s_suite_test.go b/test/e2e/tests/k8s_suite/k8s_suite_test.go index 122346c67..bf35fc62a 100644 --- a/test/e2e/tests/k8s_suite/k8s_suite_test.go +++ b/test/e2e/tests/k8s_suite/k8s_suite_test.go @@ -315,13 +315,12 @@ func (s *k8sSuite) verifyAPILogs() { } func (s *k8sSuite) verifyAPITraces(c *assert.CollectT) { - _, err := s.Env().FakeIntake.Client().GetTraces() - assert.NoError(c, err) + traces, err := s.Env().FakeIntake.Client().GetTraces() + s.Assert().NoError(err) + s.Assert().NotEmptyf(traces, fmt.Sprintf("Expected fake intake-ingested traces to not be empty: %s", err)) - // for _, trace := range traces { - // tags := lo.MapToSlice(trace.Tags, func(k string, v string) string { - // return k + ":" + v - // }) + // TODO: implement finer trace verification by checking tags + // https://github.com/DataDog/datadog-agent/blob/271a3aa2b5ec9c00c3d845a048c652e4b21e6659/test/new-e2e/tests/containers/k8s_test.go#L1451 } From d7598c7ffbda979d2017f8b16044e573603851c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Wed, 8 Jan 2025 16:16:21 +0100 Subject: [PATCH 06/10] remove pre-refactor apm test --- test/e2e/kind_test.go | 66 ---------------- test/e2e/manifests/apm/datadog-agent-apm.yaml | 20 ----- test/e2e/manifests/apm/tracegen-deploy.yaml | 78 ------------------- 3 files changed, 164 deletions(-) delete mode 100644 test/e2e/manifests/apm/datadog-agent-apm.yaml delete mode 100644 test/e2e/manifests/apm/tracegen-deploy.yaml diff --git a/test/e2e/kind_test.go b/test/e2e/kind_test.go index 986cb7a55..cb8218b75 100644 --- a/test/e2e/kind_test.go +++ b/test/e2e/kind_test.go @@ -25,7 +25,6 @@ import ( "github.com/DataDog/datadog-agent/test/new-e2e/pkg/runner/parameters" "github.com/DataDog/datadog-api-client-go/v2/api/datadog" "github.com/DataDog/datadog-api-client-go/v2/api/datadogV1" - "github.com/DataDog/datadog-api-client-go/v2/api/datadogV2" "github.com/DataDog/test-infra-definitions/common/utils" localKubernetes "github.com/DataDog/test-infra-definitions/components/kubernetes" resAws "github.com/DataDog/test-infra-definitions/resources/aws" @@ -54,7 +53,6 @@ type datadogClient struct { ctx context.Context metricsApi *datadogV1.MetricsApi logsApi *datadogV1.LogsApi - spansApi *datadogV2.SpansApi } func (suite *kindSuite) SetupSuite() { @@ -78,7 +76,6 @@ func (suite *kindSuite) SetupSuite() { client := datadog.NewAPIClient(configuration) suite.datadogClient.metricsApi = datadogV1.NewMetricsApi(client) suite.datadogClient.logsApi = datadogV1.NewLogsApi(client) - suite.datadogClient.spansApi = datadogV2.NewSpansApi(client) } func TestKindSuite(t *testing.T) { @@ -378,48 +375,6 @@ func (s *kindSuite) TestKindRun() { }) - s.T().Run("APM hostPort k8s service UDP works", func(t *testing.T) { - var apmAgentSelector = ",agent.datadoghq.com/e2e-test=datadog-agent-apm" - // Update DDA - ddaConfigPath, err = getAbsPath(filepath.Join(manifestsPath, "apm", "datadog-agent-apm.yaml")) - assert.NoError(t, err) - k8s.KubectlApply(t, kubectlOptions, ddaConfigPath) - // Ensure agent pods are running - verifyAgentPods(t, kubectlOptions, nodeAgentSelector+apmAgentSelector) - - // Deploy trace generator - traceGenConfigPath, err := getAbsPath(filepath.Join(manifestsPath, "apm", "tracegen-deploy.yaml")) - assert.NoError(t, err) - k8s.KubectlApply(t, kubectlOptions, traceGenConfigPath) - - // Verify traces collection on agent pod - s.EventuallyWithTf(func(c *assert.CollectT) { - agentPods, err := k8s.ListPodsE(t, kubectlOptions, v1.ListOptions{ - LabelSelector: nodeAgentSelector + apmAgentSelector, - }) - assert.NoError(c, err) - - // This works because we have a single Agent pod (so located on same node as tracegen) - // Otherwise, we would need to deploy tracegen on the same node as the Agent pod / as a DaemonSet - for _, pod := range agentPods { - k8s.WaitUntilPodAvailable(t, kubectlOptions, pod.Name, 9, 15*time.Second) - - output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", "-it", pod.Name, "--", "agent", "status", "apm agent", "-j") - assert.NoError(c, err) - - verifyAgentTraces(c, output) - } - }, 180*time.Second, 30*time.Second, "could not validate traces on agent pod") // 3 minutes could be reduced even further - // Cleanup trace generator - k8s.KubectlDelete(t, kubectlOptions, traceGenConfigPath) - - // Verify traces collection with API client - // 4 queries within a minute to ensure traces are collected from the last 15 minutes - s.EventuallyWithTf(func(c *assert.CollectT) { - verifyAPISpans(s, c) - }, 60*time.Second, 15*time.Second, "could not validate traces with api client") - }) - s.T().Run("Cleanup DDA", func(t *testing.T) { deleteDda(t, kubectlOptions, ddaConfigPath) }) @@ -476,27 +431,6 @@ func verifyAgentTraces(c *assert.CollectT, collectorOutput string) { assert.Equal(c, expectedServices, foundServices, "The found services do not match the expected services") } -func verifyAPISpans(s *kindSuite, c *assert.CollectT) { - // TODO: check uniqueness of kind.ClusterName per test run - // If not unique, we can also use pod_name to filter spans - spansQuery := fmt.Sprintf("(env:e2e-operator service:e2e-test-apm* kube_cluster_name:%s)", s.Env().Kind.ClusterName) - timeNow := time.Now() - timeNowMinus15Mins := timeNow.Add(-15 * time.Minute) - timeNowMillisStr := fmt.Sprintf("%d", timeNow.UnixMilli()) - timeNowMinus15MinsMillisStr := fmt.Sprintf("%d", timeNowMinus15Mins.UnixMilli()) - // Query spans from the last 15 minutes, limit to 20 spans to reduce size of response - spansBody := datadogV2.ListSpansGetOptionalParameters{ - FilterQuery: datadog.PtrString(spansQuery), - FilterFrom: datadog.PtrString(timeNowMinus15MinsMillisStr), - FilterTo: datadog.PtrString(timeNowMillisStr), - PageLimit: datadog.PtrInt32(20), - } - resp, _, err := s.datadogClient.spansApi.ListSpansGet(s.datadogClient.ctx, spansBody) - - assert.NoError(c, err, "failed to query spans: %v", err) - assert.Greater(c, len(resp.Data), 0, fmt.Sprintf("expected spans to not be empty: %s", err)) -} - func verifyCheck(c *assert.CollectT, collectorOutput string, checkName string) { var runningChecks map[string]interface{} diff --git a/test/e2e/manifests/apm/datadog-agent-apm.yaml b/test/e2e/manifests/apm/datadog-agent-apm.yaml deleted file mode 100644 index 8de6a8a8d..000000000 --- a/test/e2e/manifests/apm/datadog-agent-apm.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: datadoghq.com/v2alpha1 -kind: DatadogAgent -metadata: - name: datadog-agent-apm - namespace: e2e-operator - labels: - agent.datadoghq.com/e2e-test: datadog-agent-apm -spec: - global: - kubelet: - tlsVerify: false - features: - apm: - enabled: true - hostPortConfig: - enabled: true - hostPort: 8126 - unixDomainSocketConfig: - enabled: true - path: /var/run/datadog/apm.socket diff --git a/test/e2e/manifests/apm/tracegen-deploy.yaml b/test/e2e/manifests/apm/tracegen-deploy.yaml deleted file mode 100644 index fb6e2e2b4..000000000 --- a/test/e2e/manifests/apm/tracegen-deploy.yaml +++ /dev/null @@ -1,78 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: tracegen-tribrid - labels: - app: tracegen-tribrid -spec: - replicas: 1 - selector: - matchLabels: - app: tracegen-tribrid - template: - metadata: - labels: - app: tracegen-tribrid - spec: - containers: - - name: tracegen-tcp-hostip - image: ghcr.io/datadog/apps-tracegen:main - env: - # IP of the node - listened by the trace-Agent if hostPort is enabled - - name: DD_AGENT_HOST - valueFrom: - fieldRef: - fieldPath: status.hostIP - - name: DD_SERVICE - value: "e2e-test-apm-hostip" - - name: DD_ENV - value: "e2e-operator" - resources: - requests: - memory: "32Mi" - cpu: "2m" - limits: - memory: "32Mi" - cpu: "10m" - - name: tracegen-tcp-agent-service - image: ghcr.io/datadog/apps-tracegen:main - env: - # Kubernetes service of the node Agent - enabled by default with the APM feature - - name: DD_AGENT_HOST - value: "datadog-agent" - - name: DD_SERVICE - value: "e2e-test-apm-agent-service" - - name: DD_ENV - value: "e2e-operator" - resources: - requests: - memory: "32Mi" - cpu: "2m" - limits: - memory: "32Mi" - cpu: "10m" - - name: tracegen-udp - image: ghcr.io/datadog/apps-tracegen:main - # Socket of the trace-agent - env: - - name: DD_TRACE_AGENT_URL - value: "unix:///var/run/datadog/apm.socket" - - name: DD_SERVICE - value: "e2e-test-apm-socket" - - name: DD_ENV - value: "e2e-operator" - resources: - requests: - memory: "32Mi" - cpu: "2m" - limits: - memory: "32Mi" - cpu: "10m" - volumeMounts: - - name: apmsocketpath - mountPath: /var/run/datadog/apm.socket - volumes: - - name: apmsocketpath - hostPath: - path: /var/run/datadog/apm.socket - type: Socket From 0c08145295bb47551ec67de47ca3021fe4a2919b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Wed, 8 Jan 2025 16:17:33 +0100 Subject: [PATCH 07/10] remove apm pre refactor util func --- test/e2e/kind_test.go | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/test/e2e/kind_test.go b/test/e2e/kind_test.go index cb8218b75..d57022542 100644 --- a/test/e2e/kind_test.go +++ b/test/e2e/kind_test.go @@ -405,32 +405,6 @@ func verifyAgentPodLogs(c *assert.CollectT, collectorOutput string) { assert.True(c, tailedIntegrations >= totalIntegrations*80/100, "Expected at least 80%% of integrations to be tailed, got %d/%d", tailedIntegrations, totalIntegrations) } -func verifyAgentTraces(c *assert.CollectT, collectorOutput string) { - apmAgentJson := parseCollectorJson(collectorOutput) - // The order of services in the Agent JSON output is not guaranteed. - // We use a map to assert that we have received traces for all expected services. - expectedServices := map[string]bool{ - "e2e-test-apm-hostip": true, - "e2e-test-apm-socket": true, - "e2e-test-apm-agent-service": true, - } - // Track found services - foundServices := map[string]bool{} - - if apmAgentJson != nil { - apmStats := apmAgentJson["apmStats"].(map[string]interface{})["receiver"].([]interface{}) - for _, service := range apmStats { - serviceName := service.(map[string]interface{})["Service"].(string) - tracesReceived := service.(map[string]interface{})["TracesReceived"].(float64) - // Ensure we received at least one trace for the service - assert.Greater(c, tracesReceived, float64(0), "Expected traces to be received for service %s", serviceName) - // Mark the service as found - foundServices[serviceName] = true - } - } - assert.Equal(c, expectedServices, foundServices, "The found services do not match the expected services") -} - func verifyCheck(c *assert.CollectT, collectorOutput string, checkName string) { var runningChecks map[string]interface{} From 1822d969b37d9eeb411d790b35b664ddb1af0c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Wed, 8 Jan 2025 16:22:16 +0100 Subject: [PATCH 08/10] remove nginx cleanup since it's for pre refactor test --- test/e2e/kind_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/e2e/kind_test.go b/test/e2e/kind_test.go index d57022542..a4d7b5f35 100644 --- a/test/e2e/kind_test.go +++ b/test/e2e/kind_test.go @@ -257,9 +257,6 @@ func (s *kindSuite) TestKindRun() { } }, 300*time.Second, 30*time.Second, "could not validate http check on agent pod") - // cleanup nginx deployment - k8s.KubectlDelete(t, kubectlOptions, nginxConfigPath) - s.EventuallyWithTf(func(c *assert.CollectT) { verifyHTTPCheck(s, c) }, 300*time.Second, 30*time.Second, "could not validate http.can_connect check with api client") From 8ee16e0080de80ca1185e238b8ec7904135816df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Thu, 9 Jan 2025 11:24:22 +0100 Subject: [PATCH 09/10] fix apm e2e refactor: avoid race + remove socket from volume def --- .../new_manifests/apm/tracegen-deploy.yaml | 7 ++-- test/e2e/tests/k8s_suite/k8s_suite_test.go | 35 +++++++++++-------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml index 75f71e89e..222cb057b 100644 --- a/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml +++ b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml @@ -55,8 +55,8 @@ spec: cpu: "10m" - name: tracegen-udp image: ghcr.io/datadog/apps-tracegen:main - # Socket of the trace-agent env: + # Socket of the trace-agent - name: DD_TRACE_AGENT_URL value: "unix:///var/run/datadog/apm.socket" - name: DD_SERVICE @@ -72,9 +72,8 @@ spec: cpu: "10m" volumeMounts: - name: apmsocketpath - mountPath: /var/run/datadog/apm.socket + mountPath: /var/run/datadog volumes: - name: apmsocketpath hostPath: - path: /var/run/datadog/apm.socket - type: Socket + path: /var/run/datadog/ \ No newline at end of file diff --git a/test/e2e/tests/k8s_suite/k8s_suite_test.go b/test/e2e/tests/k8s_suite/k8s_suite_test.go index bf35fc62a..4140b6b34 100644 --- a/test/e2e/tests/k8s_suite/k8s_suite_test.go +++ b/test/e2e/tests/k8s_suite/k8s_suite_test.go @@ -201,7 +201,8 @@ func (s *k8sSuite) TestGenericK8s() { utils.VerifyAgentPods(s.T(), c, common.NamespaceName, s.Env().KubernetesCluster.Client(), common.NodeAgentSelector+",agent.datadoghq.com/name=dda-autodiscovery") // check agent pods for http check - agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + ",agent.datadoghq.com/name=dda-autodiscovery"}) + agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + ",agent.datadoghq.com/name=dda-autodiscovery", + FieldSelector: "status.phase=Running"}) assert.NoError(c, err) for _, pod := range agentPods.Items { @@ -255,8 +256,18 @@ func (s *k8sSuite) TestGenericK8s() { }) s.T().Run("APM hostPort k8s service UDP works", func(t *testing.T) { - var apmAgentSelector = ",agent.datadoghq.com/name=datadog-agent-apm" + // Cleanup to avoid potential lingering DatadogAgent + // Avoid race with the new Agent not being able to bind to the hostPort + withoutDDAProvisionerOptions := []provisioners.KubernetesProvisionerOption{ + provisioners.WithTestName("e2e-operator-apm"), + provisioners.WithoutDDA(), + provisioners.WithLocal(s.local), + } + withoutDDAProvisionerOptions = append(withoutDDAProvisionerOptions, defaultProvisionerOpts...) + s.UpdateEnv(provisioners.KubernetesProvisioner(withoutDDAProvisionerOptions...)) + + var apmAgentSelector = ",agent.datadoghq.com/name=datadog-agent-apm" ddaConfigPath, err := common.GetAbsPath(filepath.Join(common.ManifestsPath, "apm", "datadog-agent-apm.yaml")) assert.NoError(s.T(), err) @@ -268,7 +279,7 @@ func (s *k8sSuite) TestGenericK8s() { } ddaOpts = append(ddaOpts, defaultDDAOpts...) - provisionerOptions := []provisioners.KubernetesProvisionerOption{ + ddaProvisionerOptions := []provisioners.KubernetesProvisionerOption{ provisioners.WithTestName("e2e-operator-apm"), provisioners.WithDDAOptions(ddaOpts...), provisioners.WithYAMLWorkload(provisioners.YAMLWorkload{ @@ -277,10 +288,10 @@ func (s *k8sSuite) TestGenericK8s() { }), provisioners.WithLocal(s.local), } - provisionerOptions = append(provisionerOptions, defaultProvisionerOpts...) + ddaProvisionerOptions = append(ddaProvisionerOptions, defaultProvisionerOpts...) - // Deploy trace generator - s.UpdateEnv(provisioners.KubernetesProvisioner(provisionerOptions...)) + // Deploy APM DatadogAgent and tracegen + s.UpdateEnv(provisioners.KubernetesProvisioner(ddaProvisionerOptions...)) // Verify traces collection on agent pod s.EventuallyWithTf(func(c *assert.CollectT) { @@ -289,7 +300,7 @@ func (s *k8sSuite) TestGenericK8s() { // Verify agent pods are running utils.VerifyAgentPods(s.T(), c, common.NamespaceName, s.Env().KubernetesCluster.Client(), common.NodeAgentSelector+apmAgentSelector) - agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + apmAgentSelector}) + agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + apmAgentSelector, FieldSelector: "status.phase=Running"}) assert.NoError(c, err) // This works because we have a single Agent pod (so located on same node as tracegen) @@ -302,7 +313,7 @@ func (s *k8sSuite) TestGenericK8s() { utils.VerifyAgentTraces(c, output) } - // Verify traces collection with API client + // Verify traces collection ingestion by fakeintake s.verifyAPITraces(c) }, 600*time.Second, 15*time.Second, "could not validate traces on agent pod") // TODO: check duration }) @@ -316,12 +327,8 @@ func (s *k8sSuite) verifyAPILogs() { func (s *k8sSuite) verifyAPITraces(c *assert.CollectT) { traces, err := s.Env().FakeIntake.Client().GetTraces() - s.Assert().NoError(err) - s.Assert().NotEmptyf(traces, fmt.Sprintf("Expected fake intake-ingested traces to not be empty: %s", err)) - - // TODO: implement finer trace verification by checking tags - // https://github.com/DataDog/datadog-agent/blob/271a3aa2b5ec9c00c3d845a048c652e4b21e6659/test/new-e2e/tests/containers/k8s_test.go#L1451 - + assert.NoError(c, err) + assert.NotEmptyf(c, traces, fmt.Sprintf("Expected fake intake-ingested traces to not be empty: %s", err)) } func (s *k8sSuite) verifyKSMCheck(c *assert.CollectT) { From c3c50d1aa63433ce7c862912a4a137c8db703f0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= Date: Fri, 10 Jan 2025 11:29:20 +0100 Subject: [PATCH 10/10] verify k8s version before checking local service traces --- test/e2e/tests/utils/utils.go | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/test/e2e/tests/utils/utils.go b/test/e2e/tests/utils/utils.go index c026278ba..98f1e04e6 100644 --- a/test/e2e/tests/utils/utils.go +++ b/test/e2e/tests/utils/utils.go @@ -108,14 +108,33 @@ func VerifyAgentPodLogs(c *assert.CollectT, collectorOutput string) { assert.True(c, tailedIntegrations >= totalIntegrations*80/100, "Expected at least 80%% of integrations to be tailed, got %d/%d", tailedIntegrations, totalIntegrations) } +// isInternalTrafficPolicySupported checks if the internalTrafficPolicy field is supported in the current Kubernetes version. +// This is accomplished by checking if the Kubernetes minor version is >= 22. +func isInternalTrafficPolicySupported() bool { + k8sVersion := common.K8sVersion + splits := strings.Split(k8sVersion, ".") + // Avoid panics by checking if the version is in the expected format (X.Y) + if len(splits) < 2 { + return false + } + minorVersion, err := strconv.Atoi(splits[1]) + if err != nil { + return false + } + return minorVersion >= 22 +} + func VerifyAgentTraces(c *assert.CollectT, collectorOutput string) { apmAgentJson := common.ParseCollectorJson(collectorOutput) // The order of services in the Agent JSON output is not guaranteed. // We use a map to assert that we have received traces for all expected services. expectedServices := map[string]bool{ - "e2e-test-apm-hostip": true, - "e2e-test-apm-socket": true, - "e2e-test-apm-agent-service": true, + "e2e-test-apm-hostip": true, + "e2e-test-apm-socket": true, + } + // On Kubernetes >= 1.22, the node Agent k8s service is created since internalTrafficPolicy is supported. + if isInternalTrafficPolicySupported() { + expectedServices["e2e-test-apm-agent-service"] = true } // Track found services foundServices := map[string]bool{}