From 4f6af2040070b996ba21133af1f6ea2b8659f0ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Bavelier?= <97530782+tbavelier@users.noreply.github.com> Date: Fri, 10 Jan 2025 17:37:36 +0100 Subject: [PATCH] [CECO-1153] APM e2e test (#1549) * comment lots of stuff to simplify dev * apm e2e test and un-comment for dev * rebase on latest fanny * apm e2e test refactor * fix apm refactor * remove pre-refactor apm test * remove apm pre refactor util func * remove nginx cleanup since it's for pre refactor test * fix apm e2e refactor: avoid race + remove socket from volume def * verify k8s version before checking local service traces --------- Co-authored-by: Fanny Jiang --- test/e2e/kind_test.go | 3 +- .../new_manifests/apm/datadog-agent-apm.yaml | 19 +++++ .../new_manifests/apm/tracegen-deploy.yaml | 79 +++++++++++++++++++ test/e2e/tests/k8s_suite/k8s_suite_test.go | 69 ++++++++++++++++ test/e2e/tests/utils/utils.go | 52 +++++++++++- 5 files changed, 218 insertions(+), 4 deletions(-) create mode 100644 test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml create mode 100644 test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml diff --git a/test/e2e/kind_test.go b/test/e2e/kind_test.go index 4b09317cf..a4d7b5f35 100644 --- a/test/e2e/kind_test.go +++ b/test/e2e/kind_test.go @@ -11,13 +11,14 @@ package e2e import ( "context" "fmt" - "github.com/DataDog/datadog-agent/test/new-e2e/pkg/provisioners" "path/filepath" "strconv" "strings" "testing" "time" + "github.com/DataDog/datadog-agent/test/new-e2e/pkg/provisioners" + "github.com/DataDog/datadog-agent/test/new-e2e/pkg/components" "github.com/DataDog/datadog-agent/test/new-e2e/pkg/e2e" "github.com/DataDog/datadog-agent/test/new-e2e/pkg/runner" diff --git a/test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml b/test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml new file mode 100644 index 000000000..741bf381b --- /dev/null +++ b/test/e2e/manifests/new_manifests/apm/datadog-agent-apm.yaml @@ -0,0 +1,19 @@ +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + namespace: e2e-operator + labels: + agent.datadoghq.com/e2e-test: datadog-agent-apm +spec: + global: + kubelet: + tlsVerify: false + features: + apm: + enabled: true + hostPortConfig: + enabled: true + hostPort: 8126 + unixDomainSocketConfig: + enabled: true + path: /var/run/datadog/apm.socket diff --git a/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml new file mode 100644 index 000000000..222cb057b --- /dev/null +++ b/test/e2e/manifests/new_manifests/apm/tracegen-deploy.yaml @@ -0,0 +1,79 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tracegen-tribrid + namespace: e2e-operator + labels: + app: tracegen-tribrid +spec: + replicas: 1 + selector: + matchLabels: + app: tracegen-tribrid + template: + metadata: + labels: + app: tracegen-tribrid + spec: + containers: + - name: tracegen-tcp-hostip + image: ghcr.io/datadog/apps-tracegen:main + env: + # IP of the node - listened by the trace-Agent if hostPort is enabled + - name: DD_AGENT_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: DD_SERVICE + value: "e2e-test-apm-hostip" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + - name: tracegen-tcp-agent-service + image: ghcr.io/datadog/apps-tracegen:main + env: + # Kubernetes service of the node Agent - enabled by default with the APM feature + # The service is created by the Datadog Operator following convention: -agent + - name: DD_AGENT_HOST + value: "datadog-agent-apm-agent" + - name: DD_SERVICE + value: "e2e-test-apm-agent-service" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + - name: tracegen-udp + image: ghcr.io/datadog/apps-tracegen:main + env: + # Socket of the trace-agent + - name: DD_TRACE_AGENT_URL + value: "unix:///var/run/datadog/apm.socket" + - name: DD_SERVICE + value: "e2e-test-apm-socket" + - name: DD_ENV + value: "e2e-operator" + resources: + requests: + memory: "32Mi" + cpu: "2m" + limits: + memory: "32Mi" + cpu: "10m" + volumeMounts: + - name: apmsocketpath + mountPath: /var/run/datadog + volumes: + - name: apmsocketpath + hostPath: + path: /var/run/datadog/ \ No newline at end of file diff --git a/test/e2e/tests/k8s_suite/k8s_suite_test.go b/test/e2e/tests/k8s_suite/k8s_suite_test.go index 936c92cde..4140b6b34 100644 --- a/test/e2e/tests/k8s_suite/k8s_suite_test.go +++ b/test/e2e/tests/k8s_suite/k8s_suite_test.go @@ -254,6 +254,69 @@ func (s *k8sSuite) TestGenericK8s() { s.verifyAPILogs() }, 900*time.Second, 15*time.Second, "could not valid logs collection in time") }) + + s.T().Run("APM hostPort k8s service UDP works", func(t *testing.T) { + + // Cleanup to avoid potential lingering DatadogAgent + // Avoid race with the new Agent not being able to bind to the hostPort + withoutDDAProvisionerOptions := []provisioners.KubernetesProvisionerOption{ + provisioners.WithTestName("e2e-operator-apm"), + provisioners.WithoutDDA(), + provisioners.WithLocal(s.local), + } + withoutDDAProvisionerOptions = append(withoutDDAProvisionerOptions, defaultProvisionerOpts...) + s.UpdateEnv(provisioners.KubernetesProvisioner(withoutDDAProvisionerOptions...)) + + var apmAgentSelector = ",agent.datadoghq.com/name=datadog-agent-apm" + ddaConfigPath, err := common.GetAbsPath(filepath.Join(common.ManifestsPath, "apm", "datadog-agent-apm.yaml")) + assert.NoError(s.T(), err) + + ddaOpts := []agentwithoperatorparams.Option{ + agentwithoperatorparams.WithDDAConfig(agentwithoperatorparams.DDAConfig{ + Name: "datadog-agent-apm", + YamlFilePath: ddaConfigPath, + }), + } + ddaOpts = append(ddaOpts, defaultDDAOpts...) + + ddaProvisionerOptions := []provisioners.KubernetesProvisionerOption{ + provisioners.WithTestName("e2e-operator-apm"), + provisioners.WithDDAOptions(ddaOpts...), + provisioners.WithYAMLWorkload(provisioners.YAMLWorkload{ + Name: "tracegen-deploy", + Path: strings.Join([]string{common.ManifestsPath, "apm", "tracegen-deploy.yaml"}, "/"), + }), + provisioners.WithLocal(s.local), + } + ddaProvisionerOptions = append(ddaProvisionerOptions, defaultProvisionerOpts...) + + // Deploy APM DatadogAgent and tracegen + s.UpdateEnv(provisioners.KubernetesProvisioner(ddaProvisionerOptions...)) + + // Verify traces collection on agent pod + s.EventuallyWithTf(func(c *assert.CollectT) { + // Verify tracegen deployment is running + utils.VerifyNumPodsForSelector(s.T(), c, common.NamespaceName, s.Env().KubernetesCluster.Client(), 1, "app=tracegen-tribrid") + + // Verify agent pods are running + utils.VerifyAgentPods(s.T(), c, common.NamespaceName, s.Env().KubernetesCluster.Client(), common.NodeAgentSelector+apmAgentSelector) + agentPods, err := s.Env().KubernetesCluster.Client().CoreV1().Pods(common.NamespaceName).List(context.TODO(), metav1.ListOptions{LabelSelector: common.NodeAgentSelector + apmAgentSelector, FieldSelector: "status.phase=Running"}) + assert.NoError(c, err) + + // This works because we have a single Agent pod (so located on same node as tracegen) + // Otherwise, we would need to deploy tracegen on the same node as the Agent pod / as a DaemonSet + for _, pod := range agentPods.Items { + + output, _, err := s.Env().KubernetesCluster.KubernetesClient.PodExec(common.NamespaceName, pod.Name, "agent", []string{"agent", "status", "apm agent", "-j"}) + assert.NoError(c, err) + + utils.VerifyAgentTraces(c, output) + } + + // Verify traces collection ingestion by fakeintake + s.verifyAPITraces(c) + }, 600*time.Second, 15*time.Second, "could not validate traces on agent pod") // TODO: check duration + }) } func (s *k8sSuite) verifyAPILogs() { @@ -262,6 +325,12 @@ func (s *k8sSuite) verifyAPILogs() { s.Assert().NotEmptyf(logs, fmt.Sprintf("Expected fake intake-ingested logs to not be empty: %s", err)) } +func (s *k8sSuite) verifyAPITraces(c *assert.CollectT) { + traces, err := s.Env().FakeIntake.Client().GetTraces() + assert.NoError(c, err) + assert.NotEmptyf(c, traces, fmt.Sprintf("Expected fake intake-ingested traces to not be empty: %s", err)) +} + func (s *k8sSuite) verifyKSMCheck(c *assert.CollectT) { metricNames, err := s.Env().FakeIntake.Client().GetMetricNames() assert.NoError(c, err) diff --git a/test/e2e/tests/utils/utils.go b/test/e2e/tests/utils/utils.go index 6ef50cda5..98f1e04e6 100644 --- a/test/e2e/tests/utils/utils.go +++ b/test/e2e/tests/utils/utils.go @@ -8,13 +8,14 @@ package utils import ( "context" "fmt" + "strconv" + "strings" + "testing" + "github.com/DataDog/datadog-operator/test/e2e/common" "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeClient "k8s.io/client-go/kubernetes" - "strconv" - "strings" - "testing" ) func VerifyOperator(t *testing.T, c *assert.CollectT, namespace string, k8sClient kubeClient.Interface) { @@ -106,3 +107,48 @@ func VerifyAgentPodLogs(c *assert.CollectT, collectorOutput string) { totalIntegrations := len(agentLogs) assert.True(c, tailedIntegrations >= totalIntegrations*80/100, "Expected at least 80%% of integrations to be tailed, got %d/%d", tailedIntegrations, totalIntegrations) } + +// isInternalTrafficPolicySupported checks if the internalTrafficPolicy field is supported in the current Kubernetes version. +// This is accomplished by checking if the Kubernetes minor version is >= 22. +func isInternalTrafficPolicySupported() bool { + k8sVersion := common.K8sVersion + splits := strings.Split(k8sVersion, ".") + // Avoid panics by checking if the version is in the expected format (X.Y) + if len(splits) < 2 { + return false + } + minorVersion, err := strconv.Atoi(splits[1]) + if err != nil { + return false + } + return minorVersion >= 22 +} + +func VerifyAgentTraces(c *assert.CollectT, collectorOutput string) { + apmAgentJson := common.ParseCollectorJson(collectorOutput) + // The order of services in the Agent JSON output is not guaranteed. + // We use a map to assert that we have received traces for all expected services. + expectedServices := map[string]bool{ + "e2e-test-apm-hostip": true, + "e2e-test-apm-socket": true, + } + // On Kubernetes >= 1.22, the node Agent k8s service is created since internalTrafficPolicy is supported. + if isInternalTrafficPolicySupported() { + expectedServices["e2e-test-apm-agent-service"] = true + } + // Track found services + foundServices := map[string]bool{} + + if apmAgentJson != nil { + apmStats := apmAgentJson["apmStats"].(map[string]interface{})["receiver"].([]interface{}) + for _, service := range apmStats { + serviceName := service.(map[string]interface{})["Service"].(string) + tracesReceived := service.(map[string]interface{})["TracesReceived"].(float64) + // Ensure we received at least one trace for the service + assert.Greater(c, tracesReceived, float64(0), "Expected traces to be received for service %s", serviceName) + // Mark the service as found + foundServices[serviceName] = true + } + } + assert.Equal(c, expectedServices, foundServices, "The found services do not match the expected services") +}