diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 857767b44e..461f98fdb9 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -82,6 +82,8 @@ type DatadogFeatures struct { SBOM *SBOMFeatureConfig `json:"sbom,omitempty"` // ServiceDiscovery ServiceDiscovery *ServiceDiscoveryFeatureConfig `json:"serviceDiscovery,omitempty"` + // GPU monitoring + GPU *GPUFeatureConfig `json:"gpu,omitempty"` // Cluster-level features @@ -498,6 +500,20 @@ type ServiceDiscoveryFeatureConfig struct { Enabled *bool `json:"enabled,omitempty"` } +// GPUFeatureConfig contains the GPU monitoring configuration. +type GPUFeatureConfig struct { + // Enabled enables GPU monitoring. + // Default: false + // +optional + Enabled *bool `json:"enabled,omitempty"` + + // PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + // If the value is an empty string, the runtime class is not set. + // Default: nvidia + // +optional + PodRuntimeClassName *string `json:"requiredRuntimeClassName"` +} + // DogstatsdFeatureConfig contains the Dogstatsd configuration parameters. // +k8s:openapi-gen=true type DogstatsdFeatureConfig struct { diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index a36ac97786..cc3d92e54e 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1232,6 +1232,11 @@ func (in *DatadogFeatures) DeepCopyInto(out *DatadogFeatures) { *out = new(ServiceDiscoveryFeatureConfig) (*in).DeepCopyInto(*out) } + if in.GPU != nil { + in, out := &in.GPU, &out.GPU + *out = new(GPUFeatureConfig) + (*in).DeepCopyInto(*out) + } if in.EventCollection != nil { in, out := &in.EventCollection, &out.EventCollection *out = new(EventCollectionFeatureConfig) @@ -1545,6 +1550,31 @@ func (in *FIPSConfig) DeepCopy() *FIPSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUFeatureConfig) DeepCopyInto(out *GPUFeatureConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.PodRuntimeClassName != nil { + in, out := &in.PodRuntimeClassName, &out.PodRuntimeClassName + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUFeatureConfig. +func (in *GPUFeatureConfig) DeepCopy() *GPUFeatureConfig { + if in == nil { + return nil + } + out := new(GPUFeatureConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GlobalConfig) DeepCopyInto(out *GlobalConfig) { *out = *in diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index 9dfdc495f3..f1aa9771c3 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -675,6 +675,12 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig"), }, }, + "gpu": { + SchemaProps: spec.SchemaProps{ + Description: "GPU monitoring", + Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUFeatureConfig"), + }, + }, "eventCollection": { SchemaProps: spec.SchemaProps{ Description: "EventCollection configuration.", @@ -733,7 +739,7 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R }, }, Dependencies: []string{ - "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, } } diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index 679f7ce12a..383dc41888 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -1019,6 +1019,21 @@ spec: Default: false type: boolean type: object + gpu: + description: GPU monitoring + properties: + enabled: + description: |- + Enabled enables GPU monitoring. + Default: false + type: boolean + requiredRuntimeClassName: + description: |- + PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + If the value is an empty string, the runtime class is not set. + Default: nvidia + type: string + type: object helmCheck: description: HelmCheck configuration. properties: @@ -7883,6 +7898,21 @@ spec: Default: false type: boolean type: object + gpu: + description: GPU monitoring + properties: + enabled: + description: |- + Enabled enables GPU monitoring. + Default: false + type: boolean + requiredRuntimeClassName: + description: |- + PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + If the value is an empty string, the runtime class is not set. + Default: nvidia + type: string + type: object helmCheck: description: HelmCheck configuration. properties: diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index 02d401ef45..324d0267b2 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -1065,6 +1065,21 @@ }, "type": "object" }, + "gpu": { + "additionalProperties": false, + "description": "GPU monitoring", + "properties": { + "enabled": { + "description": "Enabled enables GPU monitoring.\nDefault: false", + "type": "boolean" + }, + "requiredRuntimeClassName": { + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf the value is an empty string, the runtime class is not set.\nDefault: nvidia", + "type": "string" + } + }, + "type": "object" + }, "helmCheck": { "additionalProperties": false, "description": "HelmCheck configuration.", @@ -7871,6 +7886,21 @@ }, "type": "object" }, + "gpu": { + "additionalProperties": false, + "description": "GPU monitoring", + "properties": { + "enabled": { + "description": "Enabled enables GPU monitoring.\nDefault: false", + "type": "boolean" + }, + "requiredRuntimeClassName": { + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf the value is an empty string, the runtime class is not set.\nDefault: nvidia", + "type": "string" + } + }, + "type": "object" + }, "helmCheck": { "additionalProperties": false, "description": "HelmCheck configuration.", diff --git a/docs/configuration.v2alpha1.md b/docs/configuration.v2alpha1.md index 1e37d145cd..2932d7d5ce 100644 --- a/docs/configuration.v2alpha1.md +++ b/docs/configuration.v2alpha1.md @@ -111,6 +111,8 @@ spec: | features.externalMetricsServer.registerAPIService | RegisterAPIService registers the External Metrics endpoint as an APIService Default: true | | features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true | | features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false | +| features.gpu.enabled | Enables GPU monitoring. Default: false | +| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If the value is an empty string, the runtime class is not set. Default: nvidia | | features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false | | features.helmCheck.enabled | Enables the Helm check. Default: false | | features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} | diff --git a/examples/datadogagent/datadog-agent-all.yaml b/examples/datadogagent/datadog-agent-all.yaml index ea0cff3c00..dd786ed542 100644 --- a/examples/datadogagent/datadog-agent-all.yaml +++ b/examples/datadogagent/datadog-agent-all.yaml @@ -47,6 +47,8 @@ spec: enabled: true serviceDiscovery: enabled: true + gpu: + enabled: true eventCollection: collectKubernetesEvents: true orchestratorExplorer: diff --git a/internal/controller/datadogagent/controller.go b/internal/controller/datadogagent/controller.go index 7c29bdd319..6eb7f2e659 100644 --- a/internal/controller/datadogagent/controller.go +++ b/internal/controller/datadogagent/controller.go @@ -34,6 +34,7 @@ import ( _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/enabledefault" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/eventcollection" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/externalmetrics" + _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/gpu" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/helmcheck" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/kubernetesstatecore" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/livecontainer" diff --git a/internal/controller/datadogagent/defaults/datadogagent_default.go b/internal/controller/datadogagent/defaults/datadogagent_default.go index 1ebe569709..26e118105b 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default.go @@ -37,6 +37,8 @@ const ( defaultEBPFCheckEnabled bool = false + defaultGPUMonitoringEnabled bool = false + defaultServiceDiscoveryEnabled bool = false defaultAPMEnabled bool = true @@ -265,6 +267,12 @@ func defaultFeaturesConfig(ddaSpec *v2alpha1.DatadogAgentSpec) { } apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.ServiceDiscovery.Enabled, defaultServiceDiscoveryEnabled) + // GPU monitoring feature + if ddaSpec.Features.GPU == nil { + ddaSpec.Features.GPU = &v2alpha1.GPUFeatureConfig{} + } + apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.GPU.Enabled, defaultGPUMonitoringEnabled) + // APM Feature // APM is enabled by default if ddaSpec.Features.APM == nil { diff --git a/internal/controller/datadogagent/defaults/datadogagent_default_test.go b/internal/controller/datadogagent/defaults/datadogagent_default_test.go index 98d61fd6b1..18310a1167 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default_test.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default_test.go @@ -198,6 +198,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -333,6 +336,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueFalse), }, @@ -423,6 +429,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueFalse), }, @@ -549,6 +558,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -696,6 +708,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -838,6 +853,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueTrue), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -980,6 +998,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1131,6 +1152,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1273,6 +1297,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1418,6 +1445,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1602,6 +1632,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, CSPM: &v2alpha1.CSPMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultCSPMEnabled), }, @@ -1717,6 +1750,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1860,6 +1896,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1979,6 +2018,7 @@ func Test_defaultFeatures(t *testing.T) { OOMKill: &v2alpha1.OOMKillFeatureConfig{}, TCPQueueLength: &v2alpha1.TCPQueueLengthFeatureConfig{}, EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{}, + GPU: &v2alpha1.GPUFeatureConfig{}, ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{}, APM: &v2alpha1.APMFeatureConfig{}, ASM: &v2alpha1.ASMFeatureConfig{}, @@ -2024,6 +2064,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -2169,6 +2212,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ diff --git a/internal/controller/datadogagent/feature/gpu/const.go b/internal/controller/datadogagent/feature/gpu/const.go new file mode 100644 index 0000000000..fedde4d703 --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/const.go @@ -0,0 +1,15 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package gpu + +const ( + nvidiaDevicesMountPath = "/var/run/nvidia-container-devices/all" + nvidiaDevicesVolumeName = "nvidia-devices" + devNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices + + // defaultGPURuntimeClass default runtime class for GPU pods + defaultGPURuntimeClass = "nvidia" +) diff --git a/internal/controller/datadogagent/feature/gpu/envvar.go b/internal/controller/datadogagent/feature/gpu/envvar.go new file mode 100644 index 0000000000..5c8a0b96f6 --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/envvar.go @@ -0,0 +1,9 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package gpu + +const DDEnableGPUMonitoringEnvVar = "DD_GPU_MONITORING_ENABLED" +const NVIDIAVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES" diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go new file mode 100644 index 0000000000..8aebca2128 --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -0,0 +1,163 @@ +package gpu + +import ( + corev1 "k8s.io/api/core/v1" + + apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" + apiutils "github.com/DataDog/datadog-operator/api/utils" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume" +) + +func init() { + if err := feature.Register(feature.GPUIDType, buildFeature); err != nil { + panic(err) + } +} + +func buildFeature(*feature.Options) feature.Feature { + return &gpuFeature{} +} + +type gpuFeature struct { + // podRuntimeClassName is the value to set in the runtimeClassName + // configuration of the agent pod. If this is empty, the runtimeClassName + // will not be changed. + podRuntimeClassName string +} + +// ID returns the ID of the Feature +func (f *gpuFeature) ID() feature.IDType { + return feature.GPUIDType +} + +// Configure is used to configure the feature from a v2alpha1.DatadogAgent instance. +func (f *gpuFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.RequiredComponents) { + if dda.Spec.Features == nil || dda.Spec.Features.GPU == nil || !apiutils.BoolValue(dda.Spec.Features.GPU.Enabled) { + return reqComp + } + + reqComp.Agent = feature.RequiredComponent{ + IsRequired: apiutils.NewBoolPointer(true), + Containers: []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, + } + + if dda.Spec.Features.GPU.PodRuntimeClassName == nil { + // Configuration option not set, so revert to the default + f.podRuntimeClassName = defaultGPURuntimeClass + } else { + // Configuration option set, use the value. Note that here the value might be an empty + // string, which tells us to not change the runtime class. + f.podRuntimeClassName = *dda.Spec.Features.GPU.PodRuntimeClassName + } + + return reqComp +} + +// ManageDependencies allows a feature to manage its dependencies. +// Feature's dependencies should be added in the store. +func (f *gpuFeature) ManageDependencies(feature.ResourceManagers, feature.RequiredComponents) error { + return nil +} + +// ManageClusterAgent allows a feature to configure the ClusterAgent's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuFeature) ManageClusterAgent(feature.PodTemplateManagers) error { + return nil +} + +func configureSystemProbe(managers feature.PodTemplateManagers) { + // annotations + managers.Annotation().AddAnnotation(v2alpha1.SystemProbeAppArmorAnnotationKey, v2alpha1.SystemProbeAppArmorAnnotationValue) + + // security context capabilities + managers.SecurityContext().AddCapabilitiesToContainer(agent.DefaultCapabilitiesForSystemProbe(), apicommon.SystemProbeContainerName) + + // socket volume mount (needs write perms for the system probe container but not the others) + procdirVol, procdirMount := volume.GetVolumes(v2alpha1.ProcdirVolumeName, v2alpha1.ProcdirHostPath, v2alpha1.ProcdirMountPath, true) + managers.VolumeMount().AddVolumeMountToContainer(&procdirMount, apicommon.SystemProbeContainerName) + managers.Volume().AddVolume(&procdirVol) + + socketVol, socketVolMount := volume.GetVolumesEmptyDir(v2alpha1.SystemProbeSocketVolumeName, v2alpha1.SystemProbeSocketVolumePath, false) + managers.Volume().AddVolume(&socketVol) + managers.VolumeMount().AddVolumeMountToContainer(&socketVolMount, apicommon.SystemProbeContainerName) + + _, socketVolMountReadOnly := volume.GetVolumesEmptyDir(v2alpha1.SystemProbeSocketVolumeName, v2alpha1.SystemProbeSocketVolumePath, true) + managers.VolumeMount().AddVolumeMountToContainer(&socketVolMountReadOnly, apicommon.CoreAgentContainerName) + + socketEnvVar := &corev1.EnvVar{ + Name: v2alpha1.DDSystemProbeSocket, + Value: v2alpha1.DefaultSystemProbeSocketPath, + } + + managers.EnvVar().AddEnvVarToContainer(apicommon.CoreAgentContainerName, socketEnvVar) + managers.EnvVar().AddEnvVarToContainer(apicommon.SystemProbeContainerName, socketEnvVar) +} + +// ManageNodeAgent allows a feature to configure the Node Agent's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuFeature) ManageNodeAgent(managers feature.PodTemplateManagers, _ string) error { + configureSystemProbe(managers) + + // env var to enable the GPU module + enableEnvVar := &corev1.EnvVar{ + Name: DDEnableGPUMonitoringEnvVar, + Value: "true", + } + + // Both in the core agent and the system probe + managers.EnvVar().AddEnvVarToContainers([]apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, enableEnvVar) + + // The agent check does not need to be manually enabled, the init config container will + // check if GPU monitoring is enabled and will enable the check automatically (see + // Dockerfiles/agent/cont-init.d/60-sysprobe-check.sh in the datadog-agent repo). + managers.EnvVar().AddEnvVarToInitContainer(apicommon.InitConfigContainerName, enableEnvVar) + + // Now we need to add the NVIDIA_VISIBLE_DEVICES env var to both agents again so + // that the nvidia runtime can expose the GPU devices in the container + nvidiaVisibleDevicesEnvVar := &corev1.EnvVar{ + Name: NVIDIAVisibleDevicesEnvVar, + Value: "all", + } + + managers.EnvVar().AddEnvVarToContainers([]apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, nvidiaVisibleDevicesEnvVar) + + // Some nvidia-container-runtime setups ignore the NVIDIA_VISIBLE_DEVICES + // env variable. This is usually configured with the options + // accept-nvidia-visible-devices-envvar-when-unprivileged = true + // accept-nvidia-visible-devices-as-volume-mounts = true + // in the NVIDIA container runtime config. In this case, we need to mount the + // /var/run/nvidia-container-devices/all directory into the container, so that + // the nvidia-container-runtime can see that we want to use all GPUs. + devicesVol, devicesMount := volume.GetVolumes(nvidiaDevicesVolumeName, devNullPath, nvidiaDevicesMountPath, true) + managers.Volume().AddVolume(&devicesVol) + managers.VolumeMount().AddVolumeMountToContainers(&devicesMount, []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}) + + // Configure the runtime class for the pod + if f.podRuntimeClassName != "" { + managers.PodTemplateSpec().Spec.RuntimeClassName = &f.podRuntimeClassName + } + + // Note: we don't need to mount the NVML library, as it's mounted + // automatically by the nvidia-container-runtime. However, if needed, we + // could add a config option for that and mount that in the agent and + // system-probe folders, and then set the correct configuration option so + // that the binaries can find the library. + + return nil +} + +// ManageSingleContainerNodeAgent allows a feature to configure the Agent container for the Node Agent's corev1.PodTemplateSpec +// if SingleContainerStrategy is enabled and can be used with the configured feature set. +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuFeature) ManageSingleContainerNodeAgent(feature.PodTemplateManagers, string) error { + return nil +} + +// ManageClusterChecksRunner allows a feature to configure the ClusterChecksRunner's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuFeature) ManageClusterChecksRunner(feature.PodTemplateManagers) error { + return nil +} diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go new file mode 100644 index 0000000000..2202bf7dc9 --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -0,0 +1,182 @@ +package gpu + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + + apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" + apiutils "github.com/DataDog/datadog-operator/api/utils" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/fake" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/test" +) + +const alternativeRuntimeClass = "nvidia-like" + +func Test_GPUMonitoringFeature_Configure(t *testing.T) { + ddaGPUMonitoringDisabled := v2alpha1.DatadogAgent{ + Spec: v2alpha1.DatadogAgentSpec{ + Features: &v2alpha1.DatadogFeatures{ + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(false), + }, + }, + }, + } + ddaGPUMonitoringEnabled := ddaGPUMonitoringDisabled.DeepCopy() + ddaGPUMonitoringEnabled.Spec.Features.GPU.Enabled = apiutils.NewBoolPointer(true) + + ddaGPUMonitoringEnabledAlternativeRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() + ddaGPUMonitoringEnabledAlternativeRuntimeClass.Spec.Features.GPU.PodRuntimeClassName = apiutils.NewStringPointer(alternativeRuntimeClass) + + ddaGPUMonitoringEnabledANoRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() + ddaGPUMonitoringEnabledANoRuntimeClass.Spec.Features.GPU.PodRuntimeClassName = apiutils.NewStringPointer("") + + GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers, expectedRuntimeClass string) { + mgr := mgrInterface.(*fake.PodTemplateManagers) + + // check security context capabilities + sysProbeCapabilities := mgr.SecurityContextMgr.CapabilitiesByC[apicommon.SystemProbeContainerName] + assert.True( + t, + apiutils.IsEqualStruct(sysProbeCapabilities, agent.DefaultCapabilitiesForSystemProbe()), + "System Probe security context capabilities \ndiff = %s", + cmp.Diff(sysProbeCapabilities, agent.DefaultCapabilitiesForSystemProbe()), + ) + + // check volume mounts + wantCoreAgentVolMounts := []corev1.VolumeMount{ + { + Name: v2alpha1.SystemProbeSocketVolumeName, + MountPath: v2alpha1.SystemProbeSocketVolumePath, + ReadOnly: true, + }, + { + Name: nvidiaDevicesVolumeName, + MountPath: nvidiaDevicesMountPath, + ReadOnly: true, + }, + } + + wantSystemProbeVolMounts := []corev1.VolumeMount{ + { + Name: v2alpha1.ProcdirVolumeName, + MountPath: v2alpha1.ProcdirMountPath, + ReadOnly: true, + }, + { + Name: v2alpha1.SystemProbeSocketVolumeName, + MountPath: v2alpha1.SystemProbeSocketVolumePath, + ReadOnly: false, + }, + { + Name: nvidiaDevicesVolumeName, + MountPath: nvidiaDevicesMountPath, + ReadOnly: true, + }, + } + + coreAgentVolumeMounts := mgr.VolumeMountMgr.VolumeMountsByC[apicommon.CoreAgentContainerName] + assert.True(t, apiutils.IsEqualStruct(coreAgentVolumeMounts, wantCoreAgentVolMounts), "Core agent volume mounts \ndiff = %s", cmp.Diff(coreAgentVolumeMounts, wantCoreAgentVolMounts)) + + systemProbeVolumeMounts := mgr.VolumeMountMgr.VolumeMountsByC[apicommon.SystemProbeContainerName] + assert.True(t, apiutils.IsEqualStruct(systemProbeVolumeMounts, wantSystemProbeVolMounts), "System Probe volume mounts \ndiff = %s", cmp.Diff(systemProbeVolumeMounts, wantSystemProbeVolMounts)) + + // check volumes + wantVolumes := []corev1.Volume{ + { + Name: v2alpha1.ProcdirVolumeName, + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: v2alpha1.ProcdirHostPath, + }, + }, + }, + { + Name: v2alpha1.SystemProbeSocketVolumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: nvidiaDevicesVolumeName, + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: devNullPath, + }, + }, + }, + } + + volumes := mgr.VolumeMgr.Volumes + assert.True(t, apiutils.IsEqualStruct(volumes, wantVolumes), "Volumes \ndiff = %s", cmp.Diff(volumes, wantVolumes)) + + // check env vars + wantEnvVars := []*corev1.EnvVar{ + { + Name: v2alpha1.DDSystemProbeSocket, + Value: v2alpha1.DefaultSystemProbeSocketPath, + }, + { + Name: DDEnableGPUMonitoringEnvVar, + Value: "true", + }, + { + Name: NVIDIAVisibleDevicesEnvVar, + Value: "all", + }, + } + agentEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.CoreAgentContainerName] + assert.True(t, apiutils.IsEqualStruct(agentEnvVars, wantEnvVars), "Agent envvars \ndiff = %s", cmp.Diff(agentEnvVars, wantEnvVars)) + + systemProbeEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.SystemProbeContainerName] + assert.True(t, apiutils.IsEqualStruct(systemProbeEnvVars, wantEnvVars), "System Probe envvars \ndiff = %s", cmp.Diff(systemProbeEnvVars, wantEnvVars)) + + // Check runtime class + if expectedRuntimeClass == "" { + assert.Nil(t, mgr.PodTemplateSpec().Spec.RuntimeClassName) + } else { + assert.Equal(t, expectedRuntimeClass, *mgr.PodTemplateSpec().Spec.RuntimeClassName) + } + } + + tests := test.FeatureTestSuite{ + { + Name: "gpu monitoring not enabled", + DDA: ddaGPUMonitoringDisabled.DeepCopy(), + WantConfigure: false, + }, + { + Name: "gpu monitoring enabled", + DDA: ddaGPUMonitoringEnabled, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, defaultGPURuntimeClass) + }), + }, + { + Name: "gpu monitoring enabled, alternative runtime class", + DDA: ddaGPUMonitoringEnabledAlternativeRuntimeClass, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, alternativeRuntimeClass) + }), + }, + + { + Name: "gpu monitoring enabled, no runtime class", + DDA: ddaGPUMonitoringEnabledANoRuntimeClass, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, "") + }), + }, + } + + tests.Run(t, buildFeature) +} diff --git a/internal/controller/datadogagent/feature/ids.go b/internal/controller/datadogagent/feature/ids.go index b395d720de..274599b08f 100644 --- a/internal/controller/datadogagent/feature/ids.go +++ b/internal/controller/datadogagent/feature/ids.go @@ -71,4 +71,6 @@ const ( DummyIDType = "dummy" // ServiceDiscoveryType service discovery feature. ServiceDiscoveryType = "service_discovery" + // GPUIDType GPU monitoring feature. + GPUIDType = "gpu" ) diff --git a/internal/controller/datadogagent/feature/test/factory_test.go b/internal/controller/datadogagent/feature/test/factory_test.go index 8d0491d174..5b6ed0e58e 100644 --- a/internal/controller/datadogagent/feature/test/factory_test.go +++ b/internal/controller/datadogagent/feature/test/factory_test.go @@ -12,6 +12,7 @@ import ( _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/apm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/cspm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/enabledefault" + _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/gpu" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/livecontainer" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/npm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/otelcollector" @@ -295,6 +296,22 @@ func TestBuilder(t *testing.T) { common.AgentDataPlaneContainerName: false, }, }, + { + name: "GPU monitoring enabled, 4 agents", + dda: testutils.NewDatadogAgentBuilder(). + WithGPUMonitoringEnabled(true). + BuildWithDefaults(), + wantAgentContainer: map[common.AgentContainerName]bool{ + common.UnprivilegedSingleAgentContainerName: false, + common.CoreAgentContainerName: true, + common.ProcessAgentContainerName: true, + common.TraceAgentContainerName: true, + common.SystemProbeContainerName: true, + common.SecurityAgentContainerName: false, + common.OtelAgent: false, + common.AgentDataPlaneContainerName: false, + }, + }, } for _, tt := range tests { @@ -304,7 +321,7 @@ func TestBuilder(t *testing.T) { assert.True(t, *requiredComponents.Agent.IsRequired) for name, required := range tt.wantAgentContainer { - assert.Equal(t, required, wantAgentContainer(name, requiredComponents), "Check", name) + assert.Equal(t, required, wantAgentContainer(name, requiredComponents), "container %s", name) } }) } diff --git a/internal/controller/datadogagent_controller_test.go b/internal/controller/datadogagent_controller_test.go index b8442043aa..97d7e6def4 100644 --- a/internal/controller/datadogagent_controller_test.go +++ b/internal/controller/datadogagent_controller_test.go @@ -168,6 +168,11 @@ var _ = Describe("V2 Controller - DatadogAgent Deployment", func() { "with overrides", testFunction(testutils.NewDatadogAgentWithOverrides(namespace, "with-overrides")), ) + + Context( + "with GPU monitoring", + testFunction(testutils.NewDatadogAgentWithGPUMonitoring(namespace, "with-gpu-monitoring")), + ) }) func testFunction(agent v2alpha1.DatadogAgent) func() { diff --git a/internal/controller/testutils/agent.go b/internal/controller/testutils/agent.go index 89244d7255..9bdf855fc6 100644 --- a/internal/controller/testutils/agent.go +++ b/internal/controller/testutils/agent.go @@ -351,6 +351,19 @@ func NewDatadogAgentWithUSM(namespace string, name string) v2alpha1.DatadogAgent ) } +// NewDatadogAgentWithGPUMonitoring returns an agent with GPU monitoring enabled +func NewDatadogAgentWithGPUMonitoring(namespace string, name string) v2alpha1.DatadogAgent { + return newDatadogAgentWithFeatures( + namespace, + name, + &v2alpha1.DatadogFeatures{ + GPU: &v2alpha1.GPUFeatureConfig{ + Enabled: apiutils.NewBoolPointer(true), + }, + }, + ) +} + // NewDatadogAgentWithGlobalConfigSettings returns an agent with some global // settings set func NewDatadogAgentWithGlobalConfigSettings(namespace string, name string) v2alpha1.DatadogAgent { diff --git a/pkg/testutils/builder.go b/pkg/testutils/builder.go index 28a2f4e61d..e049886a27 100644 --- a/pkg/testutils/builder.go +++ b/pkg/testutils/builder.go @@ -949,3 +949,17 @@ func (builder *DatadogAgentBuilder) WithFIPS(fipsConfig v2alpha1.FIPSConfig) *Da builder.datadogAgent.Spec.Global.FIPS = &fipsConfig return builder } + +// GPU + +func (builder *DatadogAgentBuilder) initGPUMonitoring() { + if builder.datadogAgent.Spec.Features.GPU == nil { + builder.datadogAgent.Spec.Features.GPU = &v2alpha1.GPUFeatureConfig{} + } +} + +func (builder *DatadogAgentBuilder) WithGPUMonitoringEnabled(enabled bool) *DatadogAgentBuilder { + builder.initGPUMonitoring() + builder.datadogAgent.Spec.Features.GPU.Enabled = apiutils.NewBoolPointer(enabled) + return builder +}