Skip to content

Commit

Permalink
Add support for GPU monitoring (#1601)
Browse files Browse the repository at this point in the history
* Add support for GPU feature

* Add tests for runtime class changes

* Documentation

* Update docs

* Update api/datadoghq/v2alpha1/datadogagent_types.go

Co-authored-by: Celene <[email protected]>

* Remove debug changes

* Move const variables to gpu package

* GPUMonitoringType -> GPUIDType

* Rename gpuMonitoringFeature to gpuFeature

* Apply suggestion

* defaultGPURuntimeClass

* Rename GPUMonitoringConfig

* Fix comment

* Generate code

---------

Co-authored-by: Celene <[email protected]>
  • Loading branch information
gjulianm and celenechang authored Jan 30, 2025
1 parent 8c8d714 commit 7f198ad
Show file tree
Hide file tree
Showing 19 changed files with 593 additions and 2 deletions.
16 changes: 16 additions & 0 deletions api/datadoghq/v2alpha1/datadogagent_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ type DatadogFeatures struct {
SBOM *SBOMFeatureConfig `json:"sbom,omitempty"`
// ServiceDiscovery
ServiceDiscovery *ServiceDiscoveryFeatureConfig `json:"serviceDiscovery,omitempty"`
// GPU monitoring
GPU *GPUFeatureConfig `json:"gpu,omitempty"`

// Cluster-level features

Expand Down Expand Up @@ -498,6 +500,20 @@ type ServiceDiscoveryFeatureConfig struct {
Enabled *bool `json:"enabled,omitempty"`
}

// GPUFeatureConfig contains the GPU monitoring configuration.
type GPUFeatureConfig struct {
// Enabled enables GPU monitoring.
// Default: false
// +optional
Enabled *bool `json:"enabled,omitempty"`

// PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.
// If the value is an empty string, the runtime class is not set.
// Default: nvidia
// +optional
PodRuntimeClassName *string `json:"requiredRuntimeClassName"`
}

// DogstatsdFeatureConfig contains the Dogstatsd configuration parameters.
// +k8s:openapi-gen=true
type DogstatsdFeatureConfig struct {
Expand Down
30 changes: 30 additions & 0 deletions api/datadoghq/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion api/datadoghq/v2alpha1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions config/crd/bases/v1/datadoghq.com_datadogagents.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,21 @@ spec:
Default: false
type: boolean
type: object
gpu:
description: GPU monitoring
properties:
enabled:
description: |-
Enabled enables GPU monitoring.
Default: false
type: boolean
requiredRuntimeClassName:
description: |-
PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.
If the value is an empty string, the runtime class is not set.
Default: nvidia
type: string
type: object
helmCheck:
description: HelmCheck configuration.
properties:
Expand Down Expand Up @@ -7883,6 +7898,21 @@ spec:
Default: false
type: boolean
type: object
gpu:
description: GPU monitoring
properties:
enabled:
description: |-
Enabled enables GPU monitoring.
Default: false
type: boolean
requiredRuntimeClassName:
description: |-
PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.
If the value is an empty string, the runtime class is not set.
Default: nvidia
type: string
type: object
helmCheck:
description: HelmCheck configuration.
properties:
Expand Down
30 changes: 30 additions & 0 deletions config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,21 @@
},
"type": "object"
},
"gpu": {
"additionalProperties": false,
"description": "GPU monitoring",
"properties": {
"enabled": {
"description": "Enabled enables GPU monitoring.\nDefault: false",
"type": "boolean"
},
"requiredRuntimeClassName": {
"description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf the value is an empty string, the runtime class is not set.\nDefault: nvidia",
"type": "string"
}
},
"type": "object"
},
"helmCheck": {
"additionalProperties": false,
"description": "HelmCheck configuration.",
Expand Down Expand Up @@ -7871,6 +7886,21 @@
},
"type": "object"
},
"gpu": {
"additionalProperties": false,
"description": "GPU monitoring",
"properties": {
"enabled": {
"description": "Enabled enables GPU monitoring.\nDefault: false",
"type": "boolean"
},
"requiredRuntimeClassName": {
"description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf the value is an empty string, the runtime class is not set.\nDefault: nvidia",
"type": "string"
}
},
"type": "object"
},
"helmCheck": {
"additionalProperties": false,
"description": "HelmCheck configuration.",
Expand Down
2 changes: 2 additions & 0 deletions docs/configuration.v2alpha1.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ spec:
| features.externalMetricsServer.registerAPIService | RegisterAPIService registers the External Metrics endpoint as an APIService Default: true |
| features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true |
| features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false |
| features.gpu.enabled | Enables GPU monitoring. Default: false |
| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If the value is an empty string, the runtime class is not set. Default: nvidia |
| features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false |
| features.helmCheck.enabled | Enables the Helm check. Default: false |
| features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} |
Expand Down
2 changes: 2 additions & 0 deletions examples/datadogagent/datadog-agent-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ spec:
enabled: true
serviceDiscovery:
enabled: true
gpu:
enabled: true
eventCollection:
collectKubernetesEvents: true
orchestratorExplorer:
Expand Down
1 change: 1 addition & 0 deletions internal/controller/datadogagent/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
_ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/enabledefault"
_ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/eventcollection"
_ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/externalmetrics"
_ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/gpu"
_ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/helmcheck"
_ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/kubernetesstatecore"
_ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/livecontainer"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ const (

defaultEBPFCheckEnabled bool = false

defaultGPUMonitoringEnabled bool = false

defaultServiceDiscoveryEnabled bool = false

defaultAPMEnabled bool = true
Expand Down Expand Up @@ -265,6 +267,12 @@ func defaultFeaturesConfig(ddaSpec *v2alpha1.DatadogAgentSpec) {
}
apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.ServiceDiscovery.Enabled, defaultServiceDiscoveryEnabled)

// GPU monitoring feature
if ddaSpec.Features.GPU == nil {
ddaSpec.Features.GPU = &v2alpha1.GPUFeatureConfig{}
}
apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.GPU.Enabled, defaultGPUMonitoringEnabled)

// APM Feature
// APM is enabled by default
if ddaSpec.Features.APM == nil {
Expand Down
Loading

0 comments on commit 7f198ad

Please sign in to comment.