Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion charts/dataplane/templates/prometheus/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,29 @@ data:
- role: pod
namespaces:
names:
- kube-system
- {{ index .Values "dcgm-exporter" "namespace" | default "kube-system" }}
selectors:
- role: pod
label: app.kubernetes.io/name=dcgm-exporter
relabel_configs:
- source_labels: [__meta_kubernetes_pod_ip]
regex: '(.*)'
target_label: __address__
replacement: '${1}:9400'
action: replace
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: pod
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: node
metric_relabel_configs:
- source_labels: [__name__]
regex: "DCGM_FI_DEV_GPU_UTIL|DCGM_FI_DEV_MEM_COPY_UTIL|DCGM_FI_DEV_FB_USED|DCGM_FI_DEV_FB_FREE|DCGM_FI_PROF_GR_ENGINE_ACTIVE|DCGM_FI_PROF_SM_ACTIVE|DCGM_FI_PROF_SM_OCCUPANCY|DCGM_FI_PROF_PIPE_TENSOR_ACTIVE|DCGM_FI_PROF_DRAM_ACTIVE|DCGM_FI_DEV_POWER_USAGE|DCGM_FI_DEV_GPU_TEMP"
action: keep
{{- end }}

{{- if .Values.cost.enabled }}
Expand Down
10 changes: 10 additions & 0 deletions charts/dataplane/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -650,11 +650,21 @@ dcgm-exporter:
# Support a very high memory usage as dcgm-exporter can spike
# depending on GPU count.
resources: {}
# -- Namespace where dcgm-exporter is deployed. Used by Prometheus scrape config.
# Override if dcgm-exporter runs outside kube-system (e.g. gpu-operator namespace).
namespace: "kube-system"

extraEnv:
# Required to map GPUs to the actual Kubernetes pods that used them.
# TLM uses this to map GPU metrics to task pods
- name: DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE
value: "uid"
# Required for MIG (Multi-Instance GPU) partition profiling on A100/H100.
# Allows DCGM to access per-partition profiling counters (DCGM_FI_PROF_*).
# Without this, DCGM_FI_PROF_SM_ACTIVE and related metrics return no data
# for MIG slices.
- name: NVIDIA_MIG_MONITOR_DEVICES
value: "all"

# Based on default metrics list at
# https://github.com/NVIDIA/dcgm-exporter/blob/3446595fbb31e22e45a8dbcd63ade14e3da49810/deployment/templates/metrics-configmap.yaml#L13
Expand Down
Loading
Loading