Skip to content

Commit

Permalink
feat: output worker metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
0x5457 committed Mar 11, 2025
1 parent 985f2d2 commit 90e2ed1
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 27 deletions.
20 changes: 0 additions & 20 deletions internal/controller/pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ import (

tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
"github.com/NexusGPU/tensor-fusion/internal/constants"
"github.com/NexusGPU/tensor-fusion/internal/metrics"
webhookv1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1"
"github.com/prometheus/client_golang/prometheus"
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -60,10 +57,6 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
log.Error(err, "Failed to get Pod")
return ctrl.Result{}, err
}
tfInfo, err := webhookv1.ParseTensorFusionInfo(ctx, r.Client, pod)
if err != nil {
return ctrl.Result{}, fmt.Errorf("parse tf resources: %w", err)
}

// generate tensor fusion connections and apply to cluster
tfConnection := generateTensorFusionConnection(pod)
Expand All @@ -76,19 +69,6 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
}
}

// update metrics
for _, container := range tfInfo.ContainerNames {
labels := prometheus.Labels{
"pod": pod.Name,
"namespace": pod.Namespace,
"container": container,
}
metrics.GpuTflopsRequest.With(labels).Set(tfInfo.Profile.Resources.Requests.Tflops.AsApproximateFloat64())
metrics.GpuTflopsLimit.With(labels).Set(tfInfo.Profile.Resources.Limits.Tflops.AsApproximateFloat64())
metrics.VramBytesRequest.With(labels).Set(tfInfo.Profile.Resources.Requests.Vram.AsApproximateFloat64())
metrics.VramBytesLimit.With(labels).Set(tfInfo.Profile.Resources.Limits.Vram.AsApproximateFloat64())
}

return ctrl.Result{}, nil
}

Expand Down
28 changes: 25 additions & 3 deletions internal/controller/tensorfusionworkload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ import (
tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion/api/v1"
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
"github.com/NexusGPU/tensor-fusion/internal/constants"
"github.com/NexusGPU/tensor-fusion/internal/metrics"
scheduler "github.com/NexusGPU/tensor-fusion/internal/scheduler"
"github.com/NexusGPU/tensor-fusion/internal/utils"
"github.com/NexusGPU/tensor-fusion/internal/worker"
"github.com/prometheus/client_golang/prometheus"
)

// TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object
Expand Down Expand Up @@ -141,7 +143,7 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl

// Calculate how many pods need to be removed
podsToRemove := int(currentReplicas - desiredReplicas)
if err := r.scaleDownWorkers(ctx, podList.Items[:podsToRemove]); err != nil {
if err := r.scaleDownWorkers(ctx, workload, podList.Items[:podsToRemove]); err != nil {
return ctrl.Result{}, err
}
}
Expand Down Expand Up @@ -185,7 +187,7 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
}

// scaleDownWorkers handles the scaling down of worker pods
func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, pods []corev1.Pod) error {
func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, workload *tfv1.TensorFusionWorkload, pods []corev1.Pod) error {
log := log.FromContext(ctx)

for i := range pods {
Expand All @@ -196,6 +198,16 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, p
if err := r.deletePod(ctx, podToDelete); err != nil {
return err
}

labels := prometheus.Labels{
"worker": podToDelete.Name,
"namespace": podToDelete.Namespace,
"pool": workload.Spec.PoolName,
}
metrics.GpuTflopsRequest.Delete(labels)
metrics.GpuTflopsLimit.Delete(labels)
metrics.VramBytesRequest.Delete(labels)
metrics.VramBytesLimit.Delete(labels)
}
return nil
}
Expand Down Expand Up @@ -271,7 +283,7 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
return fmt.Errorf("schedule GPU: %w", err)
}

_, err = r.tryStartWorker(ctx, workerGenerator, gpu, workload)
pod, err := r.tryStartWorker(ctx, workerGenerator, gpu, workload)
if err != nil {
// Try to release the GPU resource if pod creation fails
releaseErr := r.Scheduler.Release(ctx, workload.Spec.Resources.Requests, gpu)
Expand All @@ -280,6 +292,16 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
}
return fmt.Errorf("create worker pod: %w", err)
}

labels := prometheus.Labels{
"worker": pod.Name,
"namespace": pod.Namespace,
"pool": workload.Spec.PoolName,
}
metrics.GpuTflopsRequest.With(labels).Set(workload.Spec.Resources.Requests.Tflops.AsApproximateFloat64())
metrics.GpuTflopsLimit.With(labels).Set(workload.Spec.Resources.Limits.Tflops.AsApproximateFloat64())
metrics.VramBytesRequest.With(labels).Set(workload.Spec.Resources.Requests.Vram.AsApproximateFloat64())
metrics.VramBytesLimit.With(labels).Set(workload.Spec.Resources.Limits.Vram.AsApproximateFloat64())
}

return nil
Expand Down
12 changes: 8 additions & 4 deletions internal/metrics/connection.go → internal/metrics/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,36 @@ import (
)

var (
labels = []string{
"namespace", "worker", "pool",
}

GpuTflopsRequest = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "gpu_tflops_request",
},
[]string{"namespace", "pod", "container"},
labels,
)

GpuTflopsLimit = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "gpu_tflops_limit",
},
[]string{"namespace", "pod", "container"},
labels,
)

VramBytesRequest = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "vram_bytes_request",
},
[]string{"namespace", "pod", "container"},
labels,
)

VramBytesLimit = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "vram_bytes_limit",
},
[]string{"namespace", "pod", "container"},
labels,
)
)

Expand Down

0 comments on commit 90e2ed1

Please sign in to comment.