@@ -20,12 +20,17 @@ package controller
20
20
import (
21
21
"context"
22
22
"fmt"
23
+ "strings"
23
24
24
25
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
26
+ "k8s.io/apimachinery/pkg/api/errors"
27
+
25
28
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
26
29
corev1 "k8s.io/api/core/v1"
27
30
networkingv1 "k8s.io/api/networking/v1"
28
31
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32
+ "k8s.io/apimachinery/pkg/runtime/schema"
33
+ "k8s.io/client-go/scale"
29
34
"k8s.io/client-go/tools/record"
30
35
ctrl "sigs.k8s.io/controller-runtime"
31
36
"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -50,6 +55,20 @@ const (
50
55
PendingState State = "pending"
51
56
)
52
57
58
+ var (
59
+ // Grove GroupVersionResources for scaling operations
60
+ podCliqueGVR = schema.GroupVersionResource {
61
+ Group : "grove.io" ,
62
+ Version : "v1alpha1" ,
63
+ Resource : "podcliques" ,
64
+ }
65
+ podCliqueScalingGroupGVR = schema.GroupVersionResource {
66
+ Group : "grove.io" ,
67
+ Version : "v1alpha1" ,
68
+ Resource : "podcliquescalinggroups" ,
69
+ }
70
+ )
71
+
53
72
type etcdStorage interface {
54
73
DeleteKeys (ctx context.Context , prefix string ) error
55
74
}
@@ -60,12 +79,15 @@ type DynamoGraphDeploymentReconciler struct {
60
79
Config commonController.Config
61
80
Recorder record.EventRecorder
62
81
DockerSecretRetriever dockerSecretRetriever
82
+ ScaleClient scale.ScalesGetter
63
83
}
64
84
65
85
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
66
86
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
67
87
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
68
88
// +kubebuilder:rbac:groups=grove.io,resources=podgangsets,verbs=get;list;watch;create;update;patch;delete
89
+ // +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
90
+ // +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
69
91
70
92
// Reconcile is part of the main kubernetes reconciliation loop which aims to
71
93
// move the current state of the cluster closer to the desired state.
@@ -156,6 +178,80 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context
156
178
157
179
}
158
180
181
+ // scaleGroveResource scales a Grove resource using the generic scaling function
182
+ func (r * DynamoGraphDeploymentReconciler ) scaleGroveResource (ctx context.Context , resourceName , namespace string , newReplicas int32 , resourceType string ) error {
183
+ logger := log .FromContext (ctx )
184
+ // Determine the GroupVersionResource based on resource type
185
+ var gvr schema.GroupVersionResource
186
+ switch resourceType {
187
+ case "PodClique" :
188
+ gvr = podCliqueGVR
189
+ case "PodCliqueScalingGroup" :
190
+ gvr = podCliqueScalingGroupGVR
191
+ default :
192
+ return fmt .Errorf ("unsupported Grove resource type: %s" , resourceType )
193
+ }
194
+
195
+ // Use the generic scaling function
196
+ err := commonController .ScaleResource (ctx , r .ScaleClient , gvr , namespace , resourceName , newReplicas )
197
+ if err != nil {
198
+ if errors .IsNotFound (err ) {
199
+ // Resource doesn't exist yet - this is normal during initial creation when Grove is still creating the resources asynchronously
200
+ logger .V (1 ).Info ("Grove resource not found yet, skipping scaling for now - will retry on next reconciliation" , "gvr" , gvr , "name" , resourceName , "namespace" , namespace )
201
+ return nil
202
+ }
203
+ }
204
+ return err
205
+ }
206
+
207
+ // reconcileGroveScaling handles scaling operations for Grove resources based on service replica changes
208
+ func (r * DynamoGraphDeploymentReconciler ) reconcileGroveScaling (ctx context.Context , dynamoDeployment * nvidiacomv1alpha1.DynamoGraphDeployment ) error {
209
+ logger := log .FromContext (ctx )
210
+ logger .V (1 ).Info ("Reconciling Grove scaling operations" )
211
+
212
+ replicaIndex := 0
213
+ for serviceName , component := range dynamoDeployment .Spec .Services {
214
+ // Skip if replicas are not specified
215
+ if component .Replicas == nil {
216
+ continue
217
+ }
218
+
219
+ numberOfNodes := component .GetNumberOfNodes ()
220
+ isMultinode := numberOfNodes > 1
221
+
222
+ if isMultinode {
223
+ // Scale PodCliqueScalingGroup for multinode services
224
+ // Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
225
+ resourceName := fmt .Sprintf ("%s-%d-%s" , dynamoDeployment .Name , replicaIndex , strings .ToLower (serviceName ))
226
+ err := r .scaleGroveResource (ctx ,
227
+ resourceName ,
228
+ dynamoDeployment .Namespace ,
229
+ * component .Replicas ,
230
+ "PodCliqueScalingGroup" )
231
+ if err != nil {
232
+ logger .Error (err , "Failed to scale PodCliqueScalingGroup" , "serviceName" , serviceName , "resourceName" , resourceName , "replicas" , * component .Replicas )
233
+ return fmt .Errorf ("failed to scale PodCliqueScalingGroup %s: %w" , resourceName , err )
234
+ }
235
+ } else {
236
+ // Scale individual PodClique for single-node services
237
+ // Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
238
+ resourceName := fmt .Sprintf ("%s-%d-%s" , dynamoDeployment .Name , replicaIndex , strings .ToLower (serviceName ))
239
+ err := r .scaleGroveResource (ctx ,
240
+ resourceName ,
241
+ dynamoDeployment .Namespace ,
242
+ * component .Replicas ,
243
+ "PodClique" )
244
+ if err != nil {
245
+ logger .Error (err , "Failed to scale PodClique" , "serviceName" , serviceName , "resourceName" , resourceName , "replicas" , * component .Replicas )
246
+ return fmt .Errorf ("failed to scale PodClique %s: %w" , resourceName , err )
247
+ }
248
+ }
249
+ }
250
+
251
+ logger .V (1 ).Info ("Successfully reconciled Grove scaling operations" )
252
+ return nil
253
+ }
254
+
159
255
func (r * DynamoGraphDeploymentReconciler ) reconcileGroveResources (ctx context.Context , dynamoDeployment * nvidiacomv1alpha1.DynamoGraphDeployment ) (State , Reason , Message , error ) {
160
256
logger := log .FromContext (ctx )
161
257
// generate the dynamoComponentsDeployments from the config
@@ -177,6 +273,13 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
177
273
}
178
274
return false
179
275
})
276
+
277
+ // Handle Grove scaling operations after structural changes
278
+ if err := r .reconcileGroveScaling (ctx , dynamoDeployment ); err != nil {
279
+ logger .Error (err , "failed to reconcile Grove scaling" )
280
+ return FailedState , "grove_scaling_failed" , Message (err .Error ()), err
281
+ }
282
+
180
283
resources := []Resource {groveGangSetAsResource }
181
284
for componentName , component := range dynamoDeployment .Spec .Services {
182
285
if component .ComponentType == consts .ComponentTypeFrontend {
@@ -203,10 +306,6 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
203
306
ingressSpec = * component .Ingress
204
307
}
205
308
mainComponentIngress := dynamo .GenerateComponentIngress (ctx , dynamo .GetDynamoComponentName (dynamoDeployment , componentName ), dynamoDeployment .Namespace , ingressSpec )
206
- if err != nil {
207
- logger .Error (err , "failed to generate the main component ingress" )
208
- return "" , "" , "" , fmt .Errorf ("failed to generate the main component ingress: %w" , err )
209
- }
210
309
_ , syncedMainComponentIngress , err := commonController .SyncResource (ctx , r , dynamoDeployment , func (ctx context.Context ) (* networkingv1.Ingress , bool , error ) {
211
310
if ! ingressSpec .Enabled || ingressSpec .IngressControllerClassName == nil {
212
311
logger .Info ("Ingress is not enabled" )
@@ -224,10 +323,6 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
224
323
// generate the main component virtual service
225
324
if r .Config .IngressConfig .UseVirtualService () {
226
325
mainComponentVirtualService := dynamo .GenerateComponentVirtualService (ctx , dynamo .GetDynamoComponentName (dynamoDeployment , componentName ), dynamoDeployment .Namespace , ingressSpec )
227
- if err != nil {
228
- logger .Error (err , "failed to generate the main component virtual service" )
229
- return "" , "" , "" , fmt .Errorf ("failed to generate the main component virtual service: %w" , err )
230
- }
231
326
_ , syncedMainComponentVirtualService , err := commonController .SyncResource (ctx , r , dynamoDeployment , func (ctx context.Context ) (* networkingv1beta1.VirtualService , bool , error ) {
232
327
if ! ingressSpec .IsVirtualServiceEnabled () {
233
328
logger .Info ("VirtualService is not enabled" )
0 commit comments