@@ -55,65 +55,55 @@ type LeftToRightPolicy struct{}
55
55
56
56
type FirstFitPolicy struct {}
57
57
58
- // TODO: remove this and find a better way to reduce duplicates update via controller runtime
59
- var processedPodDeletion []string
60
-
61
58
//+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices,verbs=get;list;watch;create;update;patch;delete
62
59
//+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/status,verbs=get;update;patch
63
60
//+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/finalizers,verbs=update
64
61
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
65
62
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;create;update;patch;delete
66
63
67
64
func (r * InstasliceReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
68
- _ = log .FromContext (ctx )
69
- logger := log .Log .WithName ("InstaSlice-controller" )
65
+
70
66
var policy AllocationPolicy
71
67
policy = & FirstFitPolicy {}
72
68
pod := & v1.Pod {}
73
69
var isPodGated = false
74
70
err := r .Get (ctx , req .NamespacedName , pod )
75
71
if err != nil {
76
- if errors .IsNotFound (err ) {
77
- // Pod not found. It might have been deleted.
78
- return ctrl.Result {}, nil
79
- }
80
72
// Error fetching the Pod
81
- return ctrl. Result {}, err
73
+ log . FromContext ( ctx ). Error ( err , "unable to fetch pod, trying to delete instaslice allocation" )
82
74
}
83
75
84
76
isPodGated = checkIfPodGated (pod , isPodGated )
85
77
86
78
var instasliceList inferencev1alpha1.InstasliceList
87
79
88
80
if err := r .List (ctx , & instasliceList , & client.ListOptions {}); err != nil {
89
- logger .Error (err , "Error listing Instaslice" )
81
+ log . FromContext ( ctx ) .Error (err , "Error listing Instaslice" )
90
82
}
91
83
// handles graceful termination of pods, wait for about 30 seconds from the time deletiontimestamp is set on the pod
92
84
if ! pod .DeletionTimestamp .IsZero () {
93
- if controllerutil .ContainsFinalizer (pod , "org.instaslice/accelarator" ) && isPodDeletionProcessed (pod .Name , processedPodDeletion ) {
85
+ log .FromContext (ctx ).Info ("set status to deleted for " , "pod" , pod .Name )
86
+ if controllerutil .ContainsFinalizer (pod , "org.instaslice/accelarator" ) {
94
87
for _ , instaslice := range instasliceList .Items {
95
88
for podUuid , allocation := range instaslice .Spec .Allocations {
96
89
if podUuid == string (pod .UID ) {
97
90
elapsed := time .Since (pod .DeletionTimestamp .Time )
98
91
if elapsed > 30 * time .Second {
99
92
if controllerutil .RemoveFinalizer (pod , "org.instaslice/accelarator" ) {
100
93
if err := r .Update (ctx , pod ); err != nil {
101
- return ctrl.Result {}, err
94
+ log .FromContext (ctx ).Info ("unable to update removal of finalizer, retrying" )
95
+ return ctrl.Result {RequeueAfter : 1 * time .Second }, nil
96
+ }
97
+ log .FromContext (ctx ).Info ("finalizer deleted" )
98
+ allocation .Allocationstatus = "deleted"
99
+ instaslice .Spec .Allocations [podUuid ] = allocation
100
+ errUpdatingInstaslice := r .Update (ctx , & instaslice )
101
+ if errUpdatingInstaslice != nil {
102
+ log .FromContext (ctx ).Info ("unable to set instaslice to state deleted" )
103
+ return ctrl.Result {RequeueAfter : 1 * time .Second }, nil
102
104
}
103
- logger .Info ("finalizer deleted" )
104
- }
105
- allocation .Allocationstatus = "deleted"
106
- instaslice .Spec .Allocations [podUuid ] = allocation
107
- err := r .Update (ctx , & instaslice )
108
- if errors .IsConflict (err ) {
109
- //not retrying as daemonset might be updating the instaslice object for other pods
110
- logger .Info ("Latest version for instaslice object not found, retrying in next iteration" )
111
- return ctrl.Result {Requeue : true }, nil
112
- }
113
- if err != nil {
114
- logger .Info ("allocation set to deleted for" , "pod" , pod .Name )
115
- processedPodDeletion = append (processedPodDeletion , pod .Name )
116
105
}
106
+
117
107
} else {
118
108
remainingTime := 30 * time .Second - elapsed
119
109
return ctrl.Result {RequeueAfter : remainingTime }, nil
@@ -143,7 +133,7 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
143
133
if allocations .Allocationstatus == "created" && allocations .PodUUID == string (pod .UID ) {
144
134
pod := r .unGatePod (pod )
145
135
errForUngating := r .Update (ctx , pod )
146
- if errors . IsConflict ( errForUngating ) {
136
+ if errForUngating != nil {
147
137
//pod updates are retried as controller is the only entiting working on pod updates.
148
138
return ctrl.Result {Requeue : true }, nil
149
139
}
@@ -156,15 +146,42 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
156
146
if _ , exists := instaslice .Spec .Allocations [string (pod .UID )]; ! exists {
157
147
r .findDeviceForASlice (& instaslice , profileName , policy , pod )
158
148
}
159
- //update all created allocations belonging to different pods to state ungated
149
+
160
150
if err := r .Update (ctx , & instaslice ); err != nil {
161
- logger .Error (err , "Error updating instaslice allocations" )
162
- return ctrl.Result {}, err
151
+ log . FromContext ( ctx ) .Error (err , "Error updating instaslice allocations" )
152
+ return ctrl.Result {Requeue : true }, nil
163
153
}
164
154
}
165
155
166
156
}
167
- // no gated pod found, do nothing
157
+ podSearch := & v1.Pod {}
158
+ for _ , instaslice := range instasliceList .Items {
159
+ for podUuid , allocation := range instaslice .Spec .Allocations {
160
+ if allocation .Allocationstatus == "ungated" {
161
+ nsName := types.NamespacedName {
162
+ Name : allocation .PodName ,
163
+ Namespace : allocation .Namespace ,
164
+ }
165
+ log .FromContext (ctx ).Info ("checking if pod exist with " , "name" , allocation .PodName )
166
+ err := r .Get (ctx , nsName , podSearch )
167
+ if err != nil {
168
+ if errors .IsNotFound (err ) {
169
+ log .FromContext (ctx ).Info ("Pod deleted still, instaslice allocation exists in ungated state" )
170
+ allocation .Allocationstatus = "deleted"
171
+ instaslice .Spec .Allocations [podUuid ] = allocation
172
+ errUpdatingInstaslice := r .Update (ctx , & instaslice )
173
+ if errUpdatingInstaslice != nil {
174
+ log .FromContext (ctx ).Info ("unable to set instaslice allocation to deleted when no pod exists" )
175
+ return ctrl.Result {RequeueAfter : 1 * time .Second }, nil
176
+ }
177
+
178
+ }
179
+ }
180
+ }
181
+
182
+ }
183
+ }
184
+ // no gated pod or dangling reference found
168
185
return ctrl.Result {}, nil
169
186
}
170
187
@@ -239,14 +256,15 @@ func (*InstasliceReconciler) getStartIndexFromPreparedState(instaslice *inferenc
239
256
for i := range gpuAllocatedIndex {
240
257
gpuAllocatedIndex [i ] = 0
241
258
}
242
- for _ , item := range instaslice .Spec .Prepared {
243
- if item .Parent == gpuUUID {
244
- for i := 0 ; i < int (item .Size ); i ++ {
245
- gpuAllocatedIndex [int (item .Start )+ i ] = 1
246
- }
259
+ //avoid double counting
260
+ // for _, item := range instaslice.Spec.Prepared {
261
+ // if item.Parent == gpuUUID {
262
+ // for i := 0; i < int(item.Size); i++ {
263
+ // gpuAllocatedIndex[int(item.Start)+i] = 1
264
+ // }
247
265
248
- }
249
- }
266
+ // }
267
+ // }
250
268
251
269
for _ , item := range instaslice .Spec .Allocations {
252
270
if item .GPUUUID == gpuUUID {
@@ -327,7 +345,7 @@ func checkIfPodGated(pod *v1.Pod, isPodGated bool) bool {
327
345
func (r * InstasliceReconciler ) podMapFunc (ctx context.Context , obj client.Object ) []reconcile.Request {
328
346
instaslice := obj .(* inferencev1alpha1.Instaslice )
329
347
for _ , allocation := range instaslice .Spec .Allocations {
330
- if allocation .Allocationstatus == "created" || allocation . Allocationstatus == "deleting" {
348
+ if allocation .Allocationstatus == "created" {
331
349
return []reconcile.Request {{NamespacedName : types.NamespacedName {Namespace : allocation .Namespace , Name : allocation .PodName }}}
332
350
}
333
351
}
0 commit comments