Skip to content

Commit 4750a45

Browse files
committed
fix cm update issue
1 parent 5ba90e9 commit 4750a45

File tree

5 files changed

+344
-313
lines changed

5 files changed

+344
-313
lines changed

deploy/custom-configmapwithprofiles.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ metadata:
44
name: test
55
namespace: gpu-operator
66
data:
7-
a100-40gb: |-
7+
update-capacity: |-
88
version: v1
99
flags:
1010
migStrategy: mixed
11-
a100-40gb-1: |-
11+
update-capacity-1: |-
1212
version: v1
1313
flags:
1414
migStrategy: mixed

deploy/setup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ echo "GPU operator installation commands executed successfully"
6060

6161
kubectl apply -f ./deploy/custom-configmapwithprofiles.yaml
6262

63-
kubectl label node --all nvidia.com/device-plugin.config=a100-40gb
63+
kubectl label node --all nvidia.com/device-plugin.config=update-capacity
6464

6565
#for already deployed GPU operator
6666
#To avoid waiting for minutes, for now run the below command manually

internal/controller/instaslice_controller.go

Lines changed: 57 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -55,65 +55,55 @@ type LeftToRightPolicy struct{}
5555

5656
type FirstFitPolicy struct{}
5757

58-
// TODO: remove this and find a better way to reduce duplicates update via controller runtime
59-
var processedPodDeletion []string
60-
6158
//+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices,verbs=get;list;watch;create;update;patch;delete
6259
//+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/status,verbs=get;update;patch
6360
//+kubebuilder:rbac:groups=inference.codeflare.dev,resources=instaslices/finalizers,verbs=update
6461
//+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
6562
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;create;update;patch;delete
6663

6764
func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
68-
_ = log.FromContext(ctx)
69-
logger := log.Log.WithName("InstaSlice-controller")
65+
7066
var policy AllocationPolicy
7167
policy = &FirstFitPolicy{}
7268
pod := &v1.Pod{}
7369
var isPodGated = false
7470
err := r.Get(ctx, req.NamespacedName, pod)
7571
if err != nil {
76-
if errors.IsNotFound(err) {
77-
// Pod not found. It might have been deleted.
78-
return ctrl.Result{}, nil
79-
}
8072
// Error fetching the Pod
81-
return ctrl.Result{}, err
73+
log.FromContext(ctx).Error(err, "unable to fetch pod, trying to delete instaslice allocation")
8274
}
8375

8476
isPodGated = checkIfPodGated(pod, isPodGated)
8577

8678
var instasliceList inferencev1alpha1.InstasliceList
8779

8880
if err := r.List(ctx, &instasliceList, &client.ListOptions{}); err != nil {
89-
logger.Error(err, "Error listing Instaslice")
81+
log.FromContext(ctx).Error(err, "Error listing Instaslice")
9082
}
9183
// handles graceful termination of pods, wait for about 30 seconds from the time deletiontimestamp is set on the pod
9284
if !pod.DeletionTimestamp.IsZero() {
93-
if controllerutil.ContainsFinalizer(pod, "org.instaslice/accelarator") && isPodDeletionProcessed(pod.Name, processedPodDeletion) {
85+
log.FromContext(ctx).Info("set status to deleted for ", "pod", pod.Name)
86+
if controllerutil.ContainsFinalizer(pod, "org.instaslice/accelarator") {
9487
for _, instaslice := range instasliceList.Items {
9588
for podUuid, allocation := range instaslice.Spec.Allocations {
9689
if podUuid == string(pod.UID) {
9790
elapsed := time.Since(pod.DeletionTimestamp.Time)
9891
if elapsed > 30*time.Second {
9992
if controllerutil.RemoveFinalizer(pod, "org.instaslice/accelarator") {
10093
if err := r.Update(ctx, pod); err != nil {
101-
return ctrl.Result{}, err
94+
log.FromContext(ctx).Info("unable to update removal of finalizer, retrying")
95+
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
96+
}
97+
log.FromContext(ctx).Info("finalizer deleted")
98+
allocation.Allocationstatus = "deleted"
99+
instaslice.Spec.Allocations[podUuid] = allocation
100+
errUpdatingInstaslice := r.Update(ctx, &instaslice)
101+
if errUpdatingInstaslice != nil {
102+
log.FromContext(ctx).Info("unable to set instaslice to state deleted")
103+
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
102104
}
103-
logger.Info("finalizer deleted")
104-
}
105-
allocation.Allocationstatus = "deleted"
106-
instaslice.Spec.Allocations[podUuid] = allocation
107-
err := r.Update(ctx, &instaslice)
108-
if errors.IsConflict(err) {
109-
//not retrying as daemonset might be updating the instaslice object for other pods
110-
logger.Info("Latest version for instaslice object not found, retrying in next iteration")
111-
return ctrl.Result{Requeue: true}, nil
112-
}
113-
if err != nil {
114-
logger.Info("allocation set to deleted for", "pod", pod.Name)
115-
processedPodDeletion = append(processedPodDeletion, pod.Name)
116105
}
106+
117107
} else {
118108
remainingTime := 30*time.Second - elapsed
119109
return ctrl.Result{RequeueAfter: remainingTime}, nil
@@ -143,7 +133,7 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
143133
if allocations.Allocationstatus == "created" && allocations.PodUUID == string(pod.UID) {
144134
pod := r.unGatePod(pod)
145135
errForUngating := r.Update(ctx, pod)
146-
if errors.IsConflict(errForUngating) {
136+
if errForUngating != nil {
147137
//pod updates are retried as controller is the only entiting working on pod updates.
148138
return ctrl.Result{Requeue: true}, nil
149139
}
@@ -156,15 +146,42 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
156146
if _, exists := instaslice.Spec.Allocations[string(pod.UID)]; !exists {
157147
r.findDeviceForASlice(&instaslice, profileName, policy, pod)
158148
}
159-
//update all created allocations belonging to different pods to state ungated
149+
160150
if err := r.Update(ctx, &instaslice); err != nil {
161-
logger.Error(err, "Error updating instaslice allocations")
162-
return ctrl.Result{}, err
151+
log.FromContext(ctx).Error(err, "Error updating instaslice allocations")
152+
return ctrl.Result{Requeue: true}, nil
163153
}
164154
}
165155

166156
}
167-
// no gated pod found, do nothing
157+
podSearch := &v1.Pod{}
158+
for _, instaslice := range instasliceList.Items {
159+
for podUuid, allocation := range instaslice.Spec.Allocations {
160+
if allocation.Allocationstatus == "ungated" {
161+
nsName := types.NamespacedName{
162+
Name: allocation.PodName,
163+
Namespace: allocation.Namespace,
164+
}
165+
log.FromContext(ctx).Info("checking if pod exist with ", "name", allocation.PodName)
166+
err := r.Get(ctx, nsName, podSearch)
167+
if err != nil {
168+
if errors.IsNotFound(err) {
169+
log.FromContext(ctx).Info("Pod deleted still, instaslice allocation exists in ungated state")
170+
allocation.Allocationstatus = "deleted"
171+
instaslice.Spec.Allocations[podUuid] = allocation
172+
errUpdatingInstaslice := r.Update(ctx, &instaslice)
173+
if errUpdatingInstaslice != nil {
174+
log.FromContext(ctx).Info("unable to set instaslice allocation to deleted when no pod exists")
175+
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
176+
}
177+
178+
}
179+
}
180+
}
181+
182+
}
183+
}
184+
// no gated pod or dangling reference found
168185
return ctrl.Result{}, nil
169186
}
170187

@@ -239,14 +256,15 @@ func (*InstasliceReconciler) getStartIndexFromPreparedState(instaslice *inferenc
239256
for i := range gpuAllocatedIndex {
240257
gpuAllocatedIndex[i] = 0
241258
}
242-
for _, item := range instaslice.Spec.Prepared {
243-
if item.Parent == gpuUUID {
244-
for i := 0; i < int(item.Size); i++ {
245-
gpuAllocatedIndex[int(item.Start)+i] = 1
246-
}
259+
//avoid double counting
260+
// for _, item := range instaslice.Spec.Prepared {
261+
// if item.Parent == gpuUUID {
262+
// for i := 0; i < int(item.Size); i++ {
263+
// gpuAllocatedIndex[int(item.Start)+i] = 1
264+
// }
247265

248-
}
249-
}
266+
// }
267+
// }
250268

251269
for _, item := range instaslice.Spec.Allocations {
252270
if item.GPUUUID == gpuUUID {
@@ -327,7 +345,7 @@ func checkIfPodGated(pod *v1.Pod, isPodGated bool) bool {
327345
func (r *InstasliceReconciler) podMapFunc(ctx context.Context, obj client.Object) []reconcile.Request {
328346
instaslice := obj.(*inferencev1alpha1.Instaslice)
329347
for _, allocation := range instaslice.Spec.Allocations {
330-
if allocation.Allocationstatus == "created" || allocation.Allocationstatus == "deleting" {
348+
if allocation.Allocationstatus == "created" {
331349
return []reconcile.Request{{NamespacedName: types.NamespacedName{Namespace: allocation.Namespace, Name: allocation.PodName}}}
332350
}
333351
}

0 commit comments

Comments
 (0)