-
Notifications
You must be signed in to change notification settings - Fork 33
[WIP] PCP-5556: MAAS CAPI Provider is failing to clean up Stale IP addresses #323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: spectro-master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -8,6 +8,7 @@ import ( | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "net" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "regexp" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "strings" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "time" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "github.com/pkg/errors" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| "github.com/spectrocloud/cluster-api-provider-maas/pkg/maas/maintenance" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -58,6 +59,7 @@ func logVMHostDiagnostics(s *Service, err error) { | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| sys := m[1] | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| s.scope.Info("Releasing broken machine", "system-id", sys) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| ctx := context.TODO() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| s.releaseIPsForMachine(ctx, sys) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| _, _ = s.maasClient.Machines().Machine(sys).Releaser().WithForce().Release(ctx) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -96,18 +98,76 @@ func (s *Service) GetMachine(systemID string) (*infrav1beta1.Machine, error) { | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| return machine, nil | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // releaseIPBestEffort releases a single IP in MAAS. Used to avoid "User reserved" IP exhaustion | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // when a machine is never released (e.g. crash, force delete) or release fails. Logs errors only. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| func (s *Service) releaseIPBestEffort(ctx context.Context, ip string) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if ip == "" { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| return | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if err := s.maasClient.IPAddresses().Release(ctx, ip); err != nil { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| s.scope.V(1).Info("Best-effort IP release failed (may already be released)", "ip", ip, "error", err) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // releaseIPsForMachine fetches the machine by systemID and releases each of its IPs (best-effort). | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // Used so force-release and other paths that do not call ReleaseMachine still release IPs. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| func (s *Service) releaseIPsForMachine(ctx context.Context, systemID string) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if systemID == "" { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| return | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| m, err := s.maasClient.Machines().Machine(systemID).Get(ctx) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| return | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| for _, ip := range m.IPAddresses() { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if ip != nil { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| s.releaseIPBestEffort(ctx, ip.String()) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // ReleaseIP releases a single IP address in MAAS. Used when the machine is gone (e.g. force-deleted) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // but the MaasMachine spec still had a static IP that may remain "User reserved". Best-effort. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| func (s *Service) ReleaseIP(ip string) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| ctx := context.TODO() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+131
to
+132
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| func (s *Service) ReleaseIP(ip string) { | |
| ctx := context.TODO() | |
| // For new code, prefer ReleaseIPWithContext to allow cancellation/timeouts. | |
| func (s *Service) ReleaseIP(ip string) { | |
| s.ReleaseIPWithContext(context.Background(), ip) | |
| } | |
| // ReleaseIPWithContext releases a single IP address in MAAS using the provided context. | |
| // This allows callers (e.g. controllers) to apply timeouts and cancellation. | |
| func (s *Service) ReleaseIPWithContext(ctx context.Context, ip string) { |
Copilot
AI
Mar 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ReleaseMachine now explicitly releases each machine IP (via releaseIPBestEffort/IPAddresses().Release()) before calling Releaser().Release(), but the current unit tests only cover an empty IP list. Please add/extend tests to assert non-empty IPAddresses() triggers the expected IP release calls (and ideally cover the retry path too).
Copilot
AI
Mar 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ReleaseMachine treats any Machine().Get() error as "machine already gone" and then releases the scope static IP. If Get fails for a transient/non-404 reason (e.g. MAAS API outage), this can unintentionally release an IP while the machine still exists. Consider only taking the "already gone" path when the error is a confirmed not-found (e.g. the same 404/no machine matches checks used in GetMachine), and otherwise log/return the error (or skip IP release).
| // Machine already gone; release scope static IP so it does not stay "User reserved". | |
| s.releaseIPBestEffort(ctx, s.scope.GetStaticIP()) | |
| // Only treat confirmed/notional "not found" errors as "machine already gone". | |
| errMsg := getErr.Error() | |
| notFound := strings.Contains(errMsg, "404") || | |
| strings.Contains(errMsg, "not found") || | |
| strings.Contains(errMsg, "No matching machine") | |
| if notFound { | |
| s.scope.V(1).Info("Machine not found when fetching before release; releasing scope static IP", "systemID", systemID, "error", getErr) | |
| // Machine already gone; release scope static IP so it does not stay "User reserved". | |
| s.releaseIPBestEffort(ctx, s.scope.GetStaticIP()) | |
| } else { | |
| // Transient or unexpected error; skip scope static IP release to avoid releasing an IP | |
| // that may still be in use, but still attempt machine release below. | |
| s.scope.V(1).Info("Failed to get machine before release; skipping scope static IP release", "systemID", systemID, "error", getErr) | |
| } |
Copilot
AI
Mar 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ReleaseMachine uses time.Sleep for backoff while using context.TODO(), so this blocking wait can't be cancelled on shutdown or reconcile timeout. In controller-runtime codepaths this can reduce throughput and delay other reconciliations. Prefer a context-aware backoff (e.g. timer + select on ctx.Done()) and consider threading the caller's ctx into ReleaseMachine instead of creating a TODO context.
| time.Sleep(backoff) | |
| timer := time.NewTimer(backoff) | |
| select { | |
| case <-ctx.Done(): | |
| if !timer.Stop() { | |
| <-timer.C | |
| } | |
| return ctx.Err() | |
| case <-timer.C: | |
| } |
Copilot
AI
Mar 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Release() retry loop retries on every error, including likely non-retriable ones (e.g. 404 not found if the machine was already released by another actor). This can add unnecessary delay and may cause reconciliation to fail even though the desired end state (machine gone) is already reached. Consider detecting "not found" errors and treating them as success, and only retrying errors that are known transient (or at least stopping retries on 4xx).
Copilot
AI
Mar 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The deploy-failure cleanup path also performs blocking retry sleeps using the same long-lived ctx variable; this can stall reconcile for multiple seconds and doesn't respect cancellation. Consider using a context-aware backoff (or pushing retries up to the reconciler via requeue) so shutdown and timeouts can interrupt the wait.
| for attempt := 1; attempt <= maxAttempts; attempt++ { | |
| _, releaseErr = m.Releaser().Release(ctx) | |
| if releaseErr == nil { | |
| break | |
| } | |
| log.Error(releaseErr, "Unable to release machine on deploy failure", "attempt", attempt) | |
| if attempt < maxAttempts { | |
| time.Sleep(backoff) | |
| retryRelease: | |
| for attempt := 1; attempt <= maxAttempts; attempt++ { | |
| // Stop retrying if the context has been cancelled. | |
| if err := ctx.Err(); err != nil { | |
| log.Error(err, "Context cancelled while retrying machine release on deploy failure") | |
| break retryRelease | |
| } | |
| _, releaseErr = m.Releaser().Release(ctx) | |
| if releaseErr == nil { | |
| break | |
| } | |
| log.Error(releaseErr, "Unable to release machine on deploy failure", "attempt", attempt) | |
| if attempt < maxAttempts { | |
| select { | |
| case <-ctx.Done(): | |
| // Context cancelled during backoff; stop retrying. | |
| log.Error(ctx.Err(), "Context cancelled during backoff while retrying machine release on deploy failure") | |
| break retryRelease | |
| case <-time.After(backoff): | |
| } |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -103,9 +103,11 @@ func TestMachine(t *testing.T) { | |||||
| } | ||||||
|
|
||||||
| mockClientSetInterface.EXPECT().Machines().Return(mockMachines) | ||||||
|
||||||
| mockClientSetInterface.EXPECT().Machines().Return(mockMachines) | |
| mockClientSetInterface.EXPECT().Machines().Return(mockMachines).Times(2) // Get, then Releaser |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
reconcileDelete receives a context but discards it (
_ context.Context) and then calls machineSvc.ReleaseIP(), which currently uses context.TODO(). This prevents request-scoped timeouts/cancellation for MAAS calls during delete. Consider keeping the ctx parameter and threading it through to ReleaseIP(ctx, ip).