From f94dd8f08b5276ec12b84270530c1b4b7010567d Mon Sep 17 00:00:00 2001 From: Amit Sahastrabuddhe Date: Fri, 6 Mar 2026 16:03:11 +0530 Subject: [PATCH] PCP-5556: MAAS CAPI Provider is failing to clean up Stale IP addresses --- controllers/maasmachine_controller.go | 4 + go.mod | 2 +- pkg/maas/machine/machine.go | 108 +++++++++++++++++++++++--- pkg/maas/machine/machine_test.go | 6 +- 4 files changed, 106 insertions(+), 14 deletions(-) diff --git a/controllers/maasmachine_controller.go b/controllers/maasmachine_controller.go index 79d1261f8..1a9000ae4 100644 --- a/controllers/maasmachine_controller.go +++ b/controllers/maasmachine_controller.go @@ -206,6 +206,10 @@ func (r *MaasMachineReconciler) reconcileDelete(_ context.Context, machineScope machineScope.Info("Not removing finalizer yet; waiting for deletion age threshold", "age", deletionAge.String()) return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } + // Release static IP from spec if set, so it does not remain "User reserved" in MAAS (e.g. force-deleted machine). + if maasMachine.Spec.StaticIP != nil && maasMachine.Spec.StaticIP.IP != "" { + machineSvc.ReleaseIP(maasMachine.Spec.StaticIP.IP) + } machineScope.V(2).Info("Unable to locate MaaS instance by ID or tags", "system-id", machineScope.GetInstanceID()) r.Recorder.Eventf(maasMachine, corev1.EventTypeWarning, "NoMachineFound", "Unable to find matching MaaS machine") controllerutil.RemoveFinalizer(maasMachine, infrav1beta1.MachineFinalizer) diff --git a/go.mod b/go.mod index b4977862f..8206a1bbb 100644 --- a/go.mod +++ b/go.mod @@ -2,7 +2,7 @@ module github.com/spectrocloud/cluster-api-provider-maas go 1.24.2 -toolchain go1.24.11 +toolchain go1.24.13 require ( github.com/go-logr/logr v1.4.2 diff --git a/pkg/maas/machine/machine.go b/pkg/maas/machine/machine.go index 20eddad1c..02e5780ed 100644 --- a/pkg/maas/machine/machine.go +++ b/pkg/maas/machine/machine.go @@ -8,6 +8,7 @@ import ( "net" "regexp" "strings" + "time" "github.com/pkg/errors" "github.com/spectrocloud/cluster-api-provider-maas/pkg/maas/maintenance" @@ -58,6 +59,7 @@ func logVMHostDiagnostics(s *Service, err error) { sys := m[1] s.scope.Info("Releasing broken machine", "system-id", sys) ctx := context.TODO() + s.releaseIPsForMachine(ctx, sys) _, _ = s.maasClient.Machines().Machine(sys).Releaser().WithForce().Release(ctx) } @@ -96,18 +98,76 @@ func (s *Service) GetMachine(systemID string) (*infrav1beta1.Machine, error) { return machine, nil } +// releaseIPBestEffort releases a single IP in MAAS. Used to avoid "User reserved" IP exhaustion +// when a machine is never released (e.g. crash, force delete) or release fails. Logs errors only. +func (s *Service) releaseIPBestEffort(ctx context.Context, ip string) { + if ip == "" { + return + } + if err := s.maasClient.IPAddresses().Release(ctx, ip); err != nil { + s.scope.V(1).Info("Best-effort IP release failed (may already be released)", "ip", ip, "error", err) + } +} + +// releaseIPsForMachine fetches the machine by systemID and releases each of its IPs (best-effort). +// Used so force-release and other paths that do not call ReleaseMachine still release IPs. +func (s *Service) releaseIPsForMachine(ctx context.Context, systemID string) { + if systemID == "" { + return + } + m, err := s.maasClient.Machines().Machine(systemID).Get(ctx) + if err != nil { + return + } + for _, ip := range m.IPAddresses() { + if ip != nil { + s.releaseIPBestEffort(ctx, ip.String()) + } + } +} + +// ReleaseIP releases a single IP address in MAAS. Used when the machine is gone (e.g. force-deleted) +// but the MaasMachine spec still had a static IP that may remain "User reserved". Best-effort. +func (s *Service) ReleaseIP(ip string) { + ctx := context.TODO() + s.releaseIPBestEffort(ctx, ip) +} + +// ReleaseMachine releases the MAAS machine and its IP allocations. It explicitly releases any +// IPs associated with the machine before calling MAAS Release(), and retries Release() on +// transient failures to reduce "User reserved" IP exhaustion from failed releases. func (s *Service) ReleaseMachine(systemID string) error { ctx := context.TODO() - _, err := s.maasClient.Machines(). - Machine(systemID). - Releaser(). - Release(ctx) - if err != nil { - return errors.Wrapf(err, "Unable to release machine") + // Fetch machine to release its IPs explicitly before release. IPs can remain "User reserved" + // if Release() is never called (crash, force delete, migration failure) or if Release() fails. + m, getErr := s.maasClient.Machines().Machine(systemID).Get(ctx) + if getErr == nil { + for _, ip := range m.IPAddresses() { + if ip != nil { + s.releaseIPBestEffort(ctx, ip.String()) + } + } + } else { + // Machine already gone; release scope static IP so it does not stay "User reserved". + s.releaseIPBestEffort(ctx, s.scope.GetStaticIP()) } + // Proceed with machine release even if Get failed (e.g. machine already gone). - return nil + const maxAttempts = 3 + const backoff = 2 * time.Second + var lastErr error + for attempt := 1; attempt <= maxAttempts; attempt++ { + _, lastErr = s.maasClient.Machines().Machine(systemID).Releaser().Release(ctx) + if lastErr == nil { + return nil + } + s.scope.V(1).Info("Machine release attempt failed, will retry", "attempt", attempt, "systemID", systemID, "error", lastErr) + if attempt < maxAttempts { + time.Sleep(backoff) + } + } + return errors.Wrapf(lastErr, "Unable to release machine after %d attempts", maxAttempts) } func (s *Service) DeployMachine(userDataB64 string) (_ *infrav1beta1.Machine, rerr error) { @@ -180,6 +240,7 @@ func (s *Service) DeployMachine(userDataB64 string) (_ *infrav1beta1.Machine, re if pt == "lxd" || pt == "lxdvm" || pt == "virsh" { s.scope.Info("Rejecting VM host allocation for node(s) under HCP; releasing and retrying", "system-id", m.SystemID(), "powerType", pt, "zone", m.ZoneName(), "pool", m.ResourcePoolName()) + s.releaseIPsForMachine(ctx, m.SystemID()) _, _ = m.Releaser().WithForce().Release(ctx) return nil, ErrBrokenMachine } @@ -225,10 +286,26 @@ func (s *Service) DeployMachine(userDataB64 string) (_ *infrav1beta1.Machine, re defer func() { if rerr != nil { s.scope.Info("Attempting to release machine which failed to deploy") - _, err := m.Releaser().Release(ctx) - if err != nil { - // Is it right to NOT set rerr so we can see the original issue? - log.Error(err, "Unable to release properly") + // Release static IP if we had one to avoid "User reserved" IP exhaustion + if staticIP := s.scope.GetStaticIP(); staticIP != "" { + s.releaseIPBestEffort(ctx, staticIP) + } + // Retry machine release to reduce stuck allocations when Release() fails transiently + const maxAttempts = 3 + const backoff = 2 * time.Second + var releaseErr error + for attempt := 1; attempt <= maxAttempts; attempt++ { + _, releaseErr = m.Releaser().Release(ctx) + if releaseErr == nil { + break + } + log.Error(releaseErr, "Unable to release machine on deploy failure", "attempt", attempt) + if attempt < maxAttempts { + time.Sleep(backoff) + } + } + if releaseErr != nil { + log.Error(releaseErr, "Unable to release properly after retries (IP/machine may need manual release in MAAS)") } // Clear IDs so the next reconcile can allocate a different machine instead of @@ -286,6 +363,13 @@ func (s *Service) createVMViaMAAS(ctx context.Context, userDataB64 string) (*inf if err != nil { return nil, errors.Wrap(err, "failed to get existing VM by system-id") } + cleanupOnError := false + defer func() { + if cleanupOnError { + s.releaseIPBestEffort(ctx, s.scope.GetStaticIP()) + _ = s.ReleaseMachine(m.SystemID()) + } + }() // Best-effort: set hostname and static IP before deploy machineName := s.scope.Machine.Name vmName := fmt.Sprintf("vm-%s", machineName) @@ -295,6 +379,7 @@ func (s *Service) createVMViaMAAS(ctx context.Context, userDataB64 string) (*inf if staticIP := s.scope.GetStaticIP(); staticIP != "" { if err := s.setMachineStaticIP(m.SystemID(), &infrav1beta1.StaticIPConfig{IP: staticIP}); err != nil { // Fail fast so we don't attempt Deploy without a network link configured + cleanupOnError = true return nil, errors.Wrap(err, "failed to configure static IP before deploy") } } @@ -304,6 +389,7 @@ func (s *Service) createVMViaMAAS(ctx context.Context, userDataB64 string) (*inf SetOSSystem("custom"). SetDistroSeries(mm.Spec.Image).Deploy(ctx) if err != nil { + cleanupOnError = true return nil, errors.Wrap(err, "failed to deploy existing VM") } // Determine fallback zone diff --git a/pkg/maas/machine/machine_test.go b/pkg/maas/machine/machine_test.go index 161c03b52..749fd0177 100644 --- a/pkg/maas/machine/machine_test.go +++ b/pkg/maas/machine/machine_test.go @@ -103,9 +103,11 @@ func TestMachine(t *testing.T) { } mockClientSetInterface.EXPECT().Machines().Return(mockMachines) - mockMachines.EXPECT().Machine("abc123").Return(mockMachine) + mockMachines.EXPECT().Machine("abc123").Return(mockMachine).Times(2) // Get, then Releaser + mockMachine.EXPECT().Get(gomock.Any()).Return(mockMachine, nil) + mockMachine.EXPECT().IPAddresses().Return([]net.IP{}) mockMachine.EXPECT().Releaser().Return(mockMachineReleaser) - mockMachineReleaser.EXPECT().Release(context.Background()).Return(mockMachine, nil) + mockMachineReleaser.EXPECT().Release(gomock.Any()).Return(mockMachine, nil) err := s.ReleaseMachine("abc123") g.Expect(err).ToNot(HaveOccurred())