Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions controllers/maasmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ func (r *MaasMachineReconciler) reconcileDelete(_ context.Context, machineScope
machineScope.Info("Not removing finalizer yet; waiting for deletion age threshold", "age", deletionAge.String())
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
// Release static IP from spec if set, so it does not remain "User reserved" in MAAS (e.g. force-deleted machine).
if maasMachine.Spec.StaticIP != nil && maasMachine.Spec.StaticIP.IP != "" {
machineSvc.ReleaseIP(maasMachine.Spec.StaticIP.IP)
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reconcileDelete receives a context but discards it (_ context.Context) and then calls machineSvc.ReleaseIP(), which currently uses context.TODO(). This prevents request-scoped timeouts/cancellation for MAAS calls during delete. Consider keeping the ctx parameter and threading it through to ReleaseIP(ctx, ip).

Suggested change
machineSvc.ReleaseIP(maasMachine.Spec.StaticIP.IP)
machineSvc.ReleaseIP(ctx, maasMachine.Spec.StaticIP.IP)

Copilot uses AI. Check for mistakes.
}
machineScope.V(2).Info("Unable to locate MaaS instance by ID or tags", "system-id", machineScope.GetInstanceID())
r.Recorder.Eventf(maasMachine, corev1.EventTypeWarning, "NoMachineFound", "Unable to find matching MaaS machine")
controllerutil.RemoveFinalizer(maasMachine, infrav1beta1.MachineFinalizer)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module github.com/spectrocloud/cluster-api-provider-maas

go 1.24.2

toolchain go1.24.11
toolchain go1.24.13

require (
github.com/go-logr/logr v1.4.2
Expand Down
108 changes: 97 additions & 11 deletions pkg/maas/machine/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"net"
"regexp"
"strings"
"time"

"github.com/pkg/errors"
"github.com/spectrocloud/cluster-api-provider-maas/pkg/maas/maintenance"
Expand Down Expand Up @@ -58,6 +59,7 @@ func logVMHostDiagnostics(s *Service, err error) {
sys := m[1]
s.scope.Info("Releasing broken machine", "system-id", sys)
ctx := context.TODO()
s.releaseIPsForMachine(ctx, sys)
_, _ = s.maasClient.Machines().Machine(sys).Releaser().WithForce().Release(ctx)
}

Expand Down Expand Up @@ -96,18 +98,76 @@ func (s *Service) GetMachine(systemID string) (*infrav1beta1.Machine, error) {
return machine, nil
}

// releaseIPBestEffort releases a single IP in MAAS. Used to avoid "User reserved" IP exhaustion
// when a machine is never released (e.g. crash, force delete) or release fails. Logs errors only.
func (s *Service) releaseIPBestEffort(ctx context.Context, ip string) {
if ip == "" {
return
}
if err := s.maasClient.IPAddresses().Release(ctx, ip); err != nil {
s.scope.V(1).Info("Best-effort IP release failed (may already be released)", "ip", ip, "error", err)
}
}

// releaseIPsForMachine fetches the machine by systemID and releases each of its IPs (best-effort).
// Used so force-release and other paths that do not call ReleaseMachine still release IPs.
func (s *Service) releaseIPsForMachine(ctx context.Context, systemID string) {
if systemID == "" {
return
}
m, err := s.maasClient.Machines().Machine(systemID).Get(ctx)
if err != nil {
return
}
for _, ip := range m.IPAddresses() {
if ip != nil {
s.releaseIPBestEffort(ctx, ip.String())
}
}
}

// ReleaseIP releases a single IP address in MAAS. Used when the machine is gone (e.g. force-deleted)
// but the MaasMachine spec still had a static IP that may remain "User reserved". Best-effort.
func (s *Service) ReleaseIP(ip string) {
ctx := context.TODO()
Comment on lines +131 to +132
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReleaseIP creates a context.TODO() internally, which makes it hard for callers (controllers) to apply timeouts/cancellation and can block shutdown if MAAS calls hang. Consider changing ReleaseIP to accept a context.Context parameter and passing the reconcile ctx from the controller.

Suggested change
func (s *Service) ReleaseIP(ip string) {
ctx := context.TODO()
// For new code, prefer ReleaseIPWithContext to allow cancellation/timeouts.
func (s *Service) ReleaseIP(ip string) {
s.ReleaseIPWithContext(context.Background(), ip)
}
// ReleaseIPWithContext releases a single IP address in MAAS using the provided context.
// This allows callers (e.g. controllers) to apply timeouts and cancellation.
func (s *Service) ReleaseIPWithContext(ctx context.Context, ip string) {

Copilot uses AI. Check for mistakes.
s.releaseIPBestEffort(ctx, ip)
}

// ReleaseMachine releases the MAAS machine and its IP allocations. It explicitly releases any
// IPs associated with the machine before calling MAAS Release(), and retries Release() on
// transient failures to reduce "User reserved" IP exhaustion from failed releases.
func (s *Service) ReleaseMachine(systemID string) error {
ctx := context.TODO()

_, err := s.maasClient.Machines().
Machine(systemID).
Releaser().
Release(ctx)
if err != nil {
return errors.Wrapf(err, "Unable to release machine")
// Fetch machine to release its IPs explicitly before release. IPs can remain "User reserved"
// if Release() is never called (crash, force delete, migration failure) or if Release() fails.
m, getErr := s.maasClient.Machines().Machine(systemID).Get(ctx)
if getErr == nil {
for _, ip := range m.IPAddresses() {
if ip != nil {
s.releaseIPBestEffort(ctx, ip.String())
}
Comment on lines +145 to +149
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReleaseMachine now explicitly releases each machine IP (via releaseIPBestEffort/IPAddresses().Release()) before calling Releaser().Release(), but the current unit tests only cover an empty IP list. Please add/extend tests to assert non-empty IPAddresses() triggers the expected IP release calls (and ideally cover the retry path too).

Copilot uses AI. Check for mistakes.
}
} else {
// Machine already gone; release scope static IP so it does not stay "User reserved".
s.releaseIPBestEffort(ctx, s.scope.GetStaticIP())
Comment on lines +152 to +153
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReleaseMachine treats any Machine().Get() error as "machine already gone" and then releases the scope static IP. If Get fails for a transient/non-404 reason (e.g. MAAS API outage), this can unintentionally release an IP while the machine still exists. Consider only taking the "already gone" path when the error is a confirmed not-found (e.g. the same 404/no machine matches checks used in GetMachine), and otherwise log/return the error (or skip IP release).

Suggested change
// Machine already gone; release scope static IP so it does not stay "User reserved".
s.releaseIPBestEffort(ctx, s.scope.GetStaticIP())
// Only treat confirmed/notional "not found" errors as "machine already gone".
errMsg := getErr.Error()
notFound := strings.Contains(errMsg, "404") ||
strings.Contains(errMsg, "not found") ||
strings.Contains(errMsg, "No matching machine")
if notFound {
s.scope.V(1).Info("Machine not found when fetching before release; releasing scope static IP", "systemID", systemID, "error", getErr)
// Machine already gone; release scope static IP so it does not stay "User reserved".
s.releaseIPBestEffort(ctx, s.scope.GetStaticIP())
} else {
// Transient or unexpected error; skip scope static IP release to avoid releasing an IP
// that may still be in use, but still attempt machine release below.
s.scope.V(1).Info("Failed to get machine before release; skipping scope static IP release", "systemID", systemID, "error", getErr)
}

Copilot uses AI. Check for mistakes.
}
// Proceed with machine release even if Get failed (e.g. machine already gone).

return nil
const maxAttempts = 3
const backoff = 2 * time.Second
var lastErr error
for attempt := 1; attempt <= maxAttempts; attempt++ {
_, lastErr = s.maasClient.Machines().Machine(systemID).Releaser().Release(ctx)
if lastErr == nil {
return nil
}
s.scope.V(1).Info("Machine release attempt failed, will retry", "attempt", attempt, "systemID", systemID, "error", lastErr)
if attempt < maxAttempts {
time.Sleep(backoff)
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReleaseMachine uses time.Sleep for backoff while using context.TODO(), so this blocking wait can't be cancelled on shutdown or reconcile timeout. In controller-runtime codepaths this can reduce throughput and delay other reconciliations. Prefer a context-aware backoff (e.g. timer + select on ctx.Done()) and consider threading the caller's ctx into ReleaseMachine instead of creating a TODO context.

Suggested change
time.Sleep(backoff)
timer := time.NewTimer(backoff)
select {
case <-ctx.Done():
if !timer.Stop() {
<-timer.C
}
return ctx.Err()
case <-timer.C:
}

Copilot uses AI. Check for mistakes.
}
}
return errors.Wrapf(lastErr, "Unable to release machine after %d attempts", maxAttempts)
Comment on lines +157 to +170
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Release() retry loop retries on every error, including likely non-retriable ones (e.g. 404 not found if the machine was already released by another actor). This can add unnecessary delay and may cause reconciliation to fail even though the desired end state (machine gone) is already reached. Consider detecting "not found" errors and treating them as success, and only retrying errors that are known transient (or at least stopping retries on 4xx).

Copilot uses AI. Check for mistakes.
}

func (s *Service) DeployMachine(userDataB64 string) (_ *infrav1beta1.Machine, rerr error) {
Expand Down Expand Up @@ -180,6 +240,7 @@ func (s *Service) DeployMachine(userDataB64 string) (_ *infrav1beta1.Machine, re
if pt == "lxd" || pt == "lxdvm" || pt == "virsh" {
s.scope.Info("Rejecting VM host allocation for node(s) under HCP; releasing and retrying",
"system-id", m.SystemID(), "powerType", pt, "zone", m.ZoneName(), "pool", m.ResourcePoolName())
s.releaseIPsForMachine(ctx, m.SystemID())
_, _ = m.Releaser().WithForce().Release(ctx)
return nil, ErrBrokenMachine
}
Expand Down Expand Up @@ -225,10 +286,26 @@ func (s *Service) DeployMachine(userDataB64 string) (_ *infrav1beta1.Machine, re
defer func() {
if rerr != nil {
s.scope.Info("Attempting to release machine which failed to deploy")
_, err := m.Releaser().Release(ctx)
if err != nil {
// Is it right to NOT set rerr so we can see the original issue?
log.Error(err, "Unable to release properly")
// Release static IP if we had one to avoid "User reserved" IP exhaustion
if staticIP := s.scope.GetStaticIP(); staticIP != "" {
s.releaseIPBestEffort(ctx, staticIP)
}
// Retry machine release to reduce stuck allocations when Release() fails transiently
const maxAttempts = 3
const backoff = 2 * time.Second
var releaseErr error
for attempt := 1; attempt <= maxAttempts; attempt++ {
_, releaseErr = m.Releaser().Release(ctx)
if releaseErr == nil {
break
}
log.Error(releaseErr, "Unable to release machine on deploy failure", "attempt", attempt)
if attempt < maxAttempts {
time.Sleep(backoff)
Comment on lines +297 to +304
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deploy-failure cleanup path also performs blocking retry sleeps using the same long-lived ctx variable; this can stall reconcile for multiple seconds and doesn't respect cancellation. Consider using a context-aware backoff (or pushing retries up to the reconciler via requeue) so shutdown and timeouts can interrupt the wait.

Suggested change
for attempt := 1; attempt <= maxAttempts; attempt++ {
_, releaseErr = m.Releaser().Release(ctx)
if releaseErr == nil {
break
}
log.Error(releaseErr, "Unable to release machine on deploy failure", "attempt", attempt)
if attempt < maxAttempts {
time.Sleep(backoff)
retryRelease:
for attempt := 1; attempt <= maxAttempts; attempt++ {
// Stop retrying if the context has been cancelled.
if err := ctx.Err(); err != nil {
log.Error(err, "Context cancelled while retrying machine release on deploy failure")
break retryRelease
}
_, releaseErr = m.Releaser().Release(ctx)
if releaseErr == nil {
break
}
log.Error(releaseErr, "Unable to release machine on deploy failure", "attempt", attempt)
if attempt < maxAttempts {
select {
case <-ctx.Done():
// Context cancelled during backoff; stop retrying.
log.Error(ctx.Err(), "Context cancelled during backoff while retrying machine release on deploy failure")
break retryRelease
case <-time.After(backoff):
}

Copilot uses AI. Check for mistakes.
}
}
if releaseErr != nil {
log.Error(releaseErr, "Unable to release properly after retries (IP/machine may need manual release in MAAS)")
}

// Clear IDs so the next reconcile can allocate a different machine instead of
Expand Down Expand Up @@ -286,6 +363,13 @@ func (s *Service) createVMViaMAAS(ctx context.Context, userDataB64 string) (*inf
if err != nil {
return nil, errors.Wrap(err, "failed to get existing VM by system-id")
}
cleanupOnError := false
defer func() {
if cleanupOnError {
s.releaseIPBestEffort(ctx, s.scope.GetStaticIP())
_ = s.ReleaseMachine(m.SystemID())
}
}()
// Best-effort: set hostname and static IP before deploy
machineName := s.scope.Machine.Name
vmName := fmt.Sprintf("vm-%s", machineName)
Expand All @@ -295,6 +379,7 @@ func (s *Service) createVMViaMAAS(ctx context.Context, userDataB64 string) (*inf
if staticIP := s.scope.GetStaticIP(); staticIP != "" {
if err := s.setMachineStaticIP(m.SystemID(), &infrav1beta1.StaticIPConfig{IP: staticIP}); err != nil {
// Fail fast so we don't attempt Deploy without a network link configured
cleanupOnError = true
return nil, errors.Wrap(err, "failed to configure static IP before deploy")
}
}
Expand All @@ -304,6 +389,7 @@ func (s *Service) createVMViaMAAS(ctx context.Context, userDataB64 string) (*inf
SetOSSystem("custom").
SetDistroSeries(mm.Spec.Image).Deploy(ctx)
if err != nil {
cleanupOnError = true
return nil, errors.Wrap(err, "failed to deploy existing VM")
}
// Determine fallback zone
Expand Down
6 changes: 4 additions & 2 deletions pkg/maas/machine/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,11 @@ func TestMachine(t *testing.T) {
}

mockClientSetInterface.EXPECT().Machines().Return(mockMachines)
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReleaseMachine calls maasClient.Machines() at least twice now (once for Get, once for Releaser().Release(), and more if retries happen), but this test only expects Machines() to be called once. Update the gomock expectation (e.g. Times(2) for the no-retry happy path, or AnyTimes/MinTimes if you want the test to tolerate retries).

Suggested change
mockClientSetInterface.EXPECT().Machines().Return(mockMachines)
mockClientSetInterface.EXPECT().Machines().Return(mockMachines).Times(2) // Get, then Releaser

Copilot uses AI. Check for mistakes.
mockMachines.EXPECT().Machine("abc123").Return(mockMachine)
mockMachines.EXPECT().Machine("abc123").Return(mockMachine).Times(2) // Get, then Releaser
mockMachine.EXPECT().Get(gomock.Any()).Return(mockMachine, nil)
mockMachine.EXPECT().IPAddresses().Return([]net.IP{})
mockMachine.EXPECT().Releaser().Return(mockMachineReleaser)
mockMachineReleaser.EXPECT().Release(context.Background()).Return(mockMachine, nil)
mockMachineReleaser.EXPECT().Release(gomock.Any()).Return(mockMachine, nil)

err := s.ReleaseMachine("abc123")
g.Expect(err).ToNot(HaveOccurred())
Expand Down