Skip to content

Commit 6e58fca

Browse files
author
OpenShift CI Bot
committed
fix(control-plane-operator): add HCP finalizer to AWSEndpointService reconciler
Add a finalizer on the HostedControlPlane to block HCP deletion until all AWS PrivateLink resources (VPC endpoints, security groups, DNS records) are cleaned up by the AWSEndpointService reconciler. - Add hcpAWSPrivateLinkFinalizerName finalizer, placed after client initialization succeeds on the normal reconciliation path - Add reconcileHCPDeletion to clean up AWS resources for each AWSEndpointService CR before removing the HCP finalizer - Replace handler.Funcs{UpdateFunc: ...} HCP watch with EnqueueRequestsFromMapFunc so Create/Delete events also trigger reconciliation (critical for CPO restarts during HCP deletion) - Use convergent multi-CR coordination: each reconciler cleans its own CR, only the last one to finish removes the HCP finalizer - Add comprehensive unit tests covering finalizer lifecycle, HCP deletion cleanup, concurrent reconciler coordination, and SharedVPC scenarios Previously, if the HCP was deleted before AWSEndpointService cleanup, the controller could not construct valid AWS clients — particularly for SharedVPC clusters where cross-account role ARNs are sourced from the HCP spec — orphaning AWS resources. Signed-off-by: OpenShift CI Bot <ci-bot@redhat.com> Commit-Message-Assisted-by: Claude (via Claude Code)
1 parent bc3bda9 commit 6e58fca

2 files changed

Lines changed: 936 additions & 47 deletions

File tree

control-plane-operator/controllers/awsprivatelink/awsprivatelink_controller.go

Lines changed: 203 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ import (
4646
"sigs.k8s.io/controller-runtime/pkg/client"
4747
"sigs.k8s.io/controller-runtime/pkg/controller"
4848
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
49-
"sigs.k8s.io/controller-runtime/pkg/event"
5049
"sigs.k8s.io/controller-runtime/pkg/handler"
50+
"sigs.k8s.io/controller-runtime/pkg/log"
5151
"sigs.k8s.io/controller-runtime/pkg/manager"
5252
"sigs.k8s.io/controller-runtime/pkg/reconcile"
5353
"sigs.k8s.io/controller-runtime/pkg/source"
@@ -219,6 +219,16 @@ const (
219219
endpointServiceDeletionRequeueDuration = 5 * time.Second
220220
hypershiftLocalZone = "hypershift.local"
221221
routerDomain = "apps"
222+
223+
// hcpAWSPrivateLinkFinalizerName is a finalizer placed on the HostedControlPlane by
224+
// the AWSEndpointService reconciler. It blocks HCP deletion until all AWS PrivateLink
225+
// resources (VPC endpoints, security groups, DNS records) are cleaned up.
226+
//
227+
// Without this finalizer, the AWSEndpointService CR's own finalizer runs during CR
228+
// deletion, but the controller may not be able to construct valid AWS clients if the
229+
// HCP has already been deleted — particularly for SharedVPC clusters where cross-account
230+
// role ARNs are sourced from the HCP spec. This would orphan AWS resources.
231+
hcpAWSPrivateLinkFinalizerName = "hypershift.openshift.io/aws-private-link-endpoint-cleanup"
222232
)
223233

224234
// AWSEndpointServiceReconciler watches AWSEndpointService resources and reconciles
@@ -371,7 +381,9 @@ func (r *AWSEndpointServiceReconciler) SetupWithManager(mgr ctrl.Manager) error
371381
RateLimiter: workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](3*time.Second, 30*time.Second),
372382
MaxConcurrentReconciles: 10,
373383
}).
374-
Watches(&hyperv1.HostedControlPlane{}, handler.Funcs{UpdateFunc: r.enqueueOnAccessChange(mgr)}).
384+
Watches(&hyperv1.HostedControlPlane{}, handler.EnqueueRequestsFromMapFunc(
385+
r.mapHCPToAWSEndpointService(),
386+
)).
375387
Build(r)
376388
if err != nil {
377389
return fmt.Errorf("failed setting up with a controller manager: %w", err)
@@ -381,30 +393,42 @@ func (r *AWSEndpointServiceReconciler) SetupWithManager(mgr ctrl.Manager) error
381393
return nil
382394
}
383395

384-
func (r *AWSEndpointServiceReconciler) enqueueOnAccessChange(mgr ctrl.Manager) func(context.Context, event.UpdateEvent, workqueue.TypedRateLimitingInterface[reconcile.Request]) {
385-
return func(ctx context.Context, e event.UpdateEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
386-
logger := mgr.GetLogger()
387-
newHCP, isOk := e.ObjectNew.(*hyperv1.HostedControlPlane)
388-
if !isOk {
389-
logger.Info("WARNING: enqueueOnAccessChange: new resource is not of type HostedControlPlane")
390-
return
396+
// mapHCPToAWSEndpointService maps HostedControlPlane events to the AWSEndpointService CRs
397+
// in the same namespace. This uses EnqueueRequestsFromMapFunc (instead of handler.Funcs)
398+
// so that all event types — Create, Update, Delete — trigger the mapping. This is critical
399+
// for controller restarts: if the CPO restarts while an HCP is being deleted (DeletionTimestamp
400+
// already set), the informer cache sync generates a Create event, not an Update. Using
401+
// handler.Funcs{UpdateFunc: ...} would miss this, defeating the purpose of the HCP finalizer.
402+
//
403+
// The function only enqueues CRs when the HCP has our finalizer, avoiding unnecessary
404+
// reconciliations for unrelated HCPs. EndpointAccess changes (previously detected via
405+
// old/new comparison) are picked up by the reconciler's periodic 5-minute requeue.
406+
func (r *AWSEndpointServiceReconciler) mapHCPToAWSEndpointService() handler.MapFunc {
407+
return func(ctx context.Context, obj client.Object) []reconcile.Request {
408+
hcp, ok := obj.(*hyperv1.HostedControlPlane)
409+
if !ok {
410+
return nil
411+
}
412+
413+
// Only trigger reconciliation when the HCP has our finalizer; this avoids
414+
// unnecessary reconciliations for HCPs that are not related to AWS PrivateLink.
415+
if !controllerutil.ContainsFinalizer(hcp, hcpAWSPrivateLinkFinalizerName) {
416+
return nil
391417
}
392-
oldHCP, isOk := e.ObjectOld.(*hyperv1.HostedControlPlane)
393-
if !isOk {
394-
logger.Info("WARNING: enqueueOnAccessChange: old resource is not of type HostedControlPlane")
395-
return
418+
419+
awsEndpointServiceList := &hyperv1.AWSEndpointServiceList{}
420+
if err := r.List(ctx, awsEndpointServiceList, client.InNamespace(hcp.Namespace)); err != nil {
421+
log.FromContext(ctx).Error(err, "mapHCPToAWSEndpointService: cannot list AWSEndpointService resources", "namespace", hcp.Namespace)
422+
return nil
396423
}
397-
// Only enqueue awsendpointservices when there is a change in the endpointaccess value, otherwise ignore changes
398-
if newHCP.Spec.Platform.AWS != nil && oldHCP.Spec.Platform.AWS != nil && newHCP.Spec.Platform.AWS.EndpointAccess != oldHCP.Spec.Platform.AWS.EndpointAccess {
399-
awsEndpointServiceList := &hyperv1.AWSEndpointServiceList{}
400-
if err := r.List(context.Background(), awsEndpointServiceList, client.InNamespace(newHCP.Namespace)); err != nil {
401-
logger.Error(err, "enqueueOnAccessChange: cannot list awsendpointservices")
402-
return
403-
}
404-
for i := range awsEndpointServiceList.Items {
405-
q.Add(reconcile.Request{NamespacedName: client.ObjectKeyFromObject(&awsEndpointServiceList.Items[i])})
406-
}
424+
425+
var requests []reconcile.Request
426+
for i := range awsEndpointServiceList.Items {
427+
requests = append(requests, reconcile.Request{
428+
NamespacedName: client.ObjectKeyFromObject(&awsEndpointServiceList.Items[i]),
429+
})
407430
}
431+
return requests
408432
}
409433
}
410434

@@ -443,13 +467,12 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
443467
// the non-deletion path. If the HCP still exists, initialize from it so that
444468
// getClients can succeed and deletion can proceed.
445469
//
446-
// Known issue (SharedVPC): when the HCP is already deleted, the SharedVPC role
447-
// ARNs (needed for cross-account EC2/Route53 access) are lost. Initialization
448-
// cannot happen, getClients will fail, and the finalizer will be preserved until
449-
// the hypershift-operator force-removes it after the grace period — orphaning
450-
// AWS resources in the shared VPC account. A proper fix requires persisting the
451-
// SharedVPC role ARNs in the AWSEndpointService status. See
452-
// TestReconcileDeletionSharedVPC for details.
470+
// The HCP finalizer (hcpAWSPrivateLinkFinalizerName), when present, blocks HCP
471+
// deletion so the HCP remains available during this cleanup. Note: the finalizer
472+
// is only added after a successful normal reconciliation. If the controller has
473+
// not yet reconciled (e.g., controller was down since cluster creation), the HCP
474+
// may be deleted before the finalizer is placed, and this best-effort initialization
475+
// is the only protection. See reconcileHCPDeletion for the HCP-deletion cleanup path.
453476
hcpList := &hyperv1.HostedControlPlaneList{}
454477
if err := r.List(ctx, hcpList, &client.ListOptions{Namespace: req.Namespace}); err == nil && len(hcpList.Items) == 1 {
455478
r.awsClientBuilder.initializeWithHCP(log, &hcpList.Items[0])
@@ -475,7 +498,16 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
475498
return ctrl.Result{}, nil
476499
}
477500

478-
// Ensure the awsEndpointService has a finalizer for cleanup
501+
serviceName := awsEndpointService.Status.EndpointServiceName
502+
if serviceName == "" {
503+
// Service Endpoint is not yet set, wait for hypershift-operator to populate
504+
// Likely observing our own Create
505+
return ctrl.Result{}, nil
506+
}
507+
508+
// Ensure the awsEndpointService has a finalizer for cleanup.
509+
// This is placed after the serviceName check so that CRs not yet populated
510+
// by hypershift-operator don't get a finalizer that would block HCP deletion.
479511
if !controllerutil.ContainsFinalizer(awsEndpointService, finalizer) {
480512
controllerutil.AddFinalizer(awsEndpointService, finalizer)
481513
if err := r.Update(ctx, awsEndpointService); err != nil {
@@ -486,26 +518,18 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
486518
}
487519
}
488520

489-
serviceName := awsEndpointService.Status.EndpointServiceName
490-
if serviceName == "" {
491-
// Service Endpoint is not yet set, wait for hypershift-operator to populate
492-
// Likely observing our own Create
493-
return ctrl.Result{}, nil
494-
}
495-
496-
// Fetch the HostedControlPlane
497-
hcpList := &hyperv1.HostedControlPlaneList{}
498-
if err := r.List(ctx, hcpList, &client.ListOptions{Namespace: req.Namespace}); err != nil {
499-
return ctrl.Result{}, fmt.Errorf("failed to get resource: %w", err)
521+
hcp, err := r.getHostedControlPlane(ctx, req.Namespace)
522+
if err != nil {
523+
return ctrl.Result{}, err
500524
}
501-
if len(hcpList.Items) == 0 {
502-
// Return early if HostedControlPlane is deleted
525+
if hcp == nil {
503526
return ctrl.Result{}, nil
504527
}
505-
if len(hcpList.Items) > 1 {
506-
return ctrl.Result{}, fmt.Errorf("unexpected number of HostedControlPlanes in namespace, expected: 1, actual: %d", len(hcpList.Items))
528+
529+
// Handle HCP deletion: clean up AWS resources while HCP credentials are still valid.
530+
if !hcp.DeletionTimestamp.IsZero() {
531+
return r.reconcileHCPDeletion(ctx, awsEndpointService, hcp, log)
507532
}
508-
hcp := &hcpList.Items[0]
509533

510534
if isPaused, duration := util.IsReconciliationPaused(log, hcp.Spec.PausedUntil); isPaused {
511535
log.Info("Reconciliation paused", "pausedUntil", *hcp.Spec.PausedUntil)
@@ -518,6 +542,13 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
518542
return ctrl.Result{}, err
519543
}
520544

545+
// Add HCP finalizer to block HCP deletion until AWS PrivateLink cleanup is done.
546+
// This is done after initializing clients, ensuring the reconciler is operational
547+
// and there are actually resources to protect.
548+
if result, err := r.ensureHCPFinalizer(ctx, hcp, log); err != nil || !result.IsZero() {
549+
return result, err
550+
}
551+
521552
// Reconcile the AWSEndpointService
522553
oldStatus := awsEndpointService.Status.DeepCopy()
523554
if err := r.reconcileAWSEndpointService(ctx, awsEndpointService, hcp, ec2Client, route53Client); err != nil {
@@ -565,6 +596,131 @@ func isAWSThrottleError(err error) bool {
565596
return false
566597
}
567598

599+
// ensureHCPFinalizer adds the HCP finalizer if not already present.
600+
// Uses optimistic locking via MergeFromWithOptimisticLock to safely update the HCP.
601+
func (r *AWSEndpointServiceReconciler) ensureHCPFinalizer(ctx context.Context, hcp *hyperv1.HostedControlPlane, log logr.Logger) (ctrl.Result, error) {
602+
if controllerutil.ContainsFinalizer(hcp, hcpAWSPrivateLinkFinalizerName) {
603+
return ctrl.Result{}, nil
604+
}
605+
606+
log.Info("Adding HCP finalizer for AWS PrivateLink cleanup")
607+
originalHCP := hcp.DeepCopy()
608+
controllerutil.AddFinalizer(hcp, hcpAWSPrivateLinkFinalizerName)
609+
if err := r.Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil {
610+
if apierrors.IsConflict(err) {
611+
// Use RequeueAfter to avoid a tight retry loop when multiple AWSEndpointService
612+
// reconcilers concurrently try to patch the same HCP.
613+
return ctrl.Result{RequeueAfter: time.Second}, nil
614+
}
615+
return ctrl.Result{}, fmt.Errorf("failed to add HCP finalizer: %w", err)
616+
}
617+
return ctrl.Result{}, nil
618+
}
619+
620+
func (r *AWSEndpointServiceReconciler) getHostedControlPlane(ctx context.Context, namespace string) (*hyperv1.HostedControlPlane, error) {
621+
hcpList := &hyperv1.HostedControlPlaneList{}
622+
if err := r.List(ctx, hcpList, &client.ListOptions{Namespace: namespace}); err != nil {
623+
return nil, fmt.Errorf("failed to get resource: %w", err)
624+
}
625+
if len(hcpList.Items) == 0 {
626+
return nil, nil
627+
}
628+
if len(hcpList.Items) > 1 {
629+
return nil, fmt.Errorf("unexpected number of HostedControlPlanes in namespace, expected: 1, actual: %d", len(hcpList.Items))
630+
}
631+
return &hcpList.Items[0], nil
632+
}
633+
634+
// reconcileHCPDeletion handles HCP deletion by cleaning up AWS PrivateLink resources
635+
// and removing the HCP finalizer. This ensures AWS credentials remain valid during
636+
// the cleanup process, which is critical for SharedVPC clusters where the cross-account
637+
// role ARNs are sourced from the HCP spec.
638+
//
639+
// Convergent multi-CR coordination:
640+
// With MaxConcurrentReconciles: 10 and the HCP watch enqueuing ALL CRs on HCP deletion,
641+
// multiple reconcilers run this function concurrently — one per AWSEndpointService CR.
642+
// Each reconciler cleans up its own CR's AWS resources, then checks whether all CRs in
643+
// the namespace are done. Only the last reconciler to finish (the one that finds no pending
644+
// CRs) removes the HCP finalizer. Earlier reconcilers see pending CRs and return
645+
// RequeueAfter to retry; when they re-enter, they find the HCP finalizer already removed
646+
// and return early at the top of this function. This convergent pattern produces a small
647+
// number of no-op requeues but is correct and self-healing.
648+
func (r *AWSEndpointServiceReconciler) reconcileHCPDeletion(ctx context.Context, awsEndpointService *hyperv1.AWSEndpointService, hcp *hyperv1.HostedControlPlane, log logr.Logger) (ctrl.Result, error) {
649+
if !controllerutil.ContainsFinalizer(hcp, hcpAWSPrivateLinkFinalizerName) {
650+
return ctrl.Result{}, nil
651+
}
652+
653+
// If the AWSEndpointService itself is being deleted, let the existing CR deletion
654+
// path (which returns early at the top of Reconcile) handle cleanup. This prevents
655+
// the two deletion paths from racing under MaxConcurrentReconciles > 1, where the
656+
// CR deletion path and HCP deletion path could both try to clean up the same CR.
657+
if !awsEndpointService.DeletionTimestamp.IsZero() {
658+
return ctrl.Result{}, nil
659+
}
660+
661+
log.Info("HCP is being deleted, cleaning up AWS PrivateLink resources before removing HCP finalizer")
662+
663+
// Initialize AWS clients from the HCP — guaranteed to be available because
664+
// our finalizer blocks HCP deletion.
665+
r.awsClientBuilder.initializeWithHCP(log, hcp)
666+
ec2Client, route53Client, err := r.awsClientBuilder.getClients(ctx)
667+
if err != nil {
668+
return ctrl.Result{}, fmt.Errorf("failed to get AWS clients during HCP deletion cleanup: %w", err)
669+
}
670+
671+
// Clean up this AWSEndpointService's AWS resources.
672+
if controllerutil.ContainsFinalizer(awsEndpointService, finalizer) {
673+
completed, err := r.delete(ctx, awsEndpointService, ec2Client, route53Client)
674+
if err != nil {
675+
return ctrl.Result{}, fmt.Errorf("failed to clean up AWS resources during HCP deletion: %w", err)
676+
}
677+
if !completed {
678+
return ctrl.Result{RequeueAfter: endpointServiceDeletionRequeueDuration}, nil
679+
}
680+
681+
originalAES := awsEndpointService.DeepCopy()
682+
controllerutil.RemoveFinalizer(awsEndpointService, finalizer)
683+
if err := r.Patch(ctx, awsEndpointService, client.MergeFromWithOptions(originalAES, client.MergeFromWithOptimisticLock{})); err != nil {
684+
if apierrors.IsConflict(err) {
685+
return ctrl.Result{RequeueAfter: time.Second}, nil
686+
}
687+
return ctrl.Result{}, fmt.Errorf("failed to remove AWSEndpointService finalizer during HCP deletion: %w", err)
688+
}
689+
log.Info("Removed AWSEndpointService finalizer after AWS resource cleanup")
690+
}
691+
692+
// Check if all AWSEndpointService CRs in this namespace have been cleaned up.
693+
// Only remove the HCP finalizer once all CRs no longer have our CR finalizer,
694+
// meaning all AWS resources have been cleaned up.
695+
awsEndpointServiceList := &hyperv1.AWSEndpointServiceList{}
696+
if err := r.List(ctx, awsEndpointServiceList, client.InNamespace(hcp.Namespace)); err != nil {
697+
return ctrl.Result{}, fmt.Errorf("failed to list AWSEndpointService resources: %w", err)
698+
}
699+
for i := range awsEndpointServiceList.Items {
700+
if controllerutil.ContainsFinalizer(&awsEndpointServiceList.Items[i], finalizer) {
701+
log.Info("Other AWSEndpointService resources still have finalizers, keeping HCP finalizer",
702+
"pendingResource", awsEndpointServiceList.Items[i].Name)
703+
return ctrl.Result{RequeueAfter: endpointServiceDeletionRequeueDuration}, nil
704+
}
705+
}
706+
707+
// All AWSEndpointService resources are cleaned up — remove the HCP finalizer
708+
// to unblock HCP deletion.
709+
log.Info("All AWS PrivateLink resources cleaned up, removing HCP finalizer")
710+
originalHCP := hcp.DeepCopy()
711+
controllerutil.RemoveFinalizer(hcp, hcpAWSPrivateLinkFinalizerName)
712+
if err := r.Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil {
713+
if apierrors.IsConflict(err) {
714+
// Use RequeueAfter to avoid a tight retry loop when multiple AWSEndpointService
715+
// reconcilers concurrently try to patch the same HCP.
716+
return ctrl.Result{RequeueAfter: time.Second}, nil
717+
}
718+
return ctrl.Result{}, fmt.Errorf("failed to remove HCP finalizer: %w", err)
719+
}
720+
721+
return ctrl.Result{}, nil
722+
}
723+
568724
func hasAWSConfig(platform *hyperv1.PlatformSpec) bool {
569725
return platform.Type == hyperv1.AWSPlatform && platform.AWS != nil && platform.AWS.CloudProviderConfig != nil &&
570726
platform.AWS.CloudProviderConfig.Subnet != nil && platform.AWS.CloudProviderConfig.Subnet.ID != nil

0 commit comments

Comments
 (0)