@@ -46,8 +46,8 @@ import (
4646 "sigs.k8s.io/controller-runtime/pkg/client"
4747 "sigs.k8s.io/controller-runtime/pkg/controller"
4848 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
49- "sigs.k8s.io/controller-runtime/pkg/event"
5049 "sigs.k8s.io/controller-runtime/pkg/handler"
50+ "sigs.k8s.io/controller-runtime/pkg/log"
5151 "sigs.k8s.io/controller-runtime/pkg/manager"
5252 "sigs.k8s.io/controller-runtime/pkg/reconcile"
5353 "sigs.k8s.io/controller-runtime/pkg/source"
@@ -219,6 +219,16 @@ const (
219219 endpointServiceDeletionRequeueDuration = 5 * time .Second
220220 hypershiftLocalZone = "hypershift.local"
221221 routerDomain = "apps"
222+
223+ // hcpAWSPrivateLinkFinalizerName is a finalizer placed on the HostedControlPlane by
224+ // the AWSEndpointService reconciler. It blocks HCP deletion until all AWS PrivateLink
225+ // resources (VPC endpoints, security groups, DNS records) are cleaned up.
226+ //
227+ // Without this finalizer, the AWSEndpointService CR's own finalizer runs during CR
228+ // deletion, but the controller may not be able to construct valid AWS clients if the
229+ // HCP has already been deleted — particularly for SharedVPC clusters where cross-account
230+ // role ARNs are sourced from the HCP spec. This would orphan AWS resources.
231+ hcpAWSPrivateLinkFinalizerName = "hypershift.openshift.io/aws-private-link-endpoint-cleanup"
222232)
223233
224234// AWSEndpointServiceReconciler watches AWSEndpointService resources and reconciles
@@ -371,7 +381,9 @@ func (r *AWSEndpointServiceReconciler) SetupWithManager(mgr ctrl.Manager) error
371381 RateLimiter : workqueue .NewTypedItemExponentialFailureRateLimiter [reconcile.Request ](3 * time .Second , 30 * time .Second ),
372382 MaxConcurrentReconciles : 10 ,
373383 }).
374- Watches (& hyperv1.HostedControlPlane {}, handler.Funcs {UpdateFunc : r .enqueueOnAccessChange (mgr )}).
384+ Watches (& hyperv1.HostedControlPlane {}, handler .EnqueueRequestsFromMapFunc (
385+ r .mapHCPToAWSEndpointService (),
386+ )).
375387 Build (r )
376388 if err != nil {
377389 return fmt .Errorf ("failed setting up with a controller manager: %w" , err )
@@ -381,30 +393,42 @@ func (r *AWSEndpointServiceReconciler) SetupWithManager(mgr ctrl.Manager) error
381393 return nil
382394}
383395
384- func (r * AWSEndpointServiceReconciler ) enqueueOnAccessChange (mgr ctrl.Manager ) func (context.Context , event.UpdateEvent , workqueue.TypedRateLimitingInterface [reconcile.Request ]) {
385- return func (ctx context.Context , e event.UpdateEvent , q workqueue.TypedRateLimitingInterface [reconcile.Request ]) {
386- logger := mgr .GetLogger ()
387- newHCP , isOk := e .ObjectNew .(* hyperv1.HostedControlPlane )
388- if ! isOk {
389- logger .Info ("WARNING: enqueueOnAccessChange: new resource is not of type HostedControlPlane" )
390- return
396+ // mapHCPToAWSEndpointService maps HostedControlPlane events to the AWSEndpointService CRs
397+ // in the same namespace. This uses EnqueueRequestsFromMapFunc (instead of handler.Funcs)
398+ // so that all event types — Create, Update, Delete — trigger the mapping. This is critical
399+ // for controller restarts: if the CPO restarts while an HCP is being deleted (DeletionTimestamp
400+ // already set), the informer cache sync generates a Create event, not an Update. Using
401+ // handler.Funcs{UpdateFunc: ...} would miss this, defeating the purpose of the HCP finalizer.
402+ //
403+ // The function only enqueues CRs when the HCP has our finalizer, avoiding unnecessary
404+ // reconciliations for unrelated HCPs. EndpointAccess changes (previously detected via
405+ // old/new comparison) are picked up by the reconciler's periodic 5-minute requeue.
406+ func (r * AWSEndpointServiceReconciler ) mapHCPToAWSEndpointService () handler.MapFunc {
407+ return func (ctx context.Context , obj client.Object ) []reconcile.Request {
408+ hcp , ok := obj .(* hyperv1.HostedControlPlane )
409+ if ! ok {
410+ return nil
411+ }
412+
413+ // Only trigger reconciliation when the HCP has our finalizer; this avoids
414+ // unnecessary reconciliations for HCPs that are not related to AWS PrivateLink.
415+ if ! controllerutil .ContainsFinalizer (hcp , hcpAWSPrivateLinkFinalizerName ) {
416+ return nil
391417 }
392- oldHCP , isOk := e .ObjectOld .(* hyperv1.HostedControlPlane )
393- if ! isOk {
394- logger .Info ("WARNING: enqueueOnAccessChange: old resource is not of type HostedControlPlane" )
395- return
418+
419+ awsEndpointServiceList := & hyperv1.AWSEndpointServiceList {}
420+ if err := r .List (ctx , awsEndpointServiceList , client .InNamespace (hcp .Namespace )); err != nil {
421+ log .FromContext (ctx ).Error (err , "mapHCPToAWSEndpointService: cannot list AWSEndpointService resources" , "namespace" , hcp .Namespace )
422+ return nil
396423 }
397- // Only enqueue awsendpointservices when there is a change in the endpointaccess value, otherwise ignore changes
398- if newHCP .Spec .Platform .AWS != nil && oldHCP .Spec .Platform .AWS != nil && newHCP .Spec .Platform .AWS .EndpointAccess != oldHCP .Spec .Platform .AWS .EndpointAccess {
399- awsEndpointServiceList := & hyperv1.AWSEndpointServiceList {}
400- if err := r .List (context .Background (), awsEndpointServiceList , client .InNamespace (newHCP .Namespace )); err != nil {
401- logger .Error (err , "enqueueOnAccessChange: cannot list awsendpointservices" )
402- return
403- }
404- for i := range awsEndpointServiceList .Items {
405- q .Add (reconcile.Request {NamespacedName : client .ObjectKeyFromObject (& awsEndpointServiceList .Items [i ])})
406- }
424+
425+ var requests []reconcile.Request
426+ for i := range awsEndpointServiceList .Items {
427+ requests = append (requests , reconcile.Request {
428+ NamespacedName : client .ObjectKeyFromObject (& awsEndpointServiceList .Items [i ]),
429+ })
407430 }
431+ return requests
408432 }
409433}
410434
@@ -443,13 +467,12 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
443467 // the non-deletion path. If the HCP still exists, initialize from it so that
444468 // getClients can succeed and deletion can proceed.
445469 //
446- // Known issue (SharedVPC): when the HCP is already deleted, the SharedVPC role
447- // ARNs (needed for cross-account EC2/Route53 access) are lost. Initialization
448- // cannot happen, getClients will fail, and the finalizer will be preserved until
449- // the hypershift-operator force-removes it after the grace period — orphaning
450- // AWS resources in the shared VPC account. A proper fix requires persisting the
451- // SharedVPC role ARNs in the AWSEndpointService status. See
452- // TestReconcileDeletionSharedVPC for details.
470+ // The HCP finalizer (hcpAWSPrivateLinkFinalizerName), when present, blocks HCP
471+ // deletion so the HCP remains available during this cleanup. Note: the finalizer
472+ // is only added after a successful normal reconciliation. If the controller has
473+ // not yet reconciled (e.g., controller was down since cluster creation), the HCP
474+ // may be deleted before the finalizer is placed, and this best-effort initialization
475+ // is the only protection. See reconcileHCPDeletion for the HCP-deletion cleanup path.
453476 hcpList := & hyperv1.HostedControlPlaneList {}
454477 if err := r .List (ctx , hcpList , & client.ListOptions {Namespace : req .Namespace }); err == nil && len (hcpList .Items ) == 1 {
455478 r .awsClientBuilder .initializeWithHCP (log , & hcpList .Items [0 ])
@@ -475,7 +498,16 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
475498 return ctrl.Result {}, nil
476499 }
477500
478- // Ensure the awsEndpointService has a finalizer for cleanup
501+ serviceName := awsEndpointService .Status .EndpointServiceName
502+ if serviceName == "" {
503+ // Service Endpoint is not yet set, wait for hypershift-operator to populate
504+ // Likely observing our own Create
505+ return ctrl.Result {}, nil
506+ }
507+
508+ // Ensure the awsEndpointService has a finalizer for cleanup.
509+ // This is placed after the serviceName check so that CRs not yet populated
510+ // by hypershift-operator don't get a finalizer that would block HCP deletion.
479511 if ! controllerutil .ContainsFinalizer (awsEndpointService , finalizer ) {
480512 controllerutil .AddFinalizer (awsEndpointService , finalizer )
481513 if err := r .Update (ctx , awsEndpointService ); err != nil {
@@ -486,26 +518,18 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
486518 }
487519 }
488520
489- serviceName := awsEndpointService .Status .EndpointServiceName
490- if serviceName == "" {
491- // Service Endpoint is not yet set, wait for hypershift-operator to populate
492- // Likely observing our own Create
493- return ctrl.Result {}, nil
494- }
495-
496- // Fetch the HostedControlPlane
497- hcpList := & hyperv1.HostedControlPlaneList {}
498- if err := r .List (ctx , hcpList , & client.ListOptions {Namespace : req .Namespace }); err != nil {
499- return ctrl.Result {}, fmt .Errorf ("failed to get resource: %w" , err )
521+ hcp , err := r .getHostedControlPlane (ctx , req .Namespace )
522+ if err != nil {
523+ return ctrl.Result {}, err
500524 }
501- if len (hcpList .Items ) == 0 {
502- // Return early if HostedControlPlane is deleted
525+ if hcp == nil {
503526 return ctrl.Result {}, nil
504527 }
505- if len (hcpList .Items ) > 1 {
506- return ctrl.Result {}, fmt .Errorf ("unexpected number of HostedControlPlanes in namespace, expected: 1, actual: %d" , len (hcpList .Items ))
528+
529+ // Handle HCP deletion: clean up AWS resources while HCP credentials are still valid.
530+ if ! hcp .DeletionTimestamp .IsZero () {
531+ return r .reconcileHCPDeletion (ctx , awsEndpointService , hcp , log )
507532 }
508- hcp := & hcpList .Items [0 ]
509533
510534 if isPaused , duration := util .IsReconciliationPaused (log , hcp .Spec .PausedUntil ); isPaused {
511535 log .Info ("Reconciliation paused" , "pausedUntil" , * hcp .Spec .PausedUntil )
@@ -518,6 +542,13 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
518542 return ctrl.Result {}, err
519543 }
520544
545+ // Add HCP finalizer to block HCP deletion until AWS PrivateLink cleanup is done.
546+ // This is done after initializing clients, ensuring the reconciler is operational
547+ // and there are actually resources to protect.
548+ if result , err := r .ensureHCPFinalizer (ctx , hcp , log ); err != nil || ! result .IsZero () {
549+ return result , err
550+ }
551+
521552 // Reconcile the AWSEndpointService
522553 oldStatus := awsEndpointService .Status .DeepCopy ()
523554 if err := r .reconcileAWSEndpointService (ctx , awsEndpointService , hcp , ec2Client , route53Client ); err != nil {
@@ -565,6 +596,131 @@ func isAWSThrottleError(err error) bool {
565596 return false
566597}
567598
599+ // ensureHCPFinalizer adds the HCP finalizer if not already present.
600+ // Uses optimistic locking via MergeFromWithOptimisticLock to safely update the HCP.
601+ func (r * AWSEndpointServiceReconciler ) ensureHCPFinalizer (ctx context.Context , hcp * hyperv1.HostedControlPlane , log logr.Logger ) (ctrl.Result , error ) {
602+ if controllerutil .ContainsFinalizer (hcp , hcpAWSPrivateLinkFinalizerName ) {
603+ return ctrl.Result {}, nil
604+ }
605+
606+ log .Info ("Adding HCP finalizer for AWS PrivateLink cleanup" )
607+ originalHCP := hcp .DeepCopy ()
608+ controllerutil .AddFinalizer (hcp , hcpAWSPrivateLinkFinalizerName )
609+ if err := r .Patch (ctx , hcp , client .MergeFromWithOptions (originalHCP , client.MergeFromWithOptimisticLock {})); err != nil {
610+ if apierrors .IsConflict (err ) {
611+ // Use RequeueAfter to avoid a tight retry loop when multiple AWSEndpointService
612+ // reconcilers concurrently try to patch the same HCP.
613+ return ctrl.Result {RequeueAfter : time .Second }, nil
614+ }
615+ return ctrl.Result {}, fmt .Errorf ("failed to add HCP finalizer: %w" , err )
616+ }
617+ return ctrl.Result {}, nil
618+ }
619+
620+ func (r * AWSEndpointServiceReconciler ) getHostedControlPlane (ctx context.Context , namespace string ) (* hyperv1.HostedControlPlane , error ) {
621+ hcpList := & hyperv1.HostedControlPlaneList {}
622+ if err := r .List (ctx , hcpList , & client.ListOptions {Namespace : namespace }); err != nil {
623+ return nil , fmt .Errorf ("failed to get resource: %w" , err )
624+ }
625+ if len (hcpList .Items ) == 0 {
626+ return nil , nil
627+ }
628+ if len (hcpList .Items ) > 1 {
629+ return nil , fmt .Errorf ("unexpected number of HostedControlPlanes in namespace, expected: 1, actual: %d" , len (hcpList .Items ))
630+ }
631+ return & hcpList .Items [0 ], nil
632+ }
633+
634+ // reconcileHCPDeletion handles HCP deletion by cleaning up AWS PrivateLink resources
635+ // and removing the HCP finalizer. This ensures AWS credentials remain valid during
636+ // the cleanup process, which is critical for SharedVPC clusters where the cross-account
637+ // role ARNs are sourced from the HCP spec.
638+ //
639+ // Convergent multi-CR coordination:
640+ // With MaxConcurrentReconciles: 10 and the HCP watch enqueuing ALL CRs on HCP deletion,
641+ // multiple reconcilers run this function concurrently — one per AWSEndpointService CR.
642+ // Each reconciler cleans up its own CR's AWS resources, then checks whether all CRs in
643+ // the namespace are done. Only the last reconciler to finish (the one that finds no pending
644+ // CRs) removes the HCP finalizer. Earlier reconcilers see pending CRs and return
645+ // RequeueAfter to retry; when they re-enter, they find the HCP finalizer already removed
646+ // and return early at the top of this function. This convergent pattern produces a small
647+ // number of no-op requeues but is correct and self-healing.
648+ func (r * AWSEndpointServiceReconciler ) reconcileHCPDeletion (ctx context.Context , awsEndpointService * hyperv1.AWSEndpointService , hcp * hyperv1.HostedControlPlane , log logr.Logger ) (ctrl.Result , error ) {
649+ if ! controllerutil .ContainsFinalizer (hcp , hcpAWSPrivateLinkFinalizerName ) {
650+ return ctrl.Result {}, nil
651+ }
652+
653+ // If the AWSEndpointService itself is being deleted, let the existing CR deletion
654+ // path (which returns early at the top of Reconcile) handle cleanup. This prevents
655+ // the two deletion paths from racing under MaxConcurrentReconciles > 1, where the
656+ // CR deletion path and HCP deletion path could both try to clean up the same CR.
657+ if ! awsEndpointService .DeletionTimestamp .IsZero () {
658+ return ctrl.Result {}, nil
659+ }
660+
661+ log .Info ("HCP is being deleted, cleaning up AWS PrivateLink resources before removing HCP finalizer" )
662+
663+ // Initialize AWS clients from the HCP — guaranteed to be available because
664+ // our finalizer blocks HCP deletion.
665+ r .awsClientBuilder .initializeWithHCP (log , hcp )
666+ ec2Client , route53Client , err := r .awsClientBuilder .getClients (ctx )
667+ if err != nil {
668+ return ctrl.Result {}, fmt .Errorf ("failed to get AWS clients during HCP deletion cleanup: %w" , err )
669+ }
670+
671+ // Clean up this AWSEndpointService's AWS resources.
672+ if controllerutil .ContainsFinalizer (awsEndpointService , finalizer ) {
673+ completed , err := r .delete (ctx , awsEndpointService , ec2Client , route53Client )
674+ if err != nil {
675+ return ctrl.Result {}, fmt .Errorf ("failed to clean up AWS resources during HCP deletion: %w" , err )
676+ }
677+ if ! completed {
678+ return ctrl.Result {RequeueAfter : endpointServiceDeletionRequeueDuration }, nil
679+ }
680+
681+ originalAES := awsEndpointService .DeepCopy ()
682+ controllerutil .RemoveFinalizer (awsEndpointService , finalizer )
683+ if err := r .Patch (ctx , awsEndpointService , client .MergeFromWithOptions (originalAES , client.MergeFromWithOptimisticLock {})); err != nil {
684+ if apierrors .IsConflict (err ) {
685+ return ctrl.Result {RequeueAfter : time .Second }, nil
686+ }
687+ return ctrl.Result {}, fmt .Errorf ("failed to remove AWSEndpointService finalizer during HCP deletion: %w" , err )
688+ }
689+ log .Info ("Removed AWSEndpointService finalizer after AWS resource cleanup" )
690+ }
691+
692+ // Check if all AWSEndpointService CRs in this namespace have been cleaned up.
693+ // Only remove the HCP finalizer once all CRs no longer have our CR finalizer,
694+ // meaning all AWS resources have been cleaned up.
695+ awsEndpointServiceList := & hyperv1.AWSEndpointServiceList {}
696+ if err := r .List (ctx , awsEndpointServiceList , client .InNamespace (hcp .Namespace )); err != nil {
697+ return ctrl.Result {}, fmt .Errorf ("failed to list AWSEndpointService resources: %w" , err )
698+ }
699+ for i := range awsEndpointServiceList .Items {
700+ if controllerutil .ContainsFinalizer (& awsEndpointServiceList .Items [i ], finalizer ) {
701+ log .Info ("Other AWSEndpointService resources still have finalizers, keeping HCP finalizer" ,
702+ "pendingResource" , awsEndpointServiceList .Items [i ].Name )
703+ return ctrl.Result {RequeueAfter : endpointServiceDeletionRequeueDuration }, nil
704+ }
705+ }
706+
707+ // All AWSEndpointService resources are cleaned up — remove the HCP finalizer
708+ // to unblock HCP deletion.
709+ log .Info ("All AWS PrivateLink resources cleaned up, removing HCP finalizer" )
710+ originalHCP := hcp .DeepCopy ()
711+ controllerutil .RemoveFinalizer (hcp , hcpAWSPrivateLinkFinalizerName )
712+ if err := r .Patch (ctx , hcp , client .MergeFromWithOptions (originalHCP , client.MergeFromWithOptimisticLock {})); err != nil {
713+ if apierrors .IsConflict (err ) {
714+ // Use RequeueAfter to avoid a tight retry loop when multiple AWSEndpointService
715+ // reconcilers concurrently try to patch the same HCP.
716+ return ctrl.Result {RequeueAfter : time .Second }, nil
717+ }
718+ return ctrl.Result {}, fmt .Errorf ("failed to remove HCP finalizer: %w" , err )
719+ }
720+
721+ return ctrl.Result {}, nil
722+ }
723+
568724func hasAWSConfig (platform * hyperv1.PlatformSpec ) bool {
569725 return platform .Type == hyperv1 .AWSPlatform && platform .AWS != nil && platform .AWS .CloudProviderConfig != nil &&
570726 platform .AWS .CloudProviderConfig .Subnet != nil && platform .AWS .CloudProviderConfig .Subnet .ID != nil
0 commit comments