From 2212107f7617eb2746b846a347ecb9694bac8f8c Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Mon, 17 Feb 2025 13:14:59 -0500 Subject: [PATCH 1/8] Add Azure Stack as a valid environment Adds AzureStack as a valid cloud environment. The value "AzureStackCloud" corresponds to the name expected by the azure autorest package, which will expect a environment configuration file to be found at a path specified by the environment variable AZURE_ENVIRONMENT_FILEPATH. See: https://github.com/Azure/go-autorest/blob/main/autorest/azure/environments.go#L300-L302 --- api/v1beta1/types_class.go | 2 ++ azure/defaults.go | 2 ++ .../bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml | 1 + .../infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml | 1 + 4 files changed, 6 insertions(+) diff --git a/api/v1beta1/types_class.go b/api/v1beta1/types_class.go index 452eab517f8..d6b853dba20 100644 --- a/api/v1beta1/types_class.go +++ b/api/v1beta1/types_class.go @@ -48,6 +48,7 @@ type AzureClusterClassSpec struct { // - GermanCloud: "AzureGermanCloud" // - PublicCloud: "AzurePublicCloud" // - USGovernmentCloud: "AzureUSGovernmentCloud" + // - StackCloud: "AzureStackCloud" // // Note that values other than the default must also be accompanied by corresponding changes to the // aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does @@ -186,6 +187,7 @@ type AzureManagedControlPlaneClassSpec struct { // - PublicCloud: "AzurePublicCloud" // - USGovernmentCloud: "AzureUSGovernmentCloud" // + // // Note that values other than the default must also be accompanied by corresponding changes to the // aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does // not support referring to multiple different clouds in a single installation. The following fields must diff --git a/azure/defaults.go b/azure/defaults.go index 02e5508fa5c..41a08cb37ea 100644 --- a/azure/defaults.go +++ b/azure/defaults.go @@ -44,6 +44,8 @@ const ( ChinaCloudName = "AzureChinaCloud" // USGovernmentCloudName is the name of the Azure US Government cloud. USGovernmentCloudName = "AzureUSGovernmentCloud" + // StackCloudName is the name for Azure Stack hybrid cloud environments. + StackCloudName = "AzureStackCloud" ) const ( diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml index 9c7cdabe38c..27db4c7950f 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml @@ -92,6 +92,7 @@ spec: - GermanCloud: "AzureGermanCloud" - PublicCloud: "AzurePublicCloud" - USGovernmentCloud: "AzureUSGovernmentCloud" + - StackCloud: "AzureStackCloud" Note that values other than the default must also be accompanied by corresponding changes to the aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml index 2b5acaaec48..aa2614d0a72 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml @@ -65,6 +65,7 @@ spec: - GermanCloud: "AzureGermanCloud" - PublicCloud: "AzurePublicCloud" - USGovernmentCloud: "AzureUSGovernmentCloud" + - StackCloud: "AzureStackCloud" Note that values other than the default must also be accompanied by corresponding changes to the aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does From 70bea61d4e938c041c481d4c4379ffd4de28f60c Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Wed, 14 May 2025 14:38:23 -0400 Subject: [PATCH 2/8] Set ARMClientOptions for Azure Stack Sets ARM Client Options when using the Azure Stack environment. Sets the APIVersion to a hybrid cloud profile to ensure compatibility with hybrid environments. --- azure/defaults.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/azure/defaults.go b/azure/defaults.go index 41a08cb37ea..312f018c720 100644 --- a/azure/defaults.go +++ b/azure/defaults.go @@ -27,6 +27,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" "github.com/Azure/azure-sdk-for-go/sdk/tracing/azotel" + azureautorest "github.com/Azure/go-autorest/autorest/azure" "go.opentelemetry.io/otel" "sigs.k8s.io/cluster-api-provider-azure/util/tele" @@ -111,6 +112,12 @@ const ( CustomHeaderPrefix = "infrastructure.cluster.x-k8s.io/custom-header-" ) +const ( + // StackAPIVersionProfile is the API version profile to set for ARM clients. See: + // https://learn.microsoft.com/en-us/azure-stack/user/azure-stack-profiles-azure-resource-manager-versions?view=azs-2408#overview-of-the-2020-09-01-hybrid-profile + StackAPIVersionProfile = "2020-06-01" +) + var ( // LinuxBootstrapExtensionCommand is the command the VM bootstrap extension will execute to verify Linux nodes bootstrap completes successfully. LinuxBootstrapExtensionCommand = fmt.Sprintf("for i in $(seq 1 %d); do test -f %s && break; if [ $i -eq %d ]; then echo 'Error joining node to cluster: kubeadm init or join failed. To debug, check the cloud-init, kubelet, or other bootstrap logs: https://capz.sigs.k8s.io/self-managed/troubleshooting.html#checking-cloud-init-logs-ubuntu'; exit 1; else sleep %d; fi; done", bootstrapExtensionRetries, bootstrapSentinelFile, bootstrapExtensionRetries, bootstrapExtensionSleep) @@ -369,6 +376,21 @@ func ARMClientOptions(azureEnvironment string, extraPolicies ...policy.Policy) ( opts.Cloud = cloud.AzureChina case USGovernmentCloudName: opts.Cloud = cloud.AzureGovernment + case StackCloudName: + cloudEnv, err := azureautorest.EnvironmentFromName(azureEnvironment) + if err != nil { + return nil, fmt.Errorf("unable to get Azure Stack cloud environment: %w", err) + } + opts.APIVersion = StackAPIVersionProfile + opts.Cloud = cloud.Configuration{ + ActiveDirectoryAuthorityHost: cloudEnv.ActiveDirectoryEndpoint, + Services: map[cloud.ServiceName]cloud.ServiceConfiguration{ + cloud.ResourceManager: { + Audience: cloudEnv.TokenAudience, + Endpoint: cloudEnv.ResourceManagerEndpoint, + }, + }, + } case "": // No cloud name provided, so leave at defaults. default: From 6ae830471bcbef0057991ccd1debd47d591fd20c Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Mon, 3 Mar 2025 14:55:23 -0500 Subject: [PATCH 3/8] Skip privatedns zones on Azure Stack Azure Stack Hub does not support private dns zones, so skip them. --- azure/scope/cluster.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/azure/scope/cluster.go b/azure/scope/cluster.go index f7f035f2e7b..3a530d7722a 100644 --- a/azure/scope/cluster.go +++ b/azure/scope/cluster.go @@ -559,7 +559,7 @@ func (s *ClusterScope) VNetSpec() azure.ASOResourceSpecGetter[*asonetworkv1api20 // PrivateDNSSpec returns the private dns zone spec. func (s *ClusterScope) PrivateDNSSpec() (zoneSpec azure.ResourceSpecGetter, linkSpec, recordSpec []azure.ResourceSpecGetter) { - if s.IsAPIServerPrivate() { + if s.IsAPIServerPrivate() && !s.IsAzureStack() { resourceGroup := s.ResourceGroup() if s.AzureCluster.Spec.NetworkSpec.PrivateDNSZoneResourceGroup != "" { resourceGroup = s.AzureCluster.Spec.NetworkSpec.PrivateDNSZoneResourceGroup @@ -1251,3 +1251,8 @@ func (s *ClusterScope) getLastAppliedSecurityRules(nsgName string) map[string]in } return lastAppliedSecurityRules } + +// IsAzureStack returns true if the cluster is running on Azure Stack. +func (s *ClusterScope) IsAzureStack() bool { + return strings.EqualFold(s.Environment.Name, azure.StackCloudName) +} From 435df16c188c67d77637cc8ad013985200325cbe Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Mon, 24 Mar 2025 12:18:50 -0400 Subject: [PATCH 4/8] AzureStack: handle missing avail set sku cache The Resource SKU API for availability sets may not be available in an Azure Stack environment. The cache is used to determine the fault domain count. For Azure Stack, we can default to 2. Future work could potentially set this programatically or expose the fault domain count in the API. --- azure/scope/machine.go | 16 ++++---- azure/services/availabilitysets/spec.go | 53 ++++++++++++++++--------- 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/azure/scope/machine.go b/azure/scope/machine.go index 67635826d8b..f7f674b4589 100644 --- a/azure/scope/machine.go +++ b/azure/scope/machine.go @@ -150,7 +150,8 @@ func (m *MachineScope) InitMachineCache(ctx context.Context) error { } m.cache.availabilitySetSKU, err = skuCache.Get(ctx, string(armcompute.AvailabilitySetSKUTypesAligned), resourceskus.AvailabilitySets) - if err != nil { + // Resource SKU API for availability sets may not be available in Azure Stack environments. + if err != nil && !strings.EqualFold(m.CloudEnvironment(), azure.StackCloudName) { return errors.Wrapf(err, "failed to get availability set SKU %s in compute api", string(armcompute.AvailabilitySetSKUTypesAligned)) } } @@ -497,12 +498,13 @@ func (m *MachineScope) AvailabilitySetSpec() azure.ResourceSpecGetter { } spec := &availabilitysets.AvailabilitySetSpec{ - Name: availabilitySetName, - ResourceGroup: m.NodeResourceGroup(), - ClusterName: m.ClusterName(), - Location: m.Location(), - SKU: nil, - AdditionalTags: m.AdditionalTags(), + Name: availabilitySetName, + ResourceGroup: m.NodeResourceGroup(), + ClusterName: m.ClusterName(), + Location: m.Location(), + CloudEnvironment: m.CloudEnvironment(), + SKU: nil, + AdditionalTags: m.AdditionalTags(), } if m.cache != nil { diff --git a/azure/services/availabilitysets/spec.go b/azure/services/availabilitysets/spec.go index ea522da07ee..4d411edf184 100644 --- a/azure/services/availabilitysets/spec.go +++ b/azure/services/availabilitysets/spec.go @@ -19,24 +19,27 @@ package availabilitysets import ( "context" "strconv" + "strings" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" "github.com/pkg/errors" "k8s.io/utils/ptr" infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-azure/azure" "sigs.k8s.io/cluster-api-provider-azure/azure/converters" "sigs.k8s.io/cluster-api-provider-azure/azure/services/resourceskus" ) // AvailabilitySetSpec defines the specification for an availability set. type AvailabilitySetSpec struct { - Name string - ResourceGroup string - ClusterName string - Location string - SKU *resourceskus.SKU - AdditionalTags infrav1.Tags + Name string + ResourceGroup string + ClusterName string + Location string + CloudEnvironment string + SKU *resourceskus.SKU + AdditionalTags infrav1.Tags } // ResourceName returns the name of the availability set. @@ -64,20 +67,10 @@ func (s *AvailabilitySetSpec) Parameters(_ context.Context, existing interface{} return nil, nil } - if s.SKU == nil { - return nil, errors.New("unable to get required availability set SKU from machine cache") - } - - var faultDomainCount *int32 - faultDomainCountStr, ok := s.SKU.GetCapability(resourceskus.MaximumPlatformFaultDomainCount) - if !ok { - return nil, errors.Errorf("unable to get required availability set SKU capability %s", resourceskus.MaximumPlatformFaultDomainCount) - } - count, err := strconv.ParseInt(faultDomainCountStr, 10, 32) + faultDomainCount, err := getFaultDomainCount(s.SKU, s.CloudEnvironment) if err != nil { - return nil, errors.Wrapf(err, "unable to parse availability set fault domain count") + return nil, err } - faultDomainCount = ptr.To[int32](int32(count)) asParams := armcompute.AvailabilitySet{ SKU: &armcompute.SKU{ @@ -98,3 +91,27 @@ func (s *AvailabilitySetSpec) Parameters(_ context.Context, existing interface{} return asParams, nil } + +func getFaultDomainCount(sku *resourceskus.SKU, cloudEnvironment string) (*int32, error) { + // Azure Stack environments may not implement the resource SKU API + // for availability sets. Use a default value instead. + if strings.EqualFold(cloudEnvironment, azure.StackCloudName) { + return ptr.To(int32(2)), nil + } + + if sku == nil { + return nil, errors.New("unable to get required availability set SKU from machine cache") + } + + var faultDomainCount *int32 + faultDomainCountStr, ok := sku.GetCapability(resourceskus.MaximumPlatformFaultDomainCount) + if !ok { + return nil, errors.Errorf("unable to get required availability set SKU capability %s", resourceskus.MaximumPlatformFaultDomainCount) + } + count, err := strconv.ParseInt(faultDomainCountStr, 10, 32) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse availability set fault domain count") + } + faultDomainCount = ptr.To(int32(count)) + return faultDomainCount, nil +} From 66bd450d28800169a6d0250a3f0e60045bfb2121 Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Thu, 27 Mar 2025 14:14:59 -0400 Subject: [PATCH 5/8] Skip tag reconciliation in Azure Stack The tag service using the V2 SDK is not available in azure stack. Skip tag reconciliation in Azure Stack environments. --- controllers/azuremachine_reconciler.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/controllers/azuremachine_reconciler.go b/controllers/azuremachine_reconciler.go index 544ccc02694..294f45922e3 100644 --- a/controllers/azuremachine_reconciler.go +++ b/controllers/azuremachine_reconciler.go @@ -18,6 +18,7 @@ package controllers import ( "context" + "strings" "github.com/pkg/errors" @@ -101,10 +102,19 @@ func newAzureMachineService(machineScope *scope.MachineScope) (*azureMachineServ virtualmachinesSvc, roleAssignmentsSvc, vmextensionsSvc, - tagsSvc, }, skuCache: cache, } + + // The tags service fails in Azure Stack because the current SDK implementation + // will throw an error when trying to get tags at scope on Azure Stack environments. + // This means tags can only be provided on Azure Stack machines at creation time + // and will not be reconciled day-2. Once the get-tags-at-scope SDK issue is + // addressed, this change can be reverted to add tagsSvc in all environments. + if !strings.EqualFold(machineScope.CloudEnvironment(), azure.StackCloudName) { + ams.services = append(ams.services, tagsSvc) + } + ams.Reconcile = ams.reconcile ams.Pause = ams.pause ams.Delete = ams.delete From ba8bb880a304bf12afd2bea97b6d03762a4e81e4 Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Thu, 27 Mar 2025 14:35:13 -0400 Subject: [PATCH 6/8] Change Disk Client API Version for Azure Stack The standard 2020-06-01 API Version is not supported for disk operations in Azure Stack, so change to the compatible 2018-06-01 profile. --- azure/defaults.go | 4 ++++ azure/services/disks/client.go | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/azure/defaults.go b/azure/defaults.go index 312f018c720..f17ae2d4394 100644 --- a/azure/defaults.go +++ b/azure/defaults.go @@ -116,6 +116,10 @@ const ( // StackAPIVersionProfile is the API version profile to set for ARM clients. See: // https://learn.microsoft.com/en-us/azure-stack/user/azure-stack-profiles-azure-resource-manager-versions?view=azs-2408#overview-of-the-2020-09-01-hybrid-profile StackAPIVersionProfile = "2020-06-01" + + // StackDiskAPIVersionProfile is the API Version to set for the disk client. + // API Version Profile "2020-06-01" is not supported for disks. + StackDiskAPIVersionProfile = "2018-06-01" ) var ( diff --git a/azure/services/disks/client.go b/azure/services/disks/client.go index 58cdb4345fc..edfa947581c 100644 --- a/azure/services/disks/client.go +++ b/azure/services/disks/client.go @@ -18,6 +18,7 @@ package disks import ( "context" + "strings" "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" @@ -38,6 +39,9 @@ type azureClient struct { // newClient creates a new disks client from an authorizer. func newClient(auth azure.Authorizer, apiCallTimeout time.Duration) (*azureClient, error) { opts, err := azure.ARMClientOptions(auth.CloudEnvironment()) + if strings.EqualFold(auth.CloudEnvironment(), azure.StackCloudName) { + opts.APIVersion = azure.StackDiskAPIVersionProfile + } if err != nil { return nil, errors.Wrap(err, "failed to create disks client options") } From 32e327a26e5b25020d0e00f761e92f3222853b5c Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Thu, 27 Mar 2025 17:17:42 -0400 Subject: [PATCH 7/8] Retry VM Delete without Force if Bad Request Azure Stack returns a 400 error when trying to delete a VM with the force flag and the error message suggests retrying without the flag. --- azure/errors.go | 6 ++++++ azure/services/virtualmachines/client.go | 11 +++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/azure/errors.go b/azure/errors.go index 0d719e80037..8e77269acd4 100644 --- a/azure/errors.go +++ b/azure/errors.go @@ -34,6 +34,12 @@ func ResourceNotFound(err error) bool { return errors.As(err, &rerr) && rerr.StatusCode == http.StatusNotFound } +// BadRequest parses an error to check if it its status code is Bad Request (400). +func BadRequest(err error) bool { + var rerr *azcore.ResponseError + return errors.As(err, &rerr) && rerr.StatusCode == http.StatusBadRequest +} + // VMDeletedError is returned when a virtual machine is deleted outside of capz. type VMDeletedError struct { ProviderID string diff --git a/azure/services/virtualmachines/client.go b/azure/services/virtualmachines/client.go index 1e2bbea08d4..119f546b33f 100644 --- a/azure/services/virtualmachines/client.go +++ b/azure/services/virtualmachines/client.go @@ -109,14 +109,21 @@ func (ac *AzureClient) CreateOrUpdateAsync(ctx context.Context, spec azure.Resou // request to Azure and if accepted without error, the func will return a Poller which can be used to track the ongoing // progress of the operation. func (ac *AzureClient) DeleteAsync(ctx context.Context, spec azure.ResourceSpecGetter, resumeToken string) (poller *runtime.Poller[armcompute.VirtualMachinesClientDeleteResponse], err error) { - ctx, _, done := tele.StartSpanWithLogger(ctx, "virtualmachines.AzureClient.Delete") + ctx, log, done := tele.StartSpanWithLogger(ctx, "virtualmachines.AzureClient.Delete") defer done() forceDelete := ptr.To(true) opts := &armcompute.VirtualMachinesClientBeginDeleteOptions{ResumeToken: resumeToken, ForceDeletion: forceDelete} poller, err = ac.virtualmachines.BeginDelete(ctx, spec.ResourceGroupName(), spec.ResourceName(), opts) if err != nil { - return nil, err + if azure.BadRequest(err) { + log.Info("Failed to Begin VM Delete with Force Deletion, retrying without the force flag") + opts.ForceDeletion = ptr.To(false) + poller, err = ac.virtualmachines.BeginDelete(ctx, spec.ResourceGroupName(), spec.ResourceName(), opts) + } + if err != nil { + return nil, err + } } ctx, cancel := context.WithTimeout(ctx, ac.apiCallTimeout) From 170b6f96991d90a06d374f4fc5b11a1fba2d7f68 Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Thu, 8 May 2025 14:20:44 -0400 Subject: [PATCH 8/8] Azure Stack: assume all IPs are managed Trying to get tags at scope in Azure Stack results in an error, so just assume that IPs are managed. --- azure/services/publicips/publicips.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/azure/services/publicips/publicips.go b/azure/services/publicips/publicips.go index 0854f89f583..be7e456a7ad 100644 --- a/azure/services/publicips/publicips.go +++ b/azure/services/publicips/publicips.go @@ -18,6 +18,7 @@ package publicips import ( "context" + "strings" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4" "github.com/pkg/errors" @@ -151,6 +152,12 @@ func (s *Service) Delete(ctx context.Context) error { // isIPManaged returns true if the IP has an owned tag with the cluster name as value, // meaning that the IP's lifecycle is managed. func (s *Service) isIPManaged(ctx context.Context, spec azure.ResourceSpecGetter) (bool, error) { + if strings.EqualFold(s.Scope.CloudEnvironment(), azure.StackCloudName) { + // Azure Stack does not yet support getting tags with scope, + // so assume IPs are managed. + return true, nil + } + scope := azure.PublicIPID(s.Scope.SubscriptionID(), spec.ResourceGroupName(), spec.ResourceName()) result, err := s.TagsGetter.GetAtScope(ctx, scope) if err != nil {