Skip to content

Commit 64ca097

Browse files
authored
fix: undefined instance state on provisioning state failed (#7750)
* fix: undefined instance state on provisioning state failed * test: add unit tests for provisioning state failed + fast delete * test: support both fast/not fast delete on an affected test
1 parent 03b8aeb commit 64ca097

File tree

4 files changed

+348
-22
lines changed

4 files changed

+348
-22
lines changed

cluster-autoscaler/cloudprovider/azure/azure_scale_set.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,7 @@ func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *st
809809

810810
// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus
811811
// instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible
812+
// Suggestion: reunify this with scaleSet.instanceStatusFromVM()
812813
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) *cloudprovider.InstanceStatus {
813814
if provisioningState == nil {
814815
return nil
@@ -823,6 +824,8 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisi
823824
case provisioningStateCreating:
824825
status.State = cloudprovider.InstanceCreating
825826
case provisioningStateFailed:
827+
status.State = cloudprovider.InstanceRunning
828+
826829
if enableFastDeleteOnFailedProvisioning {
827830
// Provisioning can fail both during instance creation or after the instance is running.
828831
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,

cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, statu
198198
}
199199

200200
// instanceStatusFromVM converts the VM provisioning state to cloudprovider.InstanceStatus.
201+
// Suggestion: reunify this with instanceStatusFromProvisioningStateAndPowerState() in azure_scale_set.go
201202
func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSetVM) *cloudprovider.InstanceStatus {
202203
// Prefer the proactive cache view of the instance state if we aren't in a terminal state
203204
// This is because the power state may be taking longer to update and we don't want
@@ -224,6 +225,8 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe
224225
case string(compute.GalleryProvisioningStateCreating):
225226
status.State = cloudprovider.InstanceCreating
226227
case string(compute.GalleryProvisioningStateFailed):
228+
status.State = cloudprovider.InstanceRunning
229+
227230
klog.V(3).Infof("VM %s reports failed provisioning state with power state: %s, eligible for fast delete: %s", to.String(vm.ID), powerState, strconv.FormatBool(scaleSet.enableFastDeleteOnFailedProvisioning))
228231
if scaleSet.enableFastDeleteOnFailedProvisioning {
229232
// Provisioning can fail both during instance creation or after the instance is running.

cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go

Lines changed: 125 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,11 @@ package azure
1919
import (
2020
"fmt"
2121
"testing"
22-
"time"
2322

2423
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"
2524
"github.com/stretchr/testify/assert"
26-
"go.uber.org/mock/gomock"
2725

2826
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
29-
30-
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient"
31-
)
32-
33-
var (
34-
ctrl *gomock.Controller
35-
currentTime, expiredTime time.Time
36-
provider *AzureCloudProvider
37-
scaleSet *ScaleSet
38-
mockVMSSVMClient *mockvmssvmclient.MockInterface
39-
expectedVMSSVMs []compute.VirtualMachineScaleSetVM
40-
expectedStates []cloudprovider.InstanceState
41-
instanceCache, expectedInstanceCache []cloudprovider.Instance
4227
)
4328

4429
func testGetInstanceCacheWithStates(t *testing.T, vms []compute.VirtualMachineScaleSetVM,
@@ -53,3 +38,128 @@ func testGetInstanceCacheWithStates(t *testing.T, vms []compute.VirtualMachineSc
5338
}
5439
return instanceCacheTest
5540
}
41+
42+
// Suggestion: could populate all combinations, should reunify with TestInstanceStatusFromProvisioningStateAndPowerState
43+
func TestInstanceStatusFromVM(t *testing.T) {
44+
t.Run("fast delete enablement = false", func(t *testing.T) {
45+
provider := newTestProvider(t)
46+
scaleSet := newTestScaleSet(provider.azureManager, "testScaleSet")
47+
48+
t.Run("provisioning state = failed, power state = starting", func(t *testing.T) {
49+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStarting)
50+
51+
status := scaleSet.instanceStatusFromVM(vm)
52+
53+
assert.NotNil(t, status)
54+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
55+
})
56+
57+
t.Run("provisioning state = failed, power state = running", func(t *testing.T) {
58+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateRunning)
59+
60+
status := scaleSet.instanceStatusFromVM(vm)
61+
62+
assert.NotNil(t, status)
63+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
64+
})
65+
66+
t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) {
67+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopping)
68+
69+
status := scaleSet.instanceStatusFromVM(vm)
70+
71+
assert.NotNil(t, status)
72+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
73+
})
74+
75+
t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) {
76+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopped)
77+
78+
status := scaleSet.instanceStatusFromVM(vm)
79+
80+
assert.NotNil(t, status)
81+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
82+
})
83+
84+
t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) {
85+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateDeallocated)
86+
87+
status := scaleSet.instanceStatusFromVM(vm)
88+
89+
assert.NotNil(t, status)
90+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
91+
})
92+
93+
t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) {
94+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateUnknown)
95+
96+
status := scaleSet.instanceStatusFromVM(vm)
97+
98+
assert.NotNil(t, status)
99+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
100+
})
101+
})
102+
103+
t.Run("fast delete enablement = true", func(t *testing.T) {
104+
provider := newTestProvider(t)
105+
scaleSet := newTestScaleSetWithFastDelete(provider.azureManager, "testScaleSet")
106+
107+
t.Run("provisioning state = failed, power state = starting", func(t *testing.T) {
108+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStarting)
109+
110+
status := scaleSet.instanceStatusFromVM(vm)
111+
112+
assert.NotNil(t, status)
113+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
114+
})
115+
116+
t.Run("provisioning state = failed, power state = running", func(t *testing.T) {
117+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateRunning)
118+
119+
status := scaleSet.instanceStatusFromVM(vm)
120+
121+
assert.NotNil(t, status)
122+
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
123+
})
124+
125+
t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) {
126+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopping)
127+
128+
status := scaleSet.instanceStatusFromVM(vm)
129+
130+
assert.NotNil(t, status)
131+
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
132+
assert.NotNil(t, status.ErrorInfo)
133+
})
134+
135+
t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) {
136+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopped)
137+
138+
status := scaleSet.instanceStatusFromVM(vm)
139+
140+
assert.NotNil(t, status)
141+
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
142+
assert.NotNil(t, status.ErrorInfo)
143+
})
144+
145+
t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) {
146+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateDeallocated)
147+
148+
status := scaleSet.instanceStatusFromVM(vm)
149+
150+
assert.NotNil(t, status)
151+
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
152+
assert.NotNil(t, status.ErrorInfo)
153+
})
154+
155+
t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) {
156+
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateUnknown)
157+
158+
status := scaleSet.instanceStatusFromVM(vm)
159+
160+
assert.NotNil(t, status)
161+
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
162+
assert.NotNil(t, status.ErrorInfo)
163+
})
164+
})
165+
}

0 commit comments

Comments
 (0)