Skip to content

Commit a179997

Browse files
Merge pull request #184 from kyrtapz/fix_capacity_calc
OCPBUGS-63348: Fix capacity calculation
2 parents 8384756 + f6b12fd commit a179997

File tree

8 files changed

+149
-109
lines changed

8 files changed

+149
-109
lines changed

pkg/cloudprovider/aws.go

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ import (
1414
"github.com/aws/aws-sdk-go/aws/endpoints"
1515
"github.com/aws/aws-sdk-go/aws/session"
1616
"github.com/aws/aws-sdk-go/service/ec2"
17-
v1 "github.com/openshift/api/cloudnetwork/v1"
18-
"github.com/openshift/cloud-network-config-controller/pkg/cloudprivateipconfig"
1917
"github.com/pkg/errors"
2018
corev1 "k8s.io/api/core/v1"
2119
"k8s.io/apimachinery/pkg/util/sets"
@@ -181,7 +179,7 @@ func (a *AWS) ReleasePrivateIP(ip net.IP, node *corev1.Node) error {
181179
}
182180
}
183181

184-
func (a *AWS) GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPConfigs []*v1.CloudPrivateIPConfig) ([]*NodeEgressIPConfiguration, error) {
182+
func (a *AWS) GetNodeEgressIPConfiguration(node *corev1.Node, cpicIPs sets.Set[string]) ([]*NodeEgressIPConfiguration, error) {
185183
instance, err := a.getInstance(node)
186184
if err != nil {
187185
return nil, err
@@ -214,10 +212,8 @@ func (a *AWS) GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPConf
214212
if v6Subnet != nil {
215213
config.IFAddr.IPv6 = v6Subnet.String()
216214
}
217-
capV4, capV6, err := a.getCapacity(instanceV4Capacity, instanceV6Capacity, networkInterface, cloudPrivateIPConfigs)
218-
if err != nil {
219-
return nil, err
220-
}
215+
216+
capV4, capV6 := a.getCapacity(instanceV4Capacity, instanceV6Capacity, networkInterface, cpicIPs)
221217
config.Capacity = capacity{
222218
IPv4: ptr.To(capV4),
223219
IPv6: ptr.To(capV6),
@@ -304,32 +300,28 @@ func (a *AWS) getSubnet(networkInterface *ec2.InstanceNetworkInterface) (*net.IP
304300

305301
// AWS uses a variable capacity per instance type, see:
306302
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#AvailableIpPerENI
307-
// Hence we need to retrieve that and then subtract the amount already assigned
308-
// by default.
309-
func (a *AWS) getCapacity(instanceV4Capacity, instanceV6Capacity int, networkInterface *ec2.InstanceNetworkInterface, cloudPrivateIPConfigs []*v1.CloudPrivateIPConfig) (int, int, error) {
303+
// Capacity represents the number of IPs available for consumption.
304+
// We calculate this as: instance_limit - IPs_consumed_by_non_CPIC_sources
305+
// This way, CNCC managed IPs (regardless of their status) don't reduce capacity.
306+
func (a *AWS) getCapacity(instanceV4Capacity, instanceV6Capacity int, networkInterface *ec2.InstanceNetworkInterface, cpicIPs sets.Set[string]) (int, int) {
310307
currentIPv4Usage, currentIPv6Usage := 0, 0
311308
for _, assignedIPv6 := range networkInterface.Ipv6Addresses {
312309
if assignedIP := net.ParseIP(*assignedIPv6.Ipv6Address); assignedIP != nil {
313-
currentIPv6Usage++
310+
if !cpicIPs.Has(assignedIP.String()) {
311+
currentIPv6Usage++
312+
}
314313
}
315314
}
315+
316316
for _, assignedIPv4 := range networkInterface.PrivateIpAddresses {
317317
if assignedIP := net.ParseIP(*assignedIPv4.PrivateIpAddress); assignedIP != nil {
318-
currentIPv4Usage++
319-
}
320-
}
321-
for _, cloudPrivateIPConfig := range cloudPrivateIPConfigs {
322-
_, ipFamily, err := cloudprivateipconfig.NameToIP(cloudPrivateIPConfig.Name)
323-
if err != nil {
324-
return 0, 0, err
325-
}
326-
if ipFamily == cloudprivateipconfig.IPv4 {
327-
instanceV4Capacity++
328-
} else if ipFamily == cloudprivateipconfig.IPv6 {
329-
instanceV6Capacity++
318+
if !cpicIPs.Has(assignedIP.String()) {
319+
currentIPv4Usage++
320+
}
330321
}
331322
}
332-
return instanceV4Capacity - currentIPv4Usage, instanceV6Capacity - currentIPv6Usage, nil
323+
324+
return instanceV4Capacity - currentIPv4Usage, instanceV6Capacity - currentIPv6Usage
333325
}
334326

335327
func (a *AWS) getNetworkInterfaces(instance *ec2.Instance) ([]*ec2.InstanceNetworkInterface, error) {
@@ -455,6 +447,9 @@ func sharedCredentialsFileFromDirectory(dir string) (string, error) {
455447
return f.Name(), nil
456448
}
457449

450+
func (a *AWS) CleanupNode(nodeName string) {
451+
}
452+
458453
// newConfigForStaticCreds is copied verbatim from:
459454
// https://github.com/openshift/cluster-ingress-operator/blob/1600a0e349ef075fcb52ab65b33445e256358ab8/pkg/util/aws/shared_credentials_file.go#L52
460455
func newConfigForStaticCreds(accessKey string, accessSecret string) []byte {

pkg/cloudprovider/azure.go

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ import (
2121
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6"
2222
azureapi "github.com/Azure/go-autorest/autorest/azure"
2323
"github.com/Azure/msi-dataplane/pkg/dataplane"
24-
v1 "github.com/openshift/api/cloudnetwork/v1"
2524
configv1 "github.com/openshift/api/config/v1"
2625
corev1 "k8s.io/api/core/v1"
26+
"k8s.io/apimachinery/pkg/util/sets"
2727
"k8s.io/klog/v2"
2828
utilnet "k8s.io/utils/net"
2929
)
@@ -322,7 +322,7 @@ func (a *Azure) ReleasePrivateIP(ip net.IP, node *corev1.Node) error {
322322
return a.waitForCompletion(poller)
323323
}
324324

325-
func (a *Azure) GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPConfigs []*v1.CloudPrivateIPConfig) ([]*NodeEgressIPConfiguration, error) {
325+
func (a *Azure) GetNodeEgressIPConfiguration(node *corev1.Node, cpicIPs sets.Set[string]) ([]*NodeEgressIPConfiguration, error) {
326326
instance, err := a.getInstance(node)
327327
if err != nil {
328328
return nil, err
@@ -352,7 +352,7 @@ func (a *Azure) GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPCo
352352
}
353353
config.Capacity = capacity{
354354
// IPv4 and IPv6 fields not used by Azure (uses IP-family-agnostic capacity)
355-
IP: ptr.To(a.getCapacity(networkInterface, len(cloudPrivateIPConfigs))),
355+
IP: ptr.To(a.getCapacity(networkInterface, cpicIPs)),
356356
}
357357
return []*NodeEgressIPConfiguration{config}, nil
358358
}
@@ -408,18 +408,17 @@ func (a *Azure) getSubnet(networkInterface armnetwork.Interface) (*net.IPNet, *n
408408
// We need to retrieve the amounts assigned to the node by default and subtract
409409
// that from the default 256 value. Note: there is also a "Private IP addresses
410410
// per virtual network" quota, but that's 65.536, so we can skip that.
411-
func (a *Azure) getCapacity(networkInterface armnetwork.Interface, cloudPrivateIPsCount int) int {
412-
currentIPv4Usage, currentIPv6Usage := 0, 0
411+
func (a *Azure) getCapacity(networkInterface armnetwork.Interface, cpicIPs sets.Set[string]) int {
412+
currentIPUsage := 0
413413
for _, ipConfiguration := range networkInterface.Properties.IPConfigurations {
414414
if assignedIP := net.ParseIP(ptr.Deref(ipConfiguration.Properties.PrivateIPAddress, "")); assignedIP != nil {
415-
if utilnet.IsIPv4(assignedIP) {
416-
currentIPv4Usage++
417-
} else {
418-
currentIPv6Usage++
415+
if !cpicIPs.Has(assignedIP.String()) {
416+
currentIPUsage++
419417
}
420418
}
421419
}
422-
return defaultAzurePrivateIPCapacity + cloudPrivateIPsCount - currentIPv4Usage - currentIPv6Usage
420+
421+
return defaultAzurePrivateIPCapacity - currentIPUsage
423422
}
424423

425424
// This is what the node's providerID looks like on Azure
@@ -668,6 +667,12 @@ func (a *Azure) getNodeLock(nodeName string) *sync.Mutex {
668667
return a.nodeLockMap[nodeName]
669668
}
670669

670+
func (a *Azure) CleanupNode(nodeName string) {
671+
a.nodeMapLock.Lock()
672+
defer a.nodeMapLock.Unlock()
673+
delete(a.nodeLockMap, nodeName)
674+
}
675+
671676
func getNameFromResourceID(id string) string {
672677
return id[strings.LastIndex(id, "/"):]
673678
}

pkg/cloudprovider/cloudprovider.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ import (
1313
apifeatures "github.com/openshift/api/features"
1414
"github.com/openshift/library-go/pkg/operator/configobserver/featuregates"
1515

16-
v1 "github.com/openshift/api/cloudnetwork/v1"
1716
corev1 "k8s.io/api/core/v1"
17+
"k8s.io/apimachinery/pkg/util/sets"
1818
)
1919

2020
var (
@@ -60,7 +60,13 @@ type CloudProviderIntf interface {
6060
// for all instance types and IP families (GCP, Azure) or variable per
6161
// instance and IP family (AWS), also: the interface is either keyed by name
6262
// (GCP) or ID (Azure, AWS).
63-
GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPConfigs []*v1.CloudPrivateIPConfig) ([]*NodeEgressIPConfiguration, error)
63+
// The cpicIPs parameter is a set of IP addresses that are
64+
// managed by CloudPrivateIPConfigs and should be excluded from capacity calculations.
65+
GetNodeEgressIPConfiguration(node *corev1.Node, cpicIPs sets.Set[string]) ([]*NodeEgressIPConfiguration, error)
66+
67+
// CleanupNode removes any internal state associated with the node.
68+
// This should be called when a node is deleted.
69+
CleanupNode(nodeName string)
6470
}
6571

6672
// CloudProviderWithMoveIntf is additional interface that can be added to cloud
@@ -163,6 +169,7 @@ func NewCloudProviderClient(cfg CloudProviderConfig, platformStatus *configv1.Pl
163169
case PlatformTypeGCP:
164170
cloudProviderIntf = &GCP{
165171
CloudProvider: cp,
172+
nodeLockMap: make(map[string]*sync.Mutex),
166173
}
167174
case PlatformTypeOpenStack:
168175
cloudProviderIntf = &OpenStack{

pkg/cloudprovider/cloudprovider_fake.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ import (
55
"net"
66
"time"
77

8-
v1 "github.com/openshift/api/cloudnetwork/v1"
98
corev1 "k8s.io/api/core/v1"
9+
"k8s.io/apimachinery/pkg/util/sets"
1010
)
1111

1212
type FakeCloudProvider struct {
@@ -63,9 +63,12 @@ func (f *FakeCloudProvider) waitForCompletion() error {
6363
return nil
6464
}
6565

66-
func (f *FakeCloudProvider) GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPConfigs []*v1.CloudPrivateIPConfig) ([]*NodeEgressIPConfiguration, error) {
66+
func (f *FakeCloudProvider) GetNodeEgressIPConfiguration(node *corev1.Node, cpicIPs sets.Set[string]) ([]*NodeEgressIPConfiguration, error) {
6767
if f.mockErrorOnGetNodeEgressIPConfiguration {
6868
return nil, fmt.Errorf("Get node egress IP configuration failed")
6969
}
7070
return nil, nil
7171
}
72+
73+
func (f *FakeCloudProvider) CleanupNode(nodeName string) {
74+
}

pkg/cloudprovider/gcp.go

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ import (
66
"net"
77
"net/url"
88
"strings"
9+
"sync"
910

10-
v1 "github.com/openshift/api/cloudnetwork/v1"
1111
google "google.golang.org/api/compute/v1"
1212
"google.golang.org/api/option"
1313
corev1 "k8s.io/api/core/v1"
14-
utilnet "k8s.io/utils/net"
14+
"k8s.io/apimachinery/pkg/util/sets"
1515
"k8s.io/utils/ptr"
1616
)
1717

@@ -31,7 +31,9 @@ const (
3131
// to the GCP cloud API
3232
type GCP struct {
3333
CloudProvider
34-
client *google.Service
34+
client *google.Service
35+
nodeMapLock sync.Mutex
36+
nodeLockMap map[string]*sync.Mutex
3537
}
3638

3739
func (g *GCP) initCredentials() (err error) {
@@ -81,6 +83,10 @@ func (g *GCP) initCredentials() (err error) {
8183
// GCP can return 10.0.32.25/32 or 10.0.32.25 - we thus need to check for both
8284
// when validating that the IP provided doesn't already exist
8385
func (g *GCP) AssignPrivateIP(ip net.IP, node *corev1.Node) error {
86+
nodeLock := g.getNodeLock(node.Name)
87+
nodeLock.Lock()
88+
defer nodeLock.Unlock()
89+
8490
project, zone, instance, err := g.getInstance(node)
8591
if err != nil {
8692
return err
@@ -114,6 +120,10 @@ func (g *GCP) AssignPrivateIP(ip net.IP, node *corev1.Node) error {
114120
// Important: GCP IP aliases can come in all forms, i.e: if you add 10.0.32.25
115121
// GCP can return 10.0.32.25/32 or 10.0.32.25
116122
func (g *GCP) ReleasePrivateIP(ip net.IP, node *corev1.Node) error {
123+
nodeLock := g.getNodeLock(node.Name)
124+
nodeLock.Lock()
125+
defer nodeLock.Unlock()
126+
117127
project, zone, instance, err := g.getInstance(node)
118128
if err != nil {
119129
return err
@@ -155,7 +165,7 @@ func (g *GCP) ReleasePrivateIP(ip net.IP, node *corev1.Node) error {
155165
return g.waitForCompletion(project, zone, operation.Name)
156166
}
157167

158-
func (g *GCP) GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPConfigs []*v1.CloudPrivateIPConfig) ([]*NodeEgressIPConfiguration, error) {
168+
func (g *GCP) GetNodeEgressIPConfiguration(node *corev1.Node, cpicIPs sets.Set[string]) ([]*NodeEgressIPConfiguration, error) {
159169
_, _, instance, err := g.getInstance(node)
160170
if err != nil {
161171
return nil, fmt.Errorf("error retrieving instance associated with node, err: %v", err)
@@ -184,7 +194,7 @@ func (g *GCP) GetNodeEgressIPConfiguration(node *corev1.Node, cloudPrivateIPConf
184194
}
185195
config.Capacity = capacity{
186196
// IPv4 and IPv6 fields not used by GCP (uses IP-family-agnostic capacity)
187-
IP: ptr.To(g.getCapacity(networkInterface, len(cloudPrivateIPConfigs))),
197+
IP: ptr.To(g.getCapacity(networkInterface, cpicIPs)),
188198
}
189199
return []*NodeEgressIPConfiguration{config}, nil //nolint:staticcheck
190200
}
@@ -235,25 +245,22 @@ func (g *GCP) getSubnet(networkInterface *google.NetworkInterface) (*net.IPNet,
235245

236246
// Note: there is also a global "alias IP per VPC quota", but OpenShift clusters on
237247
// GCP seem to have that value defined to 15,000. So we can skip that.
238-
func (g *GCP) getCapacity(networkInterface *google.NetworkInterface, cloudPrivateIPsCount int) int {
239-
currentIPv4Usage := 0
240-
currentIPv6Usage := 0
248+
func (g *GCP) getCapacity(networkInterface *google.NetworkInterface, cpicIPs sets.Set[string]) int {
249+
currentIPUsage := 0
241250
for _, aliasIPRange := range networkInterface.AliasIpRanges {
251+
var aliasIP net.IP
242252
if assignedIP := net.ParseIP(aliasIPRange.IpCidrRange); assignedIP != nil {
243-
if utilnet.IsIPv4(assignedIP) {
244-
currentIPv4Usage++
245-
} else {
246-
currentIPv6Usage++
247-
}
253+
aliasIP = assignedIP
248254
} else if _, assignedSubnet, err := net.ParseCIDR(aliasIPRange.IpCidrRange); err == nil {
249-
if utilnet.IsIPv4CIDR(assignedSubnet) {
250-
currentIPv4Usage++
251-
} else {
252-
currentIPv6Usage++
253-
}
255+
aliasIP = assignedSubnet.IP
256+
}
257+
258+
if aliasIP != nil && !cpicIPs.Has(aliasIP.String()) {
259+
currentIPUsage++
254260
}
255261
}
256-
return defaultGCPPrivateIPCapacity + cloudPrivateIPsCount - currentIPv4Usage - currentIPv6Usage
262+
263+
return defaultGCPPrivateIPCapacity - currentIPUsage
257264
}
258265

259266
// getInstance retrieves the GCP instance referred by the Node object.
@@ -317,3 +324,20 @@ func (g *GCP) parseSubnet(subnetURL string) (string, string, string, error) {
317324
return subnetURLParts[len(subnetURLParts)-5], subnetURLParts[len(subnetURLParts)-3],
318325
subnetURLParts[len(subnetURLParts)-1], nil
319326
}
327+
328+
// getNodeLock retrieves node lock from nodeLockMap, If lock doesn't exist, then update map
329+
// with a new lock entry for the given node name.
330+
func (g *GCP) getNodeLock(nodeName string) *sync.Mutex {
331+
g.nodeMapLock.Lock()
332+
defer g.nodeMapLock.Unlock()
333+
if _, ok := g.nodeLockMap[nodeName]; !ok {
334+
g.nodeLockMap[nodeName] = &sync.Mutex{}
335+
}
336+
return g.nodeLockMap[nodeName]
337+
}
338+
339+
func (g *GCP) CleanupNode(nodeName string) {
340+
g.nodeMapLock.Lock()
341+
defer g.nodeMapLock.Unlock()
342+
delete(g.nodeLockMap, nodeName)
343+
}

0 commit comments

Comments
 (0)