From cec1814ada5df8e32477b7a9fa117436e6324c3e Mon Sep 17 00:00:00 2001 From: leo-ryu Date: Tue, 11 Mar 2025 19:29:22 +0800 Subject: [PATCH] fix topology spread constraints with zonal volume --- pkg/controllers/provisioning/provisioner.go | 14 +- .../provisioning/scheduling/existingnode.go | 9 +- .../provisioning/scheduling/nodeclaim.go | 9 +- .../provisioning/scheduling/scheduler.go | 10 +- .../scheduling/scheduling_benchmark_test.go | 3 +- .../provisioning/scheduling/suite_test.go | 142 ++++++++++++++++++ .../provisioning/scheduling/topology.go | 20 ++- .../provisioning/scheduling/topologygroup.go | 82 ++++++++-- .../provisioning/scheduling/volumetopology.go | 27 +--- 9 files changed, 259 insertions(+), 57 deletions(-) diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go index 0cbfc1a575..8cc04fadc6 100644 --- a/pkg/controllers/provisioning/provisioner.go +++ b/pkg/controllers/provisioning/provisioner.go @@ -254,11 +254,11 @@ func (p *Provisioner) NewScheduler( instanceTypes[np.Name] = its } - // inject topology constraints - pods = p.injectVolumeTopologyRequirements(ctx, pods) + // Link volume requirements to pods + podsVolumeRequirements := p.convertToPodVolumeRequirements(ctx, pods) // Calculate cluster topology - topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, stateNodes, nodePools, instanceTypes, pods) + topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, stateNodes, nodePools, instanceTypes, pods, podsVolumeRequirements) if err != nil { return nil, fmt.Errorf("tracking topology counts, %w", err) } @@ -464,13 +464,13 @@ func validateKarpenterManagedLabelCanExist(p *corev1.Pod) error { return nil } -func (p *Provisioner) injectVolumeTopologyRequirements(ctx context.Context, pods []*corev1.Pod) []*corev1.Pod { - var schedulablePods []*corev1.Pod +func (p *Provisioner) convertToPodVolumeRequirements(ctx context.Context, pods []*corev1.Pod) map[*corev1.Pod][]corev1.NodeSelectorRequirement { + var schedulablePods = make(map[*corev1.Pod][]corev1.NodeSelectorRequirement) for _, pod := range pods { - if err := p.volumeTopology.Inject(ctx, pod); err != nil { + if requirements, err := p.volumeTopology.GetVolumeRequirements(ctx, pod); err != nil { log.FromContext(ctx).WithValues("Pod", klog.KObj(pod)).Error(err, "failed getting volume topology requirements") } else { - schedulablePods = append(schedulablePods, pod) + schedulablePods[pod] = requirements } } return schedulablePods diff --git a/pkg/controllers/provisioning/scheduling/existingnode.go b/pkg/controllers/provisioning/scheduling/existingnode.go index 804ca1cc61..785fc21443 100644 --- a/pkg/controllers/provisioning/scheduling/existingnode.go +++ b/pkg/controllers/provisioning/scheduling/existingnode.go @@ -65,7 +65,7 @@ func NewExistingNode(n *state.StateNode, topology *Topology, taints []v1.Taint, return node } -func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v1.Pod, podData *PodData) error { +func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v1.Pod, podData *PodData, volumeRequirements []v1.NodeSelectorRequirement) error { // Check Taints if err := scheduling.Taints(n.cachedTaints).ToleratesPod(pod); err != nil { return err @@ -111,6 +111,13 @@ func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v } nodeRequirements.Add(topologyRequirements.Values()...) + podVolumeRequirements := scheduling.NewNodeSelectorRequirements(volumeRequirements...) + // Check Pod Volume Requirements + if err = nodeRequirements.Compatible(podVolumeRequirements); err != nil { + return err + } + nodeRequirements.Add(podVolumeRequirements.Values()...) + // Update node n.Pods = append(n.Pods, pod) n.requests = requests diff --git a/pkg/controllers/provisioning/scheduling/nodeclaim.go b/pkg/controllers/provisioning/scheduling/nodeclaim.go index 51d8f29d75..c42db466b5 100644 --- a/pkg/controllers/provisioning/scheduling/nodeclaim.go +++ b/pkg/controllers/provisioning/scheduling/nodeclaim.go @@ -108,7 +108,7 @@ func NewNodeClaim( } } -func (n *NodeClaim) Add(ctx context.Context, pod *corev1.Pod, podData *PodData) error { +func (n *NodeClaim) Add(ctx context.Context, pod *corev1.Pod, podData *PodData, volumeRequirements []corev1.NodeSelectorRequirement) error { // Check Taints if err := scheduling.Taints(n.Spec.Taints).ToleratesPod(pod); err != nil { return err @@ -137,6 +137,13 @@ func (n *NodeClaim) Add(ctx context.Context, pod *corev1.Pod, podData *PodData) } nodeClaimRequirements.Add(topologyRequirements.Values()...) + podVolumeRequirements := scheduling.NewNodeSelectorRequirements(volumeRequirements...) + // Check Pod Volume Requirements + if err = nodeClaimRequirements.Compatible(podVolumeRequirements, scheduling.AllowUndefinedWellKnownLabels); err != nil { + return err + } + nodeClaimRequirements.Add(podVolumeRequirements.Values()...) + // Check instance type combinations requests := resources.Merge(n.Spec.Resources.Requests, podData.Requests) diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go index 152998c50f..341adb004c 100644 --- a/pkg/controllers/provisioning/scheduling/scheduler.go +++ b/pkg/controllers/provisioning/scheduling/scheduler.go @@ -350,9 +350,13 @@ func (s *Scheduler) updateCachedPodData(p *corev1.Pod) { //nolint:gocyclo func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error { + var volumeRequirements []corev1.NodeSelectorRequirement + if _, ok := s.topology.podVolumeRequirements[pod]; ok { + volumeRequirements = s.topology.podVolumeRequirements[pod] + } // first try to schedule against an in-flight real node for _, node := range s.existingNodes { - if err := node.Add(ctx, s.kubeClient, pod, s.cachedPodData[pod.UID]); err == nil { + if err := node.Add(ctx, s.kubeClient, pod, s.cachedPodData[pod.UID], volumeRequirements); err == nil { return nil } } @@ -362,7 +366,7 @@ func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error { // Pick existing node that we are about to create for _, nodeClaim := range s.newNodeClaims { - if err := nodeClaim.Add(ctx, pod, s.cachedPodData[pod.UID]); err == nil { + if err := nodeClaim.Add(ctx, pod, s.cachedPodData[pod.UID], volumeRequirements); err == nil { return nil } } @@ -389,7 +393,7 @@ func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error { } nodeClaim := NewNodeClaim(nodeClaimTemplate, s.topology, s.daemonOverhead[nodeClaimTemplate], instanceTypes, s.reservationManager, s.reservedOfferingMode) - if err := nodeClaim.Add(ctx, pod, s.cachedPodData[pod.UID]); err != nil { + if err := nodeClaim.Add(ctx, pod, s.cachedPodData[pod.UID], volumeRequirements); err != nil { nodeClaim.Destroy() if IsReservedOfferingError(err) { errs = multierr.Append(errs, fmt.Errorf( diff --git a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go index 3907ca5242..1be8dc30aa 100644 --- a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go +++ b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go @@ -155,9 +155,10 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) { pods := makeDiversePods(podCount) clock := &clock.RealClock{} cluster = state.NewCluster(clock, client, cloudProvider) + podsVolumeRequirements := make(map[*corev1.Pod][]corev1.NodeSelectorRequirement) topology, err := scheduling.NewTopology(ctx, client, cluster, nil, []*v1.NodePool{nodePool}, map[string][]*cloudprovider.InstanceType{ nodePool.Name: instanceTypes, - }, pods) + }, pods, podsVolumeRequirements) if err != nil { b.Fatalf("creating topology, %s", err) } diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go index e89c723d7d..f33fd5e6f7 100644 --- a/pkg/controllers/provisioning/scheduling/suite_test.go +++ b/pkg/controllers/provisioning/scheduling/suite_test.go @@ -3443,6 +3443,148 @@ var _ = Context("Scheduling", func() { Expect(node.Name).ToNot(Equal(node2.Name)) }) }) + Context("Pods with Zonal Volume and Topology Spread", func() { + var labels = map[string]string{"test": "test"} + var pvcs []*corev1.PersistentVolumeClaim + var pods []*corev1.Pod + var sc1 *storagev1.StorageClass + var sc2 *storagev1.StorageClass + var tsc = corev1.TopologySpreadConstraint{ + MaxSkew: 1, + TopologyKey: corev1.LabelTopologyZone, + WhenUnsatisfiable: corev1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + } + BeforeEach(func() { + pvcs = []*corev1.PersistentVolumeClaim{} + pods = []*corev1.Pod{} + sc1 = test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{Name: "my-storage-class-1"}, + Zones: []string{"test-zone-1"}, + }) + sc2 = test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{Name: "my-storage-class-2"}, + Zones: []string{"test-zone-2"}, + }) + for i := 0; i < 3; i++ { + // one is in test-zone-1 and others are in test-zone-2 + scname := sc1.Name + if i > 0 { + scname = sc2.Name + } + pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("my-claim-%d", i)}, + StorageClassName: lo.ToPtr(scname), + }) + pod := test.UnschedulablePod(test.PodOptions{ + // to ensure one node with one pod + PodAntiRequirements: []corev1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + TopologyKey: corev1.LabelHostname, + }, + }, + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{tsc}, + PersistentVolumeClaims: []string{pvc.Name}, + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + }) + pvcs = append(pvcs, pvc) + pods = append(pods, pod) + } + }) + It("should launch nodes when volume zone is compatible with topology spread", func() { + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-1"}, + }, + }) + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-2"}, + }, + }) + ExpectApplied(ctx, env.Client, nodePool, sc1, sc2) + ExpectApplied(ctx, env.Client, pvcs[0], pvcs[1], pvcs[2]) + ExpectApplied(ctx, env.Client, pods[0], pods[1], node1, node2) + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node2) + + ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node1, node2}, nil) + + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods[2]) + ExpectScheduled(ctx, env.Client, pods[2]) + }) + It("should not launch nodes when volume zone is not compatible with topology spread", func() { + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-1"}, + }, + }) + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-2"}, + }, + }) + node3 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-3"}, + }, + }) + + ExpectApplied(ctx, env.Client, nodePool, sc1, sc2) + ExpectApplied(ctx, env.Client, pvcs[0], pvcs[1], pvcs[2]) + ExpectApplied(ctx, env.Client, pods[0], pods[1], node1, node2, node3) + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node2) + + ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node1, node2, node3}, nil) + + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods[2]) + // for topology spread 3rd pod should be schduled to test-zone-3, but volume need be in test-zone-2 + ExpectNotScheduled(ctx, env.Client, pods[2]) + + }) + It("only nodes matching nodeAffinity/nodeSelector are included in the calculations by default", func() { + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-1", "test": "test"}, + }, + }) + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-2", "test": "test"}, + }, + }) + node3 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-3"}, + }, + }) + nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: corev1.NodeSelectorRequirement{ + Key: "test", + Operator: corev1.NodeSelectorOpIn, + Values: []string{"test"}, + }, + }, + } + pods[2].Spec.NodeSelector = map[string]string{"test": "test"} + + ExpectApplied(ctx, env.Client, nodePool, sc1, sc2) + ExpectApplied(ctx, env.Client, pvcs[0], pvcs[1], pvcs[2]) + ExpectApplied(ctx, env.Client, pods[0], pods[1], node1, node2, node3) + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node2) + + ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node1, node2, node3}, nil) + + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods[2]) + // since there is no node in test-zone-3 has label test, just test-zone-1 and test-zone-2 are included in the calculations. + ExpectScheduled(ctx, env.Client, pods[2]) + + }) + }) }) Describe("Deleting Nodes", func() { diff --git a/pkg/controllers/provisioning/scheduling/topology.go b/pkg/controllers/provisioning/scheduling/topology.go index 507dc785bd..68dd885f82 100644 --- a/pkg/controllers/provisioning/scheduling/topology.go +++ b/pkg/controllers/provisioning/scheduling/topology.go @@ -60,6 +60,9 @@ type Topology struct { excludedPods sets.Set[string] cluster *state.Cluster stateNodes []*state.StateNode + // podVolumeRequirements links volume requirements to pods. This is used so we + // can track the volume requirements in simulate scheduler + podVolumeRequirements map[*corev1.Pod][]corev1.NodeSelectorRequirement } func NewTopology( @@ -70,6 +73,9 @@ func NewTopology( nodePools []*v1.NodePool, instanceTypes map[string][]*cloudprovider.InstanceType, pods []*corev1.Pod, + // podVolumeRequirements links volume requirements to pods. This is used so we + // can track the volume requirements in simulate scheduler + podsVolumeRequirements map[*corev1.Pod][]corev1.NodeSelectorRequirement, ) (*Topology, error) { t := &Topology{ kubeClient: kubeClient, @@ -79,17 +85,18 @@ func NewTopology( topologyGroups: map[uint64]*TopologyGroup{}, inverseTopologyGroups: map[uint64]*TopologyGroup{}, excludedPods: sets.New[string](), + podVolumeRequirements: podsVolumeRequirements, } // these are the pods that we intend to schedule, so if they are currently in the cluster we shouldn't count them for // topology purposes - for _, p := range pods { + for p := range podsVolumeRequirements { t.excludedPods.Insert(string(p.UID)) } errs := t.updateInverseAffinities(ctx) - for i := range pods { - errs = multierr.Append(errs, t.Update(ctx, pods[i])) + for p := range podsVolumeRequirements { + errs = multierr.Append(errs, t.Update(ctx, p)) } if errs != nil { return nil, errs @@ -228,7 +235,7 @@ func (t *Topology) AddRequirements(p *corev1.Pod, taints []corev1.Taint, podRequ if nodeRequirements.Has(topology.Key) { nodeDomains = nodeRequirements.Get(topology.Key) } - domains := topology.Get(p, podDomains, nodeDomains) + domains := topology.Get(p, podDomains, nodeDomains, len(t.podVolumeRequirements[p]) != 0) if domains.Len() == 0 { return nil, topologyError{ topology: topology, @@ -299,7 +306,7 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *corev1.Po return err } - tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey]) + tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey], t.cluster) hash := tg.Hash() if existing, ok := t.inverseTopologyGroups[hash]; !ok { @@ -442,6 +449,7 @@ func (t *Topology) newForTopologies(p *corev1.Pod) []*TopologyGroup { tsc.NodeTaintsPolicy, tsc.NodeAffinityPolicy, t.domainGroups[tsc.TopologyKey], + t.cluster, )) } return topologyGroups @@ -479,7 +487,7 @@ func (t *Topology) newForAffinities(ctx context.Context, p *corev1.Pod) ([]*Topo if err != nil { return nil, err } - topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey])) + topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey], t.cluster)) } } return topologyGroups, nil diff --git a/pkg/controllers/provisioning/scheduling/topologygroup.go b/pkg/controllers/provisioning/scheduling/topologygroup.go index cf257f742d..6be878aece 100644 --- a/pkg/controllers/provisioning/scheduling/topologygroup.go +++ b/pkg/controllers/provisioning/scheduling/topologygroup.go @@ -29,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/karpenter/pkg/controllers/state" "sigs.k8s.io/karpenter/pkg/scheduling" ) @@ -59,6 +60,7 @@ type TopologyGroup struct { Type TopologyType maxSkew int32 minDomains *int32 + cluster *state.Cluster namespaces sets.Set[string] selector labels.Selector rawSelector *metav1.LabelSelector @@ -80,6 +82,7 @@ func NewTopologyGroup( taintPolicy *corev1.NodeInclusionPolicy, affinityPolicy *corev1.NodeInclusionPolicy, domainGroup TopologyDomainGroup, + cluster *state.Cluster, ) *TopologyGroup { // the nil *TopologyNodeFilter always passes which is what we need for affinity/anti-affinity var nodeFilter TopologyNodeFilter @@ -110,6 +113,7 @@ func NewTopologyGroup( return &TopologyGroup{ Type: topologyType, Key: topologyKey, + cluster: cluster, namespaces: namespaces, selector: selector, rawSelector: labelSelector, @@ -122,10 +126,10 @@ func NewTopologyGroup( } } -func (t *TopologyGroup) Get(pod *corev1.Pod, podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement { +func (t *TopologyGroup) Get(pod *corev1.Pod, podDomains, nodeDomains *scheduling.Requirement, hasVolumeRequirements bool) *scheduling.Requirement { switch t.Type { case TopologyTypeSpread: - return t.nextDomainTopologySpread(pod, podDomains, nodeDomains) + return t.nextDomainTopologySpread(pod, podDomains, nodeDomains, hasVolumeRequirements) case TopologyTypePodAffinity: return t.nextDomainAffinity(pod, podDomains, nodeDomains) case TopologyTypePodAntiAffinity: @@ -202,9 +206,47 @@ func (t *TopologyGroup) Hash() uint64 { // If there are multiple eligible domains, we return any random domain that satisfies the `maxSkew` configuration. // If there are no eligible domains, we return a `DoesNotExist` requirement, implying that we could not satisfy the topologySpread requirement. // nolint:gocyclo -func (t *TopologyGroup) nextDomainTopologySpread(pod *corev1.Pod, podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement { +func (t *TopologyGroup) nextDomainTopologySpread(pod *corev1.Pod, podDomains, nodeDomains *scheduling.Requirement, hasVolumeRequirement bool) *scheduling.Requirement { + var nodes = make(map[string][]*corev1.Node) + var blockedDomains = sets.New[string]() + var candidateDomains = []string{} + var firstDomains = []string{} + + if t.cluster != nil { + for _, node := range t.cluster.Nodes() { + if node == nil || node.Node == nil { + continue + } + if _, ok := node.Node.GetLabels()[t.Key]; !ok { + continue + } + nodes[node.Node.GetLabels()[t.Key]] = append(nodes[node.Node.GetLabels()[t.Key]], node.Node) + } + } + // some empty domains, which all existing nodes with them don't match the pod, should not be in the calculations. + for _, domain := range t.emptyDomains.UnsortedList() { + // no existing node has this domain, so this domain is in nodeclaim and may will be created first time. + if len(nodes[domain]) == 0 { + // if we have volume requirement, we should block the first time domain, since it's skew is always 0 which may break the skew caculations. + if hasVolumeRequirement { + firstDomains = append(firstDomains, domain) + } else { + continue + } + } + var needBlock = true + for _, node := range nodes[domain] { + if node.GetLabels()[t.Key] == domain && t.nodeFilter.Matches(node.Spec.Taints, scheduling.NewLabelRequirements(node.Labels)) { + needBlock = false + break + } + } + if needBlock { + blockedDomains.Insert(domain) + } + } // min count is calculated across all domains - min := t.domainMinCount(podDomains) + min := t.domainMinCount(podDomains, blockedDomains) selfSelecting := t.selects(pod) minDomain := "" @@ -216,41 +258,51 @@ func (t *TopologyGroup) nextDomainTopologySpread(pod *corev1.Pod, podDomains, no // lot of t.domains but only a single nodeDomain if nodeDomains.Operator() == corev1.NodeSelectorOpIn { for _, domain := range nodeDomains.Values() { - if count, ok := t.domains[domain]; ok { + if count, ok := t.domains[domain]; ok && !blockedDomains.Has(domain) { if selfSelecting { count++ } - if count-min <= t.maxSkew && count < minCount { - minDomain = domain - minCount = count + if count-min <= t.maxSkew { + candidateDomains = append(candidateDomains, domain) + if count < minCount { + minDomain = domain + minCount = count + } } } } } else { for domain := range t.domains { // but we can only choose from the node domains - if nodeDomains.Has(domain) { + if nodeDomains.Has(domain) && !blockedDomains.Has(domain) { // comment from kube-scheduler regarding the viable choices to schedule to based on skew is: // 'existing matching num' + 'if self-match (1 or 0)' - 'global min matching num' <= 'maxSkew' count := t.domains[domain] if selfSelecting { count++ } - if count-min <= t.maxSkew && count < minCount { - minDomain = domain - minCount = count + if count-min <= t.maxSkew { + candidateDomains = append(candidateDomains, domain) + if count < minCount { + minDomain = domain + minCount = count + } } } } } - if minDomain == "" { + if minDomain == "" && len(firstDomains) == 0 { // avoids an error message about 'zone in [""]', preferring 'zone in []' return scheduling.NewRequirement(podDomains.Key, corev1.NodeSelectorOpDoesNotExist) } + // we should pop all candidate domains for volume requirments + if hasVolumeRequirement { + return scheduling.NewRequirement(podDomains.Key, corev1.NodeSelectorOpIn, append(firstDomains, candidateDomains...)...) + } return scheduling.NewRequirement(podDomains.Key, corev1.NodeSelectorOpIn, minDomain) } -func (t *TopologyGroup) domainMinCount(domains *scheduling.Requirement) int32 { +func (t *TopologyGroup) domainMinCount(domains *scheduling.Requirement, blockedDomains sets.Set[string]) int32 { // hostname based topologies always have a min pod count of zero since we can create one if t.Key == corev1.LabelHostname { return 0 @@ -260,7 +312,7 @@ func (t *TopologyGroup) domainMinCount(domains *scheduling.Requirement) int32 { var numPodSupportedDomains int32 // determine our current min count for domain, count := range t.domains { - if domains.Has(domain) { + if domains.Has(domain) && !blockedDomains.Has(domain) { numPodSupportedDomains++ if count < min { min = count diff --git a/pkg/controllers/provisioning/scheduling/volumetopology.go b/pkg/controllers/provisioning/scheduling/volumetopology.go index 14d218eb13..d5552ddebb 100644 --- a/pkg/controllers/provisioning/scheduling/volumetopology.go +++ b/pkg/controllers/provisioning/scheduling/volumetopology.go @@ -39,42 +39,23 @@ type VolumeTopology struct { kubeClient client.Client } -func (v *VolumeTopology) Inject(ctx context.Context, pod *v1.Pod) error { +func (v *VolumeTopology) GetVolumeRequirements(ctx context.Context, pod *v1.Pod) ([]v1.NodeSelectorRequirement, error) { var requirements []v1.NodeSelectorRequirement for _, volume := range pod.Spec.Volumes { req, err := v.getRequirements(ctx, pod, volume) if err != nil { - return err + return nil, err } requirements = append(requirements, req...) } if len(requirements) == 0 { - return nil - } - if pod.Spec.Affinity == nil { - pod.Spec.Affinity = &v1.Affinity{} - } - if pod.Spec.Affinity.NodeAffinity == nil { - pod.Spec.Affinity.NodeAffinity = &v1.NodeAffinity{} - } - if pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { - pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{} - } - if len(pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) == 0 { - pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = []v1.NodeSelectorTerm{{}} - } - - // We add our volume topology zonal requirement to every node selector term. This causes it to be AND'd with every existing - // requirement so that relaxation won't remove our volume requirement. - for i := 0; i < len(pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms); i++ { - pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[i].MatchExpressions = append( - pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[i].MatchExpressions, requirements...) + return requirements, nil } log.FromContext(ctx). WithValues("Pod", klog.KObj(pod)). V(1).Info(fmt.Sprintf("adding requirements derived from pod volumes, %s", requirements)) - return nil + return requirements, nil } func (v *VolumeTopology) getRequirements(ctx context.Context, pod *v1.Pod, volume v1.Volume) ([]v1.NodeSelectorRequirement, error) {