Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix topology spread constraints with zonal volume #1907

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,11 +283,25 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*corev1.Pod, stat
}
}

// inject topology constraints
pods = p.injectVolumeTopologyRequirements(ctx, pods)
// Add Existing Nodes' Domains
for _, n := range stateNodes {
if n.Node != nil {
requirements := scheduling.NewLabelRequirements(n.Node.Labels)
for key, requirement := range requirements {
if domains[key] == nil {
domains[key] = sets.New(requirement.Values()...)
} else {
domains[key].Insert(requirement.Values()...)
}
}
}
}

// Link volume requirements to pods
podsVolumeRequirements := p.convertToPodVolumeRequirements(ctx, pods)

// Calculate cluster topology
topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, domains, pods)
topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, domains, podsVolumeRequirements)
if err != nil {
return nil, fmt.Errorf("tracking topology counts, %w", err)
}
Expand Down Expand Up @@ -454,13 +468,13 @@ func validateKarpenterManagedLabelCanExist(p *corev1.Pod) error {
return nil
}

func (p *Provisioner) injectVolumeTopologyRequirements(ctx context.Context, pods []*corev1.Pod) []*corev1.Pod {
var schedulablePods []*corev1.Pod
func (p *Provisioner) convertToPodVolumeRequirements(ctx context.Context, pods []*corev1.Pod) map[*corev1.Pod][]corev1.NodeSelectorRequirement {
var schedulablePods = make(map[*corev1.Pod][]corev1.NodeSelectorRequirement)
for _, pod := range pods {
if err := p.volumeTopology.Inject(ctx, pod); err != nil {
if requirements, err := p.volumeTopology.GetVolumeRequirements(ctx, pod); err != nil {
log.FromContext(ctx).WithValues("Pod", klog.KRef(pod.Namespace, pod.Name)).Error(err, "failed getting volume topology requirements")
} else {
schedulablePods = append(schedulablePods, pod)
schedulablePods[pod] = requirements
}
}
return schedulablePods
Expand Down
9 changes: 8 additions & 1 deletion pkg/controllers/provisioning/scheduling/existingnode.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func NewExistingNode(n *state.StateNode, topology *Topology, taints []v1.Taint,
return node
}

func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v1.Pod, podRequests v1.ResourceList) error {
func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v1.Pod, podRequests v1.ResourceList, volumeRequirements []v1.NodeSelectorRequirement) error {
// Check Taints
if err := scheduling.Taints(n.cachedTaints).Tolerates(pod); err != nil {
return err
Expand Down Expand Up @@ -117,6 +117,13 @@ func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v
}
nodeRequirements.Add(topologyRequirements.Values()...)

podVolumeRequirements := scheduling.NewNodeSelectorRequirements(volumeRequirements...)
// Check Pod Volume Requirements
if err = nodeRequirements.Compatible(podVolumeRequirements); err != nil {
return err
}
nodeRequirements.Add(podVolumeRequirements.Values()...)

// Update node
n.Pods = append(n.Pods, pod)
n.requests = requests
Expand Down
9 changes: 8 additions & 1 deletion pkg/controllers/provisioning/scheduling/nodeclaim.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func NewNodeClaim(nodeClaimTemplate *NodeClaimTemplate, topology *Topology, daem
}
}

func (n *NodeClaim) Add(pod *v1.Pod, podRequests v1.ResourceList) error {
func (n *NodeClaim) Add(pod *v1.Pod, podRequests v1.ResourceList, volumeRequirements []v1.NodeSelectorRequirement) error {
// Check Taints
if err := scheduling.Taints(n.Spec.Taints).Tolerates(pod); err != nil {
return err
Expand Down Expand Up @@ -100,6 +100,13 @@ func (n *NodeClaim) Add(pod *v1.Pod, podRequests v1.ResourceList) error {
}
nodeClaimRequirements.Add(topologyRequirements.Values()...)

podVolumeRequirements := scheduling.NewNodeSelectorRequirements(volumeRequirements...)
// Check Pod Volume Requirements
if err = nodeClaimRequirements.Compatible(podVolumeRequirements, scheduling.AllowUndefinedWellKnownLabels); err != nil {
return err
}
nodeClaimRequirements.Add(podVolumeRequirements.Values()...)

// Check instance type combinations
requests := resources.Merge(n.Spec.Resources.Requests, podRequests)

Expand Down
10 changes: 7 additions & 3 deletions pkg/controllers/provisioning/scheduling/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,13 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
}

func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error {
var volumeRequirements []corev1.NodeSelectorRequirement
if _, ok := s.topology.podVolumeRequirements[pod]; ok {
volumeRequirements = s.topology.podVolumeRequirements[pod]
}
// first try to schedule against an in-flight real node
for _, node := range s.existingNodes {
if err := node.Add(ctx, s.kubeClient, pod, s.cachedPodRequests[pod.UID]); err == nil {
if err := node.Add(ctx, s.kubeClient, pod, s.cachedPodRequests[pod.UID], volumeRequirements); err == nil {
return nil
}
}
Expand All @@ -278,7 +282,7 @@ func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error {

// Pick existing node that we are about to create
for _, nodeClaim := range s.newNodeClaims {
if err := nodeClaim.Add(pod, s.cachedPodRequests[pod.UID]); err == nil {
if err := nodeClaim.Add(pod, s.cachedPodRequests[pod.UID], volumeRequirements); err == nil {
return nil
}
}
Expand All @@ -299,7 +303,7 @@ func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error {
}
}
nodeClaim := NewNodeClaim(nodeClaimTemplate, s.topology, s.daemonOverhead[nodeClaimTemplate], instanceTypes)
if err := nodeClaim.Add(pod, s.cachedPodRequests[pod.UID]); err != nil {
if err := nodeClaim.Add(pod, s.cachedPodRequests[pod.UID], volumeRequirements); err != nil {
nodeClaim.Destroy() // Ensure we cleanup any changes that we made while mocking out a NodeClaim
errs = multierr.Append(errs, fmt.Errorf("incompatible with nodepool %q, daemonset overhead=%s, %w",
nodeClaimTemplate.NodePoolName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) {
clock := &clock.RealClock{}
cluster = state.NewCluster(clock, client, cloudProvider)
domains := map[string]sets.Set[string]{}
topology, err := scheduling.NewTopology(ctx, client, cluster, domains, pods)
podsVolumeRequirements := make(map[*corev1.Pod][]corev1.NodeSelectorRequirement)
topology, err := scheduling.NewTopology(ctx, client, cluster, domains, podsVolumeRequirements)
if err != nil {
b.Fatalf("creating topology, %s", err)
}
Expand Down
142 changes: 142 additions & 0 deletions pkg/controllers/provisioning/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3437,6 +3437,148 @@ var _ = Context("Scheduling", func() {
Expect(node.Name).ToNot(Equal(node2.Name))
})
})
Context("Pods with Zonal Volume and Topology Spread", func() {
var labels = map[string]string{"test": "test"}
var pvcs []*corev1.PersistentVolumeClaim
var pods []*corev1.Pod
var sc1 *storagev1.StorageClass
var sc2 *storagev1.StorageClass
var tsc = corev1.TopologySpreadConstraint{
MaxSkew: 1,
TopologyKey: corev1.LabelTopologyZone,
WhenUnsatisfiable: corev1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
}
BeforeEach(func() {
pvcs = []*corev1.PersistentVolumeClaim{}
pods = []*corev1.Pod{}
sc1 = test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{Name: "my-storage-class-1"},
Zones: []string{"test-zone-1"},
})
sc2 = test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{Name: "my-storage-class-2"},
Zones: []string{"test-zone-2"},
})
for i := 0; i < 3; i++ {
// one is in test-zone-1 and others are in test-zone-2
scname := sc1.Name
if i > 0 {
scname = sc2.Name
}
pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{
ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("my-claim-%d", i)},
StorageClassName: lo.ToPtr(scname),
})
pod := test.UnschedulablePod(test.PodOptions{
// to ensure one node with one pod
PodAntiRequirements: []corev1.PodAffinityTerm{
{
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
TopologyKey: corev1.LabelHostname,
},
},
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{tsc},
PersistentVolumeClaims: []string{pvc.Name},
ObjectMeta: metav1.ObjectMeta{Labels: labels},
})
pvcs = append(pvcs, pvc)
pods = append(pods, pod)
}
})
It("should launch nodes when volume zone is compatible with topology spread", func() {
node1 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-1"},
},
})
node2 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-2"},
},
})
ExpectApplied(ctx, env.Client, nodePool, sc1, sc2)
ExpectApplied(ctx, env.Client, pvcs[0], pvcs[1], pvcs[2])
ExpectApplied(ctx, env.Client, pods[0], pods[1], node1, node2)
ExpectManualBinding(ctx, env.Client, pods[0], node1)
ExpectManualBinding(ctx, env.Client, pods[1], node2)

ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node1, node2}, nil)

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods[2])
ExpectScheduled(ctx, env.Client, pods[2])
})
It("should not launch nodes when volume zone is not compatible with topology spread", func() {
node1 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-1"},
},
})
node2 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-2"},
},
})
node3 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-3"},
},
})

ExpectApplied(ctx, env.Client, nodePool, sc1, sc2)
ExpectApplied(ctx, env.Client, pvcs[0], pvcs[1], pvcs[2])
ExpectApplied(ctx, env.Client, pods[0], pods[1], node1, node2, node3)
ExpectManualBinding(ctx, env.Client, pods[0], node1)
ExpectManualBinding(ctx, env.Client, pods[1], node2)

ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node1, node2, node3}, nil)

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods[2])
// for topology spread 3rd pod should be schduled to test-zone-3, but volume need be in test-zone-2
ExpectNotScheduled(ctx, env.Client, pods[2])

})
It("only nodes matching nodeAffinity/nodeSelector are included in the calculations by default", func() {
node1 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-1", "test": "test"},
},
})
node2 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-2", "test": "test"},
},
})
node3 := test.Node(test.NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{corev1.LabelTopologyZone: "test-zone-3"},
},
})
nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirementWithMinValues{
{
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
Key: "test",
Operator: corev1.NodeSelectorOpIn,
Values: []string{"test"},
},
},
}
pods[2].Spec.NodeSelector = map[string]string{"test": "test"}

ExpectApplied(ctx, env.Client, nodePool, sc1, sc2)
ExpectApplied(ctx, env.Client, pvcs[0], pvcs[1], pvcs[2])
ExpectApplied(ctx, env.Client, pods[0], pods[1], node1, node2, node3)
ExpectManualBinding(ctx, env.Client, pods[0], node1)
ExpectManualBinding(ctx, env.Client, pods[1], node2)

ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node1, node2, node3}, nil)

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods[2])
// since there is no node in test-zone-3 has label test, just test-zone-1 and test-zone-2 are included in the calculations.
ExpectScheduled(ctx, env.Client, pods[2])

})
})
})

Describe("Deleting Nodes", func() {
Expand Down
34 changes: 19 additions & 15 deletions pkg/controllers/provisioning/scheduling/topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,32 @@ type Topology struct {
// excludedPods are the pod UIDs of pods that are excluded from counting. This is used so we can simulate
// moving pods to prevent them from being double counted.
excludedPods sets.Set[string]
cluster *state.Cluster
// podVolumeRequirements links volume requirements to pods. This is used so we
// can track the volume requirements in simulate scheduler
podVolumeRequirements map[*corev1.Pod][]corev1.NodeSelectorRequirement
cluster *state.Cluster
}

func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, domains map[string]sets.Set[string], pods []*corev1.Pod) (*Topology, error) {
func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, domains map[string]sets.Set[string], podsVolumeRequirements map[*corev1.Pod][]corev1.NodeSelectorRequirement) (*Topology, error) {
t := &Topology{
kubeClient: kubeClient,
cluster: cluster,
domains: domains,
topologies: map[uint64]*TopologyGroup{},
inverseTopologies: map[uint64]*TopologyGroup{},
excludedPods: sets.New[string](),
kubeClient: kubeClient,
cluster: cluster,
domains: domains,
topologies: map[uint64]*TopologyGroup{},
inverseTopologies: map[uint64]*TopologyGroup{},
excludedPods: sets.New[string](),
podVolumeRequirements: podsVolumeRequirements,
}

// these are the pods that we intend to schedule, so if they are currently in the cluster we shouldn't count them for
// topology purposes
for _, p := range pods {
for p := range podsVolumeRequirements {
t.excludedPods.Insert(string(p.UID))
}

errs := t.updateInverseAffinities(ctx)
for i := range pods {
errs = multierr.Append(errs, t.Update(ctx, pods[i]))
for p := range podsVolumeRequirements {
errs = multierr.Append(errs, t.Update(ctx, p))
}
if errs != nil {
return nil, errs
Expand Down Expand Up @@ -174,7 +178,7 @@ func (t *Topology) AddRequirements(podRequirements, nodeRequirements scheduling.
if nodeRequirements.Has(topology.Key) {
nodeDomains = nodeRequirements.Get(topology.Key)
}
domains := topology.Get(p, podDomains, nodeDomains)
domains := topology.Get(p, podDomains, nodeDomains, len(t.podVolumeRequirements[p]) != 0)
if domains.Len() == 0 {
return nil, topologyError{
topology: topology,
Expand Down Expand Up @@ -245,7 +249,7 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *corev1.Po
return err
}

tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, t.domains[term.TopologyKey])
tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, t.cluster, namespaces, term.LabelSelector, math.MaxInt32, nil, t.domains[term.TopologyKey])

hash := tg.Hash()
if existing, ok := t.inverseTopologies[hash]; !ok {
Expand Down Expand Up @@ -323,7 +327,7 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error {
func (t *Topology) newForTopologies(p *corev1.Pod) []*TopologyGroup {
var topologyGroups []*TopologyGroup
for _, cs := range p.Spec.TopologySpreadConstraints {
topologyGroups = append(topologyGroups, NewTopologyGroup(TopologyTypeSpread, cs.TopologyKey, p, sets.New(p.Namespace), cs.LabelSelector, cs.MaxSkew, cs.MinDomains, t.domains[cs.TopologyKey]))
topologyGroups = append(topologyGroups, NewTopologyGroup(TopologyTypeSpread, cs.TopologyKey, p, t.cluster, sets.New(p.Namespace), cs.LabelSelector, cs.MaxSkew, cs.MinDomains, t.domains[cs.TopologyKey]))
}
return topologyGroups
}
Expand Down Expand Up @@ -360,7 +364,7 @@ func (t *Topology) newForAffinities(ctx context.Context, p *corev1.Pod) ([]*Topo
if err != nil {
return nil, err
}
topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, t.domains[term.TopologyKey]))
topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, t.cluster, namespaces, term.LabelSelector, math.MaxInt32, nil, t.domains[term.TopologyKey]))
}
}
return topologyGroups, nil
Expand Down
Loading
Loading