From e6cc708a85f60b16463709e825a3fb54dfd64b0d Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Wed, 27 Aug 2025 11:37:27 +0900 Subject: [PATCH 01/22] Add grace period before entering into Emergency mode --- api/autoscaling/v2/webhook_suite_test.go | 2 +- cmd/main.go | 2 +- .../controller/tortoise_controller_test.go | 2 +- pkg/hpa/service.go | 44 ++- pkg/hpa/service_test.go | 267 +++++++++++++++++- 5 files changed, 300 insertions(+), 17 deletions(-) diff --git a/api/autoscaling/v2/webhook_suite_test.go b/api/autoscaling/v2/webhook_suite_test.go index 03f77bfb..a36ab99c 100644 --- a/api/autoscaling/v2/webhook_suite_test.go +++ b/api/autoscaling/v2/webhook_suite_test.go @@ -168,7 +168,7 @@ var _ = BeforeSuite(func() { eventRecorder := mgr.GetEventRecorderFor("tortoise-controller") tortoiseService, err := tortoise.New(mgr.GetClient(), eventRecorder, config.RangeOfMinMaxReplicasRecommendationHours, config.TimeZone, config.TortoiseUpdateInterval, config.GatheringDataPeriodType) Expect(err).NotTo(HaveOccurred()) - hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, 100, time.Hour, nil, 1000, 10000, 3, "") + hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, 100, time.Hour, nil, 1000, 10000, 3, "", 5*time.Minute) Expect(err).NotTo(HaveOccurred()) hpaWebhook := New(tortoiseService, hpaService) diff --git a/cmd/main.go b/cmd/main.go index 93a90ca1..ce789e40 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -152,7 +152,7 @@ func main() { os.Exit(1) } - hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, config.HPATargetUtilizationMaxIncrease, config.HPATargetUtilizationUpdateInterval, config.DefaultHPABehavior, config.MaximumMinReplicas, config.MaximumMaxReplicas, int32(config.MinimumMinReplicas), config.HPAExternalMetricExclusionRegex) + hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, config.HPATargetUtilizationMaxIncrease, config.HPATargetUtilizationUpdateInterval, config.DefaultHPABehavior, config.MaximumMinReplicas, config.MaximumMaxReplicas, int32(config.MinimumMinReplicas), config.HPAExternalMetricExclusionRegex, 5*time.Minute) if err != nil { setupLog.Error(err, "unable to start hpa service") os.Exit(1) diff --git a/internal/controller/tortoise_controller_test.go b/internal/controller/tortoise_controller_test.go index 7429577f..418cb911 100644 --- a/internal/controller/tortoise_controller_test.go +++ b/internal/controller/tortoise_controller_test.go @@ -259,7 +259,7 @@ func startController(ctx context.Context) func() { Expect(err).ShouldNot(HaveOccurred()) cli, err := vpa.New(mgr.GetConfig(), recorder) Expect(err).ShouldNot(HaveOccurred()) - hpaS, err := hpa.New(mgr.GetClient(), recorder, 0.95, 90, 25, time.Hour, nil, 1000, 10000, 3, ".*-exclude-metric") + hpaS, err := hpa.New(mgr.GetClient(), recorder, 0.95, 90, 25, time.Hour, nil, 1000, 10000, 3, ".*-exclude-metric", 5*time.Minute) Expect(err).ShouldNot(HaveOccurred()) reconciler := &TortoiseReconciler{ Scheme: scheme, diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 099a00a1..ba940b7d 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -23,7 +23,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "github.com/mercari/tortoise/api/v1beta3" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" "github.com/mercari/tortoise/pkg/event" "github.com/mercari/tortoise/pkg/metrics" @@ -43,6 +42,7 @@ type Service struct { maximumMinReplica int32 maximumMaxReplica int32 externalMetricExclusionRegex *regexp.Regexp + emergencyModeGracePeriod time.Duration } var defaultHPABehaviorValue = &v2.HorizontalPodAutoscalerBehavior{ @@ -77,6 +77,7 @@ func New( maximumMinReplica, maximumMaxReplica int32, minimumMinReplicas int32, externalMetricExclusionRegex string, + emergencyModeGracePeriod time.Duration, ) (*Service, error) { var regex *regexp.Regexp if externalMetricExclusionRegex != "" { @@ -93,6 +94,12 @@ func New( defaultHPABehavior = defaultHPABehaviorValue } + // Default emergency mode grace period if not provided + // This prevents false emergency mode triggers during temporary HPA metric unavailability + if emergencyModeGracePeriod == 0 { + emergencyModeGracePeriod = 5 * time.Minute + } + return &Service{ c: c, replicaReductionFactor: replicaReductionFactor, @@ -105,6 +112,7 @@ func New( minimumMinReplicas: minimumMinReplicas, maximumMaxReplica: maximumMaxReplica, externalMetricExclusionRegex: regex, + emergencyModeGracePeriod: emergencyModeGracePeriod, }, nil } @@ -220,7 +228,7 @@ func (c *Service) syncHPAMetricsWithTortoiseAutoscalingPolicy(ctx context.Contex } currenthpa.Spec.Metrics = append(currenthpa.Spec.Metrics, m) hpaEdited = true - tortoise = utils.ChangeTortoiseContainerResourcePhase(tortoise, d.containerName, d.rn, now, v1beta3.ContainerResourcePhaseGatheringData) + tortoise = utils.ChangeTortoiseContainerResourcePhase(tortoise, d.containerName, d.rn, now, autoscalingv1beta3.ContainerResourcePhaseGatheringData) } // remove metrics @@ -352,7 +360,7 @@ func (s *Service) RecordHPATargetUtilizationUpdate(tortoise *autoscalingv1beta3. } func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1beta3.Tortoise, hpa *v2.HorizontalPodAutoscaler, now time.Time, recordMetrics bool) (*v2.HorizontalPodAutoscaler, *autoscalingv1beta3.Tortoise, error) { - if tortoise.Status.TortoisePhase == v1beta3.TortoisePhaseInitializing || tortoise.Status.TortoisePhase == "" { + if tortoise.Status.TortoisePhase == autoscalingv1beta3.TortoisePhaseInitializing || tortoise.Status.TortoisePhase == "" { // Tortoise is not ready, don't update HPA return hpa, tortoise, nil } @@ -704,7 +712,7 @@ func (c *Service) updateHPATargetValue(hpa *v2.HorizontalPodAutoscaler, containe return fmt.Errorf("no corresponding metric found: %s/%s", hpa.Namespace, hpa.Name) } -func recordHPAMetric(ctx context.Context, tortoise *v1beta3.Tortoise, hpa *v2.HorizontalPodAutoscaler) { +func recordHPAMetric(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise, hpa *v2.HorizontalPodAutoscaler) { for _, policies := range tortoise.Status.AutoscalingPolicy { for k, p := range policies.Policy { if p != autoscalingv1beta3.AutoscalingTypeHorizontal { @@ -786,6 +794,9 @@ func (c *Service) excludeExternalMetric(ctx context.Context, hpa *v2.HorizontalP return newHPA } +// IsHpaMetricAvailable checks if HPA metrics are available for decision making. +// It includes a configurable grace period before triggering emergency mode to handle +// temporary metric unavailability during HPA updates, deployments, or other transient issues. func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise, currenthpa *v2.HorizontalPodAutoscaler) bool { logger := log.FromContext(ctx) if !HasHorizontal(tortoise) { @@ -804,26 +815,43 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin for _, condition := range conditions { if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" { - // switch to Emergency mode since no metrics - logger.Info("HPA failed to get resource metrics, switch to emergency mode") + // Always apply grace period before triggering emergency mode to handle temporary + // metric unavailability during HPA updates, deployments, scheduled scaling, etc. + conditionAge := time.Since(condition.LastTransitionTime.Time) + if conditionAge < c.emergencyModeGracePeriod { + logger.Info("HPA metric temporarily unavailable, giving grace time before emergency mode", + "gracePeriod", c.emergencyModeGracePeriod, + "conditionAge", conditionAge) + return true // Give grace period before emergency mode + } + // Grace period expired, switch to Emergency mode since no metrics + logger.Info("HPA failed to get resource metrics after grace period, switch to emergency mode", + "gracePeriod", c.emergencyModeGracePeriod, + "conditionAge", conditionAge) return false } } + hasValidMetrics := false for _, currentMetric := range currentMetrics { switch currentMetric.Type { case v2.ContainerResourceMetricSourceType: if currentMetric.ContainerResource != nil && !currentMetric.ContainerResource.Current.Value.IsZero() { // Can still get metrics for some containers, they can scale based on those - return true + hasValidMetrics = true } case v2.ExternalMetricSourceType: if currentMetric.External != nil && !currentMetric.External.Current.Value.IsZero() { // External metrics are also valid - return true + hasValidMetrics = true } } } + + if hasValidMetrics { + return true + } + logger.Info("HPA looks unready because all the metrics indicate zero", "hpa", currenthpa.Name) return false } diff --git a/pkg/hpa/service_test.go b/pkg/hpa/service_test.go index e6a02298..e381a4d5 100644 --- a/pkg/hpa/service_test.go +++ b/pkg/hpa/service_test.go @@ -2750,7 +2750,7 @@ func TestClient_UpdateHPAFromTortoiseRecommendation(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c, err := New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90, 50, time.Hour, nil, 1000, 10001, 3, tt.excludeMetricRegex) + c, err := New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90, 50, time.Hour, nil, 1000, 10001, 3, tt.excludeMetricRegex, 5*time.Minute) if err != nil { t.Fatalf("New() error = %v", err) } @@ -3063,12 +3063,12 @@ func TestService_InitializeHPA(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c, err := New(fake.NewClientBuilder().Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 100, 1000, 3, "") + c, err := New(fake.NewClientBuilder().Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 100, 1000, 3, "", 5*time.Minute) if err != nil { t.Fatalf("New() error = %v", err) } if tt.initialHPA != nil { - c, err = New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 100, 1000, 3, "") + c, err = New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 100, 1000, 3, "", 5*time.Minute) if err != nil { t.Fatalf("New() error = %v", err) } @@ -4621,12 +4621,12 @@ func TestService_UpdateHPASpecFromTortoiseAutoscalingPolicy(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c, err := New(fake.NewClientBuilder().Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 1000, 10000, 3, "") + c, err := New(fake.NewClientBuilder().Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 1000, 10000, 3, "", 5*time.Minute) if err != nil { t.Fatalf("New() error = %v", err) } if tt.initialHPA != nil { - c, err = New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 1000, 10000, 3, "") + c, err = New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 1000, 10000, 3, "", 5*time.Minute) if err != nil { t.Fatalf("New() error = %v", err) } @@ -5250,7 +5250,7 @@ func TestService_IsHpaMetricAvailable(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c, err := New(fake.NewClientBuilder().Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 100, 1000, 3, "") + c, err := New(fake.NewClientBuilder().Build(), record.NewFakeRecorder(10), 0.95, 90, 100, time.Hour, nil, 100, 1000, 3, "", 5*time.Minute) if err != nil { t.Fatalf("New() error = %v", err) } @@ -5262,3 +5262,258 @@ func TestService_IsHpaMetricAvailable(t *testing.T) { }) } } + +func TestService_IsHpaMetricAvailable_EmergencyModeGracePeriod(t *testing.T) { + now := time.Now() + + // Tortoise with horizontal scaling policy + horizontalTortoise := &v1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-tortoise", + Namespace: "default", + }, + Status: v1beta3.TortoiseStatus{ + AutoscalingPolicy: []v1beta3.ContainerAutoscalingPolicy{ + { + ContainerName: "app", + Policy: map[v1.ResourceName]v1beta3.AutoscalingType{ + v1.ResourceCPU: v1beta3.AutoscalingTypeHorizontal, + }, + }, + }, + }, + } + + tests := []struct { + name string + tortoise *v1beta3.Tortoise + hpa *v2.HorizontalPodAutoscaler + emergencyModeGracePeriod time.Duration + expected bool + description string + }{ + { + name: "Grace period active - recent failure should not trigger emergency mode", + tortoise: horizontalTortoise, + hpa: &v2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hpa", + Namespace: "default", + }, + Status: v2.HorizontalPodAutoscalerStatus{ + Conditions: []v2.HorizontalPodAutoscalerCondition{ + { + Type: v2.ScalingActive, + Status: "False", + Reason: "FailedGetResourceMetric", + Message: "failed to get cpu utilization: unable to fetch metrics from resource metrics API", + LastTransitionTime: metav1.NewTime(now.Add(-2 * time.Minute)), // Recent failure + }, + }, + CurrentMetrics: []v2.MetricStatus{ + { + Type: "ContainerResource", + ContainerResource: &v2.ContainerResourceMetricStatus{ + Container: "app", + Name: v1.ResourceCPU, + Current: v2.MetricValueStatus{ + AverageUtilization: ptr.To[int32](0), + AverageValue: resource.NewQuantity(0, resource.DecimalSI), + Value: resource.NewQuantity(0, resource.DecimalSI), + }, + }, + }, + }, + }, + }, + emergencyModeGracePeriod: 5 * time.Minute, // Grace period longer than failure time + expected: true, // Should NOT trigger emergency mode + description: "Recent failure within grace period should return true (no emergency)", + }, + { + name: "Grace period expired - old failure should trigger emergency mode", + tortoise: horizontalTortoise, + hpa: &v2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hpa", + Namespace: "default", + }, + Status: v2.HorizontalPodAutoscalerStatus{ + Conditions: []v2.HorizontalPodAutoscalerCondition{ + { + Type: v2.ScalingActive, + Status: "False", + Reason: "FailedGetResourceMetric", + Message: "failed to get cpu utilization: unable to fetch metrics from resource metrics API", + LastTransitionTime: metav1.NewTime(now.Add(-10 * time.Minute)), // Old failure + }, + }, + CurrentMetrics: []v2.MetricStatus{ + { + Type: "ContainerResource", + ContainerResource: &v2.ContainerResourceMetricStatus{ + Container: "app", + Name: v1.ResourceCPU, + Current: v2.MetricValueStatus{ + AverageUtilization: ptr.To[int32](0), + AverageValue: resource.NewQuantity(0, resource.DecimalSI), + Value: resource.NewQuantity(0, resource.DecimalSI), + }, + }, + }, + }, + }, + }, + emergencyModeGracePeriod: 5 * time.Minute, // Grace period shorter than failure time + expected: false, // Should trigger emergency mode + description: "Old failure beyond grace period should return false (trigger emergency)", + }, + { + name: "Short grace period - should trigger emergency mode quickly", + tortoise: horizontalTortoise, + hpa: &v2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hpa", + Namespace: "default", + }, + Status: v2.HorizontalPodAutoscalerStatus{ + Conditions: []v2.HorizontalPodAutoscalerCondition{ + { + Type: v2.ScalingActive, + Status: "False", + Reason: "FailedGetResourceMetric", + Message: "failed to get cpu utilization", + LastTransitionTime: metav1.NewTime(now.Add(-2 * time.Minute)), // 2 minutes ago + }, + }, + CurrentMetrics: []v2.MetricStatus{ + { + Type: "ContainerResource", + ContainerResource: &v2.ContainerResourceMetricStatus{ + Container: "app", + Name: v1.ResourceCPU, + Current: v2.MetricValueStatus{ + AverageUtilization: ptr.To[int32](0), + AverageValue: resource.NewQuantity(0, resource.DecimalSI), + Value: resource.NewQuantity(0, resource.DecimalSI), + }, + }, + }, + }, + }, + }, + emergencyModeGracePeriod: 1 * time.Minute, // Short grace period + expected: false, // Should trigger emergency mode + description: "Short grace period should allow emergency mode to trigger sooner", + }, + { + name: "No failure condition - should work normally", + tortoise: horizontalTortoise, + hpa: &v2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hpa", + Namespace: "default", + }, + Status: v2.HorizontalPodAutoscalerStatus{ + Conditions: []v2.HorizontalPodAutoscalerCondition{ + { + Type: v2.ScalingActive, + Status: "True", // Working normally + Reason: "ValidMetricFound", + Message: "the HPA was able to successfully calculate a replica count", + LastTransitionTime: metav1.NewTime(now.Add(-1 * time.Minute)), + }, + }, + CurrentMetrics: []v2.MetricStatus{ + { + Type: "ContainerResource", + ContainerResource: &v2.ContainerResourceMetricStatus{ + Container: "app", + Name: v1.ResourceCPU, + Current: v2.MetricValueStatus{ + AverageUtilization: ptr.To[int32](50), // Normal utilization + AverageValue: resource.NewQuantity(100, resource.DecimalSI), + Value: resource.NewQuantity(100, resource.DecimalSI), + }, + }, + }, + }, + }, + }, + emergencyModeGracePeriod: 5 * time.Minute, + expected: true, // Should work normally + description: "Normal HPA operation should not be affected by grace period", + }, + { + name: "Multiple failure conditions - should respect grace period for all", + tortoise: horizontalTortoise, + hpa: &v2.HorizontalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hpa", + Namespace: "default", + }, + Status: v2.HorizontalPodAutoscalerStatus{ + Conditions: []v2.HorizontalPodAutoscalerCondition{ + { + Type: v2.ScalingActive, + Status: "False", + Reason: "FailedGetResourceMetric", + Message: "failed to get cpu utilization", + LastTransitionTime: metav1.NewTime(now.Add(-10 * time.Minute)), // Old failure + }, + { + Type: v2.AbleToScale, + Status: "False", + Reason: "FailedGetScale", + Message: "failed to get scale", + LastTransitionTime: metav1.NewTime(now.Add(-2 * time.Minute)), // Recent failure + }, + }, + CurrentMetrics: []v2.MetricStatus{ + { + Type: "ContainerResource", + ContainerResource: &v2.ContainerResourceMetricStatus{ + Container: "app", + Name: v1.ResourceCPU, + Current: v2.MetricValueStatus{ + AverageUtilization: ptr.To[int32](0), + AverageValue: resource.NewQuantity(0, resource.DecimalSI), + Value: resource.NewQuantity(0, resource.DecimalSI), + }, + }, + }, + }, + }, + }, + emergencyModeGracePeriod: 5 * time.Minute, + expected: false, // Should trigger emergency mode due to old condition + description: "With multiple conditions, should trigger emergency if any are beyond grace period", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create service with the specified grace period + c, err := New( + fake.NewClientBuilder().Build(), + record.NewFakeRecorder(10), + 0.95, 90, 100, + time.Hour, + nil, + 100, 1000, 3, + "", + tt.emergencyModeGracePeriod, + ) + if err != nil { + t.Fatalf("New() error = %v", err) + } + + result := c.IsHpaMetricAvailable(context.Background(), tt.tortoise, tt.hpa) + + if result != tt.expected { + t.Errorf("Test '%s' failed: %s\nExpected: %v, Got: %v", + tt.name, tt.description, tt.expected, result) + } + }) + } +} From 4a678d93c79026366c401cf8ac761cdf0d327ff0 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Wed, 20 Aug 2025 14:49:39 +0900 Subject: [PATCH 02/22] initial scheduled scaling feature implementation and documentations --- api/v1alpha1/groupversion_info.go | 18 + api/v1alpha1/scheduledscaling_types.go | 146 +++++++ api/v1alpha1/zz_generated.deepcopy.go | 109 +++++ cmd/main.go | 11 + ...scaling.mercari.com_scheduledscalings.yaml | 160 ++++++++ config/rbac/role.yaml | 3 + docs/advanced-hpa-configuration.md | 291 ++++++++++++++ docs/scheduled-scaling-testing-guide.md | 372 ++++++++++++++++++ docs/scheduled-scaling.md | 330 ++++++++++++++++ .../controller/scheduledscaling_controller.go | 185 +++++++++ .../scheduledscaling_controller_test.go | 143 +++++++ 11 files changed, 1768 insertions(+) create mode 100644 api/v1alpha1/groupversion_info.go create mode 100644 api/v1alpha1/scheduledscaling_types.go create mode 100644 api/v1alpha1/zz_generated.deepcopy.go create mode 100644 config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml create mode 100644 docs/advanced-hpa-configuration.md create mode 100644 docs/scheduled-scaling-testing-guide.md create mode 100644 docs/scheduled-scaling.md create mode 100644 internal/controller/scheduledscaling_controller.go create mode 100644 internal/controller/scheduledscaling_controller_test.go diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha1/groupversion_info.go new file mode 100644 index 00000000..1df4db52 --- /dev/null +++ b/api/v1alpha1/groupversion_info.go @@ -0,0 +1,18 @@ +// +groupName=autoscaling.mercari.com +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "autoscaling.mercari.com", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go new file mode 100644 index 00000000..e19a7fef --- /dev/null +++ b/api/v1alpha1/scheduledscaling_types.go @@ -0,0 +1,146 @@ +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ScheduledScalingSpec defines the desired state of ScheduledScaling +type ScheduledScalingSpec struct { + // Schedule defines when the scaling should occur + // +kubebuilder:validation:Required + Schedule Schedule `json:"schedule"` + + // TargetRefs specifies which resources this scheduled scaling should affect + // +kubebuilder:validation:Required + TargetRefs TargetRefs `json:"targetRefs"` + + // Strategy defines how the scaling should be performed + // +kubebuilder:validation:Required + Strategy Strategy `json:"strategy"` + + // Status indicates the current state of the scheduled scaling + // +kubebuilder:validation:Enum=Inactive;Active + // +kubebuilder:default=Inactive + Status ScheduledScalingState `json:"status,omitempty"` +} + +// Schedule defines the timing for scheduled scaling +type Schedule struct { + // StartAt specifies when the scaling should begin + // Format: RFC3339 (e.g., "2024-01-15T10:00:00Z") + // +kubebuilder:validation:Required + StartAt string `json:"startAt"` + + // FinishAt specifies when the scaling should end and return to normal + // Format: RFC3339 (e.g., "2024-01-15T18:00:00Z") + // +kubebuilder:validation:Required + FinishAt string `json:"finishAt"` +} + +// TargetRefs specifies which resources to scale +type TargetRefs struct { + // TortoiseName is the name of the Tortoise resource to scale + // +kubebuilder:validation:Required + TortoiseName string `json:"tortoiseName"` +} + +// Strategy defines how the scaling should be performed +type Strategy struct { + // Static defines static scaling parameters + // +kubebuilder:validation:Required + Static StaticStrategy `json:"static"` +} + +// StaticStrategy defines static scaling parameters +type StaticStrategy struct { + // MinimumMinReplicas sets the minimum number of replicas during scaling + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Required + MinimumMinReplicas int32 `json:"minimumMinReplicas"` + + // MinAllocatedResources sets the minimum allocated resources during scaling + // +kubebuilder:validation:Required + MinAllocatedResources ResourceRequirements `json:"minAllocatedResources"` +} + +// ResourceRequirements describes the compute resource requirements +type ResourceRequirements struct { + // CPU specifies the CPU resource requirements + // +kubebuilder:validation:Required + CPU string `json:"cpu"` + + // Memory specifies the memory resource requirements + // +kubebuilder:validation:Required + Memory string `json:"memory"` +} + +// ScheduledScalingState represents the desired state of a scheduled scaling operation +type ScheduledScalingState string + +const ( + // ScheduledScalingStateInactive means the scheduled scaling is not active + ScheduledScalingStateInactive ScheduledScalingState = "Inactive" + // ScheduledScalingStateActive means the scheduled scaling is currently active + ScheduledScalingStateActive ScheduledScalingState = "Active" +) + +// ScheduledScalingStatus defines the observed state of ScheduledScaling +// +kubebuilder:object:generate=true +type ScheduledScalingStatus struct { + // Phase indicates the current phase of the scheduled scaling + // +kubebuilder:validation:Enum=Pending;Active;Completed;Failed + Phase ScheduledScalingPhase `json:"phase,omitempty"` + + // LastTransitionTime is the last time the status transitioned from one phase to another + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` + + // Message provides additional information about the current phase + Message string `json:"message,omitempty"` + + // Reason indicates why the scheduled scaling is in the current phase + Reason string `json:"reason,omitempty"` +} + +// ScheduledScalingPhase represents the phase of a scheduled scaling operation +type ScheduledScalingPhase string + +const ( + // ScheduledScalingPhasePending means the scheduled scaling is waiting to start + ScheduledScalingPhasePending ScheduledScalingPhase = "Pending" + // ScheduledScalingPhaseActive means the scheduled scaling is currently active + ScheduledScalingPhaseActive ScheduledScalingPhase = "Active" + // ScheduledScalingPhaseCompleted means the scheduled scaling has completed successfully + ScheduledScalingPhaseCompleted ScheduledScalingPhase = "Completed" + // ScheduledScalingPhaseFailed means the scheduled scaling has failed + ScheduledScalingPhaseFailed ScheduledScalingPhase = "Failed" +) + +//+kubebuilder:object:root=true +//+kubebuilder:subresource:status +//+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" +//+kubebuilder:printcolumn:name="Start Time",type="string",JSONPath=".spec.schedule.startAt" +//+kubebuilder:printcolumn:name="End Time",type="string",JSONPath=".spec.schedule.finishAt" +//+kubebuilder:printcolumn:name="Target",type="string",JSONPath=".spec.targetRefs.tortoiseName" +//+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" + +// ScheduledScaling is the Schema for the scheduledscalings API +type ScheduledScaling struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec ScheduledScalingSpec `json:"spec,omitempty"` + Status ScheduledScalingStatus `json:"status,omitempty"` +} + +//+kubebuilder:object:root=true + +// ScheduledScalingList contains a list of ScheduledScaling +type ScheduledScalingList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ScheduledScaling `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ScheduledScaling{}, &ScheduledScalingList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 00000000..a5023809 --- /dev/null +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,109 @@ +//go:build !ignore_autogenerated + +/* +MIT License + +Copyright (c) 2023 mercari + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScaling) DeepCopyInto(out *ScheduledScaling) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScaling. +func (in *ScheduledScaling) DeepCopy() *ScheduledScaling { + if in == nil { + return nil + } + out := new(ScheduledScaling) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledScaling) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScalingList) DeepCopyInto(out *ScheduledScalingList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ScheduledScaling, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScalingList. +func (in *ScheduledScalingList) DeepCopy() *ScheduledScalingList { + if in == nil { + return nil + } + out := new(ScheduledScalingList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledScalingList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScalingStatus) DeepCopyInto(out *ScheduledScalingStatus) { + *out = *in + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScalingStatus. +func (in *ScheduledScalingStatus) DeepCopy() *ScheduledScalingStatus { + if in == nil { + return nil + } + out := new(ScheduledScalingStatus) + in.DeepCopyInto(out) + return out +} diff --git a/cmd/main.go b/cmd/main.go index ce789e40..ed00a223 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -47,6 +47,7 @@ import ( autoscalingv2 "github.com/mercari/tortoise/api/autoscaling/v2" v1 "github.com/mercari/tortoise/api/core/v1" + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" "github.com/mercari/tortoise/internal/controller" "github.com/mercari/tortoise/pkg/config" @@ -71,6 +72,7 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(autoscalingv1beta3.AddToScheme(scheme)) + utilruntime.Must(autoscalingv1alpha1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -189,6 +191,15 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Tortoise") os.Exit(1) } + + // Setup ScheduledScaling controller + if err = (&controller.ScheduledScalingReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ScheduledScaling") + os.Exit(1) + } if err = (&autoscalingv1beta3.Tortoise{}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Tortoise") os.Exit(1) diff --git a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml new file mode 100644 index 00000000..06feb368 --- /dev/null +++ b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml @@ -0,0 +1,160 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + name: scheduledscalings.autoscaling.mercari.com +spec: + group: autoscaling.mercari.com + names: + kind: ScheduledScaling + listKind: ScheduledScalingList + plural: scheduledscalings + singular: scheduledscaling + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Status + type: string + - jsonPath: .spec.schedule.startAt + name: Start Time + type: string + - jsonPath: .spec.schedule.finishAt + name: End Time + type: string + - jsonPath: .spec.targetRefs.tortoiseName + name: Target + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: ScheduledScaling is the Schema for the scheduledscalings API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ScheduledScalingSpec defines the desired state of ScheduledScaling + properties: + schedule: + description: Schedule defines when the scaling should occur + properties: + finishAt: + description: |- + FinishAt specifies when the scaling should end and return to normal + Format: RFC3339 (e.g., "2024-01-15T18:00:00Z") + type: string + startAt: + description: |- + StartAt specifies when the scaling should begin + Format: RFC3339 (e.g., "2024-01-15T10:00:00Z") + type: string + required: + - finishAt + - startAt + type: object + status: + default: Inactive + description: Status indicates the current state of the scheduled scaling + enum: + - Inactive + - Active + type: string + strategy: + description: Strategy defines how the scaling should be performed + properties: + static: + description: Static defines static scaling parameters + properties: + minAllocatedResources: + description: MinAllocatedResources sets the minimum allocated + resources during scaling + properties: + cpu: + description: CPU specifies the CPU resource requirements + type: string + memory: + description: Memory specifies the memory resource requirements + type: string + required: + - cpu + - memory + type: object + minimumMinReplicas: + description: MinimumMinReplicas sets the minimum number of + replicas during scaling + format: int32 + minimum: 1 + type: integer + required: + - minAllocatedResources + - minimumMinReplicas + type: object + required: + - static + type: object + targetRefs: + description: TargetRefs specifies which resources this scheduled scaling + should affect + properties: + tortoiseName: + description: TortoiseName is the name of the Tortoise resource + to scale + type: string + required: + - tortoiseName + type: object + required: + - schedule + - strategy + - targetRefs + type: object + status: + description: ScheduledScalingStatus defines the observed state of ScheduledScaling + properties: + lastTransitionTime: + description: LastTransitionTime is the last time the status transitioned + from one phase to another + format: date-time + type: string + message: + description: Message provides additional information about the current + phase + type: string + phase: + description: Phase indicates the current phase of the scheduled scaling + enum: + - Pending + - Active + - Completed + - Failed + type: string + reason: + description: Reason indicates why the scheduled scaling is in the + current phase + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b01a62dc..413fc5f7 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -54,6 +54,7 @@ rules: - apiGroups: - autoscaling.mercari.com resources: + - scheduledscalings - tortoises verbs: - create @@ -66,12 +67,14 @@ rules: - apiGroups: - autoscaling.mercari.com resources: + - scheduledscalings/finalizers - tortoises/finalizers verbs: - update - apiGroups: - autoscaling.mercari.com resources: + - scheduledscalings/status - tortoises/status verbs: - get diff --git a/docs/advanced-hpa-configuration.md b/docs/advanced-hpa-configuration.md new file mode 100644 index 00000000..d05b2bc0 --- /dev/null +++ b/docs/advanced-hpa-configuration.md @@ -0,0 +1,291 @@ +--- +title: Custom HPA Scaling Limits with Tortoise +status: published +owner: kouzoh-platform-infra-jp +lastReviewed: 1970-01-01 +type: guide +tags: [tortoise, hpa, scaling, custom] +--- + +# Custom HPA Scaling Limits with Tortoise + +> Customize Tortoise HPA behavior based on your service requirements + +This guide covers the advanced ServiceGroups feature in Tortoise, which allows platform administrators to configure different HPA scaling limits for different categories of services. This feature is designed for teams who need fine-grained control over their autoscaling behavior based on service characteristics. + +## Overview + +By default, Tortoise applies the same maximum replica limits to all services through the global `MaximumMaxReplicas` configuration, which is currently set to **100 replicas** in Citadel production. However, different types of services have different scaling requirements: + +- **High-traffic services** may need to scale beyond 100 replicas during peak traffic +- **Cost-conscious services** might want lower limits to control costs +- **Critical services** might need different scaling behavior than experimental ones + +The ServiceGroups feature allows you to request custom HPA scaling limits for your services based on your specific requirements. + +## How It Works + +ServiceGroups work through a configuration request process: + +1. **Request Custom Limits**: Contact the platform team to request specific `maxReplicas` limits for your services +2. **Configuration**: Platform team configures your namespace and labels with the requested limits +3. **Automatic Application**: Tortoise automatically applies the custom limits to services matching your configuration + +If your service doesn't match any custom ServiceGroup configuration, it uses the default limit of **100 replicas**. + +## When to Request Custom Configuration + +Consider requesting custom ServiceGroup configuration if your service falls into any of these categories or you need different limits than the default 100 replicas: + +### Request Higher Limits (>100 replicas) if: +- Your service handles high user-facing traffic with significant spikes +- You need to scale beyond 100 replicas during peak periods +- Performance and availability are more critical than cost optimization +- You have validated that your service can effectively utilize 100+ replicas + +### Request Lower Limits (<100 replicas) if: +- Cost control is a primary concern for your service +- Your service has predictable, steady traffic patterns +- Your service is batch-oriented or background processing +- You want to prevent accidental over-scaling and control resource costs + +### Request Custom Limits for: +- Services with specific business requirements (e.g., 150, 200, or 50 replicas) +- Services that need limits based on downstream dependencies +- Services with regulatory or compliance constraints + +## How to Request Custom Configuration + +### Step 1: Contact Platform Team + +To request custom scaling limits for your services, contact **#team-kouzoh-platform-infra-jp** team with the following information: + +1. **Service Details**: Name and purpose of your service +2. **Namespace**: Which namespace your service is deployed in +3. **Labels**: Any specific labels your service uses (optional) +4. **Requested Limit**: The `maxReplicas` value you need (e.g., 150, 200, 50) +5. **Justification**: Business case for the custom limit + +### Step 2: Configuration Applied + +Once approved, the platform team will configure your namespace and labels with the custom limit. + +## Configuration Examples + +### Example 1: Requesting Higher Limits for High-Traffic Service + +**Your Request to #team-kouzoh-platform-infra-jp**: +``` +Hi team! I need custom scaling limits for our API gateway service: +- Service: api-gateway +- Namespace: production-api +- Current Issue: We're hitting the 100 replica limit during peak traffic +- Requested maxReplicas: 150 +- Justification: Service handles user-facing traffic with 3x traffic spikes during events +``` + +**Your Service Configuration** (after platform team configures the namespace): +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-gateway + namespace: production-api # Platform team configured this namespace for 150 replicas + labels: + app: api-gateway + tier: frontend +spec: + # ... your deployment spec +``` + +```yaml +# Your Tortoise configuration remains the same +apiVersion: autoscaling.mercari.com/v1beta3 +kind: Tortoise +metadata: + name: api-gateway-tortoise + namespace: production-api +spec: + updateMode: Auto + targetRefs: + scaleTargetRef: + kind: Deployment + name: api-gateway +``` + +**Result**: This Tortoise can now scale the HPA up to 150 replicas maximum. + +### Example 2: Requesting Lower Limits for Cost Control + +**Your Request to #team-kouzoh-platform-infra-jp**: +``` +Hi team! I need custom scaling limits for our batch processing services: +- Services: All batch processors +- Namespace: batch-services +- Labels: tier=worker (optional, for more granular control) +- Requested maxReplicas: 50 +- Justification: Cost optimization - these services have steady load and don't need >50 replicas +``` + +**Your Service Configuration** (after platform team configures the namespace): +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: batch-processor + namespace: batch-services # Platform team configured this namespace for 50 replicas + labels: + app: batch-processor + tier: worker +spec: + # ... your deployment spec +``` + +**Result**: Services in this namespace are limited to 50 replicas maximum, helping control costs. + +### Example 3: Default Behavior (No Custom Configuration) + +Services that don't have custom configuration will use the default limit: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: experimental-service + namespace: development # No custom configuration for this namespace + labels: + app: experimental-service +spec: + # ... your deployment spec +``` + +**Result**: This Tortoise will use the default limit of **100 replicas**. + +## Understanding Service Group Matching + +Once the platform team configures your custom limits, the matching works as follows: + +### Namespace-Based Matching +Services are primarily matched based on their namespace. For example, if you requested: +- `production-api` namespace → 150 replicas limit +- `batch-services` namespace → 50 replicas limit + +### Label-Based Matching (Advanced) +For more granular control, the platform team can also configure label selectors: + +```yaml +# Example: Services with specific labels get custom limits +metadata: + namespace: my-services + labels: + tier: worker # Only services with this label get the custom limit + cost-tier: optimized # Additional label selector for more precision +``` + +This allows different services within the same namespace to have different limits if needed. + +## Monitoring Your Service Group Assignment + +### Check Your Tortoise Status +You can verify which limits are being applied to your service by checking your Tortoise status: + +```bash +kubectl describe tortoise -n +``` + +Look for events or status information that indicates the maximum replica limits being applied. + +### Metrics and Observability +Monitor these metrics to understand your service's scaling behavior: + +- `mercari.tortoise.proposed_hpa_maxreplicas` - The maximum replicas Tortoise recommends +- `mercari.tortoise.current_hpa_maxreplicas` - The current HPA maximum replicas setting + +You can also use the Tortoise dashboard to visualize how service groups affect your scaling: +https://app.datadoghq.com/dashboard/h4s-vry-dyv/tortoise-user-dashboard + +## Best Practices + +### Before Requesting Custom Limits + +**Analyze Your Current Usage:** +1. **Monitor First**: Use Off mode Tortoise to see what limits would be recommended +2. **Check Current Peaks**: Review your service's historical scaling patterns +3. **Validate Need**: Ensure your service can actually benefit from custom limits +4. **Cost Impact**: Consider the cost implications of higher limits + +**Prepare Your Request:** +1. **Gather Data**: Collect metrics showing why you need custom limits +2. **Business Justification**: Explain the business impact of the current limits +3. **Specific Requirements**: Request exact numbers rather than ranges +4. **Timeline**: Mention if this is urgent or can be planned + +### After Custom Configuration + +**Gradual Adoption:** +1. **Test in Non-Production**: Start with development or staging environments +2. **Monitor Closely**: Watch for unexpected scaling behavior +3. **Cost Monitoring**: Keep track of resource costs after configuration changes +4. **Validate Effectiveness**: Ensure the custom limits solve your original problem + +## Troubleshooting + +### My Service Isn't Using the Expected Custom Limits + +1. **Verify Configuration**: Check if the platform team has confirmed your configuration is applied +2. **Check Request Status**: Follow up with #team-kouzoh-platform-infra-jp if your request is still pending +3. **Namespace/Labels**: Verify your service is deployed in the configured namespace with correct labels +4. **Tortoise Restart**: Sometimes restarting the Tortoise controller helps pick up new configuration + +### My Request Was Rejected or Modified + +If the platform team suggests different limits than requested: + +1. **Understand Reasoning**: Ask for explanation of the decision +2. **Provide Additional Context**: Share more data about your scaling needs +3. **Compromise**: Consider accepting modified limits and requesting adjustments later based on real usage + +## Common Questions + +### Can I Override Custom Limits? + +Custom limits are enforced at the platform level and cannot be overridden by individual Tortoise configurations. This ensures consistent resource management across the platform. + +### How Do I Know What Custom Limits Are Applied to My Service? + +1. **Check with Platform Team**: Ask #team-kouzoh-platform-infra-jp about your namespace configuration +2. **Monitor Tortoise Behavior**: Observe the maximum replicas Tortoise applies to your HPA +3. **Review Tortoise Events**: Check kubectl events for information about applied limits + +### What Happens If I Hit My Custom Limits During Traffic Spikes? + +Services will scale up to their configured custom limit. If you consistently hit your limits and need more capacity: + +1. **Gather Evidence**: Collect metrics showing the need for higher limits +2. **Request Increase**: Contact #team-kouzoh-platform-infra-jp with data supporting the need for higher limits +3. **Temporary Relief**: Discuss emergency procedures for critical traffic events + +### How Long Does It Take to Get Custom Configuration? + +- **Simple Requests**: Usually processed within 1-2 business days +- **Complex Requests**: May take longer if they require additional review or testing +- **Urgent Requests**: Contact #team-kouzoh-platform-infra-jp directly for expedited processing + +### Can I Request Different Limits for Different Services in the Same Namespace? + +Yes! The platform team can configure label-based selectors to give different services in the same namespace different limits. Include this requirement in your request. + +## Getting Help + +If you have questions about custom scaling limits or need assistance with configuration: + +- **Slack**: #team-kouzoh-platform-infra-jp +- **For Custom Limit Requests**: Contact #team-kouzoh-platform-infra-jp with your requirements +- **For Configuration Issues**: Reach out with your specific service details and observed behavior +- **For Urgent Issues**: Tag the platform team directly in #team-kouzoh-platform-infra-jp + +## Related Documentation + +- [Tortoise Main Guide](./tortoise.md) - Basic Tortoise usage and configuration +- [Kubernetes Kit: Scaling](../guides/k8s-kit/scaling.md) - Integration with Kubernetes Kit +- [HPA Configuration](../guides/how-to-create-hpa.md) - Manual HPA configuration (if needed) \ No newline at end of file diff --git a/docs/scheduled-scaling-testing-guide.md b/docs/scheduled-scaling-testing-guide.md new file mode 100644 index 00000000..ed64bb9a --- /dev/null +++ b/docs/scheduled-scaling-testing-guide.md @@ -0,0 +1,372 @@ +# Scheduled Scaling Testing Guide + +This guide provides comprehensive instructions for testing the Scheduled Scaling functionality in Tortoise. + +## Overview + +Scheduled Scaling allows users to predict and prepare for increased resource consumption by scheduling scaling operations in advance. This is particularly useful for: + +- **TV Campaigns**: When you know traffic will spike at specific times +- **Push Notifications**: Before sending notifications that will drive user engagement +- **Load Testing**: In development environments to simulate production loads +- **Scheduled Events**: Any predictable increase in resource demand + +## Architecture + +The Scheduled Scaling feature consists of: + +1. **ScheduledScaling CRD** (`autoscaling.mercari.com/v1alpha1`) +2. **ScheduledScaling Controller** - Manages the lifecycle and timing +3. **Integration with Tortoise** - Applies scaling policies to existing Tortoise resources + +## Testing Phases + +### Phase 1: Local Development Testing + +#### Prerequisites +- Go 1.22+ +- kubectl configured +- kind or minikube cluster +- Docker + +#### Setup +```bash +# Clone the repository +git clone +cd tortoise + +# Install dependencies +go mod tidy + +# Generate code +make generate +make manifests + +# Build the controller +make build +``` + +#### Run Tests +```bash +# Run unit tests +make test + +# Run specific controller tests +go test ./internal/controller/ -v + +# Run with coverage +make test-coverage +``` + +### Phase 2: Integration Testing + +#### Deploy to Local Cluster +```bash +# Deploy CRDs +make install + +# Deploy the controller +make deploy + +# Verify deployment +kubectl get pods -n tortoise-system +``` + +#### Test Basic Functionality +```bash +# Apply example resources +kubectl apply -f test/scheduled-scaling-example.yaml + +# Check resource status +kubectl get scheduledscalings +kubectl get tortoises +kubectl get deployments +``` + +### Phase 3: End-to-End Testing + +#### Use the Test Script +```bash +# Make script executable +chmod +x scripts/test-scheduled-scaling.sh + +# Run comprehensive test +./scripts/test-scheduled-scaling.sh +``` + +#### Monitor in Real-Time +```bash +# Use monitoring script +chmod +x scripts/monitor-scheduled-scaling.sh +./scripts/monitor-scheduled-scaling.sh default 5 +``` + +## Testing Scenarios + +### Scenario 1: Basic Scheduled Scaling + +**Objective**: Verify that a ScheduledScaling resource can be created and managed. + +**Steps**: +1. Create a ScheduledScaling with start time 1 minute in the future +2. Verify status transitions: Pending → Active → Completed +3. Check that Tortoise resources are updated accordingly + +**Expected Results**: +- Status phase transitions correctly +- Tortoise scaling policies are applied during active period +- Resources return to normal after completion + +### Scenario 2: Invalid Schedule Validation + +**Objective**: Ensure the controller properly validates schedule times. + +**Steps**: +1. Create a ScheduledScaling with finish time before start time +2. Verify the resource enters Failed state +3. Check error messages and reasons + +**Expected Results**: +- Resource status becomes Failed +- Appropriate error reason is set +- Controller logs show validation errors + +### Scenario 3: Resource Conflict Handling + +**Objective**: Test behavior when multiple scaling policies conflict. + +**Steps**: +1. Create a ScheduledScaling that overlaps with existing Tortoise policies +2. Verify graceful handling of conflicts +3. Check that the most appropriate policy is applied + +**Expected Results**: +- No resource corruption +- Clear logging of policy decisions +- Graceful fallback behavior + +### Scenario 4: Time Zone Handling + +**Objective**: Verify proper handling of different time zones. + +**Steps**: +1. Create ScheduledScaling resources with various time zone formats +2. Test edge cases around midnight and DST transitions +3. Verify consistent behavior across time zones + +**Expected Results**: +- Consistent behavior regardless of time zone +- Proper handling of edge cases +- Clear time format requirements + +## Verification Commands + +### Check Resource Status +```bash +# ScheduledScaling resources +kubectl get scheduledscalings -A -o wide + +# Tortoise resources +kubectl get tortoises -A -o wide + +# Deployment status +kubectl get deployments -A -o wide + +# HPA status +kubectl get hpa -A -o wide +``` + +### View Detailed Information +```bash +# ScheduledScaling details +kubectl describe scheduledscaling -n + +# Tortoise details +kubectl describe tortoise -n + +# Events +kubectl get events -n --sort-by='.lastTimestamp' +``` + +### Monitor Controller Logs +```bash +# Controller logs +kubectl logs -n tortoise-system deployment/tortoise-controller-manager -f + +# Filter for scheduled scaling events +kubectl logs -n tortoise-system deployment/tortoise-controller-manager | grep -i "scheduled" +``` + +## Troubleshooting + +### Common Issues + +#### 1. Controller Not Starting +```bash +# Check controller status +kubectl get pods -n tortoise-system + +# Check logs +kubectl logs -n tortoise-system deployment/tortoise-controller-manager + +# Verify CRDs are installed +kubectl get crd | grep scheduledscaling +``` + +#### 2. Resources Not Updating +```bash +# Check controller reconciliation +kubectl logs -n tortoise-system deployment/tortoise-controller-manager | grep -i "reconciling" + +# Verify resource ownership +kubectl get scheduledscaling -o yaml | grep -A 5 "ownerReferences" + +# Check for finalizers +kubectl get scheduledscaling -o yaml | grep -A 5 "finalizers" +``` + +#### 3. Time Parsing Errors +```bash +# Verify time format +kubectl get scheduledscaling -o yaml | grep -A 5 "schedule" + +# Check controller logs for parsing errors +kubectl logs -n tortoise-system deployment/tortoise-controller-manager | grep -i "time\|parse" +``` + +### Debug Mode + +Enable debug logging for more detailed information: + +```bash +# Update deployment with debug log level +kubectl patch deployment tortoise-controller-manager -n tortoise-system -p '{"spec":{"template":{"spec":{"containers":[{"name":"manager","env":[{"name":"LOG_LEVEL","value":"DEBUG"}]}]}}}}' + +# Restart the controller +kubectl rollout restart deployment/tortoise-controller-manager -n tortoise-system +``` + +## Performance Testing + +### Load Testing +```bash +# Create multiple ScheduledScaling resources +for i in {1..10}; do + kubectl apply -f - < -n +``` + +### Common Issues + +1. **Invalid Schedule Times**: Ensure startAt is before finishAt and both are valid ISO 8601 timestamps +2. **Missing Tortoise**: Verify the target Tortoise resource exists +3. **RBAC Issues**: Check controller permissions for tortoises and scheduledscalings +4. **Time Zone Confusion**: All times are in UTC, convert local times accordingly + +## Future Enhancements + +### Planned Features + +- **Dynamic Strategies**: CPU/memory-based scaling instead of static values +- **Recurring Schedules**: Daily, weekly, or monthly recurring scaling patterns +- **Multi-Resource Targeting**: Scale multiple Tortoise resources simultaneously +- **Policy Templates**: Reusable scaling configurations +- **Metrics Integration**: Scale based on historical traffic patterns +- **Webhook Notifications**: Notify external systems of scaling events + +### API Evolution + +The feature is currently in `v1alpha1`, indicating it's experimental. Future versions may include: +- Breaking changes to improve the API design +- Additional validation rules +- Enhanced status fields +- Migration guides between versions + +## Contributing + +### Development Setup + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests +5. Run linting and tests +6. Submit a pull request + +### Code Generation + +After modifying API types: +```bash +make generate # Generate deepcopy methods +make manifests # Generate CRDs and RBAC +``` + +### Testing Guidelines + +- Write unit tests for new controller logic +- Test edge cases (invalid schedules, missing resources) +- Verify error handling and status updates +- Test configuration restoration + +## Support + +For questions or issues: +- Create a GitHub issue +- Tag with `area/scheduled-scaling` +- Provide reproduction steps and logs +- Include cluster and operator versions + +## References + +- [Tortoise Documentation](../README.md) +- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) +- [Kubebuilder Book](https://book.kubebuilder.io/) +- [Controller Runtime](https://pkg.go.dev/sigs.k8s.io/controller-runtime) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go new file mode 100644 index 00000000..e13c413a --- /dev/null +++ b/internal/controller/scheduledscaling_controller.go @@ -0,0 +1,185 @@ +/* +Copyright 2024 The Tortoise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" + autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" +) + +// ScheduledScalingReconciler reconciles a ScheduledScaling object +type ScheduledScalingReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=scheduledscalings,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=scheduledscalings/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=scheduledscalings/finalizers,verbs=update +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=tortoises,verbs=get;list;watch;update;patch +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=tortoises/status,verbs=get;update;patch + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Fetch the ScheduledScaling instance + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{} + if err := r.Get(ctx, req.NamespacedName, scheduledScaling); err != nil { + // Handle the case where the resource is not found + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + logger.Info("Reconciling ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + // Parse the schedule times + startTime, err := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.StartAt) + if err != nil { + logger.Error(err, "Failed to parse start time", "startAt", scheduledScaling.Spec.Schedule.StartAt) + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidStartTime", err.Error()) + } + + finishTime, err := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.FinishAt) + if err != nil { + logger.Error(err, "Failed to parse finish time", "finishAt", scheduledScaling.Spec.Schedule.FinishAt) + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidFinishTime", err.Error()) + } + + // Validate that finish time is after start time + if finishTime.Before(startTime) || finishTime.Equal(startTime) { + err := fmt.Errorf("finish time must be after start time") + logger.Error(err, "Invalid schedule", "startAt", startTime, "finishAt", finishTime) + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidSchedule", err.Error()) + } + + now := time.Now() + var newPhase autoscalingv1alpha1.ScheduledScalingPhase + var reason, message string + + // Determine the current phase based on time + if now.Before(startTime) { + // Calculate time until start + timeUntilStart := startTime.Sub(now) + return ctrl.Result{RequeueAfter: timeUntilStart}, nil + } else if now.After(finishTime) { + // Apply normal scaling (restore original settings) + if err := r.applyNormalScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply normal scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "RestoreFailed", err.Error()) + } + newPhase = autoscalingv1alpha1.ScheduledScalingPhaseCompleted + reason = "Completed" + message = "Scheduled scaling period has ended" + } else { + // Currently in the scaling period + // Apply scheduled scaling + if err := r.applyScheduledScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply scheduled scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "ScalingFailed", err.Error()) + } + + // Calculate time until finish + timeUntilFinish := finishTime.Sub(now) + return ctrl.Result{RequeueAfter: timeUntilFinish}, nil + } + + // Update status if phase changed + if scheduledScaling.Status.Phase != newPhase { + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, newPhase, reason, message) + } + + return ctrl.Result{}, nil +} + +// applyScheduledScaling applies the scheduled scaling configuration to the target Tortoise +func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + // Get the target Tortoise + tortoise := &autoscalingv1beta3.Tortoise{} + tortoiseKey := types.NamespacedName{ + Namespace: scheduledScaling.Namespace, + Name: scheduledScaling.Spec.TargetRefs.TortoiseName, + } + + if err := r.Get(ctx, tortoiseKey, tortoise); err != nil { + return fmt.Errorf("failed to get target tortoise: %w", err) + } + + // Apply the scheduled scaling configuration + // This is a simplified implementation - in practice, you might want to: + // 1. Store the original configuration before applying changes + // 2. Apply the new configuration + // 3. Handle conflicts with other scaling policies + + // For now, we'll just log what we would do + logger := log.FromContext(ctx) + logger.Info("Applying scheduled scaling", + "tortoise", tortoise.Name, + "minReplicas", scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas, + "cpu", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU, + "memory", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory) + + return nil +} + +// applyNormalScaling restores the normal scaling configuration +func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + logger := log.FromContext(ctx) + logger.Info("Restoring normal scaling configuration", "tortoise", scheduledScaling.Spec.TargetRefs.TortoiseName) + + // In practice, you would restore the original configuration here + // This might involve: + // 1. Retrieving the stored original configuration + // 2. Applying it to the Tortoise + // 3. Cleaning up any temporary resources + + return nil +} + +// updateStatus updates the status of the ScheduledScaling resource +func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, phase autoscalingv1alpha1.ScheduledScalingPhase, reason, message string) error { + if scheduledScaling.Status.Phase != phase { + scheduledScaling.Status.Phase = phase + scheduledScaling.Status.Reason = reason + scheduledScaling.Status.Message = message + scheduledScaling.Status.LastTransitionTime = metav1.Now() + + if err := r.Status().Update(ctx, scheduledScaling); err != nil { + return fmt.Errorf("failed to update status: %w", err) + } + } + + return nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *ScheduledScalingReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&autoscalingv1alpha1.ScheduledScaling{}). + Complete(r) +} diff --git a/internal/controller/scheduledscaling_controller_test.go b/internal/controller/scheduledscaling_controller_test.go new file mode 100644 index 00000000..5f9b244e --- /dev/null +++ b/internal/controller/scheduledscaling_controller_test.go @@ -0,0 +1,143 @@ +/* +Copyright 2024 The Tortoise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ScheduledScaling Controller", func() { + const ( + timeout = time.Second * 10 + interval = time.Millisecond * 250 + ) + + Context("When creating a ScheduledScaling", func() { + It("Should handle basic scheduled scaling creation", func() { + ctx := context.Background() + + // Create a test ScheduledScaling resource + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-scheduled-scaling", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + StartAt: time.Now().Add(time.Hour).Format(time.RFC3339), + FinishAt: time.Now().Add(2 * time.Hour).Format(time.RFC3339), + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 3, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Verify the resource was created + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() bool { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + // Add any specific conditions you want to check + return err == nil + }, timeout, interval).Should(BeTrue()) + + // Verify the status is set to Pending initially + Expect(createdResource.Status.Phase).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending)) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + }) + + It("Should reject invalid schedule times", func() { + ctx := context.Background() + + // Create a ScheduledScaling with invalid times (finish before start) + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-schedule", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + StartAt: time.Now().Add(2 * time.Hour).Format(time.RFC3339), + FinishAt: time.Now().Add(time.Hour).Format(time.RFC3339), // Invalid: finish before start + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 3, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and update status + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + }) + }) +}) From 2147c1ec8e4e2cb2c47381e341dc017c8a241483 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Wed, 20 Aug 2025 16:08:00 +0900 Subject: [PATCH 03/22] add controller logic and e2e tests --- .../controller/scheduledscaling_controller.go | 137 ++++++++++++++---- .../scheduledscaling_controller_test.go | 81 ++++++++++- internal/controller/suite_test.go | 36 +++++ .../controller/tortoise_controller_test.go | 2 + 4 files changed, 221 insertions(+), 35 deletions(-) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index e13c413a..2e4b6dd7 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -18,9 +18,12 @@ package controller import ( "context" + "encoding/json" "fmt" "time" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -86,6 +89,10 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req if now.Before(startTime) { // Calculate time until start timeUntilStart := startTime.Sub(now) + // Set status to Pending if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhasePending { + return ctrl.Result{RequeueAfter: timeUntilStart}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhasePending, "Waiting", "Waiting for scheduled scaling to begin") + } return ctrl.Result{RequeueAfter: timeUntilStart}, nil } else if now.After(finishTime) { // Apply normal scaling (restore original settings) @@ -104,12 +111,17 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "ScalingFailed", err.Error()) } + // Set status to Active if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhaseActive { + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", "Scheduled scaling is currently active") + } + // Calculate time until finish timeUntilFinish := finishTime.Sub(now) return ctrl.Result{RequeueAfter: timeUntilFinish}, nil } - // Update status if phase changed + // Update status if phase changed (this handles Completed phase) if scheduledScaling.Status.Phase != newPhase { return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, newPhase, reason, message) } @@ -120,44 +132,115 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req // applyScheduledScaling applies the scheduled scaling configuration to the target Tortoise func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { // Get the target Tortoise - tortoise := &autoscalingv1beta3.Tortoise{} - tortoiseKey := types.NamespacedName{ - Namespace: scheduledScaling.Namespace, - Name: scheduledScaling.Spec.TargetRefs.TortoiseName, + t := &autoscalingv1beta3.Tortoise{} + key := types.NamespacedName{Namespace: scheduledScaling.Namespace, Name: scheduledScaling.Spec.TargetRefs.TortoiseName} + if err := r.Get(ctx, key, t); err != nil { + return fmt.Errorf("failed to get target tortoise: %w", err) } - if err := r.Get(ctx, tortoiseKey, tortoise); err != nil { - return fmt.Errorf("failed to get target tortoise: %w", err) + const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" + const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" + if t.Annotations == nil { + t.Annotations = map[string]string{} + } + + // Persist original spec if not already stored + if _, exists := t.Annotations[annOriginal]; !exists { + orig, err := json.Marshal(t.Spec) + if err != nil { + return fmt.Errorf("marshal original tortoise spec: %w", err) + } + t.Annotations[annOriginal] = string(orig) } - // Apply the scheduled scaling configuration - // This is a simplified implementation - in practice, you might want to: - // 1. Store the original configuration before applying changes - // 2. Apply the new configuration - // 3. Handle conflicts with other scaling policies + // Desired resource minimums from strategy + dCPU := scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU + dMem := scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory + var qCPU, qMem resource.Quantity + var hasCPU, hasMem bool + if dCPU != "" { + v, err := resource.ParseQuantity(dCPU) + if err != nil { + return fmt.Errorf("invalid cpu quantity %q: %w", dCPU, err) + } + qCPU = v + hasCPU = true + } + if dMem != "" { + v, err := resource.ParseQuantity(dMem) + if err != nil { + return fmt.Errorf("invalid memory quantity %q: %w", dMem, err) + } + qMem = v + hasMem = true + } - // For now, we'll just log what we would do - logger := log.FromContext(ctx) - logger.Info("Applying scheduled scaling", - "tortoise", tortoise.Name, - "minReplicas", scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas, - "cpu", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU, - "memory", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory) + updated := false + for i := range t.Spec.ResourcePolicy { + pol := &t.Spec.ResourcePolicy[i] + if pol.MinAllocatedResources == nil { + pol.MinAllocatedResources = v1.ResourceList{} + } + if hasCPU { + curr := pol.MinAllocatedResources[v1.ResourceCPU] + if curr.Cmp(qCPU) < 0 { + pol.MinAllocatedResources[v1.ResourceCPU] = qCPU + updated = true + } + } + if hasMem { + curr := pol.MinAllocatedResources[v1.ResourceMemory] + if curr.Cmp(qMem) < 0 { + pol.MinAllocatedResources[v1.ResourceMemory] = qMem + updated = true + } + } + } + if m := scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas; m > 0 { + // Store intent; future versions could wire this to Tortoise min replicas recommendation + t.Annotations[annMinReplicas] = fmt.Sprintf("%d", m) + updated = true + } + + if !updated { + log.FromContext(ctx).Info("Scheduled scaling made no changes to tortoise spec", "tortoise", t.Name) + return nil + } + if err := r.Update(ctx, t); err != nil { + return fmt.Errorf("update tortoise: %w", err) + } return nil } // applyNormalScaling restores the normal scaling configuration func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { - logger := log.FromContext(ctx) - logger.Info("Restoring normal scaling configuration", "tortoise", scheduledScaling.Spec.TargetRefs.TortoiseName) - - // In practice, you would restore the original configuration here - // This might involve: - // 1. Retrieving the stored original configuration - // 2. Applying it to the Tortoise - // 3. Cleaning up any temporary resources + // Fetch target tortoise + t := &autoscalingv1beta3.Tortoise{} + key := types.NamespacedName{Namespace: scheduledScaling.Namespace, Name: scheduledScaling.Spec.TargetRefs.TortoiseName} + if err := r.Get(ctx, key, t); err != nil { + return fmt.Errorf("failed to get target tortoise for restore: %w", err) + } + const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" + const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" + if t.Annotations == nil { + return nil + } + orig, ok := t.Annotations[annOriginal] + if !ok || orig == "" { + return nil + } + var spec autoscalingv1beta3.TortoiseSpec + if err := json.Unmarshal([]byte(orig), &spec); err != nil { + return fmt.Errorf("unmarshal original tortoise spec: %w", err) + } + t.Spec = spec + delete(t.Annotations, annOriginal) + delete(t.Annotations, annMinReplicas) + if err := r.Update(ctx, t); err != nil { + return fmt.Errorf("restore tortoise: %w", err) + } return nil } diff --git a/internal/controller/scheduledscaling_controller_test.go b/internal/controller/scheduledscaling_controller_test.go index 5f9b244e..af5fc4c3 100644 --- a/internal/controller/scheduledscaling_controller_test.go +++ b/internal/controller/scheduledscaling_controller_test.go @@ -20,10 +20,13 @@ import ( "context" "time" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" + autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -67,21 +70,21 @@ var _ = Describe("ScheduledScaling Controller", func() { Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) - // Verify the resource was created + // Verify the resource was created and wait for status update resourceLookupKey := types.NamespacedName{ Name: scheduledScaling.Name, Namespace: scheduledScaling.Namespace, } createdResource := &autoscalingv1alpha1.ScheduledScaling{} - Eventually(func() bool { + // Wait for the controller to reconcile and set the status to Pending + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { err := k8sClient.Get(ctx, resourceLookupKey, createdResource) - // Add any specific conditions you want to check - return err == nil - }, timeout, interval).Should(BeTrue()) - - // Verify the status is set to Pending initially - Expect(createdResource.Status.Phase).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending)) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending)) // Clean up Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) @@ -140,4 +143,66 @@ var _ = Describe("ScheduledScaling Controller", func() { Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) }) }) + + Context("Apply and restore scheduled scaling", func() { + It("should apply desired min resources and then restore original spec", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-apply", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Prepare a ScheduledScaling that is currently active and ends soon + start := time.Now().Add(-1 * time.Second).Format(time.RFC3339) + finish := time.Now().Add(3 * time.Second).Format(time.RFC3339) + ss := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{Name: "test-apply-restore", Namespace: "default"}, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{StartAt: start, FinishAt: finish}, + TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, + Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 5, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{CPU: "500m", Memory: "512Mi"}, + }}, + }, + } + Expect(k8sClient.Create(ctx, ss)).Should(Succeed()) + + // During active window, Tortoise should be updated + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).ToNot(BeEmpty()) + pol := cur.Spec.ResourcePolicy[0] + cpu := pol.MinAllocatedResources[v1.ResourceCPU] + mem := pol.MinAllocatedResources[v1.ResourceMemory] + g.Expect((&cpu).Cmp(resource.MustParse("500m")) >= 0).To(BeTrue()) + g.Expect((&mem).Cmp(resource.MustParse("512Mi")) >= 0).To(BeTrue()) + }, timeout, interval).Should(Succeed()) + + // After the window finishes, original spec should be restored + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + pol := cur.Spec.ResourcePolicy[0] + g.Expect(pol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("100m"))).To(BeTrue()) + g.Expect(pol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("128Mi"))).To(BeTrue()) + }, time.Second*12, interval).Should(Succeed()) + }) + }) }) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 1e0ee6f4..566a13e1 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -26,6 +26,7 @@ SOFTWARE. package controller import ( + "context" "fmt" "os" "path/filepath" @@ -39,11 +40,14 @@ import ( v1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" "k8s.io/client-go/rest" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" . "github.com/onsi/ginkgo/v2" @@ -57,6 +61,7 @@ var cfg *rest.Config var k8sClient client.Client var testEnv *envtest.Environment var scheme = runtime.NewScheme() +var controllerMgrCancel context.CancelFunc func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -83,6 +88,8 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) Expect(cfg).NotTo(BeNil()) + err = autoscalingv1alpha1.AddToScheme(scheme) + Expect(err).NotTo(HaveOccurred()) err = autoscalingv1beta3.AddToScheme(scheme) Expect(err).NotTo(HaveOccurred()) err = v1.AddToScheme(scheme) @@ -98,6 +105,29 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) Expect(k8sClient).NotTo(BeNil()) + // Start the controller manager + By("starting the controller manager") + mgr, err := ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme, + Metrics: server.Options{BindAddress: "0"}, + }) + Expect(err).NotTo(HaveOccurred()) + + // Setup the ScheduledScaling controller + err = (&ScheduledScalingReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).NotTo(HaveOccurred()) + + // Start the manager with a cancellable context + ctx, cancel := context.WithCancel(context.Background()) + controllerMgrCancel = cancel + go func() { + err = mgr.Start(ctx) + Expect(err).NotTo(HaveOccurred()) + }() + // fix a time to reduce the diff when regenerating the files. onlyTestNow = ptr.To(time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC)) }) @@ -120,6 +150,12 @@ $ rm -rf %s return } + // Stop manager first to help envtest terminate cleanly + if controllerMgrCancel != nil { + controllerMgrCancel() + time.Sleep(200 * time.Millisecond) + } + err := testEnv.Stop() Expect(err).NotTo(HaveOccurred()) }) diff --git a/internal/controller/tortoise_controller_test.go b/internal/controller/tortoise_controller_test.go index 418cb911..dd988cd9 100644 --- a/internal/controller/tortoise_controller_test.go +++ b/internal/controller/tortoise_controller_test.go @@ -21,6 +21,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/yaml" "github.com/mercari/tortoise/api/v1beta3" @@ -246,6 +247,7 @@ func removeUnnecessaryFields(rawdata []byte) ([]byte, error) { func startController(ctx context.Context) func() { mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme, + Metrics: server.Options{BindAddress: "0"}, LeaderElection: false, Controller: config.Controller{ SkipNameValidation: ptr.To(true), From b7f00abdc7f89bebacbdb7fb87e1afd05fde7928 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Tue, 26 Aug 2025 14:03:18 +0900 Subject: [PATCH 04/22] add validation for minreplicas --- .../rbac/tortoise-scheduledscaling-role.yaml | 11 +++++++ ...tortoise-scheduledscaling-rolebinding.yaml | 12 ++++++++ pkg/hpa/service.go | 30 +++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 config/rbac/tortoise-scheduledscaling-role.yaml create mode 100644 config/rbac/tortoise-scheduledscaling-rolebinding.yaml diff --git a/config/rbac/tortoise-scheduledscaling-role.yaml b/config/rbac/tortoise-scheduledscaling-role.yaml new file mode 100644 index 00000000..cae42ccd --- /dev/null +++ b/config/rbac/tortoise-scheduledscaling-role.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tortoise-scheduledscaling-access +rules: +- apiGroups: ["autoscaling.mercari.com"] + resources: ["scheduledscalings", "scheduledscalings/status"] + verbs: ["get","list","watch","update","patch"] +- apiGroups: ["autoscaling.mercari.com"] + resources: ["tortoises", "tortoises/status"] + verbs: ["get","list","watch","update","patch"] diff --git a/config/rbac/tortoise-scheduledscaling-rolebinding.yaml b/config/rbac/tortoise-scheduledscaling-rolebinding.yaml new file mode 100644 index 00000000..6bf9a44d --- /dev/null +++ b/config/rbac/tortoise-scheduledscaling-rolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tortoise-scheduledscaling-access +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tortoise-scheduledscaling-access +subjects: +- kind: ServiceAccount + name: tortoise-controller-manager + namespace: mercari-tortoise-lab diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index ba940b7d..55631142 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -8,6 +8,7 @@ import ( "reflect" "regexp" "sort" + "strconv" "time" v2 "k8s.io/api/autoscaling/v2" @@ -424,6 +425,19 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet recommendMax = *tortoise.Spec.MaxReplicas } + // ScheduledScaling: ensure max is not below the scheduled min override + var ssMinOverride int32 + if tortoise.Annotations != nil { + if v, ok := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; ok && v != "" { + if parsed, perr := strconv.ParseInt(v, 10, 32); perr == nil && parsed > 0 { + ssMinOverride = int32(parsed) + } + } + } + if ssMinOverride > 0 && recommendMax < ssMinOverride { + recommendMax = ssMinOverride + } + if recommendMax > c.maximumMaxReplica { c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, c.maximumMaxReplica, c.maximumMaxReplica)) recommendMax = c.maximumMaxReplica @@ -467,6 +481,22 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet minToActuallyApply = recommendMin } + // ScheduledScaling: enforce configured minimum replicas during Active + if ssMinOverride > 0 && minToActuallyApply < ssMinOverride { + minToActuallyApply = ssMinOverride + } + + // Ensure consistency: MaxReplicas >= MinReplicas + if recommendMax < minToActuallyApply { + recommendMax = minToActuallyApply + if recommendMax > c.maximumMaxReplica { + recommendMax = c.maximumMaxReplica + if minToActuallyApply > recommendMax { + minToActuallyApply = recommendMax + } + } + } + hpa.Spec.MinReplicas = &minToActuallyApply if tortoise.Spec.UpdateMode != autoscalingv1beta3.UpdateModeOff && recordMetrics { // We don't want to record applied* metric when UpdateMode is Off. From 98c3f5373a3a5744c8d6858274a3b48baaa12ef4 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Tue, 26 Aug 2025 15:20:53 +0900 Subject: [PATCH 05/22] use existing HPA for scheduledscaling --- .../controller/scheduledscaling_controller.go | 15 +++++++++++++++ pkg/hpa/service.go | 13 +++++++++++++ 2 files changed, 28 insertions(+) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 2e4b6dd7..7d674d46 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -138,6 +138,13 @@ func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, return fmt.Errorf("failed to get target tortoise: %w", err) } + // Preserve existing HPA reference if present and not already specified in spec + if t.Spec.TargetRefs.HorizontalPodAutoscalerName == nil && t.Status.Targets.HorizontalPodAutoscaler != "" { + // If tortoise created an HPA but spec doesn't reference it explicitly, + // add the reference to prevent HPA recreation during scheduled scaling + t.Spec.TargetRefs.HorizontalPodAutoscalerName = &t.Status.Targets.HorizontalPodAutoscaler + } + const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" if t.Annotations == nil { @@ -235,6 +242,14 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch if err := json.Unmarshal([]byte(orig), &spec); err != nil { return fmt.Errorf("unmarshal original tortoise spec: %w", err) } + + // Preserve HPA reference if it was added during scheduled scaling to prevent HPA recreation + if spec.TargetRefs.HorizontalPodAutoscalerName == nil && t.Spec.TargetRefs.HorizontalPodAutoscalerName != nil && t.Status.Targets.HorizontalPodAutoscaler != "" { + // If original spec didn't have HPA reference but current spec does (added during scheduled scaling), + // preserve it to avoid HPA recreation when restoring + spec.TargetRefs.HorizontalPodAutoscalerName = t.Spec.TargetRefs.HorizontalPodAutoscalerName + } + t.Spec = spec delete(t.Annotations, annOriginal) delete(t.Annotations, annMinReplicas) diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 55631142..9f8faf65 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -133,6 +133,19 @@ func (c *Service) InitializeHPA(ctx context.Context, tortoise *autoscalingv1beta return tortoise, nil } + // Check if tortoise already has an HPA reference in status that exists + if tortoise.Status.Targets.HorizontalPodAutoscaler != "" { + hpa := &v2.HorizontalPodAutoscaler{} + if err := c.c.Get(ctx, types.NamespacedName{ + Namespace: tortoise.Namespace, + Name: tortoise.Status.Targets.HorizontalPodAutoscaler, + }, hpa); err == nil { + logger.Info("tortoise already has an existing HPA, no need to create new one", "hpa", tortoise.Status.Targets.HorizontalPodAutoscaler) + return tortoise, nil + } + logger.Info("tortoise status references HPA that doesn't exist, will create new one", "missing_hpa", tortoise.Status.Targets.HorizontalPodAutoscaler) + } + logger.Info("no existing HPA specified, creating HPA") // create default HPA. From 140d22e2567538f07a1983d2739203d80a1ac113 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Wed, 27 Aug 2025 17:25:30 +0900 Subject: [PATCH 06/22] Add support for Cron based scheduled scaling --- api/v1alpha1/scheduledscaling_types.go | 41 +- api/v1alpha1/scheduledscaling_validation.go | 199 +++++++++ .../scheduledscaling_validation_test.go | 411 ++++++++++++++++++ ...scaling.mercari.com_scheduledscalings.yaml | 27 +- go.mod | 1 + go.sum | 2 + .../controller/scheduledscaling_controller.go | 124 +++++- .../scheduledscaling_controller_test.go | 376 +++++++++++++++- pkg/hpa/service.go | 31 +- 9 files changed, 1185 insertions(+), 27 deletions(-) create mode 100644 api/v1alpha1/scheduledscaling_validation.go create mode 100644 api/v1alpha1/scheduledscaling_validation_test.go diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go index e19a7fef..e2a54bbe 100644 --- a/api/v1alpha1/scheduledscaling_types.go +++ b/api/v1alpha1/scheduledscaling_types.go @@ -24,17 +24,50 @@ type ScheduledScalingSpec struct { Status ScheduledScalingState `json:"status,omitempty"` } +// ScheduleType defines the type of scheduling to use +type ScheduleType string + +const ( + // ScheduleTypeTime uses specific start and end times + ScheduleTypeTime ScheduleType = "time" + // ScheduleTypeCron uses cron expression for periodic scheduling + ScheduleTypeCron ScheduleType = "cron" +) + // Schedule defines the timing for scheduled scaling type Schedule struct { + // Type specifies the scheduling type: "time" or "cron" + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=time;cron + Type ScheduleType `json:"type"` + + // Time-based scheduling fields (used when type="time") // StartAt specifies when the scaling should begin // Format: RFC3339 (e.g., "2024-01-15T10:00:00Z") - // +kubebuilder:validation:Required - StartAt string `json:"startAt"` + // +kubebuilder:validation:Optional + StartAt string `json:"startAt,omitempty"` // FinishAt specifies when the scaling should end and return to normal // Format: RFC3339 (e.g., "2024-01-15T18:00:00Z") - // +kubebuilder:validation:Required - FinishAt string `json:"finishAt"` + // +kubebuilder:validation:Optional + FinishAt string `json:"finishAt,omitempty"` + + // Cron-based scheduling fields (used when type="cron") + // CronExpression defines when scaling periods should start using cron format + // Format: "minute hour day month dayofweek" (e.g., "0 9 * * 1-5" for 9 AM weekdays) + // +kubebuilder:validation:Optional + CronExpression string `json:"cronExpression,omitempty"` + + // Duration specifies how long each scaling period should last + // Format: Go duration (e.g., "8h", "30m", "1h30m") + // +kubebuilder:validation:Optional + Duration string `json:"duration,omitempty"` + + // TimeZone specifies the timezone for cron-based scheduling + // Format: IANA timezone (e.g., "Asia/Tokyo", "UTC", "America/New_York") + // +kubebuilder:validation:Optional + // +kubebuilder:default="Asia/Tokyo" + TimeZone string `json:"timeZone,omitempty"` } // TargetRefs specifies which resources to scale diff --git a/api/v1alpha1/scheduledscaling_validation.go b/api/v1alpha1/scheduledscaling_validation.go new file mode 100644 index 00000000..a15df7b2 --- /dev/null +++ b/api/v1alpha1/scheduledscaling_validation.go @@ -0,0 +1,199 @@ +package v1alpha1 + +import ( + "fmt" + "time" + + "github.com/robfig/cron/v3" +) + +// ValidateSchedule validates the schedule configuration based on the type +func (s *Schedule) Validate() error { + switch s.Type { + case ScheduleTypeTime: + return s.validateTimeBasedSchedule() + case ScheduleTypeCron: + return s.validateCronBasedSchedule() + default: + return fmt.Errorf("invalid schedule type: %s, must be 'time' or 'cron'", s.Type) + } +} + +// validateTimeBasedSchedule validates time-based scheduling parameters +func (s *Schedule) validateTimeBasedSchedule() error { + // Validate required fields for time-based scheduling + if s.StartAt == "" { + return fmt.Errorf("startAt is required when type is 'time'") + } + if s.FinishAt == "" { + return fmt.Errorf("finishAt is required when type is 'time'") + } + + // Validate time format (RFC3339) + startTime, err := time.Parse(time.RFC3339, s.StartAt) + if err != nil { + return fmt.Errorf("invalid startAt format: %v, expected RFC3339 format (e.g., '2024-01-15T10:00:00Z')", err) + } + + finishTime, err := time.Parse(time.RFC3339, s.FinishAt) + if err != nil { + return fmt.Errorf("invalid finishAt format: %v, expected RFC3339 format (e.g., '2024-01-15T18:00:00Z')", err) + } + + // Validate that finish time is after start time + if finishTime.Before(startTime) || finishTime.Equal(startTime) { + return fmt.Errorf("finishAt (%s) must be after startAt (%s)", s.FinishAt, s.StartAt) + } + + // Validate that cron-specific fields are not set + if s.CronExpression != "" { + return fmt.Errorf("cronExpression cannot be set when type is 'time'") + } + if s.Duration != "" { + return fmt.Errorf("duration cannot be set when type is 'time'") + } + + return nil +} + +// validateCronBasedSchedule validates cron-based scheduling parameters +func (s *Schedule) validateCronBasedSchedule() error { + // Validate required fields for cron-based scheduling + if s.CronExpression == "" { + return fmt.Errorf("cronExpression is required when type is 'cron'") + } + if s.Duration == "" { + return fmt.Errorf("duration is required when type is 'cron'") + } + + // Validate cron expression format + parser := cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + _, err := parser.Parse(s.CronExpression) + if err != nil { + return fmt.Errorf("invalid cronExpression format: %v, expected format 'minute hour day month dayofweek' (e.g., '0 9 * * 1-5')", err) + } + + // Validate duration format + duration, err := time.ParseDuration(s.Duration) + if err != nil { + return fmt.Errorf("invalid duration format: %v, expected Go duration format (e.g., '8h', '30m', '1h30m')", err) + } + + // Validate duration is reasonable (not too short or too long) + if duration < time.Minute { + return fmt.Errorf("duration must be at least 1 minute") + } + if duration > 24*time.Hour { + return fmt.Errorf("duration must not exceed 24 hours") + } + + // Validate timezone format if provided + if s.TimeZone != "" && s.TimeZone != "Asia/Tokyo" { + _, err := time.LoadLocation(s.TimeZone) + if err != nil { + return fmt.Errorf("invalid timeZone: %v, expected IANA timezone format (e.g., 'Asia/Tokyo', 'America/New_York')", err) + } + } + + // Validate that time-specific fields are not set + if s.StartAt != "" { + return fmt.Errorf("startAt cannot be set when type is 'cron'") + } + if s.FinishAt != "" { + return fmt.Errorf("finishAt cannot be set when type is 'cron'") + } + + return nil +} + +// GetNextScheduleWindow returns the next scheduled scaling window for cron-based scheduling +func (s *Schedule) GetNextScheduleWindow(from time.Time) (startTime, endTime time.Time, err error) { + if s.Type != ScheduleTypeCron { + return time.Time{}, time.Time{}, fmt.Errorf("GetNextScheduleWindow is only supported for cron-based scheduling") + } + + // Parse timezone + location, _ := time.LoadLocation("Asia/Tokyo") // Default to Tokyo timezone + if s.TimeZone != "" && s.TimeZone != "Asia/Tokyo" { + location, err = time.LoadLocation(s.TimeZone) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("failed to parse timezone: %v", err) + } + } + + // Parse duration + duration, err := time.ParseDuration(s.Duration) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("failed to parse duration: %v", err) + } + + // Parse cron expression + parser := cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + schedule, err := parser.Parse(s.CronExpression) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("failed to parse cron expression: %v", err) + } + + // Convert from time to the specified timezone + fromInLocation := from.In(location) + + // Get next scheduled time + nextTime := schedule.Next(fromInLocation) + endTimeInLocation := nextTime.Add(duration) + + // Convert back to UTC for consistency + return nextTime.UTC(), endTimeInLocation.UTC(), nil +} + +// IsCurrentlyActive checks if the current time falls within an active scaling window +func (s *Schedule) IsCurrentlyActive(now time.Time) (bool, time.Time, error) { + switch s.Type { + case ScheduleTypeTime: + startTime, err := time.Parse(time.RFC3339, s.StartAt) + if err != nil { + return false, time.Time{}, err + } + finishTime, err := time.Parse(time.RFC3339, s.FinishAt) + if err != nil { + return false, time.Time{}, err + } + + isActive := now.After(startTime) && now.Before(finishTime) + return isActive, finishTime, nil + + case ScheduleTypeCron: + // For cron-based scheduling, we need to check if we're within the current window + // We'll look back to find the most recent start time + lookBackTime := now.Add(-25 * time.Hour) // Look back 25 hours to catch daily schedules + + // Get the most recent schedule window that could be active + startTime, endTime, err := s.GetNextScheduleWindow(lookBackTime) + if err != nil { + return false, time.Time{}, err + } + + // Check if we're currently in this window + for startTime.Before(now) { + if now.Before(endTime) { + // We're in an active window + return true, endTime, nil + } + + // Move to next window + startTime, endTime, err = s.GetNextScheduleWindow(startTime.Add(time.Minute)) + if err != nil { + return false, time.Time{}, err + } + + // Prevent infinite loop - if start time is too far in the future, break + if startTime.After(now.Add(time.Hour)) { + break + } + } + + return false, time.Time{}, nil + + default: + return false, time.Time{}, fmt.Errorf("unsupported schedule type: %s", s.Type) + } +} diff --git a/api/v1alpha1/scheduledscaling_validation_test.go b/api/v1alpha1/scheduledscaling_validation_test.go new file mode 100644 index 00000000..16409498 --- /dev/null +++ b/api/v1alpha1/scheduledscaling_validation_test.go @@ -0,0 +1,411 @@ +package v1alpha1 + +import ( + "testing" + "time" +) + +func TestSchedule_Validate(t *testing.T) { + tests := []struct { + name string + schedule Schedule + wantErr bool + errMsg string + }{ + // Time-based scheduling tests + { + name: "valid time-based schedule", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + wantErr: false, + }, + { + name: "time-based missing startAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + FinishAt: "2024-01-15T18:00:00Z", + }, + wantErr: true, + errMsg: "startAt is required when type is 'time'", + }, + { + name: "time-based missing finishAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "finishAt is required when type is 'time'", + }, + { + name: "time-based invalid startAt format", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15 10:00:00", + FinishAt: "2024-01-15T18:00:00Z", + }, + wantErr: true, + errMsg: "invalid startAt format", + }, + { + name: "time-based invalid finishAt format", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15 18:00:00", + }, + wantErr: true, + errMsg: "invalid finishAt format", + }, + { + name: "time-based finishAt before startAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T18:00:00Z", + FinishAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "must be after startAt", + }, + { + name: "time-based finishAt equals startAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "must be after startAt", + }, + { + name: "time-based with cron fields should fail", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + CronExpression: "0 9 * * *", + }, + wantErr: true, + errMsg: "cronExpression cannot be set when type is 'time'", + }, + + // Cron-based scheduling tests + { + name: "valid cron-based schedule", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + wantErr: false, + }, + { + name: "valid cron-based schedule with timezone", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + wantErr: false, + }, + { + name: "cron-based missing cronExpression", + schedule: Schedule{ + Type: ScheduleTypeCron, + Duration: "8h", + }, + wantErr: true, + errMsg: "cronExpression is required when type is 'cron'", + }, + { + name: "cron-based missing duration", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + }, + wantErr: true, + errMsg: "duration is required when type is 'cron'", + }, + { + name: "cron-based invalid cron expression", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "invalid cron", + Duration: "8h", + }, + wantErr: true, + errMsg: "invalid cronExpression format", + }, + { + name: "cron-based invalid duration", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "invalid duration", + }, + wantErr: true, + errMsg: "invalid duration format", + }, + { + name: "cron-based duration too short", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "30s", + }, + wantErr: true, + errMsg: "duration must be at least 1 minute", + }, + { + name: "cron-based duration too long", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "25h", + }, + wantErr: true, + errMsg: "duration must not exceed 24 hours", + }, + { + name: "cron-based invalid timezone", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Invalid/Timezone", + }, + wantErr: true, + errMsg: "invalid timeZone", + }, + { + name: "cron-based with time fields should fail", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + StartAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "startAt cannot be set when type is 'cron'", + }, + + // Invalid type tests + { + name: "invalid schedule type", + schedule: Schedule{ + Type: "invalid", + }, + wantErr: true, + errMsg: "invalid schedule type: invalid", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.schedule.Validate() + if (err != nil) != tt.wantErr { + t.Errorf("Schedule.Validate() error = %v, wantErr %v", err, tt.wantErr) + return + } + if tt.wantErr && tt.errMsg != "" { + if err == nil { + t.Errorf("Expected error containing '%s', got no error", tt.errMsg) + } else if !containsPattern(err.Error(), tt.errMsg) { + t.Errorf("Expected error containing '%s', got '%s'", tt.errMsg, err.Error()) + } + } + }) + } +} + +func TestSchedule_GetNextScheduleWindow(t *testing.T) { + tests := []struct { + name string + schedule Schedule + from time.Time + wantErr bool + errMsg string + }{ + { + name: "valid cron schedule - daily", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * *", // Daily at 9 AM + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), // 8 AM + wantErr: false, + }, + { + name: "valid cron schedule - weekdays", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", // Weekdays at 9 AM + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), // Monday 8 AM UTC + wantErr: false, + }, + { + name: "time-based schedule should fail", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), + wantErr: true, + errMsg: "GetNextScheduleWindow is only supported for cron-based scheduling", + }, + { + name: "invalid cron expression", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "invalid cron", + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), + wantErr: true, + errMsg: "failed to parse cron expression", + }, + { + name: "invalid duration", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * *", + Duration: "invalid", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), + wantErr: true, + errMsg: "failed to parse duration", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + startTime, endTime, err := tt.schedule.GetNextScheduleWindow(tt.from) + if (err != nil) != tt.wantErr { + t.Errorf("Schedule.GetNextScheduleWindow() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if tt.wantErr { + if tt.errMsg != "" && err != nil { + if !containsPattern(err.Error(), tt.errMsg) { + t.Errorf("Expected error containing '%s', got '%s'", tt.errMsg, err.Error()) + } + } + } else { + // Validate that we got valid times + if startTime.IsZero() || endTime.IsZero() { + t.Errorf("Expected valid start and end times, got start=%v, end=%v", startTime, endTime) + } + if !endTime.After(startTime) { + t.Errorf("End time should be after start time, got start=%v, end=%v", startTime, endTime) + } + if !startTime.After(tt.from) { + t.Errorf("Start time should be after 'from' time, got start=%v, from=%v", startTime, tt.from) + } + } + }) + } +} + +func TestSchedule_IsCurrentlyActive(t *testing.T) { + tests := []struct { + name string + schedule Schedule + now time.Time + wantActive bool + wantErr bool + }{ + { + name: "time-based - before start", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + now: time.Date(2024, 1, 15, 9, 0, 0, 0, time.UTC), + wantActive: false, + wantErr: false, + }, + { + name: "time-based - during period", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + now: time.Date(2024, 1, 15, 14, 0, 0, 0, time.UTC), + wantActive: true, + wantErr: false, + }, + { + name: "time-based - after end", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + now: time.Date(2024, 1, 15, 19, 0, 0, 0, time.UTC), + wantActive: false, + wantErr: false, + }, + // Note: Cron-based tests are complex and would require mocking time + // In real implementation, you'd test with specific known schedules + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + isActive, _, err := tt.schedule.IsCurrentlyActive(tt.now) + if (err != nil) != tt.wantErr { + t.Errorf("Schedule.IsCurrentlyActive() error = %v, wantErr %v", err, tt.wantErr) + return + } + if isActive != tt.wantActive { + t.Errorf("Schedule.IsCurrentlyActive() = %v, want %v", isActive, tt.wantActive) + } + }) + } +} + +// Helper function to check if error message contains expected pattern +func containsPattern(text, pattern string) bool { + // For simplicity, just check if pattern is a substring + // In real implementation, you might want to use regex matching + return len(text) > 0 && (pattern == "" || + len(pattern) > 0 && len(text) >= len(pattern) && + findSubstring(text, pattern)) +} + +func findSubstring(text, substr string) bool { + if len(substr) == 0 { + return true + } + if len(text) < len(substr) { + return false + } + for i := 0; i <= len(text)-len(substr); i++ { + match := true + for j := 0; j < len(substr); j++ { + if text[i+j] != substr[j] { + match = false + break + } + } + if match { + return true + } + } + return false +} diff --git a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml index 06feb368..a3bd0bb8 100644 --- a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml +++ b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml @@ -58,6 +58,17 @@ spec: schedule: description: Schedule defines when the scaling should occur properties: + cronExpression: + description: |- + Cron-based scheduling fields (used when type="cron") + CronExpression defines when scaling periods should start using cron format + Format: "minute hour day month dayofweek" (e.g., "0 9 * * 1-5" for 9 AM weekdays) + type: string + duration: + description: |- + Duration specifies how long each scaling period should last + Format: Go duration (e.g., "8h", "30m", "1h30m") + type: string finishAt: description: |- FinishAt specifies when the scaling should end and return to normal @@ -65,12 +76,24 @@ spec: type: string startAt: description: |- + Time-based scheduling fields (used when type="time") StartAt specifies when the scaling should begin Format: RFC3339 (e.g., "2024-01-15T10:00:00Z") type: string + timeZone: + default: Asia/Tokyo + description: |- + TimeZone specifies the timezone for cron-based scheduling + Format: IANA timezone (e.g., "Asia/Tokyo", "UTC", "America/New_York") + type: string + type: + description: 'Type specifies the scheduling type: "time" or "cron"' + enum: + - time + - cron + type: string required: - - finishAt - - startAt + - type type: object status: default: Inactive diff --git a/go.mod b/go.mod index 72c6b751..0c4abcc1 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,7 @@ require ( require ( github.com/kyokomi/emoji/v2 v2.2.12 + github.com/robfig/cron/v3 v3.0.1 github.com/spf13/cobra v1.8.1 ) diff --git a/go.sum b/go.sum index 40b5f63a..cc13040a 100644 --- a/go.sum +++ b/go.sum @@ -93,6 +93,8 @@ github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 7d674d46..313377ae 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -59,27 +59,34 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, client.IgnoreNotFound(err) } - logger.Info("Reconciling ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + logger.Info("Reconciling ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "type", scheduledScaling.Spec.Schedule.Type) - // Parse the schedule times - startTime, err := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.StartAt) - if err != nil { - logger.Error(err, "Failed to parse start time", "startAt", scheduledScaling.Spec.Schedule.StartAt) - return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidStartTime", err.Error()) + // Validate the schedule configuration + if err := scheduledScaling.Spec.Schedule.Validate(); err != nil { + logger.Error(err, "Invalid schedule configuration") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidSchedule", err.Error()) } - finishTime, err := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.FinishAt) - if err != nil { - logger.Error(err, "Failed to parse finish time", "finishAt", scheduledScaling.Spec.Schedule.FinishAt) - return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidFinishTime", err.Error()) + // Handle scheduling based on type + switch scheduledScaling.Spec.Schedule.Type { + case autoscalingv1alpha1.ScheduleTypeTime: + return r.handleTimeBasedScheduling(ctx, scheduledScaling) + case autoscalingv1alpha1.ScheduleTypeCron: + return r.handleCronBasedScheduling(ctx, scheduledScaling) + default: + err := fmt.Errorf("unsupported schedule type: %s", scheduledScaling.Spec.Schedule.Type) + logger.Error(err, "Invalid schedule type") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidScheduleType", err.Error()) } +} - // Validate that finish time is after start time - if finishTime.Before(startTime) || finishTime.Equal(startTime) { - err := fmt.Errorf("finish time must be after start time") - logger.Error(err, "Invalid schedule", "startAt", startTime, "finishAt", finishTime) - return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidSchedule", err.Error()) - } +// handleTimeBasedScheduling handles scheduling based on specific start/end times +func (r *ScheduledScalingReconciler) handleTimeBasedScheduling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Parse the schedule times (already validated by Validate()) + startTime, _ := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.StartAt) + finishTime, _ := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.FinishAt) now := time.Now() var newPhase autoscalingv1alpha1.ScheduledScalingPhase @@ -91,7 +98,7 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req timeUntilStart := startTime.Sub(now) // Set status to Pending if not already set if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhasePending { - return ctrl.Result{RequeueAfter: timeUntilStart}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhasePending, "Waiting", "Waiting for scheduled scaling to begin") + return ctrl.Result{RequeueAfter: timeUntilStart}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhasePending, "Waiting", fmt.Sprintf("Waiting for scheduled scaling to begin at %s", startTime.Format(time.RFC3339))) } return ctrl.Result{RequeueAfter: timeUntilStart}, nil } else if now.After(finishTime) { @@ -113,7 +120,7 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req // Set status to Active if not already set if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhaseActive { - return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", "Scheduled scaling is currently active") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", fmt.Sprintf("Scheduled scaling is active until %s", finishTime.Format(time.RFC3339))) } // Calculate time until finish @@ -129,6 +136,87 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, nil } +// handleCronBasedScheduling handles scheduling based on cron expressions +func (r *ScheduledScalingReconciler) handleCronBasedScheduling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) (ctrl.Result, error) { + logger := log.FromContext(ctx) + now := time.Now() + + // Check if we're currently in an active scaling window + isActive, endTime, err := scheduledScaling.Spec.Schedule.IsCurrentlyActive(now) + if err != nil { + logger.Error(err, "Failed to check if schedule is currently active") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "ScheduleCheckFailed", err.Error()) + } + + if isActive { + // We're in an active scaling period + // Apply scheduled scaling + if err := r.applyScheduledScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply scheduled scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "ScalingFailed", err.Error()) + } + + // Set status to Active if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhaseActive { + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", fmt.Sprintf("Cron-based scheduled scaling is active until %s", endTime.Format(time.RFC3339))) + } + + // Calculate time until this window ends + timeUntilEnd := endTime.Sub(now) + // Add a small buffer to ensure we catch the transition + requeueAfter := timeUntilEnd + time.Minute + if requeueAfter < time.Minute { + requeueAfter = time.Minute + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } else { + // We're not in an active scaling period + // Apply normal scaling (restore original settings) if we were previously active + if scheduledScaling.Status.Phase == autoscalingv1alpha1.ScheduledScalingPhaseActive { + if err := r.applyNormalScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply normal scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "RestoreFailed", err.Error()) + } + } + + // Find the next scheduled window + nextStart, nextEnd, err := scheduledScaling.Spec.Schedule.GetNextScheduleWindow(now) + if err != nil { + logger.Error(err, "Failed to calculate next schedule window") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "NextScheduleFailed", err.Error()) + } + + // Set status to Pending if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhasePending { + message := fmt.Sprintf("Next scheduled scaling window: %s to %s (cron: %s)", + nextStart.Format(time.RFC3339), + nextEnd.Format(time.RFC3339), + scheduledScaling.Spec.Schedule.CronExpression) + + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhasePending, "WaitingForNext", message) + } + + // Calculate time until next start + timeUntilNext := nextStart.Sub(now) + // Add a small buffer but ensure reasonable polling interval + requeueAfter := timeUntilNext + if requeueAfter > 10*time.Minute { + // For far future schedules, check every 10 minutes + requeueAfter = 10 * time.Minute + } else if requeueAfter < time.Minute { + // For immediate schedules, check in 1 minute + requeueAfter = time.Minute + } + + logger.V(1).Info("Cron schedule waiting for next window", + "nextStart", nextStart, + "timeUntilNext", timeUntilNext, + "requeueAfter", requeueAfter) + + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } +} + // applyScheduledScaling applies the scheduled scaling configuration to the target Tortoise func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { // Get the target Tortoise diff --git a/internal/controller/scheduledscaling_controller_test.go b/internal/controller/scheduledscaling_controller_test.go index af5fc4c3..7537241b 100644 --- a/internal/controller/scheduledscaling_controller_test.go +++ b/internal/controller/scheduledscaling_controller_test.go @@ -50,6 +50,7 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Spec: autoscalingv1alpha1.ScheduledScalingSpec{ Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: time.Now().Add(time.Hour).Format(time.RFC3339), FinishAt: time.Now().Add(2 * time.Hour).Format(time.RFC3339), }, @@ -101,6 +102,7 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Spec: autoscalingv1alpha1.ScheduledScalingSpec{ Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: time.Now().Add(2 * time.Hour).Format(time.RFC3339), FinishAt: time.Now().Add(time.Hour).Format(time.RFC3339), // Invalid: finish before start }, @@ -171,7 +173,7 @@ var _ = Describe("ScheduledScaling Controller", func() { ss := &autoscalingv1alpha1.ScheduledScaling{ ObjectMeta: metav1.ObjectMeta{Name: "test-apply-restore", Namespace: "default"}, Spec: autoscalingv1alpha1.ScheduledScalingSpec{ - Schedule: autoscalingv1alpha1.Schedule{StartAt: start, FinishAt: finish}, + Schedule: autoscalingv1alpha1.Schedule{Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: start, FinishAt: finish}, TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ MinimumMinReplicas: 5, @@ -205,4 +207,376 @@ var _ = Describe("ScheduledScaling Controller", func() { }, time.Second*12, interval).Should(Succeed()) }) }) + + Context("Cron-based Scheduled Scaling", func() { + It("Should handle cron-based scheduled scaling creation", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-default-tz-unique", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a test cron-based ScheduledScaling resource + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cron-scaling", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", // Weekdays at 9 AM + Duration: "8h", // 8-hour duration + TimeZone: "Asia/Tokyo", + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-cron", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 5, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "1000m", + Memory: "1Gi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Verify the resource was created and status is set correctly + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + // For cron-based schedules, should be in Active state if currently within schedule window + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Or(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending), Equal(autoscalingv1alpha1.ScheduledScalingPhaseActive))) + + // Verify the status message contains cron information + Expect(createdResource.Status.Message).Should(ContainSubstring("cron: 0 9 * * 1-5")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should reject invalid cron expressions", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-invalid-cron", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling with invalid cron expression + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-cron", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "invalid cron expression", // Invalid cron + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-invalid-cron", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 3, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and set status to Failed + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + Expect(createdResource.Status.Message).Should(ContainSubstring("invalid cronExpression format")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should reject missing duration for cron schedule", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-missing-duration", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling without duration + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-missing-duration", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + // Duration: missing + TimeZone: "Asia/Tokyo", + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-missing-duration", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 3, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and set status to Failed + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + Expect(createdResource.Status.Message).Should(ContainSubstring("duration is required when type is 'cron'")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should reject invalid timezone", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-invalid-tz", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling with invalid timezone + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-timezone", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Invalid/Timezone", // Invalid timezone + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-invalid-tz", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 3, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and set status to Failed + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + Expect(createdResource.Status.Message).Should(ContainSubstring("invalid timeZone")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should handle cron with default timezone", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-cron", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling without explicitly setting timezone (should default to Asia/Tokyo) + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-default-timezone", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + // TimeZone: not specified, should default to Asia/Tokyo + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: 3, + MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Verify the resource was created successfully (should not fail validation) + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + // Should be in Pending state (waiting for schedule), not Failed + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending)) + + // Should not have error messages + Expect(createdResource.Status.Phase).ShouldNot(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + }) }) diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 9f8faf65..364f2dd2 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -858,15 +858,42 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin for _, condition := range conditions { if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" { - // Always apply grace period before triggering emergency mode to handle temporary - // metric unavailability during HPA updates, deployments, scheduled scaling, etc. + // Enhanced grace period logic that handles frequent pod restarts more robustly conditionAge := time.Since(condition.LastTransitionTime.Time) + + // Use HPA creation time as fallback for very frequent condition changes + // This prevents grace period from being reset constantly during pod restart storms + hpaAge := time.Since(currenthpa.CreationTimestamp.Time) + + // If the condition is very new (< 30 seconds) but HPA has been having issues for longer, + // use a minimum grace period based on recent condition changes rather than resetting completely + if conditionAge < 30*time.Second && hpaAge > c.emergencyModeGracePeriod { + // Look for any persistent failure pattern - if the HPA has been consistently failing, + // don't reset the grace period for every small condition change + logger.Info("HPA condition recently changed but HPA has persistent issues, using reduced grace period", + "gracePeriod", c.emergencyModeGracePeriod, + "conditionAge", conditionAge, + "hpaAge", hpaAge) + + // Give a minimum 2-minute grace period even if condition just changed + // This prevents constant emergency mode cycling during pod restart storms + if conditionAge < 2*time.Minute { + logger.Info("HPA metric temporarily unavailable during pod restarts, giving minimum grace time", + "gracePeriod", c.emergencyModeGracePeriod, + "conditionAge", conditionAge, + "minGracePeriod", 2*time.Minute) + return true + } + } + + // Standard grace period logic if conditionAge < c.emergencyModeGracePeriod { logger.Info("HPA metric temporarily unavailable, giving grace time before emergency mode", "gracePeriod", c.emergencyModeGracePeriod, "conditionAge", conditionAge) return true // Give grace period before emergency mode } + // Grace period expired, switch to Emergency mode since no metrics logger.Info("HPA failed to get resource metrics after grace period, switch to emergency mode", "gracePeriod", c.emergencyModeGracePeriod, From baff2d32de3903ea5f1a10a45109f9a2a7b490f9 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 28 Aug 2025 09:35:11 +0900 Subject: [PATCH 07/22] Extend kubectl get output to include more info --- api/v1alpha1/scheduledscaling_types.go | 450 +++++++++++++++++- api/v1alpha1/zz_generated.deepcopy.go | 12 + ...scaling.mercari.com_scheduledscalings.yaml | 46 +- .../controller/scheduledscaling_controller.go | 74 +++ .../scheduledscaling_controller_test.go | 14 +- 5 files changed, 586 insertions(+), 10 deletions(-) diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go index e2a54bbe..67563d27 100644 --- a/api/v1alpha1/scheduledscaling_types.go +++ b/api/v1alpha1/scheduledscaling_types.go @@ -1,6 +1,11 @@ package v1alpha1 import ( + "fmt" + "strconv" + "strings" + "time" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -132,6 +137,27 @@ type ScheduledScalingStatus struct { // Reason indicates why the scheduled scaling is in the current phase Reason string `json:"reason,omitempty"` + + // CurrentWindowStart indicates the start of the current scaling window (for cron-based scheduling) + CurrentWindowStart *metav1.Time `json:"currentWindowStart,omitempty"` + + // CurrentWindowEnd indicates the end of the current scaling window (for cron-based scheduling) + CurrentWindowEnd *metav1.Time `json:"currentWindowEnd,omitempty"` + + // NextWindowStart indicates when the next scaling window will start (for cron-based scheduling) + NextWindowStart *metav1.Time `json:"nextWindowStart,omitempty"` + + // HumanReadableSchedule provides a human-readable description of the schedule + HumanReadableSchedule string `json:"humanReadableSchedule,omitempty"` + + // FormattedStartTime provides a human-readable formatted start time + FormattedStartTime string `json:"formattedStartTime,omitempty"` + + // FormattedEndTime provides a human-readable formatted end time + FormattedEndTime string `json:"formattedEndTime,omitempty"` + + // FormattedNextStartTime provides a human-readable formatted next start time + FormattedNextStartTime string `json:"formattedNextStartTime,omitempty"` } // ScheduledScalingPhase represents the phase of a scheduled scaling operation @@ -151,9 +177,12 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:subresource:status //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" -//+kubebuilder:printcolumn:name="Start Time",type="string",JSONPath=".spec.schedule.startAt" -//+kubebuilder:printcolumn:name="End Time",type="string",JSONPath=".spec.schedule.finishAt" -//+kubebuilder:printcolumn:name="Target",type="string",JSONPath=".spec.targetRefs.tortoiseName" +//+kubebuilder:printcolumn:name="Type",type="string",JSONPath=".spec.schedule.type" +//+kubebuilder:printcolumn:name="Start Time",type="string",JSONPath=".status.formattedStartTime" +//+kubebuilder:printcolumn:name="End Time",type="string",JSONPath=".status.formattedEndTime" +//+kubebuilder:printcolumn:name="Next Start",type="string",JSONPath=".status.formattedNextStartTime" +//+kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".status.humanReadableSchedule" +//+kubebuilder:printcolumn:name="Target Tortoise",type="string",JSONPath=".spec.targetRefs.tortoiseName" //+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" // ScheduledScaling is the Schema for the scheduledscalings API @@ -177,3 +206,418 @@ type ScheduledScalingList struct { func init() { SchemeBuilder.Register(&ScheduledScaling{}, &ScheduledScalingList{}) } + +// GetHumanReadableSchedule returns a human-readable description of the schedule +func (s *ScheduledScaling) GetHumanReadableSchedule() string { + if s.Spec.Schedule.Type == ScheduleTypeCron { + return s.getHumanReadableCronSchedule() + } + return s.getHumanReadableTimeSchedule() +} + +// getHumanReadableCronSchedule returns a human-readable description of cron schedule +func (s *ScheduledScaling) getHumanReadableCronSchedule() string { + if s.Spec.Schedule.CronExpression == "" { + return "Invalid cron schedule" + } + + // Parse cron expression and provide human-readable description + desc := s.parseCronExpression(s.Spec.Schedule.CronExpression) + + // Add duration information + if s.Spec.Schedule.Duration != "" { + duration, err := time.ParseDuration(s.Spec.Schedule.Duration) + if err == nil { + desc += s.formatDuration(duration) + } else { + desc += " for " + s.Spec.Schedule.Duration + } + } + + // Add timezone information if different from default + if s.Spec.Schedule.TimeZone != "" && s.Spec.Schedule.TimeZone != "Asia/Tokyo" { + desc += " (" + s.Spec.Schedule.TimeZone + ")" + } + + return desc +} + +// formatDuration formats duration in a human-readable way +func (s *ScheduledScaling) formatDuration(duration time.Duration) string { + if duration < time.Hour { + minutes := int(duration.Minutes()) + return fmt.Sprintf(" for %d minute%s", minutes, pluralSuffix(minutes)) + } else if duration < 24*time.Hour { + hours := int(duration.Hours()) + minutes := int(duration.Minutes()) % 60 + if minutes == 0 { + return fmt.Sprintf(" for %d hour%s", hours, pluralSuffix(hours)) + } else { + return fmt.Sprintf(" for %dh %dm", hours, minutes) + } + } else { + days := int(duration.Hours() / 24) + hours := int(duration.Hours()) % 24 + if hours == 0 { + return fmt.Sprintf(" for %d day%s", days, pluralSuffix(days)) + } else { + return fmt.Sprintf(" for %d day%s %d hour%s", days, pluralSuffix(days), hours, pluralSuffix(hours)) + } + } +} + +// getHumanReadableTimeSchedule returns a human-readable description of time-based schedule +func (s *ScheduledScaling) getHumanReadableTimeSchedule() string { + if s.Spec.Schedule.StartAt == "" || s.Spec.Schedule.FinishAt == "" { + return "Invalid time schedule" + } + + startTime, err := time.Parse(time.RFC3339, s.Spec.Schedule.StartAt) + if err != nil { + return "Invalid start time format" + } + + finishTime, err := time.Parse(time.RFC3339, s.Spec.Schedule.FinishAt) + if err != nil { + return "Invalid finish time format" + } + + // Calculate duration + duration := finishTime.Sub(startTime) + + // Create a descriptive schedule name + scheduleName := s.generateScheduleName(startTime, finishTime, duration) + + // Build the description - shortened version + if duration < 24*time.Hour { + // Less than 24 hours + if duration < time.Hour { + minutes := int(duration.Minutes()) + return fmt.Sprintf("%s (%d min) - One-time", scheduleName, minutes) + } else { + hours := int(duration.Hours()) + minutes := int(duration.Minutes()) % 60 + if minutes == 0 { + return fmt.Sprintf("%s (%d hr) - One-time", scheduleName, hours) + } else { + return fmt.Sprintf("%s (%dh %dm) - One-time", scheduleName, hours, minutes) + } + } + } else { + // 24 hours or more + days := int(duration.Hours() / 24) + hours := int(duration.Hours()) % 24 + if hours == 0 { + return fmt.Sprintf("%s (%d day) - One-time", scheduleName, days) + } else { + return fmt.Sprintf("%s (%d day %dh) - One-time", scheduleName, days, hours) + } + } +} + +// generateScheduleName creates a descriptive name for the schedule +func (s *ScheduledScaling) generateScheduleName(startTime, finishTime time.Time, duration time.Duration) string { + // Check for common patterns + startHour := startTime.Hour() + finishHour := finishTime.Hour() + + // Business hours pattern + if startHour >= 8 && startHour <= 10 && finishHour >= 17 && finishHour <= 19 { + if duration >= 8*time.Hour && duration <= 10*time.Hour { + return "Business Hours" + } + } + + // Night shift pattern + if (startHour >= 22 || startHour <= 6) && (finishHour >= 22 || finishHour <= 6) { + return "Night Shift" + } + + // Weekend pattern + if startTime.Weekday() == time.Saturday || startTime.Weekday() == time.Sunday || + finishTime.Weekday() == time.Saturday || finishTime.Weekday() == time.Sunday { + return "Weekend" + } + + // Campaign/Event pattern (longer duration) + if duration >= 24*time.Hour { + return "Campaign" + } + + // Peak hours pattern + if (startHour >= 7 && startHour <= 9) || (startHour >= 17 && startHour <= 19) { + return "Peak Hours" + } + + // Default based on duration + if duration < time.Hour { + return "Quick Scaling" + } else if duration < 24*time.Hour { + return "Daily Scaling" + } else { + return "Extended Scaling" + } +} + +// parseCronExpression converts cron expression to human-readable text +func (s *ScheduledScaling) parseCronExpression(cronExpr string) string { + // Enhanced mapping for common cron patterns + patterns := map[string]string{ + "0 9 * * 1-5": "Weekdays at 9:00 AM", + "0 8,20 * * *": "Daily at 8:00 AM and 8:00 PM", + "0 0 1 * *": "Monthly on 1st at midnight", + "0 2 * * 6": "Weekly on Saturday at 2:00 AM", + "0 8 * * 1-5": "Weekdays at 8:00 AM", + "0 10 * * 1-5": "Weekdays at 10:00 AM", + "0 17 * * 1-5": "Weekdays at 5:00 PM", + "0 18 * * 1-5": "Weekdays at 6:00 PM", + "*/5 * * * *": "Every 5 minutes", + "*/10 * * * *": "Every 10 minutes", + "*/15 * * * *": "Every 15 minutes", + "*/30 * * * *": "Every 30 minutes", + "0 */2 * * *": "Every 2 hours", + "0 */3 * * *": "Every 3 hours", + "0 */4 * * *": "Every 4 hours", + "0 */6 * * *": "Every 6 hours", + "0 */8 * * *": "Every 8 hours", + "0 */12 * * *": "Every 12 hours", + "0 0 * * *": "Daily at midnight", + "0 6 * * *": "Daily at 6:00 AM", + "0 12 * * *": "Daily at noon", + "0 18 * * *": "Daily at 6:00 PM", + "0 0 * * 1": "Weekly on Monday at midnight", + "0 0 * * 6": "Weekly on Saturday at midnight", + "0 0 * * 0": "Weekly on Sunday at midnight", + "0 0 15 * *": "Monthly on 15th at midnight", + "0 0 1 1 *": "Yearly on January 1st at midnight", + } + + if desc, exists := patterns[cronExpr]; exists { + return desc + } + + // For unknown patterns, provide intelligent parsing + parts := strings.Fields(cronExpr) + if len(parts) >= 5 { + minute := parts[0] + hour := parts[1] + day := parts[2] + month := parts[3] + weekday := parts[4] + + desc := s.buildCronDescription(minute, hour, day, month, weekday) + return desc + } + + return "Cron: " + cronExpr +} + +// buildCronDescription builds a human-readable description from cron parts +func (s *ScheduledScaling) buildCronDescription(minute, hour, day, month, weekday string) string { + var desc strings.Builder + + // Handle minutes + if minute != "*" && minute != "0" { + if minute == "*/5" { + desc.WriteString("Every 5 minutes ") + } else if minute == "*/10" { + desc.WriteString("Every 10 minutes ") + } else if minute == "*/15" { + desc.WriteString("Every 15 minutes ") + } else if minute == "*/30" { + desc.WriteString("Every 30 minutes ") + } else { + desc.WriteString(fmt.Sprintf("At %s minutes past ", minute)) + } + } + + // Handle hours + if hour != "*" { + if strings.Contains(hour, ",") { + hours := strings.Split(hour, ",") + if len(hours) == 2 { + desc.WriteString(fmt.Sprintf("at %s:00 and %s:00 ", hours[0], hours[1])) + } else { + desc.WriteString(fmt.Sprintf("at %s ", hour)) + } + } else if strings.Contains(hour, "-") { + hourRange := strings.Split(hour, "-") + if len(hourRange) == 2 { + desc.WriteString(fmt.Sprintf("between %s:00 and %s:00 ", hourRange[0], hourRange[1])) + } else { + desc.WriteString(fmt.Sprintf("at %s:00 ", hour)) + } + } else if strings.Contains(hour, "/") { + hourStep := strings.Split(hour, "/") + if len(hourStep) == 2 { + desc.WriteString(fmt.Sprintf("every %s hours starting at %s:00 ", hourStep[1], hourStep[0])) + } else { + desc.WriteString(fmt.Sprintf("at %s:00 ", hour)) + } + } else { + desc.WriteString(fmt.Sprintf("at %s:00 ", hour)) + } + } + + // Handle days + if day != "*" { + if day == "1" { + desc.WriteString("on the 1st ") + } else if day == "15" { + desc.WriteString("on the 15th ") + } else if day == "31" { + desc.WriteString("on the 31st ") + } else { + desc.WriteString(fmt.Sprintf("on day %s ", day)) + } + } + + // Handle months + if month != "*" { + monthNames := []string{"", "January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December"} + if monthNum, err := strconv.Atoi(month); err == nil && monthNum >= 1 && monthNum <= 12 { + desc.WriteString(fmt.Sprintf("in %s ", monthNames[monthNum])) + } else { + desc.WriteString(fmt.Sprintf("in month %s ", month)) + } + } + + // Handle weekdays + if weekday != "*" { + if weekday == "1-5" { + desc.WriteString("on weekdays") + } else if weekday == "6-7" || weekday == "0,6" { + desc.WriteString("on weekends") + } else if weekday == "1" { + desc.WriteString("on Mondays") + } else if weekday == "2" { + desc.WriteString("on Tuesdays") + } else if weekday == "3" { + desc.WriteString("on Wednesdays") + } else if weekday == "4" { + desc.WriteString("on Thursdays") + } else if weekday == "5" { + desc.WriteString("on Fridays") + } else if weekday == "6" { + desc.WriteString("on Saturdays") + } else if weekday == "0" || weekday == "7" { + desc.WriteString("on Sundays") + } else if strings.Contains(weekday, ",") { + days := strings.Split(weekday, ",") + if len(days) == 2 { + desc.WriteString(fmt.Sprintf("on %s and %s", s.getWeekdayName(days[0]), s.getWeekdayName(days[1]))) + } else { + desc.WriteString(fmt.Sprintf("on weekdays %s", weekday)) + } + } else { + desc.WriteString(fmt.Sprintf("on weekday %s", weekday)) + } + } + + result := strings.TrimSpace(desc.String()) + if result == "" { + return "Every minute" + } + + return result +} + +// getWeekdayName converts weekday number to name +func (s *ScheduledScaling) getWeekdayName(weekday string) string { + switch weekday { + case "0", "7": + return "Sunday" + case "1": + return "Monday" + case "2": + return "Tuesday" + case "3": + return "Wednesday" + case "4": + return "Thursday" + case "5": + return "Friday" + case "6": + return "Saturday" + default: + return weekday + } +} + +// GetHumanReadableTime formats a time in a human-readable way +func (s *ScheduledScaling) GetHumanReadableTime(t *metav1.Time) string { + if t == nil { + return "-" + } + + now := time.Now() + timeDiff := t.Time.Sub(now) + + // Format based on how far in the past/future the time is + if timeDiff < 0 { + // Past time + absDiff := -timeDiff + if absDiff < time.Minute { + return "just now" + } else if absDiff < time.Hour { + minutes := int(absDiff.Minutes()) + return fmt.Sprintf("%d minute%s ago", minutes, pluralSuffix(minutes)) + } else if absDiff < 24*time.Hour { + hours := int(absDiff.Hours()) + return fmt.Sprintf("%d hour%s ago", hours, pluralSuffix(hours)) + } else { + days := int(absDiff.Hours() / 24) + return fmt.Sprintf("%d day%s ago", days, pluralSuffix(days)) + } + } else { + // Future time + if timeDiff < time.Minute { + return "starting now" + } else if timeDiff < time.Hour { + minutes := int(timeDiff.Minutes()) + return fmt.Sprintf("in %d minute%s", minutes, pluralSuffix(minutes)) + } else if timeDiff < 24*time.Hour { + hours := int(timeDiff.Hours()) + return fmt.Sprintf("in %d hour%s", hours, pluralSuffix(hours)) + } else { + days := int(timeDiff.Hours() / 24) + return fmt.Sprintf("in %d day%s", days, pluralSuffix(days)) + } + } +} + +// GetFormattedTime returns a human-readable formatted time with date information +func (s *ScheduledScaling) GetFormattedTime(t *metav1.Time) string { + if t == nil { + return "-" + } + + now := time.Now() + + // If it's today, show "Today at time" + if t.Time.Year() == now.Year() && t.Time.YearDay() == now.YearDay() { + return fmt.Sprintf("Today at %s", t.Time.Format("3:04 PM")) + } + + // If it's tomorrow, show "Tomorrow at time" + if t.Time.Year() == now.Year() && t.Time.YearDay() == now.YearDay()+1 { + return fmt.Sprintf("Tomorrow at %s", t.Time.Format("3:04 PM")) + } + + // If it's within a week, show day and time + if t.Time.Sub(now) < 7*24*time.Hour { + return t.Time.Format("Mon 3:04 PM") + } + + // Otherwise show date and time + return t.Time.Format("Jan 2, 3:04 PM") +} + +// pluralSuffix returns the appropriate plural suffix +func pluralSuffix(count int) string { + if count == 1 { + return "" + } + return "s" +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index a5023809..9716d2fd 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -96,6 +96,18 @@ func (in *ScheduledScalingList) DeepCopyObject() runtime.Object { func (in *ScheduledScalingStatus) DeepCopyInto(out *ScheduledScalingStatus) { *out = *in in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) + if in.CurrentWindowStart != nil { + in, out := &in.CurrentWindowStart, &out.CurrentWindowStart + *out = (*in).DeepCopy() + } + if in.CurrentWindowEnd != nil { + in, out := &in.CurrentWindowEnd, &out.CurrentWindowEnd + *out = (*in).DeepCopy() + } + if in.NextWindowStart != nil { + in, out := &in.NextWindowStart, &out.NextWindowStart + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScalingStatus. diff --git a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml index a3bd0bb8..074ff843 100644 --- a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml +++ b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml @@ -18,14 +18,23 @@ spec: - jsonPath: .status.phase name: Status type: string - - jsonPath: .spec.schedule.startAt + - jsonPath: .spec.schedule.type + name: Type + type: string + - jsonPath: .status.formattedStartTime name: Start Time type: string - - jsonPath: .spec.schedule.finishAt + - jsonPath: .status.formattedEndTime name: End Time type: string + - jsonPath: .status.formattedNextStartTime + name: Next Start + type: string + - jsonPath: .status.humanReadableSchedule + name: Schedule + type: string - jsonPath: .spec.targetRefs.tortoiseName - name: Target + name: Target Tortoise type: string - jsonPath: .metadata.creationTimestamp name: Age @@ -154,6 +163,32 @@ spec: status: description: ScheduledScalingStatus defines the observed state of ScheduledScaling properties: + currentWindowEnd: + description: CurrentWindowEnd indicates the end of the current scaling + window (for cron-based scheduling) + format: date-time + type: string + currentWindowStart: + description: CurrentWindowStart indicates the start of the current + scaling window (for cron-based scheduling) + format: date-time + type: string + formattedEndTime: + description: FormattedEndTime provides a human-readable formatted + end time + type: string + formattedNextStartTime: + description: FormattedNextStartTime provides a human-readable formatted + next start time + type: string + formattedStartTime: + description: FormattedStartTime provides a human-readable formatted + start time + type: string + humanReadableSchedule: + description: HumanReadableSchedule provides a human-readable description + of the schedule + type: string lastTransitionTime: description: LastTransitionTime is the last time the status transitioned from one phase to another @@ -163,6 +198,11 @@ spec: description: Message provides additional information about the current phase type: string + nextWindowStart: + description: NextWindowStart indicates when the next scaling window + will start (for cron-based scheduling) + format: date-time + type: string phase: description: Phase indicates the current phase of the scheduled scaling enum: diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 313377ae..db99a07a 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -120,6 +120,10 @@ func (r *ScheduledScalingReconciler) handleTimeBasedScheduling(ctx context.Conte // Set status to Active if not already set if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhaseActive { + // Update time-based status with formatted times + if err := r.updateTimeBasedStatus(ctx, scheduledScaling, startTime, finishTime); err != nil { + logger.Error(err, "Failed to update time-based status") + } return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", fmt.Sprintf("Scheduled scaling is active until %s", finishTime.Format(time.RFC3339))) } @@ -150,6 +154,10 @@ func (r *ScheduledScalingReconciler) handleCronBasedScheduling(ctx context.Conte if isActive { // We're in an active scaling period + // Calculate current window start time + duration, _ := time.ParseDuration(scheduledScaling.Spec.Schedule.Duration) + currentStart := endTime.Add(-duration) + // Apply scheduled scaling if err := r.applyScheduledScaling(ctx, scheduledScaling); err != nil { logger.Error(err, "Failed to apply scheduled scaling") @@ -158,6 +166,10 @@ func (r *ScheduledScalingReconciler) handleCronBasedScheduling(ctx context.Conte // Set status to Active if not already set if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhaseActive { + // Update cron status with current window information + if err := r.updateCronStatus(ctx, scheduledScaling, ¤tStart, &endTime, nil); err != nil { + logger.Error(err, "Failed to update cron status") + } return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", fmt.Sprintf("Cron-based scheduled scaling is active until %s", endTime.Format(time.RFC3339))) } @@ -193,6 +205,11 @@ func (r *ScheduledScalingReconciler) handleCronBasedScheduling(ctx context.Conte nextEnd.Format(time.RFC3339), scheduledScaling.Spec.Schedule.CronExpression) + // Update cron status with next window information + if err := r.updateCronStatus(ctx, scheduledScaling, nil, nil, &nextStart); err != nil { + logger.Error(err, "Failed to update cron status") + } + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhasePending, "WaitingForNext", message) } @@ -355,6 +372,9 @@ func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduled scheduledScaling.Status.Message = message scheduledScaling.Status.LastTransitionTime = metav1.Now() + // Update human-readable schedule description + scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() + if err := r.Status().Update(ctx, scheduledScaling); err != nil { return fmt.Errorf("failed to update status: %w", err) } @@ -363,6 +383,60 @@ func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduled return nil } +// updateTimeBasedStatus updates the time-based status fields with formatted times +func (r *ScheduledScalingReconciler) updateTimeBasedStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, startTime, finishTime time.Time) error { + // Parse the times and create metav1.Time objects + startMetaTime := metav1.NewTime(startTime) + finishMetaTime := metav1.NewTime(finishTime) + + // Update the status fields + scheduledScaling.Status.CurrentWindowStart = &startMetaTime + scheduledScaling.Status.CurrentWindowEnd = &finishMetaTime + scheduledScaling.Status.FormattedStartTime = scheduledScaling.GetFormattedTime(&startMetaTime) + scheduledScaling.Status.FormattedEndTime = scheduledScaling.GetFormattedTime(&finishMetaTime) + + // For time-based schedules, Next Start is not applicable + scheduledScaling.Status.FormattedNextStartTime = "N/A" + + // Update human-readable schedule description + scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() + + if err := r.Status().Update(ctx, scheduledScaling); err != nil { + return fmt.Errorf("failed to update time-based status: %w", err) + } + + return nil +} + +// updateCronStatus updates the cron-specific status fields +func (r *ScheduledScalingReconciler) updateCronStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, currentStart, currentEnd, nextStart *time.Time) error { + // Update current window times + if currentStart != nil { + startTime := metav1.NewTime(*currentStart) + scheduledScaling.Status.CurrentWindowStart = &startTime + scheduledScaling.Status.FormattedStartTime = scheduledScaling.GetFormattedTime(&startTime) + } + if currentEnd != nil { + endTime := metav1.NewTime(*currentEnd) + scheduledScaling.Status.CurrentWindowEnd = &endTime + scheduledScaling.Status.FormattedEndTime = scheduledScaling.GetFormattedTime(&endTime) + } + if nextStart != nil { + nextTime := metav1.NewTime(*nextStart) + scheduledScaling.Status.NextWindowStart = &nextTime + scheduledScaling.Status.FormattedNextStartTime = scheduledScaling.GetFormattedTime(&nextTime) + } + + // Update human-readable schedule description + scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() + + if err := r.Status().Update(ctx, scheduledScaling); err != nil { + return fmt.Errorf("failed to update cron status: %w", err) + } + + return nil +} + // SetupWithManager sets up the controller with the Manager. func (r *ScheduledScalingReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). diff --git a/internal/controller/scheduledscaling_controller_test.go b/internal/controller/scheduledscaling_controller_test.go index 7537241b..5b1844ba 100644 --- a/internal/controller/scheduledscaling_controller_test.go +++ b/internal/controller/scheduledscaling_controller_test.go @@ -214,7 +214,7 @@ var _ = Describe("ScheduledScaling Controller", func() { // Create a baseline Tortoise that ScheduledScaling will target t := &autoscalingv1beta3.Tortoise{ - ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-default-tz-unique", Namespace: "default"}, + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-cron", Namespace: "default"}, Spec: autoscalingv1beta3.TortoiseSpec{ ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ { @@ -276,7 +276,12 @@ var _ = Describe("ScheduledScaling Controller", func() { }, timeout, interval).Should(Or(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending), Equal(autoscalingv1alpha1.ScheduledScalingPhaseActive))) // Verify the status message contains cron information - Expect(createdResource.Status.Message).Should(ContainSubstring("cron: 0 9 * * 1-5")) + // The message format depends on whether the schedule is currently active or pending + if createdResource.Status.Phase == autoscalingv1alpha1.ScheduledScalingPhaseActive { + Expect(createdResource.Status.Message).Should(ContainSubstring("Cron-based scheduled scaling is active")) + } else { + Expect(createdResource.Status.Message).Should(ContainSubstring("cron: 0 9 * * 1-5")) + } // Clean up Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) @@ -510,7 +515,7 @@ var _ = Describe("ScheduledScaling Controller", func() { // Create a baseline Tortoise that ScheduledScaling will target t := &autoscalingv1beta3.Tortoise{ - ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-cron", Namespace: "default"}, + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise", Namespace: "default"}, Spec: autoscalingv1beta3.TortoiseSpec{ ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ { @@ -526,6 +531,7 @@ var _ = Describe("ScheduledScaling Controller", func() { Expect(k8sClient.Create(ctx, t)).Should(Succeed()) // Create a cron ScheduledScaling without explicitly setting timezone (should default to Asia/Tokyo) + // Use a schedule that won't be active during testing (e.g., 2 AM on weekdays) scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ ObjectMeta: metav1.ObjectMeta{ Name: "test-default-timezone", @@ -534,7 +540,7 @@ var _ = Describe("ScheduledScaling Controller", func() { Spec: autoscalingv1alpha1.ScheduledScalingSpec{ Schedule: autoscalingv1alpha1.Schedule{ Type: autoscalingv1alpha1.ScheduleTypeCron, - CronExpression: "0 9 * * 1-5", + CronExpression: "0 9 * * 0,6", // 9 AM on weekends only (Sunday=0, Saturday=6, should not be active during weekdays) Duration: "8h", // TimeZone: not specified, should default to Asia/Tokyo }, From e874fa8792a307037a49b8a713d982aa858f7e9c Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 28 Aug 2025 14:53:15 +0900 Subject: [PATCH 08/22] Change time display zone to Asia/Tokyo --- api/v1alpha1/scheduledscaling_types.go | 39 +++++++--- .../controller/scheduledscaling_controller.go | 74 +++++++++++++++++++ .../scheduledscaling_controller_test.go | 1 + 3 files changed, 104 insertions(+), 10 deletions(-) diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go index 67563d27..2ea15ff4 100644 --- a/api/v1alpha1/scheduledscaling_types.go +++ b/api/v1alpha1/scheduledscaling_types.go @@ -545,14 +545,31 @@ func (s *ScheduledScaling) getWeekdayName(weekday string) string { } } +// getTimezone returns the timezone to use for formatting times +// Defaults to Asia/Tokyo if not specified +func (s *ScheduledScaling) getTimezone() *time.Location { + timezone := s.Spec.Schedule.TimeZone + if timezone == "" { + timezone = "Asia/Tokyo" // Default timezone + } + + loc, err := time.LoadLocation(timezone) + if err != nil { + // Fallback to Asia/Tokyo if the specified timezone is invalid + loc, _ = time.LoadLocation("Asia/Tokyo") + } + return loc +} + // GetHumanReadableTime formats a time in a human-readable way func (s *ScheduledScaling) GetHumanReadableTime(t *metav1.Time) string { if t == nil { return "-" } - now := time.Now() - timeDiff := t.Time.Sub(now) + loc := s.getTimezone() + now := time.Now().In(loc) + timeDiff := t.Time.In(loc).Sub(now) // Format based on how far in the past/future the time is if timeDiff < 0 { @@ -593,25 +610,27 @@ func (s *ScheduledScaling) GetFormattedTime(t *metav1.Time) string { return "-" } - now := time.Now() + loc := s.getTimezone() + now := time.Now().In(loc) + targetTime := t.Time.In(loc) // If it's today, show "Today at time" - if t.Time.Year() == now.Year() && t.Time.YearDay() == now.YearDay() { - return fmt.Sprintf("Today at %s", t.Time.Format("3:04 PM")) + if targetTime.Year() == now.Year() && targetTime.YearDay() == now.YearDay() { + return fmt.Sprintf("Today at %s", targetTime.Format("3:04 PM")) } // If it's tomorrow, show "Tomorrow at time" - if t.Time.Year() == now.Year() && t.Time.YearDay() == now.YearDay()+1 { - return fmt.Sprintf("Tomorrow at %s", t.Time.Format("3:04 PM")) + if targetTime.Year() == now.Year() && targetTime.YearDay() == now.YearDay()+1 { + return fmt.Sprintf("Tomorrow at %s", targetTime.Format("3:04 PM")) } // If it's within a week, show day and time - if t.Time.Sub(now) < 7*24*time.Hour { - return t.Time.Format("Mon 3:04 PM") + if targetTime.Sub(now) < 7*24*time.Hour { + return targetTime.Format("Mon 3:04 PM") } // Otherwise show date and time - return t.Time.Format("Jan 2, 3:04 PM") + return targetTime.Format("Jan 2, 3:04 PM") } // pluralSuffix returns the appropriate plural suffix diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index db99a07a..a2f464fb 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -29,12 +29,17 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" ) +const ( + scheduledScalingFinalizer = "scheduledscaling.autoscaling.mercari.com/finalizer" +) + // ScheduledScalingReconciler reconciles a ScheduledScaling object type ScheduledScalingReconciler struct { client.Client @@ -61,6 +66,11 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req logger.Info("Reconciling ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "type", scheduledScaling.Spec.Schedule.Type) + // Handle finalizer logic + if err := r.handleFinalizer(ctx, scheduledScaling); err != nil { + return ctrl.Result{}, err + } + // Validate the schedule configuration if err := scheduledScaling.Spec.Schedule.Validate(); err != nil { logger.Error(err, "Invalid schedule configuration") @@ -327,6 +337,8 @@ func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, // applyNormalScaling restores the normal scaling configuration func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + logger := log.FromContext(ctx) + // Fetch target tortoise t := &autoscalingv1beta3.Tortoise{} key := types.NamespacedName{Namespace: scheduledScaling.Namespace, Name: scheduledScaling.Spec.TargetRefs.TortoiseName} @@ -336,13 +348,21 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" + + logger.Info("applyNormalScaling: checking annotations", "tortoise", t.Name, "annotations", t.Annotations) + if t.Annotations == nil { + logger.Info("applyNormalScaling: no annotations found") return nil } orig, ok := t.Annotations[annOriginal] if !ok || orig == "" { + logger.Info("applyNormalScaling: original spec annotation not found") return nil } + + logger.Info("applyNormalScaling: found original spec annotation", "original", orig) + var spec autoscalingv1beta3.TortoiseSpec if err := json.Unmarshal([]byte(orig), &spec); err != nil { return fmt.Errorf("unmarshal original tortoise spec: %w", err) @@ -355,12 +375,16 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch spec.TargetRefs.HorizontalPodAutoscalerName = t.Spec.TargetRefs.HorizontalPodAutoscalerName } + logger.Info("applyNormalScaling: restoring tortoise spec", "tortoise", t.Name, "newSpec", spec) + t.Spec = spec delete(t.Annotations, annOriginal) delete(t.Annotations, annMinReplicas) if err := r.Update(ctx, t); err != nil { return fmt.Errorf("restore tortoise: %w", err) } + + logger.Info("applyNormalScaling: successfully restored tortoise", "tortoise", t.Name) return nil } @@ -437,6 +461,56 @@ func (r *ScheduledScalingReconciler) updateCronStatus(ctx context.Context, sched return nil } +// handleFinalizer handles the finalizer logic for ScheduledScaling resources +func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + logger := log.FromContext(ctx) + + // Check if the ScheduledScaling is being deleted + if scheduledScaling.DeletionTimestamp != nil { + // If finalizer exists, run cleanup logic + if controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { + logger.Info("Running finalizer cleanup for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + // Try to restore the Tortoise to its original state + // If the Tortoise doesn't exist anymore, that's fine - just log it + if err := r.applyNormalScaling(ctx, scheduledScaling); err != nil { + if client.IgnoreNotFound(err) == nil { + // Tortoise was not found (likely already deleted), which is fine + logger.Info("Tortoise not found during finalizer cleanup (likely already deleted)", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else { + // Some other error occurred + logger.Error(err, "Failed to restore Tortoise during finalizer cleanup") + return fmt.Errorf("finalizer cleanup failed: %w", err) + } + } + + // Remove the finalizer + controllerutil.RemoveFinalizer(scheduledScaling, scheduledScalingFinalizer) + logger.Info("Removing finalizer from ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + if err := r.Update(ctx, scheduledScaling); err != nil { + // If update fails, it might be because the resource is being modified elsewhere + // In this case, we should return an error to trigger a requeue + return fmt.Errorf("failed to remove finalizer: %w", err) + } + + logger.Info("Finalizer cleanup completed for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + return nil + } + + // Add finalizer if it doesn't exist + if !controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { + controllerutil.AddFinalizer(scheduledScaling, scheduledScalingFinalizer) + if err := r.Update(ctx, scheduledScaling); err != nil { + return fmt.Errorf("failed to add finalizer: %w", err) + } + logger.Info("Added finalizer to ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + + return nil +} + // SetupWithManager sets up the controller with the Manager. func (r *ScheduledScalingReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). diff --git a/internal/controller/scheduledscaling_controller_test.go b/internal/controller/scheduledscaling_controller_test.go index 5b1844ba..d659eed6 100644 --- a/internal/controller/scheduledscaling_controller_test.go +++ b/internal/controller/scheduledscaling_controller_test.go @@ -584,5 +584,6 @@ var _ = Describe("ScheduledScaling Controller", func() { Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) }) + }) }) From be0eb31d1dbc8cfaf93cf9c02de8818e2b6e3ffd Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 28 Aug 2025 16:11:31 +0900 Subject: [PATCH 09/22] Add validation for ss minReplica >= tortoise recommended minReplicas --- api/v1alpha1/scheduledscaling_types.go | 3 +- .../scheduledscaling_validation_test.go | 77 ++++++++ ...scaling.mercari.com_scheduledscalings.yaml | 5 +- .../controller/scheduledscaling_controller.go | 186 +++++++++++++++--- 4 files changed, 239 insertions(+), 32 deletions(-) diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go index 2ea15ff4..ccc830c0 100644 --- a/api/v1alpha1/scheduledscaling_types.go +++ b/api/v1alpha1/scheduledscaling_types.go @@ -181,8 +181,9 @@ const ( //+kubebuilder:printcolumn:name="Start Time",type="string",JSONPath=".status.formattedStartTime" //+kubebuilder:printcolumn:name="End Time",type="string",JSONPath=".status.formattedEndTime" //+kubebuilder:printcolumn:name="Next Start",type="string",JSONPath=".status.formattedNextStartTime" -//+kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".status.humanReadableSchedule" +//+kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".status.formattedStartTime" //+kubebuilder:printcolumn:name="Target Tortoise",type="string",JSONPath=".spec.targetRefs.tortoiseName" +//+kubebuilder:printcolumn:name="Warnings",type="string",JSONPath=".status.message" //+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" // ScheduledScaling is the Schema for the scheduledscalings API diff --git a/api/v1alpha1/scheduledscaling_validation_test.go b/api/v1alpha1/scheduledscaling_validation_test.go index 16409498..55963cc8 100644 --- a/api/v1alpha1/scheduledscaling_validation_test.go +++ b/api/v1alpha1/scheduledscaling_validation_test.go @@ -1,6 +1,8 @@ package v1alpha1 import ( + "fmt" + "strings" "testing" "time" ) @@ -409,3 +411,78 @@ func findSubstring(text, substr string) bool { } return false } + +// TestScheduledScaling_AnnotationCleanup tests that ScheduledScaling annotations are properly cleaned up +func TestScheduledScaling_AnnotationCleanup(t *testing.T) { + // This test verifies the expected behavior of annotation cleanup + // The actual cleanup logic is in the controller, but we can test the annotation constants + + const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" + const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" + + // Verify annotation names are correctly defined + if annOriginal == "" { + t.Error("annOriginal constant should not be empty") + } + + if annMinReplicas == "" { + t.Error("annMinReplicas constant should not be empty") + } + + // Verify annotation names follow the expected pattern + expectedPrefix := "autoscaling.mercari.com/scheduledscaling-" + if !strings.HasPrefix(annOriginal, expectedPrefix) { + t.Errorf("annOriginal should start with %s, got %s", expectedPrefix, annOriginal) + } + + if !strings.HasPrefix(annMinReplicas, expectedPrefix) { + t.Errorf("annMinReplicas should start with %s, got %s", expectedPrefix, annMinReplicas) + } +} + +// TestScheduledScaling_ValidateMinReplicasWarning tests the minReplicas validation warning logic +func TestScheduledScaling_ValidateMinReplicasWarning(t *testing.T) { + tests := []struct { + name string + requestedMinReplicas int32 + hpaRecommendation int32 + expectedWarning bool + expectedMessage string + }{ + { + name: "no warning when requested >= recommended", + requestedMinReplicas: 5, + hpaRecommendation: 3, + expectedWarning: false, + expectedMessage: "", + }, + { + name: "warning when requested < recommended", + requestedMinReplicas: 2, + hpaRecommendation: 5, + expectedWarning: true, + expectedMessage: "Requested minReplicas (2) is lower than HPA's current recommendation (5) for the workload. Using HPA recommendation (5) instead to prevent performance issues. Consider reviewing your scaling strategy.", + }, + { + name: "no warning when requested equals recommended", + requestedMinReplicas: 5, + hpaRecommendation: 5, + expectedWarning: false, + expectedMessage: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test the expected warning message format + if tt.expectedWarning { + expectedMsg := fmt.Sprintf("Requested minReplicas (%d) is lower than HPA's current recommendation (%d) for the workload. Using HPA recommendation (%d) instead to prevent performance issues. Consider reviewing your scaling strategy.", + tt.requestedMinReplicas, tt.hpaRecommendation, tt.hpaRecommendation) + + if expectedMsg != tt.expectedMessage { + t.Errorf("expected warning message %q, got %q", tt.expectedMessage, expectedMsg) + } + } + }) + } +} diff --git a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml index 074ff843..88a0fe79 100644 --- a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml +++ b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml @@ -30,12 +30,15 @@ spec: - jsonPath: .status.formattedNextStartTime name: Next Start type: string - - jsonPath: .status.humanReadableSchedule + - jsonPath: .status.formattedStartTime name: Schedule type: string - jsonPath: .spec.targetRefs.tortoiseName name: Target Tortoise type: string + - jsonPath: .status.message + name: Warnings + type: string - jsonPath: .metadata.creationTimestamp name: Age type: date diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index a2f464fb..25380c7d 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -320,8 +320,38 @@ func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, } if m := scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas; m > 0 { - // Store intent; future versions could wire this to Tortoise min replicas recommendation - t.Annotations[annMinReplicas] = fmt.Sprintf("%d", m) + // Validate that the requested minReplicas is not lower than HPA's recommendation + isValid, warningMsg, recommendedValue := r.validateMinReplicasAgainstHPARecommendation(ctx, t, m) + + // Determine what value to actually use + effectiveMinReplicas := m + if !isValid && recommendedValue > 0 { + // Use the HPA's recommended value instead of the requested value + effectiveMinReplicas = recommendedValue + log.FromContext(ctx).Info("ScheduledScaling minReplicas using HPA recommendation instead of requested value", + "tortoise", t.Name, + "requested", m, + "using", effectiveMinReplicas, + "warning", warningMsg) + } else if !isValid { + // Log the warning but still apply the requested change + log.FromContext(ctx).Info("ScheduledScaling minReplicas validation warning", + "tortoise", t.Name, + "requested", m, + "warning", warningMsg) + } + + // Update the status message to include the warning for user visibility + if !isValid && warningMsg != "" { + if scheduledScaling.Status.Message == "" { + scheduledScaling.Status.Message = warningMsg + } else { + scheduledScaling.Status.Message = scheduledScaling.Status.Message + "; " + warningMsg + } + } + + // Store the effective minReplicas value (either requested or HPA recommended) + t.Annotations[annMinReplicas] = fmt.Sprintf("%d", effectiveMinReplicas) updated = true } @@ -351,45 +381,69 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch logger.Info("applyNormalScaling: checking annotations", "tortoise", t.Name, "annotations", t.Annotations) - if t.Annotations == nil { - logger.Info("applyNormalScaling: no annotations found") - return nil - } - orig, ok := t.Annotations[annOriginal] - if !ok || orig == "" { - logger.Info("applyNormalScaling: original spec annotation not found") - return nil - } - - logger.Info("applyNormalScaling: found original spec annotation", "original", orig) + // Always clean up ScheduledScaling annotations, even if we can't restore the original spec + hasChanges := false + + // Try to restore the original spec first, before removing annotations + if t.Annotations != nil { + if orig, ok := t.Annotations[annOriginal]; ok && orig != "" { + logger.Info("applyNormalScaling: found original spec annotation", "original", orig) + + var spec autoscalingv1beta3.TortoiseSpec + if err := json.Unmarshal([]byte(orig), &spec); err != nil { + logger.Error(err, "Failed to unmarshal original tortoise spec, but continuing with annotation cleanup") + } else { + // Preserve HPA reference if it was added during scheduled scaling to prevent HPA recreation + if spec.TargetRefs.HorizontalPodAutoscalerName == nil && t.Spec.TargetRefs.HorizontalPodAutoscalerName != nil && t.Status.Targets.HorizontalPodAutoscaler != "" { + // If original spec didn't have HPA reference but current spec does (added during scheduled scaling), + // preserve it to avoid HPA recreation when restoring + spec.TargetRefs.HorizontalPodAutoscalerName = t.Spec.TargetRefs.HorizontalPodAutoscalerName + } - var spec autoscalingv1beta3.TortoiseSpec - if err := json.Unmarshal([]byte(orig), &spec); err != nil { - return fmt.Errorf("unmarshal original tortoise spec: %w", err) + logger.Info("applyNormalScaling: restoring tortoise spec", "tortoise", t.Name, "newSpec", spec) + t.Spec = spec + hasChanges = true + } + } else { + logger.Info("applyNormalScaling: original spec annotation not found, skipping spec restoration", "tortoise", t.Name) + } } - // Preserve HPA reference if it was added during scheduled scaling to prevent HPA recreation - if spec.TargetRefs.HorizontalPodAutoscalerName == nil && t.Spec.TargetRefs.HorizontalPodAutoscalerName != nil && t.Status.Targets.HorizontalPodAutoscaler != "" { - // If original spec didn't have HPA reference but current spec does (added during scheduled scaling), - // preserve it to avoid HPA recreation when restoring - spec.TargetRefs.HorizontalPodAutoscalerName = t.Spec.TargetRefs.HorizontalPodAutoscalerName - } + // Remove ScheduledScaling annotations after restoring the spec + if t.Annotations != nil { + if _, hasOriginal := t.Annotations[annOriginal]; hasOriginal { + delete(t.Annotations, annOriginal) + hasChanges = true + logger.Info("applyNormalScaling: removed original spec annotation", "tortoise", t.Name) + } - logger.Info("applyNormalScaling: restoring tortoise spec", "tortoise", t.Name, "newSpec", spec) + if _, hasMinReplicas := t.Annotations[annMinReplicas]; hasMinReplicas { + delete(t.Annotations, annMinReplicas) + hasChanges = true + logger.Info("applyNormalScaling: removed min replicas annotation", "tortoise", t.Name) + } + } - t.Spec = spec - delete(t.Annotations, annOriginal) - delete(t.Annotations, annMinReplicas) - if err := r.Update(ctx, t); err != nil { - return fmt.Errorf("restore tortoise: %w", err) + // Only update if we made changes + if hasChanges { + if err := r.Update(ctx, t); err != nil { + return fmt.Errorf("failed to update tortoise during cleanup: %w", err) + } + logger.Info("applyNormalScaling: successfully cleaned up tortoise", "tortoise", t.Name) + } else { + logger.Info("applyNormalScaling: no changes needed", "tortoise", t.Name) } - logger.Info("applyNormalScaling: successfully restored tortoise", "tortoise", t.Name) return nil } // updateStatus updates the status of the ScheduledScaling resource func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, phase autoscalingv1alpha1.ScheduledScalingPhase, reason, message string) error { + // Skip status updates if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + return nil + } + if scheduledScaling.Status.Phase != phase { scheduledScaling.Status.Phase = phase scheduledScaling.Status.Reason = reason @@ -409,6 +463,11 @@ func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduled // updateTimeBasedStatus updates the time-based status fields with formatted times func (r *ScheduledScalingReconciler) updateTimeBasedStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, startTime, finishTime time.Time) error { + // Skip status updates if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + return nil + } + // Parse the times and create metav1.Time objects startMetaTime := metav1.NewTime(startTime) finishMetaTime := metav1.NewTime(finishTime) @@ -434,6 +493,11 @@ func (r *ScheduledScalingReconciler) updateTimeBasedStatus(ctx context.Context, // updateCronStatus updates the cron-specific status fields func (r *ScheduledScalingReconciler) updateCronStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, currentStart, currentEnd, nextStart *time.Time) error { + // Skip status updates if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + return nil + } + // Update current window times if currentStart != nil { startTime := metav1.NewTime(*currentStart) @@ -471,8 +535,9 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu if controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { logger.Info("Running finalizer cleanup for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - // Try to restore the Tortoise to its original state + // Try to restore the Tortoise to its original state and clean up annotations // If the Tortoise doesn't exist anymore, that's fine - just log it + logger.Info("Starting Tortoise cleanup and annotation removal", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) if err := r.applyNormalScaling(ctx, scheduledScaling); err != nil { if client.IgnoreNotFound(err) == nil { // Tortoise was not found (likely already deleted), which is fine @@ -482,6 +547,8 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu logger.Error(err, "Failed to restore Tortoise during finalizer cleanup") return fmt.Errorf("finalizer cleanup failed: %w", err) } + } else { + logger.Info("Successfully cleaned up Tortoise annotations and restored original spec", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } // Remove the finalizer @@ -511,6 +578,65 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu return nil } +// validateMinReplicasAgainstHPARecommendation validates that the requested minReplicas +// is not lower than the HPA's recommended value to prevent scaling down too aggressively +// Returns: (isValid, warningMessage, recommendedValue) +func (r *ScheduledScalingReconciler) validateMinReplicasAgainstHPARecommendation(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise, requestedMinReplicas int32) (bool, string, int32) { + logger := log.FromContext(ctx) + + // If no minReplicas is requested, no validation needed + if requestedMinReplicas <= 0 { + return true, "", 0 + } + + // Check if Tortoise has HPA recommendations + if tortoise.Status.Recommendations.Horizontal.MinReplicas == nil || len(tortoise.Status.Recommendations.Horizontal.MinReplicas) == 0 { + logger.Info("No HPA minReplicas recommendations available for validation", "tortoise", tortoise.Name) + return true, "", 0 + } + + // Find the current time slot recommendation + now := time.Now() + currentHour := now.Hour() + currentWeekday := now.Weekday().String() + + var recommendedMinReplicas int32 + var foundRecommendation bool + + for _, rec := range tortoise.Status.Recommendations.Horizontal.MinReplicas { + // Check if this recommendation applies to the current time + if rec.From <= currentHour && currentHour < rec.To { + // Check weekday if specified + if rec.WeekDay == nil || *rec.WeekDay == currentWeekday { + recommendedMinReplicas = rec.Value + foundRecommendation = true + break + } + } + } + + if !foundRecommendation { + logger.Info("No HPA minReplicas recommendation found for current time slot", "tortoise", tortoise.Name, "hour", currentHour, "weekday", currentWeekday) + return true, "", 0 + } + + // Validate that requested minReplicas is not lower than recommended + if requestedMinReplicas < recommendedMinReplicas { + warningMsg := fmt.Sprintf("Requested minReplicas (%d) is lower than HPA's current recommendation (%d) for the workload. Using HPA recommendation (%d) instead to prevent performance issues. Consider reviewing your scaling strategy.", + requestedMinReplicas, recommendedMinReplicas, recommendedMinReplicas) + + logger.Info("ScheduledScaling minReplicas validation warning", + "tortoise", tortoise.Name, + "requested", requestedMinReplicas, + "recommended", recommendedMinReplicas, + "warning", warningMsg) + + return false, warningMsg, recommendedMinReplicas + } + + return true, "", 0 +} + // SetupWithManager sets up the controller with the Manager. func (r *ScheduledScalingReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). From a1866bf25bfe28a846002484ab7591f7a265d22b Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 28 Aug 2025 17:32:27 +0900 Subject: [PATCH 10/22] Fix ss deletion --- internal/controller/scheduledscaling_controller.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 25380c7d..728275bc 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -71,6 +71,12 @@ func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, err } + // If the object is being deleted, don't continue with reconciliation + if scheduledScaling.DeletionTimestamp != nil { + logger.Info("ScheduledScaling is being deleted, skipping further reconciliation", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return ctrl.Result{}, nil + } + // Validate the schedule configuration if err := scheduledScaling.Spec.Schedule.Validate(); err != nil { logger.Error(err, "Invalid schedule configuration") @@ -246,6 +252,12 @@ func (r *ScheduledScalingReconciler) handleCronBasedScheduling(ctx context.Conte // applyScheduledScaling applies the scheduled scaling configuration to the target Tortoise func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + // Skip applying scheduled scaling if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).Info("Skipping scheduled scaling application - object is being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + // Get the target Tortoise t := &autoscalingv1beta3.Tortoise{} key := types.NamespacedName{Namespace: scheduledScaling.Namespace, Name: scheduledScaling.Spec.TargetRefs.TortoiseName} From 9e19adf000a56ce540e40dc1a1290b459e60314e Mon Sep 17 00:00:00 2001 From: "A.V.S Bharadwaj" <33314776+AVSBharadwaj@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:57:50 +0900 Subject: [PATCH 11/22] Add grace period before entering into Emergency mode (#457) * Add grace period before entering into Emergency mode * make emergency mode grace period configurable * Add grace period config value in existing tests --- cmd/main.go | 11 ----------- pkg/hpa/service.go | 44 -------------------------------------------- 2 files changed, 55 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 8f89c587..98e4d327 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -47,7 +47,6 @@ import ( autoscalingv2 "github.com/mercari/tortoise/api/autoscaling/v2" v1 "github.com/mercari/tortoise/api/core/v1" - autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" "github.com/mercari/tortoise/internal/controller" "github.com/mercari/tortoise/pkg/config" @@ -72,7 +71,6 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(autoscalingv1beta3.AddToScheme(scheme)) - utilruntime.Must(autoscalingv1alpha1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -191,15 +189,6 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Tortoise") os.Exit(1) } - - // Setup ScheduledScaling controller - if err = (&controller.ScheduledScalingReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "ScheduledScaling") - os.Exit(1) - } if err = (&autoscalingv1beta3.Tortoise{}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Tortoise") os.Exit(1) diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index b88d45a0..83ae0a23 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -8,7 +8,6 @@ import ( "reflect" "regexp" "sort" - "strconv" "time" v2 "k8s.io/api/autoscaling/v2" @@ -129,19 +128,6 @@ func (c *Service) InitializeHPA(ctx context.Context, tortoise *autoscalingv1beta return tortoise, nil } - // Check if tortoise already has an HPA reference in status that exists - if tortoise.Status.Targets.HorizontalPodAutoscaler != "" { - hpa := &v2.HorizontalPodAutoscaler{} - if err := c.c.Get(ctx, types.NamespacedName{ - Namespace: tortoise.Namespace, - Name: tortoise.Status.Targets.HorizontalPodAutoscaler, - }, hpa); err == nil { - logger.Info("tortoise already has an existing HPA, no need to create new one", "hpa", tortoise.Status.Targets.HorizontalPodAutoscaler) - return tortoise, nil - } - logger.Info("tortoise status references HPA that doesn't exist, will create new one", "missing_hpa", tortoise.Status.Targets.HorizontalPodAutoscaler) - } - logger.Info("no existing HPA specified, creating HPA") // create default HPA. @@ -434,19 +420,6 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet recommendMax = *tortoise.Spec.MaxReplicas } - // ScheduledScaling: ensure max is not below the scheduled min override - var ssMinOverride int32 - if tortoise.Annotations != nil { - if v, ok := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; ok && v != "" { - if parsed, perr := strconv.ParseInt(v, 10, 32); perr == nil && parsed > 0 { - ssMinOverride = int32(parsed) - } - } - } - if ssMinOverride > 0 && recommendMax < ssMinOverride { - recommendMax = ssMinOverride - } - if recommendMax > c.maximumMaxReplica { c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, c.maximumMaxReplica, c.maximumMaxReplica)) recommendMax = c.maximumMaxReplica @@ -490,22 +463,6 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet minToActuallyApply = recommendMin } - // ScheduledScaling: enforce configured minimum replicas during Active - if ssMinOverride > 0 && minToActuallyApply < ssMinOverride { - minToActuallyApply = ssMinOverride - } - - // Ensure consistency: MaxReplicas >= MinReplicas - if recommendMax < minToActuallyApply { - recommendMax = minToActuallyApply - if recommendMax > c.maximumMaxReplica { - recommendMax = c.maximumMaxReplica - if minToActuallyApply > recommendMax { - minToActuallyApply = recommendMax - } - } - } - hpa.Spec.MinReplicas = &minToActuallyApply if tortoise.Spec.UpdateMode != autoscalingv1beta3.UpdateModeOff && recordMetrics { // We don't want to record applied* metric when UpdateMode is Off. @@ -863,7 +820,6 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin "conditionAge", conditionAge) return true // Give grace period before emergency mode } - // Grace period expired, switch to Emergency mode since no metrics logger.Info("HPA failed to get resource metrics after grace period, switch to emergency mode", "gracePeriod", c.emergencyModeGracePeriod, From c48bbca851ab175bdcd2ecce52bd3256e9c2769e Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Fri, 29 Aug 2025 14:58:02 +0900 Subject: [PATCH 12/22] update timezone to asia/tokyo --- api/v1alpha1/scheduledscaling_types.go | 10 +++++----- internal/controller/scheduledscaling_controller.go | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go index ccc830c0..00496dd1 100644 --- a/api/v1alpha1/scheduledscaling_types.go +++ b/api/v1alpha1/scheduledscaling_types.go @@ -181,7 +181,7 @@ const ( //+kubebuilder:printcolumn:name="Start Time",type="string",JSONPath=".status.formattedStartTime" //+kubebuilder:printcolumn:name="End Time",type="string",JSONPath=".status.formattedEndTime" //+kubebuilder:printcolumn:name="Next Start",type="string",JSONPath=".status.formattedNextStartTime" -//+kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".status.formattedStartTime" +//+kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".status.humanReadableSchedule" //+kubebuilder:printcolumn:name="Target Tortoise",type="string",JSONPath=".spec.targetRefs.tortoiseName" //+kubebuilder:printcolumn:name="Warnings",type="string",JSONPath=".status.message" //+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" @@ -546,9 +546,9 @@ func (s *ScheduledScaling) getWeekdayName(weekday string) string { } } -// getTimezone returns the timezone to use for formatting times +// GetTimezone returns the timezone to use for formatting times // Defaults to Asia/Tokyo if not specified -func (s *ScheduledScaling) getTimezone() *time.Location { +func (s *ScheduledScaling) GetTimezone() *time.Location { timezone := s.Spec.Schedule.TimeZone if timezone == "" { timezone = "Asia/Tokyo" // Default timezone @@ -568,7 +568,7 @@ func (s *ScheduledScaling) GetHumanReadableTime(t *metav1.Time) string { return "-" } - loc := s.getTimezone() + loc := s.GetTimezone() now := time.Now().In(loc) timeDiff := t.Time.In(loc).Sub(now) @@ -611,7 +611,7 @@ func (s *ScheduledScaling) GetFormattedTime(t *metav1.Time) string { return "-" } - loc := s.getTimezone() + loc := s.GetTimezone() now := time.Now().In(loc) targetTime := t.Time.In(loc) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 728275bc..65bba2bd 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -216,9 +216,11 @@ func (r *ScheduledScalingReconciler) handleCronBasedScheduling(ctx context.Conte // Set status to Pending if not already set if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhasePending { + // Get the timezone for formatting (defaults to Asia/Tokyo) + loc := scheduledScaling.GetTimezone() message := fmt.Sprintf("Next scheduled scaling window: %s to %s (cron: %s)", - nextStart.Format(time.RFC3339), - nextEnd.Format(time.RFC3339), + nextStart.In(loc).Format("2006-01-02T15:04:05"), + nextEnd.In(loc).Format("2006-01-02T15:04:05"), scheduledScaling.Spec.Schedule.CronExpression) // Update cron status with next window information From 5168f3c40f2314f18006b2db47e2259a218f7c25 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Fri, 29 Aug 2025 15:11:52 +0900 Subject: [PATCH 13/22] Update CRD: Fix SCHEDULE column to show human-readable schedule instead of start time --- config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml index 88a0fe79..046e3fe8 100644 --- a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml +++ b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml @@ -30,7 +30,7 @@ spec: - jsonPath: .status.formattedNextStartTime name: Next Start type: string - - jsonPath: .status.formattedStartTime + - jsonPath: .status.humanReadableSchedule name: Schedule type: string - jsonPath: .spec.targetRefs.tortoiseName From 3e50c0916bf986883eb9931dfe4f681cdbbe7544 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Fri, 29 Aug 2025 16:43:54 +0900 Subject: [PATCH 14/22] Update Deletion Finalizer logic --- .../controller/scheduledscaling_controller.go | 84 ++++++++++++++----- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 65bba2bd..f69a178f 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -421,6 +421,8 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch } else { logger.Info("applyNormalScaling: original spec annotation not found, skipping spec restoration", "tortoise", t.Name) } + } else { + logger.Info("applyNormalScaling: no annotations found on tortoise", "tortoise", t.Name) } // Remove ScheduledScaling annotations after restoring the spec @@ -549,30 +551,65 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu if controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { logger.Info("Running finalizer cleanup for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - // Try to restore the Tortoise to its original state and clean up annotations - // If the Tortoise doesn't exist anymore, that's fine - just log it - logger.Info("Starting Tortoise cleanup and annotation removal", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - if err := r.applyNormalScaling(ctx, scheduledScaling); err != nil { - if client.IgnoreNotFound(err) == nil { - // Tortoise was not found (likely already deleted), which is fine - logger.Info("Tortoise not found during finalizer cleanup (likely already deleted)", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + // Only perform cleanup if the resource was ever active (has annotations or was in Active phase) + // If it's still in Pending status, no cleanup is needed + if scheduledScaling.Status.Phase == autoscalingv1alpha1.ScheduledScalingPhasePending { + logger.Info("ScheduledScaling was never active, skipping cleanup", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else { + // Try to restore the Tortoise to its original state and clean up annotations + // If the Tortoise doesn't exist anymore, that's fine - just log it + logger.Info("Starting Tortoise cleanup and annotation removal", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + // Create a timeout context for the cleanup operation to prevent it from getting stuck + cleanupCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + if err := r.applyNormalScaling(cleanupCtx, scheduledScaling); err != nil { + if client.IgnoreNotFound(err) == nil { + // Tortoise was not found (likely already deleted), which is fine + logger.Info("Tortoise not found during finalizer cleanup (likely already deleted)", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else if cleanupCtx.Err() == context.DeadlineExceeded { + // Cleanup timed out - log warning but continue with finalizer removal + logger.Info("Tortoise cleanup timed out during finalizer cleanup, but continuing with finalizer removal to prevent stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else { + // Some other error occurred - log it but don't fail the finalizer removal + // This prevents the resource from getting permanently stuck in deletion + logger.Error(err, "Failed to restore Tortoise during finalizer cleanup, but continuing with finalizer removal to prevent stuck deletion") + } } else { - // Some other error occurred - logger.Error(err, "Failed to restore Tortoise during finalizer cleanup") - return fmt.Errorf("finalizer cleanup failed: %w", err) + logger.Info("Successfully cleaned up Tortoise annotations and restored original spec", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } - } else { - logger.Info("Successfully cleaned up Tortoise annotations and restored original spec", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } - // Remove the finalizer + // Remove the finalizer - this should always succeed to prevent stuck deletion controllerutil.RemoveFinalizer(scheduledScaling, scheduledScalingFinalizer) logger.Info("Removing finalizer from ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - if err := r.Update(ctx, scheduledScaling); err != nil { - // If update fails, it might be because the resource is being modified elsewhere - // In this case, we should return an error to trigger a requeue - return fmt.Errorf("failed to remove finalizer: %w", err) + // Try to update the resource to remove the finalizer + // If it fails, retry a few times with exponential backoff to prevent stuck deletion + maxRetries := 3 + for attempt := 1; attempt <= maxRetries; attempt++ { + if err := r.Update(ctx, scheduledScaling); err != nil { + if attempt == maxRetries { + // Final attempt failed - this is critical as it could cause stuck deletion + logger.Error(err, "Failed to remove finalizer after all retries - this could cause stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "attempt", attempt) + return fmt.Errorf("failed to remove finalizer after %d attempts: %w", maxRetries, err) + } + + // Wait before retry with exponential backoff + backoffTime := time.Duration(attempt) * time.Second + logger.Info("Failed to remove finalizer, retrying", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "attempt", attempt, "backoff", backoffTime) + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(backoffTime): + continue + } + } else { + // Successfully removed finalizer + break + } } logger.Info("Finalizer cleanup completed for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) @@ -582,11 +619,16 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu // Add finalizer if it doesn't exist if !controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { - controllerutil.AddFinalizer(scheduledScaling, scheduledScalingFinalizer) - if err := r.Update(ctx, scheduledScaling); err != nil { - return fmt.Errorf("failed to add finalizer: %w", err) + // Only add finalizer if the resource is not being deleted + if scheduledScaling.DeletionTimestamp == nil { + controllerutil.AddFinalizer(scheduledScaling, scheduledScalingFinalizer) + if err := r.Update(ctx, scheduledScaling); err != nil { + return fmt.Errorf("failed to add finalizer: %w", err) + } + logger.Info("Added finalizer to ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else { + logger.Info("Skipping finalizer addition for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } - logger.Info("Added finalizer to ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } return nil From f79c9f4521d6649fabcf809d3b27934e5e97c35e Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Sun, 31 Aug 2025 09:47:15 +0900 Subject: [PATCH 15/22] Update Deletion Finalizer logic-2 --- .../controller/scheduledscaling_controller.go | 31 +++++-------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index f69a178f..11582823 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -34,6 +34,7 @@ import ( autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" + "k8s.io/apimachinery/pkg/api/errors" as apierrors ) const ( @@ -585,31 +586,13 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu controllerutil.RemoveFinalizer(scheduledScaling, scheduledScalingFinalizer) logger.Info("Removing finalizer from ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - // Try to update the resource to remove the finalizer - // If it fails, retry a few times with exponential backoff to prevent stuck deletion - maxRetries := 3 - for attempt := 1; attempt <= maxRetries; attempt++ { - if err := r.Update(ctx, scheduledScaling); err != nil { - if attempt == maxRetries { - // Final attempt failed - this is critical as it could cause stuck deletion - logger.Error(err, "Failed to remove finalizer after all retries - this could cause stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "attempt", attempt) - return fmt.Errorf("failed to remove finalizer after %d attempts: %w", maxRetries, err) - } - - // Wait before retry with exponential backoff - backoffTime := time.Duration(attempt) * time.Second - logger.Info("Failed to remove finalizer, retrying", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "attempt", attempt, "backoff", backoffTime) - - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoffTime): - continue - } - } else { - // Successfully removed finalizer - break + if err := r.Update(ctx, scheduledScaling); err != nil { + // If update fails, it might be because the resource is being modified elsewhere. + // Returning an error will trigger a requeue, and the next reconciliation will attempt to remove the finalizer again. + if apierrors.IsConflict(err) { + logger.Info("Conflict detected when removing finalizer, requeueing", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } + return fmt.Errorf("failed to remove finalizer: %w", err) } logger.Info("Finalizer cleanup completed for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) From 7d78bea0cf5ef9dc87f0f9e43e33eca3e39d1724 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Sun, 31 Aug 2025 19:08:45 +0900 Subject: [PATCH 16/22] Update Deletion Finalizer logic-3 --- internal/controller/scheduledscaling_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 11582823..3ff8dc21 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -23,6 +23,7 @@ import ( "time" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -34,7 +35,6 @@ import ( autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" - "k8s.io/apimachinery/pkg/api/errors" as apierrors ) const ( From 9a1704cebc17dbf32dec4196d97c53a3fa466494 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Tue, 2 Sep 2025 16:26:54 +0900 Subject: [PATCH 17/22] add webhook for ss config validations --- api/v1alpha1/scheduledscaling_types.go | 31 +- api/v1alpha1/scheduledscaling_webhook.go | 104 ++++++ api/v1alpha1/scheduledscaling_webhook_test.go | 197 +++++++++++ api/v1alpha1/zz_generated.deepcopy.go | 99 +++++- cmd/main.go | 6 + ...scaling.mercari.com_scheduledscalings.yaml | 40 ++- config/webhook/manifests.yaml | 20 ++ .../controller/scheduledscaling_controller.go | 191 ++++++---- .../scheduledscaling_controller_test.go | 328 +++++++++++++++++- 9 files changed, 927 insertions(+), 89 deletions(-) create mode 100644 api/v1alpha1/scheduledscaling_webhook.go create mode 100644 api/v1alpha1/scheduledscaling_webhook_test.go diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go index 00496dd1..c372df5e 100644 --- a/api/v1alpha1/scheduledscaling_types.go +++ b/api/v1alpha1/scheduledscaling_types.go @@ -10,6 +10,7 @@ import ( ) // ScheduledScalingSpec defines the desired state of ScheduledScaling +// +kubebuilder:object:generate=true type ScheduledScalingSpec struct { // Schedule defines when the scaling should occur // +kubebuilder:validation:Required @@ -83,6 +84,7 @@ type TargetRefs struct { } // Strategy defines how the scaling should be performed +// +kubebuilder:object:generate=true type Strategy struct { // Static defines static scaling parameters // +kubebuilder:validation:Required @@ -90,18 +92,26 @@ type Strategy struct { } // StaticStrategy defines static scaling parameters +// +kubebuilder:object:generate=true type StaticStrategy struct { // MinimumMinReplicas sets the minimum number of replicas during scaling // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Required - MinimumMinReplicas int32 `json:"minimumMinReplicas"` + // +kubebuilder:validation:Optional + MinimumMinReplicas *int32 `json:"minimumMinReplicas,omitempty"` // MinAllocatedResources sets the minimum allocated resources during scaling - // +kubebuilder:validation:Required - MinAllocatedResources ResourceRequirements `json:"minAllocatedResources"` + // This can be either global (applied to all containers) or container-specific + // +kubebuilder:validation:Optional + MinAllocatedResources *ResourceRequirements `json:"minAllocatedResources,omitempty"` + + // ContainerMinAllocatedResources sets container-specific minimum allocated resources + // If specified, this takes precedence over MinAllocatedResources for specific containers + // +kubebuilder:validation:Optional + ContainerMinAllocatedResources []ContainerResourceRequirements `json:"containerMinAllocatedResources,omitempty"` } // ResourceRequirements describes the compute resource requirements +// +kubebuilder:object:generate=true type ResourceRequirements struct { // CPU specifies the CPU resource requirements // +kubebuilder:validation:Required @@ -112,6 +122,18 @@ type ResourceRequirements struct { Memory string `json:"memory"` } +// ContainerResourceRequirements describes container-specific resource requirements +// +kubebuilder:object:generate=true +type ContainerResourceRequirements struct { + // ContainerName specifies which container these resources apply to + // +kubebuilder:validation:Required + ContainerName string `json:"containerName"` + + // Resources specifies the resource requirements for this container + // +kubebuilder:validation:Required + Resources ResourceRequirements `json:"resources"` +} + // ScheduledScalingState represents the desired state of a scheduled scaling operation type ScheduledScalingState string @@ -187,6 +209,7 @@ const ( //+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" // ScheduledScaling is the Schema for the scheduledscalings API +// +kubebuilder:object:generate=true type ScheduledScaling struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` diff --git a/api/v1alpha1/scheduledscaling_webhook.go b/api/v1alpha1/scheduledscaling_webhook.go new file mode 100644 index 00000000..0dbc0d5d --- /dev/null +++ b/api/v1alpha1/scheduledscaling_webhook.go @@ -0,0 +1,104 @@ +package v1alpha1 + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/validation/field" + ctrl "sigs.k8s.io/controller-runtime" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/webhook" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" +) + +// log is for logging in this package. +var scheduledscalinglog = logf.Log.WithName("scheduledscaling-resource") + +func (r *ScheduledScaling) SetupWebhookWithManager(mgr ctrl.Manager) error { + return ctrl.NewWebhookManagedBy(mgr). + For(r). + Complete() +} + +//+kubebuilder:webhook:path=/validate-autoscaling-mercari-com-v1alpha1-scheduledscaling,mutating=false,failurePolicy=fail,sideEffects=None,admissionReviewVersions=v1,groups=autoscaling.mercari.com,resources=scheduledscalings,verbs=create;update,versions=v1alpha1,name=vscheduledscaling.kb.io + +var _ webhook.Validator = &ScheduledScaling{} + +// ValidateCreate implements webhook.Validator so a webhook will be registered for the type +func (r *ScheduledScaling) ValidateCreate() (admission.Warnings, error) { + scheduledscalinglog.Info("validate create", "name", r.Name) + + if err := r.validateSpec(); err != nil { + return nil, err + } + + return nil, nil +} + +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type +func (r *ScheduledScaling) ValidateUpdate(old runtime.Object) (admission.Warnings, error) { + scheduledscalinglog.Info("validate update", "name", r.Name) + + if err := r.validateSpec(); err != nil { + return nil, err + } + + return nil, nil +} + +// ValidateDelete implements webhook.Validator so a webhook will be registered for the type +func (r *ScheduledScaling) ValidateDelete() (admission.Warnings, error) { + scheduledscalinglog.Info("validate delete", "name", r.Name) + + // No validation needed for delete + return nil, nil +} + +// validateSpec validates the ScheduledScaling spec +func (r *ScheduledScaling) validateSpec() error { + var allErrs field.ErrorList + + // Validate that at least one of minimumMinReplicas or minAllocatedResources is present + if err := r.validateAtLeastOneScalingParameter(); err != nil { + allErrs = append(allErrs, err) + } + + // Validate schedule + if err := r.Spec.Schedule.Validate(); err != nil { + allErrs = append(allErrs, field.Invalid( + field.NewPath("spec", "schedule"), + r.Spec.Schedule, + err.Error(), + )) + } + + if len(allErrs) == 0 { + return nil + } + + return fmt.Errorf("validation failed: %v", allErrs) +} + +// validateAtLeastOneScalingParameter ensures that at least one of minimumMinReplicas, minAllocatedResources, or containerMinAllocatedResources is present +func (r *ScheduledScaling) validateAtLeastOneScalingParameter() *field.Error { + static := r.Spec.Strategy.Static + + // Check if all scaling parameters are missing + if static.MinimumMinReplicas == nil && + static.MinAllocatedResources == nil && + len(static.ContainerMinAllocatedResources) == 0 { + return field.Invalid( + field.NewPath("spec", "strategy", "static"), + r.Spec.Strategy.Static, + "at least one of 'minimumMinReplicas', 'minAllocatedResources', or 'containerMinAllocatedResources' must be specified", + ) + } + + return nil +} + +// GetObjectKind implements runtime.Object +func (r *ScheduledScaling) GetObjectKind() schema.ObjectKind { + return &r.TypeMeta +} diff --git a/api/v1alpha1/scheduledscaling_webhook_test.go b/api/v1alpha1/scheduledscaling_webhook_test.go new file mode 100644 index 00000000..ec7de1fc --- /dev/null +++ b/api/v1alpha1/scheduledscaling_webhook_test.go @@ -0,0 +1,197 @@ +package v1alpha1 + +import ( + "testing" +) + +func TestScheduledScaling_ValidateAtLeastOneScalingParameter(t *testing.T) { + tests := []struct { + name string + ss *ScheduledScaling + wantErr bool + }{ + { + name: "both fields present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "all three scaling parameters present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + ContainerMinAllocatedResources: []ContainerResourceRequirements{ + { + ContainerName: "sidecar", + Resources: ResourceRequirements{CPU: "100m", Memory: "128Mi"}, + }, + }, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "only minimumMinReplicas present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + }, + }, + }, + }, + wantErr: false, + }, + { + name: "only minAllocatedResources present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "only containerMinAllocatedResources present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + ContainerMinAllocatedResources: []ContainerResourceRequirements{ + { + ContainerName: "app", + Resources: ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "both fields missing - should fail", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{}, + }, + }, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.ss.validateAtLeastOneScalingParameter() + if (err != nil) != tt.wantErr { + t.Errorf("ScheduledScaling.validateAtLeastOneScalingParameter() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestScheduledScaling_ValidateSpec(t *testing.T) { + tests := []struct { + name string + ss *ScheduledScaling + wantErr bool + }{ + { + name: "valid spec with both fields", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + TargetRefs: TargetRefs{ + TortoiseName: "test-tortoise", + }, + }, + }, + wantErr: false, + }, + { + name: "valid spec with only minimumMinReplicas", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + }, + }, + TargetRefs: TargetRefs{ + TortoiseName: "test-tortoise", + }, + }, + }, + wantErr: false, + }, + { + name: "invalid spec - missing both scaling parameters", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + Strategy: Strategy{ + Static: StaticStrategy{}, + }, + TargetRefs: TargetRefs{ + TortoiseName: "test-tortoise", + }, + }, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.ss.validateSpec() + if (err != nil) != tt.wantErr { + t.Errorf("ScheduledScaling.validateSpec() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +// Helper function to create int32 pointers +func int32Ptr(i int32) *int32 { + return &i +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 9716d2fd..9cef15d0 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -30,15 +30,46 @@ SOFTWARE. package v1alpha1 import ( - runtime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ContainerResourceRequirements) DeepCopyInto(out *ContainerResourceRequirements) { + *out = *in + out.Resources = in.Resources +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerResourceRequirements. +func (in *ContainerResourceRequirements) DeepCopy() *ContainerResourceRequirements { + if in == nil { + return nil + } + out := new(ContainerResourceRequirements) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceRequirements) DeepCopyInto(out *ResourceRequirements) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceRequirements. +func (in *ResourceRequirements) DeepCopy() *ResourceRequirements { + if in == nil { + return nil + } + out := new(ResourceRequirements) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ScheduledScaling) DeepCopyInto(out *ScheduledScaling) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec + in.Spec.DeepCopyInto(&out.Spec) in.Status.DeepCopyInto(&out.Status) } @@ -92,6 +123,24 @@ func (in *ScheduledScalingList) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScalingSpec) DeepCopyInto(out *ScheduledScalingSpec) { + *out = *in + out.Schedule = in.Schedule + out.TargetRefs = in.TargetRefs + in.Strategy.DeepCopyInto(&out.Strategy) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScalingSpec. +func (in *ScheduledScalingSpec) DeepCopy() *ScheduledScalingSpec { + if in == nil { + return nil + } + out := new(ScheduledScalingSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ScheduledScalingStatus) DeepCopyInto(out *ScheduledScalingStatus) { *out = *in @@ -119,3 +168,49 @@ func (in *ScheduledScalingStatus) DeepCopy() *ScheduledScalingStatus { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StaticStrategy) DeepCopyInto(out *StaticStrategy) { + *out = *in + if in.MinimumMinReplicas != nil { + in, out := &in.MinimumMinReplicas, &out.MinimumMinReplicas + *out = new(int32) + **out = **in + } + if in.MinAllocatedResources != nil { + in, out := &in.MinAllocatedResources, &out.MinAllocatedResources + *out = new(ResourceRequirements) + **out = **in + } + if in.ContainerMinAllocatedResources != nil { + in, out := &in.ContainerMinAllocatedResources, &out.ContainerMinAllocatedResources + *out = make([]ContainerResourceRequirements, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StaticStrategy. +func (in *StaticStrategy) DeepCopy() *StaticStrategy { + if in == nil { + return nil + } + out := new(StaticStrategy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Strategy) DeepCopyInto(out *Strategy) { + *out = *in + in.Static.DeepCopyInto(&out.Static) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Strategy. +func (in *Strategy) DeepCopy() *Strategy { + if in == nil { + return nil + } + out := new(Strategy) + in.DeepCopyInto(out) + return out +} diff --git a/cmd/main.go b/cmd/main.go index 98e4d327..bf39df24 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -47,6 +47,7 @@ import ( autoscalingv2 "github.com/mercari/tortoise/api/autoscaling/v2" v1 "github.com/mercari/tortoise/api/core/v1" + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" "github.com/mercari/tortoise/internal/controller" "github.com/mercari/tortoise/pkg/config" @@ -70,6 +71,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(autoscalingv1alpha1.AddToScheme(scheme)) utilruntime.Must(autoscalingv1beta3.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -193,6 +195,10 @@ func main() { setupLog.Error(err, "unable to create webhook", "webhook", "Tortoise") os.Exit(1) } + if err = (&autoscalingv1alpha1.ScheduledScaling{}).SetupWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "ScheduledScaling") + os.Exit(1) + } //+kubebuilder:scaffold:builder hpaWebhook := autoscalingv2.New(tortoiseService, hpaService) diff --git a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml index 046e3fe8..d6199686 100644 --- a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml +++ b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml @@ -120,9 +120,42 @@ spec: static: description: Static defines static scaling parameters properties: + containerMinAllocatedResources: + description: |- + ContainerMinAllocatedResources sets container-specific minimum allocated resources + If specified, this takes precedence over MinAllocatedResources for specific containers + items: + description: ContainerResourceRequirements describes container-specific + resource requirements + properties: + containerName: + description: ContainerName specifies which container + these resources apply to + type: string + resources: + description: Resources specifies the resource requirements + for this container + properties: + cpu: + description: CPU specifies the CPU resource requirements + type: string + memory: + description: Memory specifies the memory resource + requirements + type: string + required: + - cpu + - memory + type: object + required: + - containerName + - resources + type: object + type: array minAllocatedResources: - description: MinAllocatedResources sets the minimum allocated - resources during scaling + description: |- + MinAllocatedResources sets the minimum allocated resources during scaling + This can be either global (applied to all containers) or container-specific properties: cpu: description: CPU specifies the CPU resource requirements @@ -140,9 +173,6 @@ spec: format: int32 minimum: 1 type: integer - required: - - minAllocatedResources - - minimumMinReplicas type: object required: - static diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 0b8a979c..c5512602 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -109,3 +109,23 @@ webhooks: resources: - horizontalpodautoscalers sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /validate-autoscaling-mercari-com-v1alpha1-scheduledscaling + failurePolicy: Fail + name: vscheduledscaling.kb.io + rules: + - apiGroups: + - autoscaling.mercari.com + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - scheduledscalings + sideEffects: None diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 3ff8dc21..65082153 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -290,84 +290,151 @@ func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, t.Annotations[annOriginal] = string(orig) } - // Desired resource minimums from strategy - dCPU := scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU - dMem := scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory - var qCPU, qMem resource.Quantity - var hasCPU, hasMem bool - if dCPU != "" { - v, err := resource.ParseQuantity(dCPU) - if err != nil { - return fmt.Errorf("invalid cpu quantity %q: %w", dCPU, err) + // Handle resource scaling - both global and container-specific + updated := false + + // Process global resource requirements (applied to all containers) + var globalCPU, globalMemory resource.Quantity + var hasGlobalCPU, hasGlobalMemory bool + + if scheduledScaling.Spec.Strategy.Static.MinAllocatedResources != nil { + if scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU != "" { + v, err := resource.ParseQuantity(scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU) + if err != nil { + return fmt.Errorf("invalid global cpu quantity %q: %w", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU, err) + } + globalCPU = v + hasGlobalCPU = true + } + if scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory != "" { + v, err := resource.ParseQuantity(scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory) + if err != nil { + return fmt.Errorf("invalid global memory quantity %q: %w", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory, err) + } + globalMemory = v + hasGlobalMemory = true } - qCPU = v - hasCPU = true } - if dMem != "" { - v, err := resource.ParseQuantity(dMem) - if err != nil { - return fmt.Errorf("invalid memory quantity %q: %w", dMem, err) + + // Process container-specific resource requirements + containerResources := make(map[string]struct { + cpu resource.Quantity + memory resource.Quantity + hasCPU bool + hasMem bool + }) + + if scheduledScaling.Spec.Strategy.Static.ContainerMinAllocatedResources != nil { + for _, containerReq := range scheduledScaling.Spec.Strategy.Static.ContainerMinAllocatedResources { + var cpu, mem resource.Quantity + var hasCPU, hasMem bool + + if containerReq.Resources.CPU != "" { + v, err := resource.ParseQuantity(containerReq.Resources.CPU) + if err != nil { + return fmt.Errorf("invalid cpu quantity for container %s: %q: %w", containerReq.ContainerName, containerReq.Resources.CPU, err) + } + cpu = v + hasCPU = true + } + if containerReq.Resources.Memory != "" { + v, err := resource.ParseQuantity(containerReq.Resources.Memory) + if err != nil { + return fmt.Errorf("invalid memory quantity for container %s: %q: %w", containerReq.ContainerName, containerReq.Resources.Memory, err) + } + mem = v + hasMem = true + } + + containerResources[containerReq.ContainerName] = struct { + cpu resource.Quantity + memory resource.Quantity + hasCPU bool + hasMem bool + }{cpu, mem, hasCPU, hasMem} } - qMem = v - hasMem = true } - updated := false + // Apply resource requirements to each container in the tortoise for i := range t.Spec.ResourcePolicy { pol := &t.Spec.ResourcePolicy[i] + containerName := pol.ContainerName + + // Initialize resource lists if they don't exist if pol.MinAllocatedResources == nil { pol.MinAllocatedResources = v1.ResourceList{} } - if hasCPU { - curr := pol.MinAllocatedResources[v1.ResourceCPU] - if curr.Cmp(qCPU) < 0 { - pol.MinAllocatedResources[v1.ResourceCPU] = qCPU - updated = true - } + if pol.MaxAllocatedResources == nil { + pol.MaxAllocatedResources = v1.ResourceList{} } - if hasMem { - curr := pol.MinAllocatedResources[v1.ResourceMemory] - if curr.Cmp(qMem) < 0 { - pol.MinAllocatedResources[v1.ResourceMemory] = qMem - updated = true - } + + // Check if we have container-specific resources for this container + containerSpec, hasContainerSpec := containerResources[containerName] + + // Apply CPU resources + if hasContainerSpec && containerSpec.hasCPU { + // Container-specific CPU takes precedence + pol.MinAllocatedResources[v1.ResourceCPU] = containerSpec.cpu + pol.MaxAllocatedResources[v1.ResourceCPU] = containerSpec.cpu + updated = true + } else if hasGlobalCPU { + // Apply global CPU to this container + pol.MinAllocatedResources[v1.ResourceCPU] = globalCPU + pol.MaxAllocatedResources[v1.ResourceCPU] = globalCPU + updated = true } - } - if m := scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas; m > 0 { - // Validate that the requested minReplicas is not lower than HPA's recommendation - isValid, warningMsg, recommendedValue := r.validateMinReplicasAgainstHPARecommendation(ctx, t, m) - - // Determine what value to actually use - effectiveMinReplicas := m - if !isValid && recommendedValue > 0 { - // Use the HPA's recommended value instead of the requested value - effectiveMinReplicas = recommendedValue - log.FromContext(ctx).Info("ScheduledScaling minReplicas using HPA recommendation instead of requested value", - "tortoise", t.Name, - "requested", m, - "using", effectiveMinReplicas, - "warning", warningMsg) - } else if !isValid { - // Log the warning but still apply the requested change - log.FromContext(ctx).Info("ScheduledScaling minReplicas validation warning", - "tortoise", t.Name, - "requested", m, - "warning", warningMsg) + // Apply Memory resources + if hasContainerSpec && containerSpec.hasMem { + // Container-specific memory takes precedence + pol.MinAllocatedResources[v1.ResourceMemory] = containerSpec.memory + pol.MaxAllocatedResources[v1.ResourceMemory] = containerSpec.memory + updated = true + } else if hasGlobalMemory { + // Apply global memory to this container + pol.MinAllocatedResources[v1.ResourceMemory] = globalMemory + pol.MaxAllocatedResources[v1.ResourceMemory] = globalMemory + updated = true } + } - // Update the status message to include the warning for user visibility - if !isValid && warningMsg != "" { - if scheduledScaling.Status.Message == "" { - scheduledScaling.Status.Message = warningMsg - } else { - scheduledScaling.Status.Message = scheduledScaling.Status.Message + "; " + warningMsg + if scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas != nil { + m := *scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas + if m > 0 { + // Validate that the requested minReplicas is not lower than HPA's recommendation + isValid, warningMsg, recommendedValue := r.validateMinReplicasAgainstHPARecommendation(ctx, t, m) + + // Determine what value to actually use + effectiveMinReplicas := m + if !isValid && recommendedValue > 0 { + // Use the HPA's recommended value instead of the requested value + effectiveMinReplicas = recommendedValue + log.FromContext(ctx).Info("ScheduledScaling minReplicas using HPA recommendation instead of requested value", + "tortoise", t.Name, + "requested", m, + "using", effectiveMinReplicas, + "warning", warningMsg) + } else if !isValid { + // Log the warning but still apply the requested change + log.FromContext(ctx).Info("ScheduledScaling minReplicas validation warning", + "tortoise", t.Name, + "requested", m, + "warning", warningMsg) } - } - // Store the effective minReplicas value (either requested or HPA recommended) - t.Annotations[annMinReplicas] = fmt.Sprintf("%d", effectiveMinReplicas) - updated = true + // Update the status message to include the warning for user visibility + if !isValid && warningMsg != "" { + if scheduledScaling.Status.Message == "" { + scheduledScaling.Status.Message = warningMsg + } else { + scheduledScaling.Status.Message = scheduledScaling.Status.Message + "; " + warningMsg + } + } + + // Store the effective minReplicas value (either requested or HPA recommended) + t.Annotations[annMinReplicas] = fmt.Sprintf("%d", effectiveMinReplicas) + updated = true + } } if !updated { @@ -629,7 +696,7 @@ func (r *ScheduledScalingReconciler) validateMinReplicasAgainstHPARecommendation } // Check if Tortoise has HPA recommendations - if tortoise.Status.Recommendations.Horizontal.MinReplicas == nil || len(tortoise.Status.Recommendations.Horizontal.MinReplicas) == 0 { + if len(tortoise.Status.Recommendations.Horizontal.MinReplicas) == 0 { logger.Info("No HPA minReplicas recommendations available for validation", "tortoise", tortoise.Name) return true, "", 0 } diff --git a/internal/controller/scheduledscaling_controller_test.go b/internal/controller/scheduledscaling_controller_test.go index d659eed6..2d51c750 100644 --- a/internal/controller/scheduledscaling_controller_test.go +++ b/internal/controller/scheduledscaling_controller_test.go @@ -32,6 +32,11 @@ import ( . "github.com/onsi/gomega" ) +// Helper function to create int32 pointers +func int32Ptr(i int32) *int32 { + return &i +} + var _ = Describe("ScheduledScaling Controller", func() { const ( timeout = time.Second * 10 @@ -59,8 +64,8 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Strategy: autoscalingv1alpha1.Strategy{ Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 3, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ CPU: "500m", Memory: "512Mi", }, @@ -111,8 +116,8 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Strategy: autoscalingv1alpha1.Strategy{ Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 3, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ CPU: "500m", Memory: "512Mi", }, @@ -176,8 +181,8 @@ var _ = Describe("ScheduledScaling Controller", func() { Schedule: autoscalingv1alpha1.Schedule{Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: start, FinishAt: finish}, TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 5, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{CPU: "500m", Memory: "512Mi"}, + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{CPU: "500m", Memory: "512Mi"}, }}, }, } @@ -206,6 +211,297 @@ var _ = Describe("ScheduledScaling Controller", func() { g.Expect(pol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("128Mi"))).To(BeTrue()) }, time.Second*12, interval).Should(Succeed()) }) + + It("should apply container-specific resources and override both min and max", func() { + ctx := context.Background() + + // Create a baseline Tortoise with multiple containers and max resources + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-container-specific", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2000m"), + v1.ResourceMemory: resource.MustParse("2Gi"), + }, + }, + { + ContainerName: "sidecar", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("50m"), + v1.ResourceMemory: resource.MustParse("64Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("500m"), + v1.ResourceMemory: resource.MustParse("512Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Prepare a ScheduledScaling with container-specific resources + start := time.Now().Add(-1 * time.Second).Format(time.RFC3339) + finish := time.Now().Add(3 * time.Second).Format(time.RFC3339) + ss := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{Name: "test-container-specific", Namespace: "default"}, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: start, FinishAt: finish}, + TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, + Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ + ContainerMinAllocatedResources: []autoscalingv1alpha1.ContainerResourceRequirements{ + { + ContainerName: "app", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "1500m", + Memory: "1.5Gi", + }, + }, + { + ContainerName: "sidecar", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "300m", + Memory: "256Mi", + }, + }, + }, + }}, + }, + } + Expect(k8sClient.Create(ctx, ss)).Should(Succeed()) + + // During active window, Tortoise should be updated with container-specific resources + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(2)) + + // Check app container + appPol := cur.Spec.ResourcePolicy[0] + if appPol.ContainerName == "sidecar" { + appPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(appPol.ContainerName).To(Equal("app")) + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + // Max resources should also be overridden + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + + // Check sidecar container + sidecarPol := cur.Spec.ResourcePolicy[0] + if sidecarPol.ContainerName == "app" { + sidecarPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(sidecarPol.ContainerName).To(Equal("sidecar")) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + // Max resources should also be overridden + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + }, timeout, interval).Should(Succeed()) + + // After the window finishes, original spec should be restored + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(2)) + + // Check app container restoration + appPol := cur.Spec.ResourcePolicy[0] + if appPol.ContainerName == "sidecar" { + appPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(appPol.ContainerName).To(Equal("app")) + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("100m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("128Mi"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("2000m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("2Gi"))).To(BeTrue()) + + // Check sidecar container restoration + sidecarPol := cur.Spec.ResourcePolicy[0] + if sidecarPol.ContainerName == "app" { + sidecarPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(sidecarPol.ContainerName).To(Equal("sidecar")) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("50m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("64Mi"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("500m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("512Mi"))).To(BeTrue()) + }, time.Second*12, interval).Should(Succeed()) + }) + + It("should apply mixed global and container-specific resources with proper precedence", func() { + ctx := context.Background() + + // Create a baseline Tortoise with multiple containers + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-mixed", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2000m"), + v1.ResourceMemory: resource.MustParse("2Gi"), + }, + }, + { + ContainerName: "sidecar", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("50m"), + v1.ResourceMemory: resource.MustParse("64Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("500m"), + v1.ResourceMemory: resource.MustParse("512Mi"), + }, + }, + { + ContainerName: "monitoring", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("25m"), + v1.ResourceMemory: resource.MustParse("32Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: resource.MustParse("256Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Prepare a ScheduledScaling with both global and container-specific resources + start := time.Now().Add(-1 * time.Second).Format(time.RFC3339) + finish := time.Now().Add(3 * time.Second).Format(time.RFC3339) + ss := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{Name: "test-mixed-resources", Namespace: "default"}, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: start, FinishAt: finish}, + TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, + Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ + // Global resources (fallback for containers without specific specs) + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "800m", + Memory: "1Gi", + }, + // Container-specific resources (take precedence) + ContainerMinAllocatedResources: []autoscalingv1alpha1.ContainerResourceRequirements{ + { + ContainerName: "app", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "1500m", + Memory: "1.5Gi", + }, + }, + { + ContainerName: "sidecar", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "300m", + Memory: "256Mi", + }, + }, + // monitoring container will use global resources + }, + }}, + }, + } + Expect(k8sClient.Create(ctx, ss)).Should(Succeed()) + + // During active window, verify mixed resource application + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(3)) + + // Find each container policy + var appPol, sidecarPol, monitoringPol *autoscalingv1beta3.ContainerResourcePolicy + for i := range cur.Spec.ResourcePolicy { + switch cur.Spec.ResourcePolicy[i].ContainerName { + case "app": + appPol = &cur.Spec.ResourcePolicy[i] + case "sidecar": + sidecarPol = &cur.Spec.ResourcePolicy[i] + case "monitoring": + monitoringPol = &cur.Spec.ResourcePolicy[i] + } + } + + // App container: should use container-specific resources + g.Expect(appPol).ToNot(BeNil()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + + // Sidecar container: should use container-specific resources + g.Expect(sidecarPol).ToNot(BeNil()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + + // Monitoring container: should use global resources + g.Expect(monitoringPol).ToNot(BeNil()) + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("800m"))).To(BeTrue()) + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1Gi"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("800m"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1Gi"))).To(BeTrue()) + }, timeout, interval).Should(Succeed()) + + // After the window finishes, original spec should be restored + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(3)) + + // Find each container policy + var appPol, sidecarPol, monitoringPol *autoscalingv1beta3.ContainerResourcePolicy + for i := range cur.Spec.ResourcePolicy { + switch cur.Spec.ResourcePolicy[i].ContainerName { + case "app": + appPol = &cur.Spec.ResourcePolicy[i] + case "sidecar": + sidecarPol = &cur.Spec.ResourcePolicy[i] + case "monitoring": + monitoringPol = &cur.Spec.ResourcePolicy[i] + } + } + + // Verify all containers have original resources restored + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("100m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("128Mi"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("2000m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("2Gi"))).To(BeTrue()) + + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("50m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("64Mi"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("500m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("512Mi"))).To(BeTrue()) + + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("25m"))).To(BeTrue()) + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("32Mi"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("200m"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + }, time.Second*12, interval).Should(Succeed()) + }) }) Context("Cron-based Scheduled Scaling", func() { @@ -247,8 +543,8 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Strategy: autoscalingv1alpha1.Strategy{ Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 5, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ CPU: "1000m", Memory: "1Gi", }, @@ -326,8 +622,8 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Strategy: autoscalingv1alpha1.Strategy{ Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 3, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ CPU: "500m", Memory: "512Mi", }, @@ -400,8 +696,8 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Strategy: autoscalingv1alpha1.Strategy{ Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 3, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ CPU: "500m", Memory: "512Mi", }, @@ -474,8 +770,8 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Strategy: autoscalingv1alpha1.Strategy{ Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 3, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ CPU: "500m", Memory: "512Mi", }, @@ -549,8 +845,8 @@ var _ = Describe("ScheduledScaling Controller", func() { }, Strategy: autoscalingv1alpha1.Strategy{ Static: autoscalingv1alpha1.StaticStrategy{ - MinimumMinReplicas: 3, - MinAllocatedResources: autoscalingv1alpha1.ResourceRequirements{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ CPU: "500m", Memory: "512Mi", }, From 9430d3c9d295f5658f17036cc6c419edde776cd9 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 11 Sep 2025 11:10:43 +0900 Subject: [PATCH 18/22] modify delete finalizer code --- .../controller/scheduledscaling_controller.go | 133 +++++++++++++----- 1 file changed, 101 insertions(+), 32 deletions(-) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index 65082153..ef2bfd41 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -525,6 +525,7 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, phase autoscalingv1alpha1.ScheduledScalingPhase, reason, message string) error { // Skip status updates if the object is being deleted if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).V(1).Info("Skipping status update for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) return nil } @@ -538,6 +539,15 @@ func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduled scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() if err := r.Status().Update(ctx, scheduledScaling); err != nil { + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update status + log.FromContext(ctx).Info("ScheduledScaling was deleted while updating status", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + if apierrors.IsConflict(err) { + // Resource was modified, status update will be retried on next reconciliation + log.FromContext(ctx).V(1).Info("Conflict detected during status update, will retry", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } return fmt.Errorf("failed to update status: %w", err) } } @@ -549,6 +559,7 @@ func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduled func (r *ScheduledScalingReconciler) updateTimeBasedStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, startTime, finishTime time.Time) error { // Skip status updates if the object is being deleted if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).V(1).Info("Skipping time-based status update for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) return nil } @@ -569,6 +580,15 @@ func (r *ScheduledScalingReconciler) updateTimeBasedStatus(ctx context.Context, scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() if err := r.Status().Update(ctx, scheduledScaling); err != nil { + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update status + log.FromContext(ctx).Info("ScheduledScaling was deleted while updating time-based status", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + if apierrors.IsConflict(err) { + // Resource was modified, status update will be retried on next reconciliation + log.FromContext(ctx).V(1).Info("Conflict detected during time-based status update, will retry", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } return fmt.Errorf("failed to update time-based status: %w", err) } @@ -579,6 +599,7 @@ func (r *ScheduledScalingReconciler) updateTimeBasedStatus(ctx context.Context, func (r *ScheduledScalingReconciler) updateCronStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, currentStart, currentEnd, nextStart *time.Time) error { // Skip status updates if the object is being deleted if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).V(1).Info("Skipping cron status update for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) return nil } @@ -603,6 +624,15 @@ func (r *ScheduledScalingReconciler) updateCronStatus(ctx context.Context, sched scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() if err := r.Status().Update(ctx, scheduledScaling); err != nil { + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update status + log.FromContext(ctx).Info("ScheduledScaling was deleted while updating cron status", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + if apierrors.IsConflict(err) { + // Resource was modified, status update will be retried on next reconciliation + log.FromContext(ctx).V(1).Info("Conflict detected during cron status update, will retry", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } return fmt.Errorf("failed to update cron status: %w", err) } @@ -619,47 +649,82 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu if controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { logger.Info("Running finalizer cleanup for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - // Only perform cleanup if the resource was ever active (has annotations or was in Active phase) - // If it's still in Pending status, no cleanup is needed - if scheduledScaling.Status.Phase == autoscalingv1alpha1.ScheduledScalingPhasePending { - logger.Info("ScheduledScaling was never active, skipping cleanup", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - } else { - // Try to restore the Tortoise to its original state and clean up annotations - // If the Tortoise doesn't exist anymore, that's fine - just log it - logger.Info("Starting Tortoise cleanup and annotation removal", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - - // Create a timeout context for the cleanup operation to prevent it from getting stuck - cleanupCtx, cancel := context.WithTimeout(ctx, 30*time.Second) - defer cancel() - - if err := r.applyNormalScaling(cleanupCtx, scheduledScaling); err != nil { - if client.IgnoreNotFound(err) == nil { - // Tortoise was not found (likely already deleted), which is fine - logger.Info("Tortoise not found during finalizer cleanup (likely already deleted)", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - } else if cleanupCtx.Err() == context.DeadlineExceeded { - // Cleanup timed out - log warning but continue with finalizer removal - logger.Info("Tortoise cleanup timed out during finalizer cleanup, but continuing with finalizer removal to prevent stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - } else { - // Some other error occurred - log it but don't fail the finalizer removal - // This prevents the resource from getting permanently stuck in deletion - logger.Error(err, "Failed to restore Tortoise during finalizer cleanup, but continuing with finalizer removal to prevent stuck deletion") - } + // Always attempt cleanup, regardless of phase, to ensure we don't leave orphaned annotations + logger.Info("Starting Tortoise cleanup and annotation removal", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + // Create a timeout context for the cleanup operation to prevent it from getting stuck + // Increase timeout to 60 seconds to handle slow API responses + cleanupCtx, cancel := context.WithTimeout(ctx, 60*time.Second) + defer cancel() + + // Attempt cleanup but don't let it block deletion + cleanupErr := r.applyNormalScaling(cleanupCtx, scheduledScaling) + if cleanupErr != nil { + if client.IgnoreNotFound(cleanupErr) == nil { + // Tortoise was not found (likely already deleted), which is fine + logger.Info("Tortoise not found during finalizer cleanup (likely already deleted)", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else if cleanupCtx.Err() == context.DeadlineExceeded { + // Cleanup timed out - log warning but continue with finalizer removal + logger.Info("Tortoise cleanup timed out during finalizer cleanup, continuing with finalizer removal to prevent stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "timeout", "60s") } else { - logger.Info("Successfully cleaned up Tortoise annotations and restored original spec", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + // Some other error occurred - log it but don't fail the finalizer removal + // This prevents the resource from getting permanently stuck in deletion + logger.Error(cleanupErr, "Failed to restore Tortoise during finalizer cleanup, continuing with finalizer removal to prevent stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } + } else { + logger.Info("Successfully cleaned up Tortoise annotations and restored original spec", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) } // Remove the finalizer - this should always succeed to prevent stuck deletion controllerutil.RemoveFinalizer(scheduledScaling, scheduledScalingFinalizer) logger.Info("Removing finalizer from ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) - if err := r.Update(ctx, scheduledScaling); err != nil { - // If update fails, it might be because the resource is being modified elsewhere. - // Returning an error will trigger a requeue, and the next reconciliation will attempt to remove the finalizer again. - if apierrors.IsConflict(err) { - logger.Info("Conflict detected when removing finalizer, requeueing", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + // Retry finalizer removal with exponential backoff in case of conflicts + maxRetries := 3 + for attempt := 0; attempt < maxRetries; attempt++ { + // Refetch the resource to get the latest version before updating + latest := &autoscalingv1alpha1.ScheduledScaling{} + if err := r.Get(ctx, client.ObjectKeyFromObject(scheduledScaling), latest); err != nil { + if apierrors.IsNotFound(err) { + // Resource was already deleted, nothing more to do + logger.Info("ScheduledScaling resource was already deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + logger.Error(err, "Failed to refetch ScheduledScaling for finalizer removal", "attempt", attempt+1) + if attempt == maxRetries-1 { + return fmt.Errorf("failed to refetch resource for finalizer removal after %d attempts: %w", maxRetries, err) + } + time.Sleep(time.Duration(attempt+1) * time.Second) + continue } - return fmt.Errorf("failed to remove finalizer: %w", err) + + // Remove finalizer from the latest version + controllerutil.RemoveFinalizer(latest, scheduledScalingFinalizer) + + if err := r.Update(ctx, latest); err != nil { + if apierrors.IsConflict(err) { + logger.Info("Conflict detected when removing finalizer, retrying", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "attempt", attempt+1) + if attempt < maxRetries-1 { + time.Sleep(time.Duration(attempt+1) * time.Second) + continue + } + } + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update it + logger.Info("ScheduledScaling was deleted while removing finalizer", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + logger.Error(err, "Failed to remove finalizer", "attempt", attempt+1) + if attempt == maxRetries-1 { + return fmt.Errorf("failed to remove finalizer after %d attempts: %w", maxRetries, err) + } + time.Sleep(time.Duration(attempt+1) * time.Second) + continue + } + + // Successfully removed finalizer + logger.Info("Successfully removed finalizer from ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + break } logger.Info("Finalizer cleanup completed for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) @@ -673,6 +738,10 @@ func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, schedu if scheduledScaling.DeletionTimestamp == nil { controllerutil.AddFinalizer(scheduledScaling, scheduledScalingFinalizer) if err := r.Update(ctx, scheduledScaling); err != nil { + if apierrors.IsConflict(err) { + logger.Info("Conflict detected when adding finalizer, will retry on next reconciliation", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return fmt.Errorf("failed to add finalizer due to conflict: %w", err) + } return fmt.Errorf("failed to add finalizer: %w", err) } logger.Info("Added finalizer to ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) From 3c5eaea247638b1f9854d285095b0a3c256643ce Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 11 Sep 2025 11:39:28 +0900 Subject: [PATCH 19/22] register the scheduledscaling controller in main.go --- cmd/main.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmd/main.go b/cmd/main.go index bf39df24..ac9b88e9 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -191,6 +191,13 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Tortoise") os.Exit(1) } + if err = (&controller.ScheduledScalingReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ScheduledScaling") + os.Exit(1) + } if err = (&autoscalingv1beta3.Tortoise{}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Tortoise") os.Exit(1) From 3abd68581fd1be9c319f8dd0c460f2717b8db0db Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 11 Sep 2025 12:44:20 +0900 Subject: [PATCH 20/22] update hpa minreplica with tortoise annotation --- .../controller/scheduledscaling_controller.go | 54 +++++++++++++++++-- pkg/hpa/service.go | 33 +++++++++++- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go index ef2bfd41..75b15231 100644 --- a/internal/controller/scheduledscaling_controller.go +++ b/internal/controller/scheduledscaling_controller.go @@ -431,8 +431,10 @@ func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, } } - // Store the effective minReplicas value (either requested or HPA recommended) + // Store the effective minReplicas value in annotation for the Tortoise controller to read t.Annotations[annMinReplicas] = fmt.Sprintf("%d", effectiveMinReplicas) + log.FromContext(ctx).Info("Setting HPA minReplicas annotation for Tortoise controller", "tortoise", t.Name, "minReplicas", effectiveMinReplicas) + updated = true } } @@ -441,8 +443,48 @@ func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, log.FromContext(ctx).Info("Scheduled scaling made no changes to tortoise spec", "tortoise", t.Name) return nil } - if err := r.Update(ctx, t); err != nil { - return fmt.Errorf("update tortoise: %w", err) + + // Retry update with conflict resolution + maxRetries := 3 + for attempt := 0; attempt < maxRetries; attempt++ { + if err := r.Update(ctx, t); err != nil { + if apierrors.IsConflict(err) && attempt < maxRetries-1 { + // Refetch the latest version and retry + log.FromContext(ctx).Info("Conflict detected when updating tortoise, refetching and retrying", "tortoise", t.Name, "attempt", attempt+1) + + latest := &autoscalingv1beta3.Tortoise{} + if err := r.Get(ctx, client.ObjectKeyFromObject(t), latest); err != nil { + return fmt.Errorf("failed to refetch tortoise after conflict: %w", err) + } + + // Preserve existing HPA reference if present and not already specified in spec + if latest.Spec.TargetRefs.HorizontalPodAutoscalerName == nil && latest.Status.Targets.HorizontalPodAutoscaler != "" { + latest.Spec.TargetRefs.HorizontalPodAutoscalerName = &latest.Status.Targets.HorizontalPodAutoscaler + } + + // Re-apply the changes to the latest version + if latest.Annotations == nil { + latest.Annotations = map[string]string{} + } + + // Copy our changes to the latest version + for k, v := range t.Annotations { + if k == "autoscaling.mercari.com/scheduledscaling-original-spec" || k == "autoscaling.mercari.com/scheduledscaling-min-replicas" { + latest.Annotations[k] = v + } + } + + // Copy resource policy changes + latest.Spec.ResourcePolicy = t.Spec.ResourcePolicy + + t = latest + time.Sleep(time.Duration(attempt+1) * time.Second) + continue + } + return fmt.Errorf("update tortoise: %w", err) + } + // Update succeeded + break } return nil } @@ -482,6 +524,9 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch spec.TargetRefs.HorizontalPodAutoscalerName = t.Spec.TargetRefs.HorizontalPodAutoscalerName } + // HPA minReplicas restoration is handled by removing the annotation + // The Tortoise controller will restore the original HPA minReplicas when the annotation is removed + logger.Info("applyNormalScaling: restoring tortoise spec", "tortoise", t.Name, "newSpec", spec) t.Spec = spec hasChanges = true @@ -518,6 +563,9 @@ func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, sch logger.Info("applyNormalScaling: no changes needed", "tortoise", t.Name) } + // HPA minReplicas restoration is handled by removing the annotation + // The Tortoise controller will restore the original HPA minReplicas when the annotation is removed + return nil } diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 83ae0a23..7f776b4b 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -8,6 +8,7 @@ import ( "reflect" "regexp" "sort" + "strconv" "time" v2 "k8s.io/api/autoscaling/v2" @@ -420,13 +421,24 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet recommendMax = *tortoise.Spec.MaxReplicas } + // ScheduledScaling: ensure max is not below the scheduled min override + var ssMinOverride int32 + if tortoise.Annotations != nil { + if v, ok := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; ok && v != "" { + if parsed, perr := strconv.ParseInt(v, 10, 32); perr == nil && parsed > 0 { + ssMinOverride = int32(parsed) + } + } + } + if ssMinOverride > 0 && recommendMax < ssMinOverride { + recommendMax = ssMinOverride + } + if recommendMax > c.maximumMaxReplica { c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, c.maximumMaxReplica, c.maximumMaxReplica)) recommendMax = c.maximumMaxReplica } - hpa.Spec.MaxReplicas = recommendMax - recommendMin, err := GetReplicasRecommendation(tortoise.Status.Recommendations.Horizontal.MinReplicas, now) if err != nil { return nil, tortoise, fmt.Errorf("get minReplicas recommendation: %w", err) @@ -463,6 +475,23 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet minToActuallyApply = recommendMin } + // ScheduledScaling: enforce configured minimum replicas during Active + if ssMinOverride > 0 && minToActuallyApply < ssMinOverride { + minToActuallyApply = ssMinOverride + } + + // Ensure consistency: MaxReplicas >= MinReplicas + if recommendMax < minToActuallyApply { + recommendMax = minToActuallyApply + if recommendMax > c.maximumMaxReplica { + recommendMax = c.maximumMaxReplica + if minToActuallyApply > recommendMax { + minToActuallyApply = recommendMax + } + } + } + + hpa.Spec.MaxReplicas = recommendMax hpa.Spec.MinReplicas = &minToActuallyApply if tortoise.Spec.UpdateMode != autoscalingv1beta3.UpdateModeOff && recordMetrics { // We don't want to record applied* metric when UpdateMode is Off. From 33edb43ecf5c1101bbec7abe9ff17149d45bacc9 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Thu, 11 Sep 2025 16:08:10 +0900 Subject: [PATCH 21/22] add grace period before moving to emeregncy mode --- pkg/hpa/service.go | 133 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 125 insertions(+), 8 deletions(-) diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 7f776b4b..6a26459b 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -9,6 +9,7 @@ import ( "regexp" "sort" "strconv" + "strings" "time" v2 "k8s.io/api/autoscaling/v2" @@ -458,7 +459,13 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet switch tortoise.Status.TortoisePhase { case autoscalingv1beta3.TortoisePhaseEmergency: // when emergency mode, we set the same value on minReplicas. - minToActuallyApply = recommendMax + // However, if ScheduledScaling is active, respect its minReplicas override + if ssMinOverride > 0 && ssMinOverride > recommendMax { + // ScheduledScaling requires higher minReplicas than emergency mode would set + minToActuallyApply = ssMinOverride + } else { + minToActuallyApply = recommendMax + } case autoscalingv1beta3.TortoisePhaseBackToNormal: // gradually reduce the minReplicas. currentMin := *hpa.Spec.MinReplicas @@ -839,26 +846,76 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin currentMetrics := currenthpa.Status.CurrentMetrics for _, condition := range conditions { - if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" { + if condition.Type == "ScalingActive" && condition.Status == "False" && isMetricFailureReason(condition.Reason) { // Always apply grace period before triggering emergency mode to handle temporary // metric unavailability during HPA updates, deployments, scheduled scaling, etc. conditionAge := time.Since(condition.LastTransitionTime.Time) - if conditionAge < c.emergencyModeGracePeriod { + gracePeriod := c.emergencyModeGracePeriod + + // Extend grace period if ScheduledScaling is active to allow more time for new pods to start + if tortoise.Annotations != nil { + if _, exists := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; exists { + // Double the grace period during ScheduledScaling to account for pod startup time + gracePeriod = gracePeriod * 2 + logger.V(1).Info("ScheduledScaling detected, extending grace period", + "originalGracePeriod", c.emergencyModeGracePeriod, + "extendedGracePeriod", gracePeriod) + } + } + + if conditionAge < gracePeriod { logger.V(1).Info("HPA metric temporarily unavailable, giving grace time before emergency mode", - "gracePeriod", c.emergencyModeGracePeriod, - "conditionAge", conditionAge) + "gracePeriod", gracePeriod, + "conditionAge", conditionAge, + "reason", condition.Reason) return true // Give grace period before emergency mode } // Grace period expired, switch to Emergency mode since no metrics - logger.Info("HPA failed to get resource metrics after grace period, switch to emergency mode", - "gracePeriod", c.emergencyModeGracePeriod, - "conditionAge", conditionAge) + logger.Info("HPA failed to get metrics after grace period, switch to emergency mode", + "gracePeriod", gracePeriod, + "conditionAge", conditionAge, + "reason", condition.Reason) return false } } hasValidMetrics := false + + // Check if currentMetrics is empty or has empty entries (common during rapid scaling) + if len(currentMetrics) == 0 { + logger.V(1).Info("HPA currentMetrics is empty, checking grace period", "hpa", currenthpa.Name) + + // Use lastScaleTime to determine grace period for empty metrics + if currenthpa.Status.LastScaleTime != nil { + timeSinceLastScale := time.Since(currenthpa.Status.LastScaleTime.Time) + gracePeriod := c.emergencyModeGracePeriod + + // Extend grace period if ScheduledScaling is active + if tortoise.Annotations != nil { + if _, exists := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; exists { + gracePeriod = gracePeriod * 2 + } + } + + if timeSinceLastScale < gracePeriod { + logger.V(1).Info("Empty metrics within grace period since last scale", + "hpa", currenthpa.Name, + "timeSinceLastScale", timeSinceLastScale, + "gracePeriod", gracePeriod) + return true + } + } + // If no lastScaleTime or grace period expired, continue to check for emergency mode + } + for _, currentMetric := range currentMetrics { + // Check for empty metric types (common during scaling) + if currentMetric.Type == "" { + // For individual empty metric entries, we'll continue processing other metrics + // The final grace period check at the end will handle the overall empty metrics situation + continue // Skip empty metric entries but continue checking other metrics + } + switch currentMetric.Type { case v2.ContainerResourceMetricSourceType: if currentMetric.ContainerResource != nil && !currentMetric.ContainerResource.Current.Value.IsZero() { @@ -870,6 +927,14 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin // External metrics are also valid hasValidMetrics = true } + case v2.ObjectMetricSourceType: + if currentMetric.Object != nil && !currentMetric.Object.Current.Value.IsZero() { + hasValidMetrics = true + } + case v2.PodsMetricSourceType: + if currentMetric.Pods != nil && !currentMetric.Pods.Current.Value.IsZero() { + hasValidMetrics = true + } } } @@ -877,6 +942,58 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin return true } + // If ScheduledScaling is active and we have no valid metrics, check if we should still give grace time + if tortoise.Annotations != nil { + if _, exists := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; exists { + // For empty metrics during ScheduledScaling, we need to check the HPA's lastScaleTime + // to determine how long the scaling has been happening + if currenthpa.Status.LastScaleTime != nil { + timeSinceLastScale := time.Since(currenthpa.Status.LastScaleTime.Time) + gracePeriod := c.emergencyModeGracePeriod * 2 // Extended grace during ScheduledScaling + + if timeSinceLastScale < gracePeriod { + logger.Info("No valid metrics during ScheduledScaling, but within grace period since last scale", + "hpa", currenthpa.Name, + "tortoise", tortoise.Name, + "timeSinceLastScale", timeSinceLastScale, + "gracePeriod", gracePeriod) + return true // Still within grace period + } else { + logger.Info("No valid metrics during ScheduledScaling and grace period expired since last scale", + "hpa", currenthpa.Name, + "tortoise", tortoise.Name, + "timeSinceLastScale", timeSinceLastScale, + "gracePeriod", gracePeriod) + // Fall through to trigger emergency mode + } + } else { + // No lastScaleTime available, give benefit of doubt during ScheduledScaling + logger.Info("No valid metrics during ScheduledScaling and no lastScaleTime, giving additional grace", + "hpa", currenthpa.Name, + "tortoise", tortoise.Name) + return true + } + } + } + logger.Info("HPA looks unready because all the metrics indicate zero", "hpa", currenthpa.Name) return false } + +// isMetricFailureReason returns true if the HPA condition reason indicates a metric failure +// that should be covered by the emergency mode grace period +func isMetricFailureReason(reason string) bool { + switch reason { + case "FailedGetResourceMetric", + "FailedGetExternalMetric", + "FailedGetObjectMetric", + "FailedGetPodsMetric", + "FailedComputeMetricsReplicas", + "FailedComputeReplicas": + return true + default: + // Also catch any other "Failed*Metric*" patterns that might be added in future Kubernetes versions + return strings.Contains(reason, "Failed") && + (strings.Contains(reason, "Metric") || strings.Contains(reason, "Compute")) + } +} From 6cc6ba318d497d97a601e5026a7578ee43408110 Mon Sep 17 00:00:00 2001 From: Bharadwaj Anupindi Date: Fri, 12 Sep 2025 13:57:36 +0900 Subject: [PATCH 22/22] updated the docs --- docs/scheduled-scaling-testing-guide.md | 372 ------------------------ docs/scheduled-scaling.md | 335 ++++++++++----------- 2 files changed, 173 insertions(+), 534 deletions(-) delete mode 100644 docs/scheduled-scaling-testing-guide.md diff --git a/docs/scheduled-scaling-testing-guide.md b/docs/scheduled-scaling-testing-guide.md deleted file mode 100644 index ed64bb9a..00000000 --- a/docs/scheduled-scaling-testing-guide.md +++ /dev/null @@ -1,372 +0,0 @@ -# Scheduled Scaling Testing Guide - -This guide provides comprehensive instructions for testing the Scheduled Scaling functionality in Tortoise. - -## Overview - -Scheduled Scaling allows users to predict and prepare for increased resource consumption by scheduling scaling operations in advance. This is particularly useful for: - -- **TV Campaigns**: When you know traffic will spike at specific times -- **Push Notifications**: Before sending notifications that will drive user engagement -- **Load Testing**: In development environments to simulate production loads -- **Scheduled Events**: Any predictable increase in resource demand - -## Architecture - -The Scheduled Scaling feature consists of: - -1. **ScheduledScaling CRD** (`autoscaling.mercari.com/v1alpha1`) -2. **ScheduledScaling Controller** - Manages the lifecycle and timing -3. **Integration with Tortoise** - Applies scaling policies to existing Tortoise resources - -## Testing Phases - -### Phase 1: Local Development Testing - -#### Prerequisites -- Go 1.22+ -- kubectl configured -- kind or minikube cluster -- Docker - -#### Setup -```bash -# Clone the repository -git clone -cd tortoise - -# Install dependencies -go mod tidy - -# Generate code -make generate -make manifests - -# Build the controller -make build -``` - -#### Run Tests -```bash -# Run unit tests -make test - -# Run specific controller tests -go test ./internal/controller/ -v - -# Run with coverage -make test-coverage -``` - -### Phase 2: Integration Testing - -#### Deploy to Local Cluster -```bash -# Deploy CRDs -make install - -# Deploy the controller -make deploy - -# Verify deployment -kubectl get pods -n tortoise-system -``` - -#### Test Basic Functionality -```bash -# Apply example resources -kubectl apply -f test/scheduled-scaling-example.yaml - -# Check resource status -kubectl get scheduledscalings -kubectl get tortoises -kubectl get deployments -``` - -### Phase 3: End-to-End Testing - -#### Use the Test Script -```bash -# Make script executable -chmod +x scripts/test-scheduled-scaling.sh - -# Run comprehensive test -./scripts/test-scheduled-scaling.sh -``` - -#### Monitor in Real-Time -```bash -# Use monitoring script -chmod +x scripts/monitor-scheduled-scaling.sh -./scripts/monitor-scheduled-scaling.sh default 5 -``` - -## Testing Scenarios - -### Scenario 1: Basic Scheduled Scaling - -**Objective**: Verify that a ScheduledScaling resource can be created and managed. - -**Steps**: -1. Create a ScheduledScaling with start time 1 minute in the future -2. Verify status transitions: Pending → Active → Completed -3. Check that Tortoise resources are updated accordingly - -**Expected Results**: -- Status phase transitions correctly -- Tortoise scaling policies are applied during active period -- Resources return to normal after completion - -### Scenario 2: Invalid Schedule Validation - -**Objective**: Ensure the controller properly validates schedule times. - -**Steps**: -1. Create a ScheduledScaling with finish time before start time -2. Verify the resource enters Failed state -3. Check error messages and reasons - -**Expected Results**: -- Resource status becomes Failed -- Appropriate error reason is set -- Controller logs show validation errors - -### Scenario 3: Resource Conflict Handling - -**Objective**: Test behavior when multiple scaling policies conflict. - -**Steps**: -1. Create a ScheduledScaling that overlaps with existing Tortoise policies -2. Verify graceful handling of conflicts -3. Check that the most appropriate policy is applied - -**Expected Results**: -- No resource corruption -- Clear logging of policy decisions -- Graceful fallback behavior - -### Scenario 4: Time Zone Handling - -**Objective**: Verify proper handling of different time zones. - -**Steps**: -1. Create ScheduledScaling resources with various time zone formats -2. Test edge cases around midnight and DST transitions -3. Verify consistent behavior across time zones - -**Expected Results**: -- Consistent behavior regardless of time zone -- Proper handling of edge cases -- Clear time format requirements - -## Verification Commands - -### Check Resource Status -```bash -# ScheduledScaling resources -kubectl get scheduledscalings -A -o wide - -# Tortoise resources -kubectl get tortoises -A -o wide - -# Deployment status -kubectl get deployments -A -o wide - -# HPA status -kubectl get hpa -A -o wide -``` - -### View Detailed Information -```bash -# ScheduledScaling details -kubectl describe scheduledscaling -n - -# Tortoise details -kubectl describe tortoise -n - -# Events -kubectl get events -n --sort-by='.lastTimestamp' -``` - -### Monitor Controller Logs -```bash -# Controller logs -kubectl logs -n tortoise-system deployment/tortoise-controller-manager -f - -# Filter for scheduled scaling events -kubectl logs -n tortoise-system deployment/tortoise-controller-manager | grep -i "scheduled" -``` - -## Troubleshooting - -### Common Issues - -#### 1. Controller Not Starting -```bash -# Check controller status -kubectl get pods -n tortoise-system - -# Check logs -kubectl logs -n tortoise-system deployment/tortoise-controller-manager - -# Verify CRDs are installed -kubectl get crd | grep scheduledscaling -``` - -#### 2. Resources Not Updating -```bash -# Check controller reconciliation -kubectl logs -n tortoise-system deployment/tortoise-controller-manager | grep -i "reconciling" - -# Verify resource ownership -kubectl get scheduledscaling -o yaml | grep -A 5 "ownerReferences" - -# Check for finalizers -kubectl get scheduledscaling -o yaml | grep -A 5 "finalizers" -``` - -#### 3. Time Parsing Errors -```bash -# Verify time format -kubectl get scheduledscaling -o yaml | grep -A 5 "schedule" - -# Check controller logs for parsing errors -kubectl logs -n tortoise-system deployment/tortoise-controller-manager | grep -i "time\|parse" -``` - -### Debug Mode - -Enable debug logging for more detailed information: - -```bash -# Update deployment with debug log level -kubectl patch deployment tortoise-controller-manager -n tortoise-system -p '{"spec":{"template":{"spec":{"containers":[{"name":"manager","env":[{"name":"LOG_LEVEL","value":"DEBUG"}]}]}}}}' - -# Restart the controller -kubectl rollout restart deployment/tortoise-controller-manager -n tortoise-system -``` - -## Performance Testing - -### Load Testing -```bash -# Create multiple ScheduledScaling resources -for i in {1..10}; do - kubectl apply -f - < -o yaml +kubectl get hpa -o yaml -1. Apply CRDs: -```bash -kubectl apply -f config/crd/bases/ +# Monitor controller logs +kubectl logs -n mercari-tortoise-lab deployment/tortoise-controller-manager -f ``` -2. Deploy the controller: -```bash -make deploy -``` +## Troubleshooting -3. Verify deployment: -```bash -kubectl get pods -n tortoise-system -``` +### Common Issues -## Monitoring and Troubleshooting +1. **Status Not Updating**: Ensure ScheduledScaling controller is registered in `main.go` +2. **Emergency Mode Override**: Check HPA conditions for metric failures; ScheduledScaling extends grace period automatically +3. **Resource Not Restored**: Finalizer ensures cleanup even if ScheduledScaling is force-deleted +4. **HPA MinReplicas Not Applied**: Verify Tortoise has HPA reference and annotation is set +5. **Conflict Errors**: Controller automatically retries with latest resource version ### Controller Logs ```bash -kubectl logs -n tortoise-system deployment/tortoise-controller-manager -c manager +# Check ScheduledScaling controller logs +kubectl logs -n mercari-tortoise-lab deployment/tortoise-controller-manager -f | grep scheduledscaling + +# Check Tortoise controller logs for HPA updates +kubectl logs -n mercari-tortoise-lab deployment/tortoise-controller-manager -f | grep -i hpa ``` -### Resource Status +### Debugging Commands ```bash +# Check ScheduledScaling status kubectl get scheduledscalings -A kubectl describe scheduledscaling -n -``` -### Common Issues - -1. **Invalid Schedule Times**: Ensure startAt is before finishAt and both are valid ISO 8601 timestamps -2. **Missing Tortoise**: Verify the target Tortoise resource exists -3. **RBAC Issues**: Check controller permissions for tortoises and scheduledscalings -4. **Time Zone Confusion**: All times are in UTC, convert local times accordingly - -## Future Enhancements - -### Planned Features - -- **Dynamic Strategies**: CPU/memory-based scaling instead of static values -- **Recurring Schedules**: Daily, weekly, or monthly recurring scaling patterns -- **Multi-Resource Targeting**: Scale multiple Tortoise resources simultaneously -- **Policy Templates**: Reusable scaling configurations -- **Metrics Integration**: Scale based on historical traffic patterns -- **Webhook Notifications**: Notify external systems of scaling events - -### API Evolution - -The feature is currently in `v1alpha1`, indicating it's experimental. Future versions may include: -- Breaking changes to improve the API design -- Additional validation rules -- Enhanced status fields -- Migration guides between versions - -## Contributing +# Check Tortoise annotations +kubectl get tortoise -o yaml | grep -A5 -B5 scheduledscaling -### Development Setup - -1. Fork the repository -2. Create a feature branch -3. Make your changes -4. Add tests -5. Run linting and tests -6. Submit a pull request - -### Code Generation - -After modifying API types: -```bash -make generate # Generate deepcopy methods -make manifests # Generate CRDs and RBAC +# Check HPA current state +kubectl get hpa -o yaml ``` -### Testing Guidelines - -- Write unit tests for new controller logic -- Test edge cases (invalid schedules, missing resources) -- Verify error handling and status updates -- Test configuration restoration - -## Support - -For questions or issues: -- Create a GitHub issue -- Tag with `area/scheduled-scaling` -- Provide reproduction steps and logs -- Include cluster and operator versions - -## References +## Notes -- [Tortoise Documentation](../README.md) -- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) -- [Kubebuilder Book](https://book.kubebuilder.io/) -- [Controller Runtime](https://pkg.go.dev/sigs.k8s.io/controller-runtime) +- **Timezone Default**: All schedules default to `Asia/Tokyo` timezone +- **Grace Period**: Extended to 10 minutes during ScheduledScaling to prevent false emergency mode triggers +- **Annotation-Based**: Uses Tortoise annotations for clean integration with existing controller +- **Conflict Resolution**: Automatic retry logic handles concurrent updates +- **Emergency Mode**: Respects ScheduledScaling minReplicas even when in emergency mode