diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha1/groupversion_info.go new file mode 100644 index 00000000..1df4db52 --- /dev/null +++ b/api/v1alpha1/groupversion_info.go @@ -0,0 +1,18 @@ +// +groupName=autoscaling.mercari.com +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "autoscaling.mercari.com", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/api/v1alpha1/scheduledscaling_types.go b/api/v1alpha1/scheduledscaling_types.go new file mode 100644 index 00000000..c372df5e --- /dev/null +++ b/api/v1alpha1/scheduledscaling_types.go @@ -0,0 +1,666 @@ +package v1alpha1 + +import ( + "fmt" + "strconv" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ScheduledScalingSpec defines the desired state of ScheduledScaling +// +kubebuilder:object:generate=true +type ScheduledScalingSpec struct { + // Schedule defines when the scaling should occur + // +kubebuilder:validation:Required + Schedule Schedule `json:"schedule"` + + // TargetRefs specifies which resources this scheduled scaling should affect + // +kubebuilder:validation:Required + TargetRefs TargetRefs `json:"targetRefs"` + + // Strategy defines how the scaling should be performed + // +kubebuilder:validation:Required + Strategy Strategy `json:"strategy"` + + // Status indicates the current state of the scheduled scaling + // +kubebuilder:validation:Enum=Inactive;Active + // +kubebuilder:default=Inactive + Status ScheduledScalingState `json:"status,omitempty"` +} + +// ScheduleType defines the type of scheduling to use +type ScheduleType string + +const ( + // ScheduleTypeTime uses specific start and end times + ScheduleTypeTime ScheduleType = "time" + // ScheduleTypeCron uses cron expression for periodic scheduling + ScheduleTypeCron ScheduleType = "cron" +) + +// Schedule defines the timing for scheduled scaling +type Schedule struct { + // Type specifies the scheduling type: "time" or "cron" + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=time;cron + Type ScheduleType `json:"type"` + + // Time-based scheduling fields (used when type="time") + // StartAt specifies when the scaling should begin + // Format: RFC3339 (e.g., "2024-01-15T10:00:00Z") + // +kubebuilder:validation:Optional + StartAt string `json:"startAt,omitempty"` + + // FinishAt specifies when the scaling should end and return to normal + // Format: RFC3339 (e.g., "2024-01-15T18:00:00Z") + // +kubebuilder:validation:Optional + FinishAt string `json:"finishAt,omitempty"` + + // Cron-based scheduling fields (used when type="cron") + // CronExpression defines when scaling periods should start using cron format + // Format: "minute hour day month dayofweek" (e.g., "0 9 * * 1-5" for 9 AM weekdays) + // +kubebuilder:validation:Optional + CronExpression string `json:"cronExpression,omitempty"` + + // Duration specifies how long each scaling period should last + // Format: Go duration (e.g., "8h", "30m", "1h30m") + // +kubebuilder:validation:Optional + Duration string `json:"duration,omitempty"` + + // TimeZone specifies the timezone for cron-based scheduling + // Format: IANA timezone (e.g., "Asia/Tokyo", "UTC", "America/New_York") + // +kubebuilder:validation:Optional + // +kubebuilder:default="Asia/Tokyo" + TimeZone string `json:"timeZone,omitempty"` +} + +// TargetRefs specifies which resources to scale +type TargetRefs struct { + // TortoiseName is the name of the Tortoise resource to scale + // +kubebuilder:validation:Required + TortoiseName string `json:"tortoiseName"` +} + +// Strategy defines how the scaling should be performed +// +kubebuilder:object:generate=true +type Strategy struct { + // Static defines static scaling parameters + // +kubebuilder:validation:Required + Static StaticStrategy `json:"static"` +} + +// StaticStrategy defines static scaling parameters +// +kubebuilder:object:generate=true +type StaticStrategy struct { + // MinimumMinReplicas sets the minimum number of replicas during scaling + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Optional + MinimumMinReplicas *int32 `json:"minimumMinReplicas,omitempty"` + + // MinAllocatedResources sets the minimum allocated resources during scaling + // This can be either global (applied to all containers) or container-specific + // +kubebuilder:validation:Optional + MinAllocatedResources *ResourceRequirements `json:"minAllocatedResources,omitempty"` + + // ContainerMinAllocatedResources sets container-specific minimum allocated resources + // If specified, this takes precedence over MinAllocatedResources for specific containers + // +kubebuilder:validation:Optional + ContainerMinAllocatedResources []ContainerResourceRequirements `json:"containerMinAllocatedResources,omitempty"` +} + +// ResourceRequirements describes the compute resource requirements +// +kubebuilder:object:generate=true +type ResourceRequirements struct { + // CPU specifies the CPU resource requirements + // +kubebuilder:validation:Required + CPU string `json:"cpu"` + + // Memory specifies the memory resource requirements + // +kubebuilder:validation:Required + Memory string `json:"memory"` +} + +// ContainerResourceRequirements describes container-specific resource requirements +// +kubebuilder:object:generate=true +type ContainerResourceRequirements struct { + // ContainerName specifies which container these resources apply to + // +kubebuilder:validation:Required + ContainerName string `json:"containerName"` + + // Resources specifies the resource requirements for this container + // +kubebuilder:validation:Required + Resources ResourceRequirements `json:"resources"` +} + +// ScheduledScalingState represents the desired state of a scheduled scaling operation +type ScheduledScalingState string + +const ( + // ScheduledScalingStateInactive means the scheduled scaling is not active + ScheduledScalingStateInactive ScheduledScalingState = "Inactive" + // ScheduledScalingStateActive means the scheduled scaling is currently active + ScheduledScalingStateActive ScheduledScalingState = "Active" +) + +// ScheduledScalingStatus defines the observed state of ScheduledScaling +// +kubebuilder:object:generate=true +type ScheduledScalingStatus struct { + // Phase indicates the current phase of the scheduled scaling + // +kubebuilder:validation:Enum=Pending;Active;Completed;Failed + Phase ScheduledScalingPhase `json:"phase,omitempty"` + + // LastTransitionTime is the last time the status transitioned from one phase to another + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` + + // Message provides additional information about the current phase + Message string `json:"message,omitempty"` + + // Reason indicates why the scheduled scaling is in the current phase + Reason string `json:"reason,omitempty"` + + // CurrentWindowStart indicates the start of the current scaling window (for cron-based scheduling) + CurrentWindowStart *metav1.Time `json:"currentWindowStart,omitempty"` + + // CurrentWindowEnd indicates the end of the current scaling window (for cron-based scheduling) + CurrentWindowEnd *metav1.Time `json:"currentWindowEnd,omitempty"` + + // NextWindowStart indicates when the next scaling window will start (for cron-based scheduling) + NextWindowStart *metav1.Time `json:"nextWindowStart,omitempty"` + + // HumanReadableSchedule provides a human-readable description of the schedule + HumanReadableSchedule string `json:"humanReadableSchedule,omitempty"` + + // FormattedStartTime provides a human-readable formatted start time + FormattedStartTime string `json:"formattedStartTime,omitempty"` + + // FormattedEndTime provides a human-readable formatted end time + FormattedEndTime string `json:"formattedEndTime,omitempty"` + + // FormattedNextStartTime provides a human-readable formatted next start time + FormattedNextStartTime string `json:"formattedNextStartTime,omitempty"` +} + +// ScheduledScalingPhase represents the phase of a scheduled scaling operation +type ScheduledScalingPhase string + +const ( + // ScheduledScalingPhasePending means the scheduled scaling is waiting to start + ScheduledScalingPhasePending ScheduledScalingPhase = "Pending" + // ScheduledScalingPhaseActive means the scheduled scaling is currently active + ScheduledScalingPhaseActive ScheduledScalingPhase = "Active" + // ScheduledScalingPhaseCompleted means the scheduled scaling has completed successfully + ScheduledScalingPhaseCompleted ScheduledScalingPhase = "Completed" + // ScheduledScalingPhaseFailed means the scheduled scaling has failed + ScheduledScalingPhaseFailed ScheduledScalingPhase = "Failed" +) + +//+kubebuilder:object:root=true +//+kubebuilder:subresource:status +//+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" +//+kubebuilder:printcolumn:name="Type",type="string",JSONPath=".spec.schedule.type" +//+kubebuilder:printcolumn:name="Start Time",type="string",JSONPath=".status.formattedStartTime" +//+kubebuilder:printcolumn:name="End Time",type="string",JSONPath=".status.formattedEndTime" +//+kubebuilder:printcolumn:name="Next Start",type="string",JSONPath=".status.formattedNextStartTime" +//+kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".status.humanReadableSchedule" +//+kubebuilder:printcolumn:name="Target Tortoise",type="string",JSONPath=".spec.targetRefs.tortoiseName" +//+kubebuilder:printcolumn:name="Warnings",type="string",JSONPath=".status.message" +//+kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" + +// ScheduledScaling is the Schema for the scheduledscalings API +// +kubebuilder:object:generate=true +type ScheduledScaling struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec ScheduledScalingSpec `json:"spec,omitempty"` + Status ScheduledScalingStatus `json:"status,omitempty"` +} + +//+kubebuilder:object:root=true + +// ScheduledScalingList contains a list of ScheduledScaling +type ScheduledScalingList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ScheduledScaling `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ScheduledScaling{}, &ScheduledScalingList{}) +} + +// GetHumanReadableSchedule returns a human-readable description of the schedule +func (s *ScheduledScaling) GetHumanReadableSchedule() string { + if s.Spec.Schedule.Type == ScheduleTypeCron { + return s.getHumanReadableCronSchedule() + } + return s.getHumanReadableTimeSchedule() +} + +// getHumanReadableCronSchedule returns a human-readable description of cron schedule +func (s *ScheduledScaling) getHumanReadableCronSchedule() string { + if s.Spec.Schedule.CronExpression == "" { + return "Invalid cron schedule" + } + + // Parse cron expression and provide human-readable description + desc := s.parseCronExpression(s.Spec.Schedule.CronExpression) + + // Add duration information + if s.Spec.Schedule.Duration != "" { + duration, err := time.ParseDuration(s.Spec.Schedule.Duration) + if err == nil { + desc += s.formatDuration(duration) + } else { + desc += " for " + s.Spec.Schedule.Duration + } + } + + // Add timezone information if different from default + if s.Spec.Schedule.TimeZone != "" && s.Spec.Schedule.TimeZone != "Asia/Tokyo" { + desc += " (" + s.Spec.Schedule.TimeZone + ")" + } + + return desc +} + +// formatDuration formats duration in a human-readable way +func (s *ScheduledScaling) formatDuration(duration time.Duration) string { + if duration < time.Hour { + minutes := int(duration.Minutes()) + return fmt.Sprintf(" for %d minute%s", minutes, pluralSuffix(minutes)) + } else if duration < 24*time.Hour { + hours := int(duration.Hours()) + minutes := int(duration.Minutes()) % 60 + if minutes == 0 { + return fmt.Sprintf(" for %d hour%s", hours, pluralSuffix(hours)) + } else { + return fmt.Sprintf(" for %dh %dm", hours, minutes) + } + } else { + days := int(duration.Hours() / 24) + hours := int(duration.Hours()) % 24 + if hours == 0 { + return fmt.Sprintf(" for %d day%s", days, pluralSuffix(days)) + } else { + return fmt.Sprintf(" for %d day%s %d hour%s", days, pluralSuffix(days), hours, pluralSuffix(hours)) + } + } +} + +// getHumanReadableTimeSchedule returns a human-readable description of time-based schedule +func (s *ScheduledScaling) getHumanReadableTimeSchedule() string { + if s.Spec.Schedule.StartAt == "" || s.Spec.Schedule.FinishAt == "" { + return "Invalid time schedule" + } + + startTime, err := time.Parse(time.RFC3339, s.Spec.Schedule.StartAt) + if err != nil { + return "Invalid start time format" + } + + finishTime, err := time.Parse(time.RFC3339, s.Spec.Schedule.FinishAt) + if err != nil { + return "Invalid finish time format" + } + + // Calculate duration + duration := finishTime.Sub(startTime) + + // Create a descriptive schedule name + scheduleName := s.generateScheduleName(startTime, finishTime, duration) + + // Build the description - shortened version + if duration < 24*time.Hour { + // Less than 24 hours + if duration < time.Hour { + minutes := int(duration.Minutes()) + return fmt.Sprintf("%s (%d min) - One-time", scheduleName, minutes) + } else { + hours := int(duration.Hours()) + minutes := int(duration.Minutes()) % 60 + if minutes == 0 { + return fmt.Sprintf("%s (%d hr) - One-time", scheduleName, hours) + } else { + return fmt.Sprintf("%s (%dh %dm) - One-time", scheduleName, hours, minutes) + } + } + } else { + // 24 hours or more + days := int(duration.Hours() / 24) + hours := int(duration.Hours()) % 24 + if hours == 0 { + return fmt.Sprintf("%s (%d day) - One-time", scheduleName, days) + } else { + return fmt.Sprintf("%s (%d day %dh) - One-time", scheduleName, days, hours) + } + } +} + +// generateScheduleName creates a descriptive name for the schedule +func (s *ScheduledScaling) generateScheduleName(startTime, finishTime time.Time, duration time.Duration) string { + // Check for common patterns + startHour := startTime.Hour() + finishHour := finishTime.Hour() + + // Business hours pattern + if startHour >= 8 && startHour <= 10 && finishHour >= 17 && finishHour <= 19 { + if duration >= 8*time.Hour && duration <= 10*time.Hour { + return "Business Hours" + } + } + + // Night shift pattern + if (startHour >= 22 || startHour <= 6) && (finishHour >= 22 || finishHour <= 6) { + return "Night Shift" + } + + // Weekend pattern + if startTime.Weekday() == time.Saturday || startTime.Weekday() == time.Sunday || + finishTime.Weekday() == time.Saturday || finishTime.Weekday() == time.Sunday { + return "Weekend" + } + + // Campaign/Event pattern (longer duration) + if duration >= 24*time.Hour { + return "Campaign" + } + + // Peak hours pattern + if (startHour >= 7 && startHour <= 9) || (startHour >= 17 && startHour <= 19) { + return "Peak Hours" + } + + // Default based on duration + if duration < time.Hour { + return "Quick Scaling" + } else if duration < 24*time.Hour { + return "Daily Scaling" + } else { + return "Extended Scaling" + } +} + +// parseCronExpression converts cron expression to human-readable text +func (s *ScheduledScaling) parseCronExpression(cronExpr string) string { + // Enhanced mapping for common cron patterns + patterns := map[string]string{ + "0 9 * * 1-5": "Weekdays at 9:00 AM", + "0 8,20 * * *": "Daily at 8:00 AM and 8:00 PM", + "0 0 1 * *": "Monthly on 1st at midnight", + "0 2 * * 6": "Weekly on Saturday at 2:00 AM", + "0 8 * * 1-5": "Weekdays at 8:00 AM", + "0 10 * * 1-5": "Weekdays at 10:00 AM", + "0 17 * * 1-5": "Weekdays at 5:00 PM", + "0 18 * * 1-5": "Weekdays at 6:00 PM", + "*/5 * * * *": "Every 5 minutes", + "*/10 * * * *": "Every 10 minutes", + "*/15 * * * *": "Every 15 minutes", + "*/30 * * * *": "Every 30 minutes", + "0 */2 * * *": "Every 2 hours", + "0 */3 * * *": "Every 3 hours", + "0 */4 * * *": "Every 4 hours", + "0 */6 * * *": "Every 6 hours", + "0 */8 * * *": "Every 8 hours", + "0 */12 * * *": "Every 12 hours", + "0 0 * * *": "Daily at midnight", + "0 6 * * *": "Daily at 6:00 AM", + "0 12 * * *": "Daily at noon", + "0 18 * * *": "Daily at 6:00 PM", + "0 0 * * 1": "Weekly on Monday at midnight", + "0 0 * * 6": "Weekly on Saturday at midnight", + "0 0 * * 0": "Weekly on Sunday at midnight", + "0 0 15 * *": "Monthly on 15th at midnight", + "0 0 1 1 *": "Yearly on January 1st at midnight", + } + + if desc, exists := patterns[cronExpr]; exists { + return desc + } + + // For unknown patterns, provide intelligent parsing + parts := strings.Fields(cronExpr) + if len(parts) >= 5 { + minute := parts[0] + hour := parts[1] + day := parts[2] + month := parts[3] + weekday := parts[4] + + desc := s.buildCronDescription(minute, hour, day, month, weekday) + return desc + } + + return "Cron: " + cronExpr +} + +// buildCronDescription builds a human-readable description from cron parts +func (s *ScheduledScaling) buildCronDescription(minute, hour, day, month, weekday string) string { + var desc strings.Builder + + // Handle minutes + if minute != "*" && minute != "0" { + if minute == "*/5" { + desc.WriteString("Every 5 minutes ") + } else if minute == "*/10" { + desc.WriteString("Every 10 minutes ") + } else if minute == "*/15" { + desc.WriteString("Every 15 minutes ") + } else if minute == "*/30" { + desc.WriteString("Every 30 minutes ") + } else { + desc.WriteString(fmt.Sprintf("At %s minutes past ", minute)) + } + } + + // Handle hours + if hour != "*" { + if strings.Contains(hour, ",") { + hours := strings.Split(hour, ",") + if len(hours) == 2 { + desc.WriteString(fmt.Sprintf("at %s:00 and %s:00 ", hours[0], hours[1])) + } else { + desc.WriteString(fmt.Sprintf("at %s ", hour)) + } + } else if strings.Contains(hour, "-") { + hourRange := strings.Split(hour, "-") + if len(hourRange) == 2 { + desc.WriteString(fmt.Sprintf("between %s:00 and %s:00 ", hourRange[0], hourRange[1])) + } else { + desc.WriteString(fmt.Sprintf("at %s:00 ", hour)) + } + } else if strings.Contains(hour, "/") { + hourStep := strings.Split(hour, "/") + if len(hourStep) == 2 { + desc.WriteString(fmt.Sprintf("every %s hours starting at %s:00 ", hourStep[1], hourStep[0])) + } else { + desc.WriteString(fmt.Sprintf("at %s:00 ", hour)) + } + } else { + desc.WriteString(fmt.Sprintf("at %s:00 ", hour)) + } + } + + // Handle days + if day != "*" { + if day == "1" { + desc.WriteString("on the 1st ") + } else if day == "15" { + desc.WriteString("on the 15th ") + } else if day == "31" { + desc.WriteString("on the 31st ") + } else { + desc.WriteString(fmt.Sprintf("on day %s ", day)) + } + } + + // Handle months + if month != "*" { + monthNames := []string{"", "January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December"} + if monthNum, err := strconv.Atoi(month); err == nil && monthNum >= 1 && monthNum <= 12 { + desc.WriteString(fmt.Sprintf("in %s ", monthNames[monthNum])) + } else { + desc.WriteString(fmt.Sprintf("in month %s ", month)) + } + } + + // Handle weekdays + if weekday != "*" { + if weekday == "1-5" { + desc.WriteString("on weekdays") + } else if weekday == "6-7" || weekday == "0,6" { + desc.WriteString("on weekends") + } else if weekday == "1" { + desc.WriteString("on Mondays") + } else if weekday == "2" { + desc.WriteString("on Tuesdays") + } else if weekday == "3" { + desc.WriteString("on Wednesdays") + } else if weekday == "4" { + desc.WriteString("on Thursdays") + } else if weekday == "5" { + desc.WriteString("on Fridays") + } else if weekday == "6" { + desc.WriteString("on Saturdays") + } else if weekday == "0" || weekday == "7" { + desc.WriteString("on Sundays") + } else if strings.Contains(weekday, ",") { + days := strings.Split(weekday, ",") + if len(days) == 2 { + desc.WriteString(fmt.Sprintf("on %s and %s", s.getWeekdayName(days[0]), s.getWeekdayName(days[1]))) + } else { + desc.WriteString(fmt.Sprintf("on weekdays %s", weekday)) + } + } else { + desc.WriteString(fmt.Sprintf("on weekday %s", weekday)) + } + } + + result := strings.TrimSpace(desc.String()) + if result == "" { + return "Every minute" + } + + return result +} + +// getWeekdayName converts weekday number to name +func (s *ScheduledScaling) getWeekdayName(weekday string) string { + switch weekday { + case "0", "7": + return "Sunday" + case "1": + return "Monday" + case "2": + return "Tuesday" + case "3": + return "Wednesday" + case "4": + return "Thursday" + case "5": + return "Friday" + case "6": + return "Saturday" + default: + return weekday + } +} + +// GetTimezone returns the timezone to use for formatting times +// Defaults to Asia/Tokyo if not specified +func (s *ScheduledScaling) GetTimezone() *time.Location { + timezone := s.Spec.Schedule.TimeZone + if timezone == "" { + timezone = "Asia/Tokyo" // Default timezone + } + + loc, err := time.LoadLocation(timezone) + if err != nil { + // Fallback to Asia/Tokyo if the specified timezone is invalid + loc, _ = time.LoadLocation("Asia/Tokyo") + } + return loc +} + +// GetHumanReadableTime formats a time in a human-readable way +func (s *ScheduledScaling) GetHumanReadableTime(t *metav1.Time) string { + if t == nil { + return "-" + } + + loc := s.GetTimezone() + now := time.Now().In(loc) + timeDiff := t.Time.In(loc).Sub(now) + + // Format based on how far in the past/future the time is + if timeDiff < 0 { + // Past time + absDiff := -timeDiff + if absDiff < time.Minute { + return "just now" + } else if absDiff < time.Hour { + minutes := int(absDiff.Minutes()) + return fmt.Sprintf("%d minute%s ago", minutes, pluralSuffix(minutes)) + } else if absDiff < 24*time.Hour { + hours := int(absDiff.Hours()) + return fmt.Sprintf("%d hour%s ago", hours, pluralSuffix(hours)) + } else { + days := int(absDiff.Hours() / 24) + return fmt.Sprintf("%d day%s ago", days, pluralSuffix(days)) + } + } else { + // Future time + if timeDiff < time.Minute { + return "starting now" + } else if timeDiff < time.Hour { + minutes := int(timeDiff.Minutes()) + return fmt.Sprintf("in %d minute%s", minutes, pluralSuffix(minutes)) + } else if timeDiff < 24*time.Hour { + hours := int(timeDiff.Hours()) + return fmt.Sprintf("in %d hour%s", hours, pluralSuffix(hours)) + } else { + days := int(timeDiff.Hours() / 24) + return fmt.Sprintf("in %d day%s", days, pluralSuffix(days)) + } + } +} + +// GetFormattedTime returns a human-readable formatted time with date information +func (s *ScheduledScaling) GetFormattedTime(t *metav1.Time) string { + if t == nil { + return "-" + } + + loc := s.GetTimezone() + now := time.Now().In(loc) + targetTime := t.Time.In(loc) + + // If it's today, show "Today at time" + if targetTime.Year() == now.Year() && targetTime.YearDay() == now.YearDay() { + return fmt.Sprintf("Today at %s", targetTime.Format("3:04 PM")) + } + + // If it's tomorrow, show "Tomorrow at time" + if targetTime.Year() == now.Year() && targetTime.YearDay() == now.YearDay()+1 { + return fmt.Sprintf("Tomorrow at %s", targetTime.Format("3:04 PM")) + } + + // If it's within a week, show day and time + if targetTime.Sub(now) < 7*24*time.Hour { + return targetTime.Format("Mon 3:04 PM") + } + + // Otherwise show date and time + return targetTime.Format("Jan 2, 3:04 PM") +} + +// pluralSuffix returns the appropriate plural suffix +func pluralSuffix(count int) string { + if count == 1 { + return "" + } + return "s" +} diff --git a/api/v1alpha1/scheduledscaling_validation.go b/api/v1alpha1/scheduledscaling_validation.go new file mode 100644 index 00000000..a15df7b2 --- /dev/null +++ b/api/v1alpha1/scheduledscaling_validation.go @@ -0,0 +1,199 @@ +package v1alpha1 + +import ( + "fmt" + "time" + + "github.com/robfig/cron/v3" +) + +// ValidateSchedule validates the schedule configuration based on the type +func (s *Schedule) Validate() error { + switch s.Type { + case ScheduleTypeTime: + return s.validateTimeBasedSchedule() + case ScheduleTypeCron: + return s.validateCronBasedSchedule() + default: + return fmt.Errorf("invalid schedule type: %s, must be 'time' or 'cron'", s.Type) + } +} + +// validateTimeBasedSchedule validates time-based scheduling parameters +func (s *Schedule) validateTimeBasedSchedule() error { + // Validate required fields for time-based scheduling + if s.StartAt == "" { + return fmt.Errorf("startAt is required when type is 'time'") + } + if s.FinishAt == "" { + return fmt.Errorf("finishAt is required when type is 'time'") + } + + // Validate time format (RFC3339) + startTime, err := time.Parse(time.RFC3339, s.StartAt) + if err != nil { + return fmt.Errorf("invalid startAt format: %v, expected RFC3339 format (e.g., '2024-01-15T10:00:00Z')", err) + } + + finishTime, err := time.Parse(time.RFC3339, s.FinishAt) + if err != nil { + return fmt.Errorf("invalid finishAt format: %v, expected RFC3339 format (e.g., '2024-01-15T18:00:00Z')", err) + } + + // Validate that finish time is after start time + if finishTime.Before(startTime) || finishTime.Equal(startTime) { + return fmt.Errorf("finishAt (%s) must be after startAt (%s)", s.FinishAt, s.StartAt) + } + + // Validate that cron-specific fields are not set + if s.CronExpression != "" { + return fmt.Errorf("cronExpression cannot be set when type is 'time'") + } + if s.Duration != "" { + return fmt.Errorf("duration cannot be set when type is 'time'") + } + + return nil +} + +// validateCronBasedSchedule validates cron-based scheduling parameters +func (s *Schedule) validateCronBasedSchedule() error { + // Validate required fields for cron-based scheduling + if s.CronExpression == "" { + return fmt.Errorf("cronExpression is required when type is 'cron'") + } + if s.Duration == "" { + return fmt.Errorf("duration is required when type is 'cron'") + } + + // Validate cron expression format + parser := cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + _, err := parser.Parse(s.CronExpression) + if err != nil { + return fmt.Errorf("invalid cronExpression format: %v, expected format 'minute hour day month dayofweek' (e.g., '0 9 * * 1-5')", err) + } + + // Validate duration format + duration, err := time.ParseDuration(s.Duration) + if err != nil { + return fmt.Errorf("invalid duration format: %v, expected Go duration format (e.g., '8h', '30m', '1h30m')", err) + } + + // Validate duration is reasonable (not too short or too long) + if duration < time.Minute { + return fmt.Errorf("duration must be at least 1 minute") + } + if duration > 24*time.Hour { + return fmt.Errorf("duration must not exceed 24 hours") + } + + // Validate timezone format if provided + if s.TimeZone != "" && s.TimeZone != "Asia/Tokyo" { + _, err := time.LoadLocation(s.TimeZone) + if err != nil { + return fmt.Errorf("invalid timeZone: %v, expected IANA timezone format (e.g., 'Asia/Tokyo', 'America/New_York')", err) + } + } + + // Validate that time-specific fields are not set + if s.StartAt != "" { + return fmt.Errorf("startAt cannot be set when type is 'cron'") + } + if s.FinishAt != "" { + return fmt.Errorf("finishAt cannot be set when type is 'cron'") + } + + return nil +} + +// GetNextScheduleWindow returns the next scheduled scaling window for cron-based scheduling +func (s *Schedule) GetNextScheduleWindow(from time.Time) (startTime, endTime time.Time, err error) { + if s.Type != ScheduleTypeCron { + return time.Time{}, time.Time{}, fmt.Errorf("GetNextScheduleWindow is only supported for cron-based scheduling") + } + + // Parse timezone + location, _ := time.LoadLocation("Asia/Tokyo") // Default to Tokyo timezone + if s.TimeZone != "" && s.TimeZone != "Asia/Tokyo" { + location, err = time.LoadLocation(s.TimeZone) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("failed to parse timezone: %v", err) + } + } + + // Parse duration + duration, err := time.ParseDuration(s.Duration) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("failed to parse duration: %v", err) + } + + // Parse cron expression + parser := cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + schedule, err := parser.Parse(s.CronExpression) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("failed to parse cron expression: %v", err) + } + + // Convert from time to the specified timezone + fromInLocation := from.In(location) + + // Get next scheduled time + nextTime := schedule.Next(fromInLocation) + endTimeInLocation := nextTime.Add(duration) + + // Convert back to UTC for consistency + return nextTime.UTC(), endTimeInLocation.UTC(), nil +} + +// IsCurrentlyActive checks if the current time falls within an active scaling window +func (s *Schedule) IsCurrentlyActive(now time.Time) (bool, time.Time, error) { + switch s.Type { + case ScheduleTypeTime: + startTime, err := time.Parse(time.RFC3339, s.StartAt) + if err != nil { + return false, time.Time{}, err + } + finishTime, err := time.Parse(time.RFC3339, s.FinishAt) + if err != nil { + return false, time.Time{}, err + } + + isActive := now.After(startTime) && now.Before(finishTime) + return isActive, finishTime, nil + + case ScheduleTypeCron: + // For cron-based scheduling, we need to check if we're within the current window + // We'll look back to find the most recent start time + lookBackTime := now.Add(-25 * time.Hour) // Look back 25 hours to catch daily schedules + + // Get the most recent schedule window that could be active + startTime, endTime, err := s.GetNextScheduleWindow(lookBackTime) + if err != nil { + return false, time.Time{}, err + } + + // Check if we're currently in this window + for startTime.Before(now) { + if now.Before(endTime) { + // We're in an active window + return true, endTime, nil + } + + // Move to next window + startTime, endTime, err = s.GetNextScheduleWindow(startTime.Add(time.Minute)) + if err != nil { + return false, time.Time{}, err + } + + // Prevent infinite loop - if start time is too far in the future, break + if startTime.After(now.Add(time.Hour)) { + break + } + } + + return false, time.Time{}, nil + + default: + return false, time.Time{}, fmt.Errorf("unsupported schedule type: %s", s.Type) + } +} diff --git a/api/v1alpha1/scheduledscaling_validation_test.go b/api/v1alpha1/scheduledscaling_validation_test.go new file mode 100644 index 00000000..55963cc8 --- /dev/null +++ b/api/v1alpha1/scheduledscaling_validation_test.go @@ -0,0 +1,488 @@ +package v1alpha1 + +import ( + "fmt" + "strings" + "testing" + "time" +) + +func TestSchedule_Validate(t *testing.T) { + tests := []struct { + name string + schedule Schedule + wantErr bool + errMsg string + }{ + // Time-based scheduling tests + { + name: "valid time-based schedule", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + wantErr: false, + }, + { + name: "time-based missing startAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + FinishAt: "2024-01-15T18:00:00Z", + }, + wantErr: true, + errMsg: "startAt is required when type is 'time'", + }, + { + name: "time-based missing finishAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "finishAt is required when type is 'time'", + }, + { + name: "time-based invalid startAt format", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15 10:00:00", + FinishAt: "2024-01-15T18:00:00Z", + }, + wantErr: true, + errMsg: "invalid startAt format", + }, + { + name: "time-based invalid finishAt format", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15 18:00:00", + }, + wantErr: true, + errMsg: "invalid finishAt format", + }, + { + name: "time-based finishAt before startAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T18:00:00Z", + FinishAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "must be after startAt", + }, + { + name: "time-based finishAt equals startAt", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "must be after startAt", + }, + { + name: "time-based with cron fields should fail", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + CronExpression: "0 9 * * *", + }, + wantErr: true, + errMsg: "cronExpression cannot be set when type is 'time'", + }, + + // Cron-based scheduling tests + { + name: "valid cron-based schedule", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + wantErr: false, + }, + { + name: "valid cron-based schedule with timezone", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + wantErr: false, + }, + { + name: "cron-based missing cronExpression", + schedule: Schedule{ + Type: ScheduleTypeCron, + Duration: "8h", + }, + wantErr: true, + errMsg: "cronExpression is required when type is 'cron'", + }, + { + name: "cron-based missing duration", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + }, + wantErr: true, + errMsg: "duration is required when type is 'cron'", + }, + { + name: "cron-based invalid cron expression", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "invalid cron", + Duration: "8h", + }, + wantErr: true, + errMsg: "invalid cronExpression format", + }, + { + name: "cron-based invalid duration", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "invalid duration", + }, + wantErr: true, + errMsg: "invalid duration format", + }, + { + name: "cron-based duration too short", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "30s", + }, + wantErr: true, + errMsg: "duration must be at least 1 minute", + }, + { + name: "cron-based duration too long", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "25h", + }, + wantErr: true, + errMsg: "duration must not exceed 24 hours", + }, + { + name: "cron-based invalid timezone", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Invalid/Timezone", + }, + wantErr: true, + errMsg: "invalid timeZone", + }, + { + name: "cron-based with time fields should fail", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + StartAt: "2024-01-15T10:00:00Z", + }, + wantErr: true, + errMsg: "startAt cannot be set when type is 'cron'", + }, + + // Invalid type tests + { + name: "invalid schedule type", + schedule: Schedule{ + Type: "invalid", + }, + wantErr: true, + errMsg: "invalid schedule type: invalid", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.schedule.Validate() + if (err != nil) != tt.wantErr { + t.Errorf("Schedule.Validate() error = %v, wantErr %v", err, tt.wantErr) + return + } + if tt.wantErr && tt.errMsg != "" { + if err == nil { + t.Errorf("Expected error containing '%s', got no error", tt.errMsg) + } else if !containsPattern(err.Error(), tt.errMsg) { + t.Errorf("Expected error containing '%s', got '%s'", tt.errMsg, err.Error()) + } + } + }) + } +} + +func TestSchedule_GetNextScheduleWindow(t *testing.T) { + tests := []struct { + name string + schedule Schedule + from time.Time + wantErr bool + errMsg string + }{ + { + name: "valid cron schedule - daily", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * *", // Daily at 9 AM + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), // 8 AM + wantErr: false, + }, + { + name: "valid cron schedule - weekdays", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", // Weekdays at 9 AM + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), // Monday 8 AM UTC + wantErr: false, + }, + { + name: "time-based schedule should fail", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), + wantErr: true, + errMsg: "GetNextScheduleWindow is only supported for cron-based scheduling", + }, + { + name: "invalid cron expression", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "invalid cron", + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), + wantErr: true, + errMsg: "failed to parse cron expression", + }, + { + name: "invalid duration", + schedule: Schedule{ + Type: ScheduleTypeCron, + CronExpression: "0 9 * * *", + Duration: "invalid", + TimeZone: "Asia/Tokyo", + }, + from: time.Date(2024, 1, 15, 8, 0, 0, 0, time.UTC), + wantErr: true, + errMsg: "failed to parse duration", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + startTime, endTime, err := tt.schedule.GetNextScheduleWindow(tt.from) + if (err != nil) != tt.wantErr { + t.Errorf("Schedule.GetNextScheduleWindow() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if tt.wantErr { + if tt.errMsg != "" && err != nil { + if !containsPattern(err.Error(), tt.errMsg) { + t.Errorf("Expected error containing '%s', got '%s'", tt.errMsg, err.Error()) + } + } + } else { + // Validate that we got valid times + if startTime.IsZero() || endTime.IsZero() { + t.Errorf("Expected valid start and end times, got start=%v, end=%v", startTime, endTime) + } + if !endTime.After(startTime) { + t.Errorf("End time should be after start time, got start=%v, end=%v", startTime, endTime) + } + if !startTime.After(tt.from) { + t.Errorf("Start time should be after 'from' time, got start=%v, from=%v", startTime, tt.from) + } + } + }) + } +} + +func TestSchedule_IsCurrentlyActive(t *testing.T) { + tests := []struct { + name string + schedule Schedule + now time.Time + wantActive bool + wantErr bool + }{ + { + name: "time-based - before start", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + now: time.Date(2024, 1, 15, 9, 0, 0, 0, time.UTC), + wantActive: false, + wantErr: false, + }, + { + name: "time-based - during period", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + now: time.Date(2024, 1, 15, 14, 0, 0, 0, time.UTC), + wantActive: true, + wantErr: false, + }, + { + name: "time-based - after end", + schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + now: time.Date(2024, 1, 15, 19, 0, 0, 0, time.UTC), + wantActive: false, + wantErr: false, + }, + // Note: Cron-based tests are complex and would require mocking time + // In real implementation, you'd test with specific known schedules + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + isActive, _, err := tt.schedule.IsCurrentlyActive(tt.now) + if (err != nil) != tt.wantErr { + t.Errorf("Schedule.IsCurrentlyActive() error = %v, wantErr %v", err, tt.wantErr) + return + } + if isActive != tt.wantActive { + t.Errorf("Schedule.IsCurrentlyActive() = %v, want %v", isActive, tt.wantActive) + } + }) + } +} + +// Helper function to check if error message contains expected pattern +func containsPattern(text, pattern string) bool { + // For simplicity, just check if pattern is a substring + // In real implementation, you might want to use regex matching + return len(text) > 0 && (pattern == "" || + len(pattern) > 0 && len(text) >= len(pattern) && + findSubstring(text, pattern)) +} + +func findSubstring(text, substr string) bool { + if len(substr) == 0 { + return true + } + if len(text) < len(substr) { + return false + } + for i := 0; i <= len(text)-len(substr); i++ { + match := true + for j := 0; j < len(substr); j++ { + if text[i+j] != substr[j] { + match = false + break + } + } + if match { + return true + } + } + return false +} + +// TestScheduledScaling_AnnotationCleanup tests that ScheduledScaling annotations are properly cleaned up +func TestScheduledScaling_AnnotationCleanup(t *testing.T) { + // This test verifies the expected behavior of annotation cleanup + // The actual cleanup logic is in the controller, but we can test the annotation constants + + const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" + const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" + + // Verify annotation names are correctly defined + if annOriginal == "" { + t.Error("annOriginal constant should not be empty") + } + + if annMinReplicas == "" { + t.Error("annMinReplicas constant should not be empty") + } + + // Verify annotation names follow the expected pattern + expectedPrefix := "autoscaling.mercari.com/scheduledscaling-" + if !strings.HasPrefix(annOriginal, expectedPrefix) { + t.Errorf("annOriginal should start with %s, got %s", expectedPrefix, annOriginal) + } + + if !strings.HasPrefix(annMinReplicas, expectedPrefix) { + t.Errorf("annMinReplicas should start with %s, got %s", expectedPrefix, annMinReplicas) + } +} + +// TestScheduledScaling_ValidateMinReplicasWarning tests the minReplicas validation warning logic +func TestScheduledScaling_ValidateMinReplicasWarning(t *testing.T) { + tests := []struct { + name string + requestedMinReplicas int32 + hpaRecommendation int32 + expectedWarning bool + expectedMessage string + }{ + { + name: "no warning when requested >= recommended", + requestedMinReplicas: 5, + hpaRecommendation: 3, + expectedWarning: false, + expectedMessage: "", + }, + { + name: "warning when requested < recommended", + requestedMinReplicas: 2, + hpaRecommendation: 5, + expectedWarning: true, + expectedMessage: "Requested minReplicas (2) is lower than HPA's current recommendation (5) for the workload. Using HPA recommendation (5) instead to prevent performance issues. Consider reviewing your scaling strategy.", + }, + { + name: "no warning when requested equals recommended", + requestedMinReplicas: 5, + hpaRecommendation: 5, + expectedWarning: false, + expectedMessage: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test the expected warning message format + if tt.expectedWarning { + expectedMsg := fmt.Sprintf("Requested minReplicas (%d) is lower than HPA's current recommendation (%d) for the workload. Using HPA recommendation (%d) instead to prevent performance issues. Consider reviewing your scaling strategy.", + tt.requestedMinReplicas, tt.hpaRecommendation, tt.hpaRecommendation) + + if expectedMsg != tt.expectedMessage { + t.Errorf("expected warning message %q, got %q", tt.expectedMessage, expectedMsg) + } + } + }) + } +} diff --git a/api/v1alpha1/scheduledscaling_webhook.go b/api/v1alpha1/scheduledscaling_webhook.go new file mode 100644 index 00000000..0dbc0d5d --- /dev/null +++ b/api/v1alpha1/scheduledscaling_webhook.go @@ -0,0 +1,104 @@ +package v1alpha1 + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/validation/field" + ctrl "sigs.k8s.io/controller-runtime" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/webhook" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" +) + +// log is for logging in this package. +var scheduledscalinglog = logf.Log.WithName("scheduledscaling-resource") + +func (r *ScheduledScaling) SetupWebhookWithManager(mgr ctrl.Manager) error { + return ctrl.NewWebhookManagedBy(mgr). + For(r). + Complete() +} + +//+kubebuilder:webhook:path=/validate-autoscaling-mercari-com-v1alpha1-scheduledscaling,mutating=false,failurePolicy=fail,sideEffects=None,admissionReviewVersions=v1,groups=autoscaling.mercari.com,resources=scheduledscalings,verbs=create;update,versions=v1alpha1,name=vscheduledscaling.kb.io + +var _ webhook.Validator = &ScheduledScaling{} + +// ValidateCreate implements webhook.Validator so a webhook will be registered for the type +func (r *ScheduledScaling) ValidateCreate() (admission.Warnings, error) { + scheduledscalinglog.Info("validate create", "name", r.Name) + + if err := r.validateSpec(); err != nil { + return nil, err + } + + return nil, nil +} + +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type +func (r *ScheduledScaling) ValidateUpdate(old runtime.Object) (admission.Warnings, error) { + scheduledscalinglog.Info("validate update", "name", r.Name) + + if err := r.validateSpec(); err != nil { + return nil, err + } + + return nil, nil +} + +// ValidateDelete implements webhook.Validator so a webhook will be registered for the type +func (r *ScheduledScaling) ValidateDelete() (admission.Warnings, error) { + scheduledscalinglog.Info("validate delete", "name", r.Name) + + // No validation needed for delete + return nil, nil +} + +// validateSpec validates the ScheduledScaling spec +func (r *ScheduledScaling) validateSpec() error { + var allErrs field.ErrorList + + // Validate that at least one of minimumMinReplicas or minAllocatedResources is present + if err := r.validateAtLeastOneScalingParameter(); err != nil { + allErrs = append(allErrs, err) + } + + // Validate schedule + if err := r.Spec.Schedule.Validate(); err != nil { + allErrs = append(allErrs, field.Invalid( + field.NewPath("spec", "schedule"), + r.Spec.Schedule, + err.Error(), + )) + } + + if len(allErrs) == 0 { + return nil + } + + return fmt.Errorf("validation failed: %v", allErrs) +} + +// validateAtLeastOneScalingParameter ensures that at least one of minimumMinReplicas, minAllocatedResources, or containerMinAllocatedResources is present +func (r *ScheduledScaling) validateAtLeastOneScalingParameter() *field.Error { + static := r.Spec.Strategy.Static + + // Check if all scaling parameters are missing + if static.MinimumMinReplicas == nil && + static.MinAllocatedResources == nil && + len(static.ContainerMinAllocatedResources) == 0 { + return field.Invalid( + field.NewPath("spec", "strategy", "static"), + r.Spec.Strategy.Static, + "at least one of 'minimumMinReplicas', 'minAllocatedResources', or 'containerMinAllocatedResources' must be specified", + ) + } + + return nil +} + +// GetObjectKind implements runtime.Object +func (r *ScheduledScaling) GetObjectKind() schema.ObjectKind { + return &r.TypeMeta +} diff --git a/api/v1alpha1/scheduledscaling_webhook_test.go b/api/v1alpha1/scheduledscaling_webhook_test.go new file mode 100644 index 00000000..ec7de1fc --- /dev/null +++ b/api/v1alpha1/scheduledscaling_webhook_test.go @@ -0,0 +1,197 @@ +package v1alpha1 + +import ( + "testing" +) + +func TestScheduledScaling_ValidateAtLeastOneScalingParameter(t *testing.T) { + tests := []struct { + name string + ss *ScheduledScaling + wantErr bool + }{ + { + name: "both fields present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "all three scaling parameters present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + ContainerMinAllocatedResources: []ContainerResourceRequirements{ + { + ContainerName: "sidecar", + Resources: ResourceRequirements{CPU: "100m", Memory: "128Mi"}, + }, + }, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "only minimumMinReplicas present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + }, + }, + }, + }, + wantErr: false, + }, + { + name: "only minAllocatedResources present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "only containerMinAllocatedResources present - should pass", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{ + ContainerMinAllocatedResources: []ContainerResourceRequirements{ + { + ContainerName: "app", + Resources: ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "both fields missing - should fail", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Strategy: Strategy{ + Static: StaticStrategy{}, + }, + }, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.ss.validateAtLeastOneScalingParameter() + if (err != nil) != tt.wantErr { + t.Errorf("ScheduledScaling.validateAtLeastOneScalingParameter() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestScheduledScaling_ValidateSpec(t *testing.T) { + tests := []struct { + name string + ss *ScheduledScaling + wantErr bool + }{ + { + name: "valid spec with both fields", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &ResourceRequirements{CPU: "500m", Memory: "1Gi"}, + }, + }, + TargetRefs: TargetRefs{ + TortoiseName: "test-tortoise", + }, + }, + }, + wantErr: false, + }, + { + name: "valid spec with only minimumMinReplicas", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + Strategy: Strategy{ + Static: StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + }, + }, + TargetRefs: TargetRefs{ + TortoiseName: "test-tortoise", + }, + }, + }, + wantErr: false, + }, + { + name: "invalid spec - missing both scaling parameters", + ss: &ScheduledScaling{ + Spec: ScheduledScalingSpec{ + Schedule: Schedule{ + Type: ScheduleTypeTime, + StartAt: "2024-01-15T10:00:00Z", + FinishAt: "2024-01-15T18:00:00Z", + }, + Strategy: Strategy{ + Static: StaticStrategy{}, + }, + TargetRefs: TargetRefs{ + TortoiseName: "test-tortoise", + }, + }, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.ss.validateSpec() + if (err != nil) != tt.wantErr { + t.Errorf("ScheduledScaling.validateSpec() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +// Helper function to create int32 pointers +func int32Ptr(i int32) *int32 { + return &i +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 00000000..9cef15d0 --- /dev/null +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,216 @@ +//go:build !ignore_autogenerated + +/* +MIT License + +Copyright (c) 2023 mercari + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ContainerResourceRequirements) DeepCopyInto(out *ContainerResourceRequirements) { + *out = *in + out.Resources = in.Resources +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerResourceRequirements. +func (in *ContainerResourceRequirements) DeepCopy() *ContainerResourceRequirements { + if in == nil { + return nil + } + out := new(ContainerResourceRequirements) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceRequirements) DeepCopyInto(out *ResourceRequirements) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceRequirements. +func (in *ResourceRequirements) DeepCopy() *ResourceRequirements { + if in == nil { + return nil + } + out := new(ResourceRequirements) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScaling) DeepCopyInto(out *ScheduledScaling) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScaling. +func (in *ScheduledScaling) DeepCopy() *ScheduledScaling { + if in == nil { + return nil + } + out := new(ScheduledScaling) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledScaling) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScalingList) DeepCopyInto(out *ScheduledScalingList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ScheduledScaling, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScalingList. +func (in *ScheduledScalingList) DeepCopy() *ScheduledScalingList { + if in == nil { + return nil + } + out := new(ScheduledScalingList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledScalingList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScalingSpec) DeepCopyInto(out *ScheduledScalingSpec) { + *out = *in + out.Schedule = in.Schedule + out.TargetRefs = in.TargetRefs + in.Strategy.DeepCopyInto(&out.Strategy) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScalingSpec. +func (in *ScheduledScalingSpec) DeepCopy() *ScheduledScalingSpec { + if in == nil { + return nil + } + out := new(ScheduledScalingSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledScalingStatus) DeepCopyInto(out *ScheduledScalingStatus) { + *out = *in + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) + if in.CurrentWindowStart != nil { + in, out := &in.CurrentWindowStart, &out.CurrentWindowStart + *out = (*in).DeepCopy() + } + if in.CurrentWindowEnd != nil { + in, out := &in.CurrentWindowEnd, &out.CurrentWindowEnd + *out = (*in).DeepCopy() + } + if in.NextWindowStart != nil { + in, out := &in.NextWindowStart, &out.NextWindowStart + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledScalingStatus. +func (in *ScheduledScalingStatus) DeepCopy() *ScheduledScalingStatus { + if in == nil { + return nil + } + out := new(ScheduledScalingStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StaticStrategy) DeepCopyInto(out *StaticStrategy) { + *out = *in + if in.MinimumMinReplicas != nil { + in, out := &in.MinimumMinReplicas, &out.MinimumMinReplicas + *out = new(int32) + **out = **in + } + if in.MinAllocatedResources != nil { + in, out := &in.MinAllocatedResources, &out.MinAllocatedResources + *out = new(ResourceRequirements) + **out = **in + } + if in.ContainerMinAllocatedResources != nil { + in, out := &in.ContainerMinAllocatedResources, &out.ContainerMinAllocatedResources + *out = make([]ContainerResourceRequirements, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StaticStrategy. +func (in *StaticStrategy) DeepCopy() *StaticStrategy { + if in == nil { + return nil + } + out := new(StaticStrategy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Strategy) DeepCopyInto(out *Strategy) { + *out = *in + in.Static.DeepCopyInto(&out.Static) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Strategy. +func (in *Strategy) DeepCopy() *Strategy { + if in == nil { + return nil + } + out := new(Strategy) + in.DeepCopyInto(out) + return out +} diff --git a/cmd/main.go b/cmd/main.go index 98e4d327..ac9b88e9 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -47,6 +47,7 @@ import ( autoscalingv2 "github.com/mercari/tortoise/api/autoscaling/v2" v1 "github.com/mercari/tortoise/api/core/v1" + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" "github.com/mercari/tortoise/internal/controller" "github.com/mercari/tortoise/pkg/config" @@ -70,6 +71,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(autoscalingv1alpha1.AddToScheme(scheme)) utilruntime.Must(autoscalingv1beta3.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -189,10 +191,21 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Tortoise") os.Exit(1) } + if err = (&controller.ScheduledScalingReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ScheduledScaling") + os.Exit(1) + } if err = (&autoscalingv1beta3.Tortoise{}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Tortoise") os.Exit(1) } + if err = (&autoscalingv1alpha1.ScheduledScaling{}).SetupWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "ScheduledScaling") + os.Exit(1) + } //+kubebuilder:scaffold:builder hpaWebhook := autoscalingv2.New(tortoiseService, hpaService) diff --git a/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml new file mode 100644 index 00000000..d6199686 --- /dev/null +++ b/config/crd/bases/autoscaling.mercari.com_scheduledscalings.yaml @@ -0,0 +1,256 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + name: scheduledscalings.autoscaling.mercari.com +spec: + group: autoscaling.mercari.com + names: + kind: ScheduledScaling + listKind: ScheduledScalingList + plural: scheduledscalings + singular: scheduledscaling + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Status + type: string + - jsonPath: .spec.schedule.type + name: Type + type: string + - jsonPath: .status.formattedStartTime + name: Start Time + type: string + - jsonPath: .status.formattedEndTime + name: End Time + type: string + - jsonPath: .status.formattedNextStartTime + name: Next Start + type: string + - jsonPath: .status.humanReadableSchedule + name: Schedule + type: string + - jsonPath: .spec.targetRefs.tortoiseName + name: Target Tortoise + type: string + - jsonPath: .status.message + name: Warnings + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: ScheduledScaling is the Schema for the scheduledscalings API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ScheduledScalingSpec defines the desired state of ScheduledScaling + properties: + schedule: + description: Schedule defines when the scaling should occur + properties: + cronExpression: + description: |- + Cron-based scheduling fields (used when type="cron") + CronExpression defines when scaling periods should start using cron format + Format: "minute hour day month dayofweek" (e.g., "0 9 * * 1-5" for 9 AM weekdays) + type: string + duration: + description: |- + Duration specifies how long each scaling period should last + Format: Go duration (e.g., "8h", "30m", "1h30m") + type: string + finishAt: + description: |- + FinishAt specifies when the scaling should end and return to normal + Format: RFC3339 (e.g., "2024-01-15T18:00:00Z") + type: string + startAt: + description: |- + Time-based scheduling fields (used when type="time") + StartAt specifies when the scaling should begin + Format: RFC3339 (e.g., "2024-01-15T10:00:00Z") + type: string + timeZone: + default: Asia/Tokyo + description: |- + TimeZone specifies the timezone for cron-based scheduling + Format: IANA timezone (e.g., "Asia/Tokyo", "UTC", "America/New_York") + type: string + type: + description: 'Type specifies the scheduling type: "time" or "cron"' + enum: + - time + - cron + type: string + required: + - type + type: object + status: + default: Inactive + description: Status indicates the current state of the scheduled scaling + enum: + - Inactive + - Active + type: string + strategy: + description: Strategy defines how the scaling should be performed + properties: + static: + description: Static defines static scaling parameters + properties: + containerMinAllocatedResources: + description: |- + ContainerMinAllocatedResources sets container-specific minimum allocated resources + If specified, this takes precedence over MinAllocatedResources for specific containers + items: + description: ContainerResourceRequirements describes container-specific + resource requirements + properties: + containerName: + description: ContainerName specifies which container + these resources apply to + type: string + resources: + description: Resources specifies the resource requirements + for this container + properties: + cpu: + description: CPU specifies the CPU resource requirements + type: string + memory: + description: Memory specifies the memory resource + requirements + type: string + required: + - cpu + - memory + type: object + required: + - containerName + - resources + type: object + type: array + minAllocatedResources: + description: |- + MinAllocatedResources sets the minimum allocated resources during scaling + This can be either global (applied to all containers) or container-specific + properties: + cpu: + description: CPU specifies the CPU resource requirements + type: string + memory: + description: Memory specifies the memory resource requirements + type: string + required: + - cpu + - memory + type: object + minimumMinReplicas: + description: MinimumMinReplicas sets the minimum number of + replicas during scaling + format: int32 + minimum: 1 + type: integer + type: object + required: + - static + type: object + targetRefs: + description: TargetRefs specifies which resources this scheduled scaling + should affect + properties: + tortoiseName: + description: TortoiseName is the name of the Tortoise resource + to scale + type: string + required: + - tortoiseName + type: object + required: + - schedule + - strategy + - targetRefs + type: object + status: + description: ScheduledScalingStatus defines the observed state of ScheduledScaling + properties: + currentWindowEnd: + description: CurrentWindowEnd indicates the end of the current scaling + window (for cron-based scheduling) + format: date-time + type: string + currentWindowStart: + description: CurrentWindowStart indicates the start of the current + scaling window (for cron-based scheduling) + format: date-time + type: string + formattedEndTime: + description: FormattedEndTime provides a human-readable formatted + end time + type: string + formattedNextStartTime: + description: FormattedNextStartTime provides a human-readable formatted + next start time + type: string + formattedStartTime: + description: FormattedStartTime provides a human-readable formatted + start time + type: string + humanReadableSchedule: + description: HumanReadableSchedule provides a human-readable description + of the schedule + type: string + lastTransitionTime: + description: LastTransitionTime is the last time the status transitioned + from one phase to another + format: date-time + type: string + message: + description: Message provides additional information about the current + phase + type: string + nextWindowStart: + description: NextWindowStart indicates when the next scaling window + will start (for cron-based scheduling) + format: date-time + type: string + phase: + description: Phase indicates the current phase of the scheduled scaling + enum: + - Pending + - Active + - Completed + - Failed + type: string + reason: + description: Reason indicates why the scheduled scaling is in the + current phase + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b01a62dc..413fc5f7 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -54,6 +54,7 @@ rules: - apiGroups: - autoscaling.mercari.com resources: + - scheduledscalings - tortoises verbs: - create @@ -66,12 +67,14 @@ rules: - apiGroups: - autoscaling.mercari.com resources: + - scheduledscalings/finalizers - tortoises/finalizers verbs: - update - apiGroups: - autoscaling.mercari.com resources: + - scheduledscalings/status - tortoises/status verbs: - get diff --git a/config/rbac/tortoise-scheduledscaling-role.yaml b/config/rbac/tortoise-scheduledscaling-role.yaml new file mode 100644 index 00000000..cae42ccd --- /dev/null +++ b/config/rbac/tortoise-scheduledscaling-role.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tortoise-scheduledscaling-access +rules: +- apiGroups: ["autoscaling.mercari.com"] + resources: ["scheduledscalings", "scheduledscalings/status"] + verbs: ["get","list","watch","update","patch"] +- apiGroups: ["autoscaling.mercari.com"] + resources: ["tortoises", "tortoises/status"] + verbs: ["get","list","watch","update","patch"] diff --git a/config/rbac/tortoise-scheduledscaling-rolebinding.yaml b/config/rbac/tortoise-scheduledscaling-rolebinding.yaml new file mode 100644 index 00000000..6bf9a44d --- /dev/null +++ b/config/rbac/tortoise-scheduledscaling-rolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tortoise-scheduledscaling-access +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tortoise-scheduledscaling-access +subjects: +- kind: ServiceAccount + name: tortoise-controller-manager + namespace: mercari-tortoise-lab diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 0b8a979c..c5512602 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -109,3 +109,23 @@ webhooks: resources: - horizontalpodautoscalers sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /validate-autoscaling-mercari-com-v1alpha1-scheduledscaling + failurePolicy: Fail + name: vscheduledscaling.kb.io + rules: + - apiGroups: + - autoscaling.mercari.com + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - scheduledscalings + sideEffects: None diff --git a/docs/advanced-hpa-configuration.md b/docs/advanced-hpa-configuration.md new file mode 100644 index 00000000..d05b2bc0 --- /dev/null +++ b/docs/advanced-hpa-configuration.md @@ -0,0 +1,291 @@ +--- +title: Custom HPA Scaling Limits with Tortoise +status: published +owner: kouzoh-platform-infra-jp +lastReviewed: 1970-01-01 +type: guide +tags: [tortoise, hpa, scaling, custom] +--- + +# Custom HPA Scaling Limits with Tortoise + +> Customize Tortoise HPA behavior based on your service requirements + +This guide covers the advanced ServiceGroups feature in Tortoise, which allows platform administrators to configure different HPA scaling limits for different categories of services. This feature is designed for teams who need fine-grained control over their autoscaling behavior based on service characteristics. + +## Overview + +By default, Tortoise applies the same maximum replica limits to all services through the global `MaximumMaxReplicas` configuration, which is currently set to **100 replicas** in Citadel production. However, different types of services have different scaling requirements: + +- **High-traffic services** may need to scale beyond 100 replicas during peak traffic +- **Cost-conscious services** might want lower limits to control costs +- **Critical services** might need different scaling behavior than experimental ones + +The ServiceGroups feature allows you to request custom HPA scaling limits for your services based on your specific requirements. + +## How It Works + +ServiceGroups work through a configuration request process: + +1. **Request Custom Limits**: Contact the platform team to request specific `maxReplicas` limits for your services +2. **Configuration**: Platform team configures your namespace and labels with the requested limits +3. **Automatic Application**: Tortoise automatically applies the custom limits to services matching your configuration + +If your service doesn't match any custom ServiceGroup configuration, it uses the default limit of **100 replicas**. + +## When to Request Custom Configuration + +Consider requesting custom ServiceGroup configuration if your service falls into any of these categories or you need different limits than the default 100 replicas: + +### Request Higher Limits (>100 replicas) if: +- Your service handles high user-facing traffic with significant spikes +- You need to scale beyond 100 replicas during peak periods +- Performance and availability are more critical than cost optimization +- You have validated that your service can effectively utilize 100+ replicas + +### Request Lower Limits (<100 replicas) if: +- Cost control is a primary concern for your service +- Your service has predictable, steady traffic patterns +- Your service is batch-oriented or background processing +- You want to prevent accidental over-scaling and control resource costs + +### Request Custom Limits for: +- Services with specific business requirements (e.g., 150, 200, or 50 replicas) +- Services that need limits based on downstream dependencies +- Services with regulatory or compliance constraints + +## How to Request Custom Configuration + +### Step 1: Contact Platform Team + +To request custom scaling limits for your services, contact **#team-kouzoh-platform-infra-jp** team with the following information: + +1. **Service Details**: Name and purpose of your service +2. **Namespace**: Which namespace your service is deployed in +3. **Labels**: Any specific labels your service uses (optional) +4. **Requested Limit**: The `maxReplicas` value you need (e.g., 150, 200, 50) +5. **Justification**: Business case for the custom limit + +### Step 2: Configuration Applied + +Once approved, the platform team will configure your namespace and labels with the custom limit. + +## Configuration Examples + +### Example 1: Requesting Higher Limits for High-Traffic Service + +**Your Request to #team-kouzoh-platform-infra-jp**: +``` +Hi team! I need custom scaling limits for our API gateway service: +- Service: api-gateway +- Namespace: production-api +- Current Issue: We're hitting the 100 replica limit during peak traffic +- Requested maxReplicas: 150 +- Justification: Service handles user-facing traffic with 3x traffic spikes during events +``` + +**Your Service Configuration** (after platform team configures the namespace): +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-gateway + namespace: production-api # Platform team configured this namespace for 150 replicas + labels: + app: api-gateway + tier: frontend +spec: + # ... your deployment spec +``` + +```yaml +# Your Tortoise configuration remains the same +apiVersion: autoscaling.mercari.com/v1beta3 +kind: Tortoise +metadata: + name: api-gateway-tortoise + namespace: production-api +spec: + updateMode: Auto + targetRefs: + scaleTargetRef: + kind: Deployment + name: api-gateway +``` + +**Result**: This Tortoise can now scale the HPA up to 150 replicas maximum. + +### Example 2: Requesting Lower Limits for Cost Control + +**Your Request to #team-kouzoh-platform-infra-jp**: +``` +Hi team! I need custom scaling limits for our batch processing services: +- Services: All batch processors +- Namespace: batch-services +- Labels: tier=worker (optional, for more granular control) +- Requested maxReplicas: 50 +- Justification: Cost optimization - these services have steady load and don't need >50 replicas +``` + +**Your Service Configuration** (after platform team configures the namespace): +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: batch-processor + namespace: batch-services # Platform team configured this namespace for 50 replicas + labels: + app: batch-processor + tier: worker +spec: + # ... your deployment spec +``` + +**Result**: Services in this namespace are limited to 50 replicas maximum, helping control costs. + +### Example 3: Default Behavior (No Custom Configuration) + +Services that don't have custom configuration will use the default limit: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: experimental-service + namespace: development # No custom configuration for this namespace + labels: + app: experimental-service +spec: + # ... your deployment spec +``` + +**Result**: This Tortoise will use the default limit of **100 replicas**. + +## Understanding Service Group Matching + +Once the platform team configures your custom limits, the matching works as follows: + +### Namespace-Based Matching +Services are primarily matched based on their namespace. For example, if you requested: +- `production-api` namespace → 150 replicas limit +- `batch-services` namespace → 50 replicas limit + +### Label-Based Matching (Advanced) +For more granular control, the platform team can also configure label selectors: + +```yaml +# Example: Services with specific labels get custom limits +metadata: + namespace: my-services + labels: + tier: worker # Only services with this label get the custom limit + cost-tier: optimized # Additional label selector for more precision +``` + +This allows different services within the same namespace to have different limits if needed. + +## Monitoring Your Service Group Assignment + +### Check Your Tortoise Status +You can verify which limits are being applied to your service by checking your Tortoise status: + +```bash +kubectl describe tortoise -n +``` + +Look for events or status information that indicates the maximum replica limits being applied. + +### Metrics and Observability +Monitor these metrics to understand your service's scaling behavior: + +- `mercari.tortoise.proposed_hpa_maxreplicas` - The maximum replicas Tortoise recommends +- `mercari.tortoise.current_hpa_maxreplicas` - The current HPA maximum replicas setting + +You can also use the Tortoise dashboard to visualize how service groups affect your scaling: +https://app.datadoghq.com/dashboard/h4s-vry-dyv/tortoise-user-dashboard + +## Best Practices + +### Before Requesting Custom Limits + +**Analyze Your Current Usage:** +1. **Monitor First**: Use Off mode Tortoise to see what limits would be recommended +2. **Check Current Peaks**: Review your service's historical scaling patterns +3. **Validate Need**: Ensure your service can actually benefit from custom limits +4. **Cost Impact**: Consider the cost implications of higher limits + +**Prepare Your Request:** +1. **Gather Data**: Collect metrics showing why you need custom limits +2. **Business Justification**: Explain the business impact of the current limits +3. **Specific Requirements**: Request exact numbers rather than ranges +4. **Timeline**: Mention if this is urgent or can be planned + +### After Custom Configuration + +**Gradual Adoption:** +1. **Test in Non-Production**: Start with development or staging environments +2. **Monitor Closely**: Watch for unexpected scaling behavior +3. **Cost Monitoring**: Keep track of resource costs after configuration changes +4. **Validate Effectiveness**: Ensure the custom limits solve your original problem + +## Troubleshooting + +### My Service Isn't Using the Expected Custom Limits + +1. **Verify Configuration**: Check if the platform team has confirmed your configuration is applied +2. **Check Request Status**: Follow up with #team-kouzoh-platform-infra-jp if your request is still pending +3. **Namespace/Labels**: Verify your service is deployed in the configured namespace with correct labels +4. **Tortoise Restart**: Sometimes restarting the Tortoise controller helps pick up new configuration + +### My Request Was Rejected or Modified + +If the platform team suggests different limits than requested: + +1. **Understand Reasoning**: Ask for explanation of the decision +2. **Provide Additional Context**: Share more data about your scaling needs +3. **Compromise**: Consider accepting modified limits and requesting adjustments later based on real usage + +## Common Questions + +### Can I Override Custom Limits? + +Custom limits are enforced at the platform level and cannot be overridden by individual Tortoise configurations. This ensures consistent resource management across the platform. + +### How Do I Know What Custom Limits Are Applied to My Service? + +1. **Check with Platform Team**: Ask #team-kouzoh-platform-infra-jp about your namespace configuration +2. **Monitor Tortoise Behavior**: Observe the maximum replicas Tortoise applies to your HPA +3. **Review Tortoise Events**: Check kubectl events for information about applied limits + +### What Happens If I Hit My Custom Limits During Traffic Spikes? + +Services will scale up to their configured custom limit. If you consistently hit your limits and need more capacity: + +1. **Gather Evidence**: Collect metrics showing the need for higher limits +2. **Request Increase**: Contact #team-kouzoh-platform-infra-jp with data supporting the need for higher limits +3. **Temporary Relief**: Discuss emergency procedures for critical traffic events + +### How Long Does It Take to Get Custom Configuration? + +- **Simple Requests**: Usually processed within 1-2 business days +- **Complex Requests**: May take longer if they require additional review or testing +- **Urgent Requests**: Contact #team-kouzoh-platform-infra-jp directly for expedited processing + +### Can I Request Different Limits for Different Services in the Same Namespace? + +Yes! The platform team can configure label-based selectors to give different services in the same namespace different limits. Include this requirement in your request. + +## Getting Help + +If you have questions about custom scaling limits or need assistance with configuration: + +- **Slack**: #team-kouzoh-platform-infra-jp +- **For Custom Limit Requests**: Contact #team-kouzoh-platform-infra-jp with your requirements +- **For Configuration Issues**: Reach out with your specific service details and observed behavior +- **For Urgent Issues**: Tag the platform team directly in #team-kouzoh-platform-infra-jp + +## Related Documentation + +- [Tortoise Main Guide](./tortoise.md) - Basic Tortoise usage and configuration +- [Kubernetes Kit: Scaling](../guides/k8s-kit/scaling.md) - Integration with Kubernetes Kit +- [HPA Configuration](../guides/how-to-create-hpa.md) - Manual HPA configuration (if needed) \ No newline at end of file diff --git a/docs/scheduled-scaling.md b/docs/scheduled-scaling.md new file mode 100644 index 00000000..d50bad60 --- /dev/null +++ b/docs/scheduled-scaling.md @@ -0,0 +1,341 @@ +# Scheduled Scaling Feature + +## Overview + +The Scheduled Scaling feature allows users to proactively scale their applications before anticipated load increases occur. This is particularly useful for predictable traffic patterns such as: + +- **TV commercials or marketing campaigns** that drive traffic spikes +- **Push notifications** that trigger user engagement +- **Scheduled events** (e.g., product launches, sales) +- **Load testing** in development environments +- **Seasonal traffic patterns** (e.g., holiday shopping, tax season) + +Instead of waiting for the load to increase and then scaling reactively, Scheduled Scaling enables proactive scaling based on user-defined schedules. + +## Schedule Types + +### Time-Based Scheduling +Specify exact start and end times for one-time scaling events. + +### Cron-Based Scheduling +Use cron expressions for recurring scaling patterns with configurable duration and timezone support. + +## Architecture + +### High-Level Flow + +``` +ScheduledScaling Resource → ScheduledScaling Controller → Tortoise Resource → Tortoise Controller → HPA → Pod Scaling +``` + +### Component Responsibilities + +1. **ScheduledScaling Controller**: Manages time-based scheduling logic and applies scaling configurations to Tortoise resources +2. **Tortoise Controller**: Translates Tortoise policies into HPA configurations (existing functionality) +3. **HPA Controller**: Performs actual pod scaling based on HPA configuration (Kubernetes built-in) + +### Design Principles + +- **Separation of Concerns**: Each controller has a single responsibility +- **API-First Design**: Controllers communicate only through Kubernetes APIs +- **Limited Blast Radius**: Issues in scheduled scaling don't affect core Tortoise functionality +- **Testability**: Each component can be tested independently + +## API Reference + +### Time-Based ScheduledScaling + +```yaml +apiVersion: autoscaling.mercari.com/v1alpha1 +kind: ScheduledScaling +metadata: + name: example-time-scaling + namespace: default +spec: + schedule: + type: time + startAt: "2024-01-15T10:00:00Z" + finishAt: "2024-01-15T18:00:00Z" + targetRefs: + tortoiseName: "my-tortoise" + strategy: + static: + minimumMinReplicas: 15 + minAllocatedResources: + cpu: "500m" + memory: "1Gi" + containerMinAllocatedResources: + - containerName: "app" + resources: + cpu: "800m" + memory: "2Gi" +``` + +### Cron-Based ScheduledScaling + +```yaml +apiVersion: autoscaling.mercari.com/v1alpha1 +kind: ScheduledScaling +metadata: + name: example-cron-scaling + namespace: default +spec: + schedule: + type: cron + cronExpression: "0 9 * * 1-5" # 9 AM on weekdays + duration: "8h" # Active for 8 hours + timeZone: "Asia/Tokyo" # Timezone support + targetRefs: + tortoiseName: "my-tortoise" + strategy: + static: + minimumMinReplicas: 20 + minAllocatedResources: + cpu: "1000m" + memory: "2Gi" +``` + +### Field Descriptions + +#### Schedule +- **type**: `time` or `cron` - Type of scheduling +- **startAt**: (time only) ISO 8601 timestamp when scaling should begin +- **finishAt**: (time only) ISO 8601 timestamp when scaling should end +- **cronExpression**: (cron only) Standard cron expression (e.g., `"0 9 * * 1-5"`) +- **duration**: (cron only) How long each scaling window lasts (e.g., `"8h"`, `"30m"`) +- **timeZone**: (cron only) Timezone for cron scheduling (default: `"Asia/Tokyo"`) + +#### TargetRefs +- **tortoiseName**: Name of the Tortoise resource to apply scheduled scaling to + +#### Strategy +- **static**: Static scaling configuration + - **minimumMinReplicas**: Minimum number of replicas during the scheduled period + - **minAllocatedResources**: Global minimum resource allocation per container + - **cpu**: CPU request (e.g., `"500m"`) + - **memory**: Memory request (e.g., `"1Gi"`) + - **containerMinAllocatedResources**: Container-specific resource overrides + - **containerName**: Name of the container to override + - **resources**: Resource specifications for this container + +### Status Fields + +```yaml +status: + phase: "Active" # Pending, Active, Completed, Failed + lastTransitionTime: "2024-01-15T10:00:00Z" + reason: "Active" + message: "Scheduled scaling is currently active" + humanReadableSchedule: "Every day at 9:00 AM for 8 hours" + formattedStartTime: "Today at 9:00 AM" + formattedEndTime: "Today at 5:00 PM" + formattedNextStartTime: "Tomorrow at 9:00 AM" +``` + +## Usage Examples + +### Cron-Based Recurring Scaling + +```yaml +apiVersion: autoscaling.mercari.com/v1alpha1 +kind: ScheduledScaling +metadata: + name: business-hours-scaling + namespace: production +spec: + schedule: + type: cron + cronExpression: "0 9 * * 1-5" # 9 AM on weekdays + duration: "8h" # Active for 8 hours (9 AM - 5 PM) + timeZone: "Asia/Tokyo" + targetRefs: + tortoiseName: "my-tortoise" + strategy: + static: + minimumMinReplicas: 20 + minAllocatedResources: + cpu: "1000m" + memory: "2Gi" +``` + +### One-Time Event Scaling + +```yaml +apiVersion: autoscaling.mercari.com/v1alpha1 +kind: ScheduledScaling +metadata: + name: tv-commercial-scaling + namespace: marketing +spec: + schedule: + type: time + startAt: "2024-01-20T19:55:00Z" # 5 minutes before commercial + finishAt: "2024-01-20T20:05:00Z" # 5 minutes after commercial + targetRefs: + tortoiseName: "marketing-tortoise" + strategy: + static: + minimumMinReplicas: 15 + minAllocatedResources: + cpu: "800m" + memory: "1.5Gi" +``` + +### Container-Specific Scaling + +```yaml +apiVersion: autoscaling.mercari.com/v1alpha1 +kind: ScheduledScaling +metadata: + name: mixed-container-scaling + namespace: production +spec: + schedule: + type: cron + cronExpression: "*/10 * * * *" # Every 10 minutes (for testing) + duration: "5m" + timeZone: "Asia/Tokyo" + targetRefs: + tortoiseName: "my-tortoise" + strategy: + static: + minimumMinReplicas: 10 + minAllocatedResources: # Global defaults + cpu: "500m" + memory: "1Gi" + containerMinAllocatedResources: # Container-specific overrides + - containerName: "web-server" + resources: + cpu: "1000m" + memory: "2Gi" + - containerName: "sidecar" + resources: + cpu: "200m" + memory: "512Mi" +``` + +## How It Works + +### 1. Pending Phase +- Controller calculates time until scaling should begin +- Sets status to `Pending` with next window information +- Requeues reconciliation at the start time + +### 2. Active Phase +- Controller applies scheduled scaling configuration to the target Tortoise: + - Sets resource requests (CPU/Memory) on containers + - Sets HPA minReplicas through Tortoise annotation + - Stores original configuration for restoration +- Sets status to `Active` +- Requeues reconciliation at the finish time + +### 3. Restoration Phase +- Controller restores original Tortoise configuration +- Removes ScheduledScaling annotations +- HPA minReplicas returns to original value +- Sets status to `Pending` (for cron) or `Completed` (for time-based) + +### 4. Emergency Mode Protection +- Extended grace period (10 minutes vs 5 minutes) during ScheduledScaling +- Emergency mode respects ScheduledScaling minReplicas when both are active +- Prevents false emergency triggers during rapid scaling + +### 5. Error Handling +- Robust conflict resolution with retry logic +- Finalizer-based cleanup ensures proper restoration on deletion +- Comprehensive error logging and status updates + +## Key Features + +### Resource Scaling +- **Global Resource Requests**: Apply CPU/memory minimums to all containers +- **Container-Specific Overrides**: Set different resources per container +- **HPA MinReplicas**: Dynamically adjust minimum replica count +- **Consistent Scaling**: Ensures maxReplicas ≥ minReplicas + +### Schedule Management +- **Time-Based**: One-time scaling events with start/end times +- **Cron-Based**: Recurring scaling with cron expressions +- **Timezone Support**: Configure schedules in local timezones +- **Human-Readable Status**: Clear schedule descriptions in status + +### Emergency Mode Integration +- **Extended Grace Period**: 10-minute grace period during ScheduledScaling (vs 5 minutes normally) +- **Metric Failure Protection**: Handles all types of HPA metric failures +- **Smart Emergency Override**: Emergency mode respects ScheduledScaling minReplicas +- **Empty Metrics Handling**: Prevents false emergencies during rapid scaling + +### Reliability Features +- **Finalizer-Based Cleanup**: Ensures proper restoration even if ScheduledScaling is deleted +- **Conflict Resolution**: Automatic retry with exponential backoff +- **Original State Preservation**: Stores and restores complete Tortoise configuration +- **Annotation-Based Communication**: Clean integration with existing Tortoise controller + +## Testing + +### Quick Test + +Use the provided test script: +```bash +# Test cron-based scheduling (default) +ASSERT=1 ./scripts/test-scheduled-scaling.sh + +# Test time-based scheduling +SCHEDULE_TYPE=time ASSERT=1 ./scripts/test-scheduled-scaling.sh +``` + +### Manual Testing + +```bash +# Check ScheduledScaling status +kubectl get scheduledscalings -A + +# Check Tortoise and HPA changes +kubectl get tortoise -o yaml +kubectl get hpa -o yaml + +# Monitor controller logs +kubectl logs -n mercari-tortoise-lab deployment/tortoise-controller-manager -f +``` + +## Troubleshooting + +### Common Issues + +1. **Status Not Updating**: Ensure ScheduledScaling controller is registered in `main.go` +2. **Emergency Mode Override**: Check HPA conditions for metric failures; ScheduledScaling extends grace period automatically +3. **Resource Not Restored**: Finalizer ensures cleanup even if ScheduledScaling is force-deleted +4. **HPA MinReplicas Not Applied**: Verify Tortoise has HPA reference and annotation is set +5. **Conflict Errors**: Controller automatically retries with latest resource version + +### Controller Logs + +```bash +# Check ScheduledScaling controller logs +kubectl logs -n mercari-tortoise-lab deployment/tortoise-controller-manager -f | grep scheduledscaling + +# Check Tortoise controller logs for HPA updates +kubectl logs -n mercari-tortoise-lab deployment/tortoise-controller-manager -f | grep -i hpa +``` + +### Debugging Commands + +```bash +# Check ScheduledScaling status +kubectl get scheduledscalings -A +kubectl describe scheduledscaling -n + +# Check Tortoise annotations +kubectl get tortoise -o yaml | grep -A5 -B5 scheduledscaling + +# Check HPA current state +kubectl get hpa -o yaml +``` + +## Notes + +- **Timezone Default**: All schedules default to `Asia/Tokyo` timezone +- **Grace Period**: Extended to 10 minutes during ScheduledScaling to prevent false emergency mode triggers +- **Annotation-Based**: Uses Tortoise annotations for clean integration with existing controller +- **Conflict Resolution**: Automatic retry logic handles concurrent updates +- **Emergency Mode**: Respects ScheduledScaling minReplicas even when in emergency mode diff --git a/go.mod b/go.mod index 72c6b751..0c4abcc1 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,7 @@ require ( require ( github.com/kyokomi/emoji/v2 v2.2.12 + github.com/robfig/cron/v3 v3.0.1 github.com/spf13/cobra v1.8.1 ) diff --git a/go.sum b/go.sum index 40b5f63a..cc13040a 100644 --- a/go.sum +++ b/go.sum @@ -93,6 +93,8 @@ github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/internal/controller/scheduledscaling_controller.go b/internal/controller/scheduledscaling_controller.go new file mode 100644 index 00000000..75b15231 --- /dev/null +++ b/internal/controller/scheduledscaling_controller.go @@ -0,0 +1,868 @@ +/* +Copyright 2024 The Tortoise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "encoding/json" + "fmt" + "time" + + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" + autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" +) + +const ( + scheduledScalingFinalizer = "scheduledscaling.autoscaling.mercari.com/finalizer" +) + +// ScheduledScalingReconciler reconciles a ScheduledScaling object +type ScheduledScalingReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=scheduledscalings,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=scheduledscalings/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=scheduledscalings/finalizers,verbs=update +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=tortoises,verbs=get;list;watch;update;patch +//+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=tortoises/status,verbs=get;update;patch + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +func (r *ScheduledScalingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Fetch the ScheduledScaling instance + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{} + if err := r.Get(ctx, req.NamespacedName, scheduledScaling); err != nil { + // Handle the case where the resource is not found + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + logger.Info("Reconciling ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "type", scheduledScaling.Spec.Schedule.Type) + + // Handle finalizer logic + if err := r.handleFinalizer(ctx, scheduledScaling); err != nil { + return ctrl.Result{}, err + } + + // If the object is being deleted, don't continue with reconciliation + if scheduledScaling.DeletionTimestamp != nil { + logger.Info("ScheduledScaling is being deleted, skipping further reconciliation", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return ctrl.Result{}, nil + } + + // Validate the schedule configuration + if err := scheduledScaling.Spec.Schedule.Validate(); err != nil { + logger.Error(err, "Invalid schedule configuration") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidSchedule", err.Error()) + } + + // Handle scheduling based on type + switch scheduledScaling.Spec.Schedule.Type { + case autoscalingv1alpha1.ScheduleTypeTime: + return r.handleTimeBasedScheduling(ctx, scheduledScaling) + case autoscalingv1alpha1.ScheduleTypeCron: + return r.handleCronBasedScheduling(ctx, scheduledScaling) + default: + err := fmt.Errorf("unsupported schedule type: %s", scheduledScaling.Spec.Schedule.Type) + logger.Error(err, "Invalid schedule type") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "InvalidScheduleType", err.Error()) + } +} + +// handleTimeBasedScheduling handles scheduling based on specific start/end times +func (r *ScheduledScalingReconciler) handleTimeBasedScheduling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Parse the schedule times (already validated by Validate()) + startTime, _ := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.StartAt) + finishTime, _ := time.Parse(time.RFC3339, scheduledScaling.Spec.Schedule.FinishAt) + + now := time.Now() + var newPhase autoscalingv1alpha1.ScheduledScalingPhase + var reason, message string + + // Determine the current phase based on time + if now.Before(startTime) { + // Calculate time until start + timeUntilStart := startTime.Sub(now) + // Set status to Pending if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhasePending { + return ctrl.Result{RequeueAfter: timeUntilStart}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhasePending, "Waiting", fmt.Sprintf("Waiting for scheduled scaling to begin at %s", startTime.Format(time.RFC3339))) + } + return ctrl.Result{RequeueAfter: timeUntilStart}, nil + } else if now.After(finishTime) { + // Apply normal scaling (restore original settings) + if err := r.applyNormalScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply normal scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "RestoreFailed", err.Error()) + } + newPhase = autoscalingv1alpha1.ScheduledScalingPhaseCompleted + reason = "Completed" + message = "Scheduled scaling period has ended" + } else { + // Currently in the scaling period + // Apply scheduled scaling + if err := r.applyScheduledScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply scheduled scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "ScalingFailed", err.Error()) + } + + // Set status to Active if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhaseActive { + // Update time-based status with formatted times + if err := r.updateTimeBasedStatus(ctx, scheduledScaling, startTime, finishTime); err != nil { + logger.Error(err, "Failed to update time-based status") + } + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", fmt.Sprintf("Scheduled scaling is active until %s", finishTime.Format(time.RFC3339))) + } + + // Calculate time until finish + timeUntilFinish := finishTime.Sub(now) + return ctrl.Result{RequeueAfter: timeUntilFinish}, nil + } + + // Update status if phase changed (this handles Completed phase) + if scheduledScaling.Status.Phase != newPhase { + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, newPhase, reason, message) + } + + return ctrl.Result{}, nil +} + +// handleCronBasedScheduling handles scheduling based on cron expressions +func (r *ScheduledScalingReconciler) handleCronBasedScheduling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) (ctrl.Result, error) { + logger := log.FromContext(ctx) + now := time.Now() + + // Check if we're currently in an active scaling window + isActive, endTime, err := scheduledScaling.Spec.Schedule.IsCurrentlyActive(now) + if err != nil { + logger.Error(err, "Failed to check if schedule is currently active") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "ScheduleCheckFailed", err.Error()) + } + + if isActive { + // We're in an active scaling period + // Calculate current window start time + duration, _ := time.ParseDuration(scheduledScaling.Spec.Schedule.Duration) + currentStart := endTime.Add(-duration) + + // Apply scheduled scaling + if err := r.applyScheduledScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply scheduled scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "ScalingFailed", err.Error()) + } + + // Set status to Active if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhaseActive { + // Update cron status with current window information + if err := r.updateCronStatus(ctx, scheduledScaling, ¤tStart, &endTime, nil); err != nil { + logger.Error(err, "Failed to update cron status") + } + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseActive, "Active", fmt.Sprintf("Cron-based scheduled scaling is active until %s", endTime.Format(time.RFC3339))) + } + + // Calculate time until this window ends + timeUntilEnd := endTime.Sub(now) + // Add a small buffer to ensure we catch the transition + requeueAfter := timeUntilEnd + time.Minute + if requeueAfter < time.Minute { + requeueAfter = time.Minute + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } else { + // We're not in an active scaling period + // Apply normal scaling (restore original settings) if we were previously active + if scheduledScaling.Status.Phase == autoscalingv1alpha1.ScheduledScalingPhaseActive { + if err := r.applyNormalScaling(ctx, scheduledScaling); err != nil { + logger.Error(err, "Failed to apply normal scaling") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "RestoreFailed", err.Error()) + } + } + + // Find the next scheduled window + nextStart, nextEnd, err := scheduledScaling.Spec.Schedule.GetNextScheduleWindow(now) + if err != nil { + logger.Error(err, "Failed to calculate next schedule window") + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhaseFailed, "NextScheduleFailed", err.Error()) + } + + // Set status to Pending if not already set + if scheduledScaling.Status.Phase != autoscalingv1alpha1.ScheduledScalingPhasePending { + // Get the timezone for formatting (defaults to Asia/Tokyo) + loc := scheduledScaling.GetTimezone() + message := fmt.Sprintf("Next scheduled scaling window: %s to %s (cron: %s)", + nextStart.In(loc).Format("2006-01-02T15:04:05"), + nextEnd.In(loc).Format("2006-01-02T15:04:05"), + scheduledScaling.Spec.Schedule.CronExpression) + + // Update cron status with next window information + if err := r.updateCronStatus(ctx, scheduledScaling, nil, nil, &nextStart); err != nil { + logger.Error(err, "Failed to update cron status") + } + + return ctrl.Result{}, r.updateStatus(ctx, scheduledScaling, autoscalingv1alpha1.ScheduledScalingPhasePending, "WaitingForNext", message) + } + + // Calculate time until next start + timeUntilNext := nextStart.Sub(now) + // Add a small buffer but ensure reasonable polling interval + requeueAfter := timeUntilNext + if requeueAfter > 10*time.Minute { + // For far future schedules, check every 10 minutes + requeueAfter = 10 * time.Minute + } else if requeueAfter < time.Minute { + // For immediate schedules, check in 1 minute + requeueAfter = time.Minute + } + + logger.V(1).Info("Cron schedule waiting for next window", + "nextStart", nextStart, + "timeUntilNext", timeUntilNext, + "requeueAfter", requeueAfter) + + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } +} + +// applyScheduledScaling applies the scheduled scaling configuration to the target Tortoise +func (r *ScheduledScalingReconciler) applyScheduledScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + // Skip applying scheduled scaling if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).Info("Skipping scheduled scaling application - object is being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + + // Get the target Tortoise + t := &autoscalingv1beta3.Tortoise{} + key := types.NamespacedName{Namespace: scheduledScaling.Namespace, Name: scheduledScaling.Spec.TargetRefs.TortoiseName} + if err := r.Get(ctx, key, t); err != nil { + return fmt.Errorf("failed to get target tortoise: %w", err) + } + + // Preserve existing HPA reference if present and not already specified in spec + if t.Spec.TargetRefs.HorizontalPodAutoscalerName == nil && t.Status.Targets.HorizontalPodAutoscaler != "" { + // If tortoise created an HPA but spec doesn't reference it explicitly, + // add the reference to prevent HPA recreation during scheduled scaling + t.Spec.TargetRefs.HorizontalPodAutoscalerName = &t.Status.Targets.HorizontalPodAutoscaler + } + + const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" + const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" + if t.Annotations == nil { + t.Annotations = map[string]string{} + } + + // Persist original spec if not already stored + if _, exists := t.Annotations[annOriginal]; !exists { + orig, err := json.Marshal(t.Spec) + if err != nil { + return fmt.Errorf("marshal original tortoise spec: %w", err) + } + t.Annotations[annOriginal] = string(orig) + } + + // Handle resource scaling - both global and container-specific + updated := false + + // Process global resource requirements (applied to all containers) + var globalCPU, globalMemory resource.Quantity + var hasGlobalCPU, hasGlobalMemory bool + + if scheduledScaling.Spec.Strategy.Static.MinAllocatedResources != nil { + if scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU != "" { + v, err := resource.ParseQuantity(scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU) + if err != nil { + return fmt.Errorf("invalid global cpu quantity %q: %w", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.CPU, err) + } + globalCPU = v + hasGlobalCPU = true + } + if scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory != "" { + v, err := resource.ParseQuantity(scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory) + if err != nil { + return fmt.Errorf("invalid global memory quantity %q: %w", scheduledScaling.Spec.Strategy.Static.MinAllocatedResources.Memory, err) + } + globalMemory = v + hasGlobalMemory = true + } + } + + // Process container-specific resource requirements + containerResources := make(map[string]struct { + cpu resource.Quantity + memory resource.Quantity + hasCPU bool + hasMem bool + }) + + if scheduledScaling.Spec.Strategy.Static.ContainerMinAllocatedResources != nil { + for _, containerReq := range scheduledScaling.Spec.Strategy.Static.ContainerMinAllocatedResources { + var cpu, mem resource.Quantity + var hasCPU, hasMem bool + + if containerReq.Resources.CPU != "" { + v, err := resource.ParseQuantity(containerReq.Resources.CPU) + if err != nil { + return fmt.Errorf("invalid cpu quantity for container %s: %q: %w", containerReq.ContainerName, containerReq.Resources.CPU, err) + } + cpu = v + hasCPU = true + } + if containerReq.Resources.Memory != "" { + v, err := resource.ParseQuantity(containerReq.Resources.Memory) + if err != nil { + return fmt.Errorf("invalid memory quantity for container %s: %q: %w", containerReq.ContainerName, containerReq.Resources.Memory, err) + } + mem = v + hasMem = true + } + + containerResources[containerReq.ContainerName] = struct { + cpu resource.Quantity + memory resource.Quantity + hasCPU bool + hasMem bool + }{cpu, mem, hasCPU, hasMem} + } + } + + // Apply resource requirements to each container in the tortoise + for i := range t.Spec.ResourcePolicy { + pol := &t.Spec.ResourcePolicy[i] + containerName := pol.ContainerName + + // Initialize resource lists if they don't exist + if pol.MinAllocatedResources == nil { + pol.MinAllocatedResources = v1.ResourceList{} + } + if pol.MaxAllocatedResources == nil { + pol.MaxAllocatedResources = v1.ResourceList{} + } + + // Check if we have container-specific resources for this container + containerSpec, hasContainerSpec := containerResources[containerName] + + // Apply CPU resources + if hasContainerSpec && containerSpec.hasCPU { + // Container-specific CPU takes precedence + pol.MinAllocatedResources[v1.ResourceCPU] = containerSpec.cpu + pol.MaxAllocatedResources[v1.ResourceCPU] = containerSpec.cpu + updated = true + } else if hasGlobalCPU { + // Apply global CPU to this container + pol.MinAllocatedResources[v1.ResourceCPU] = globalCPU + pol.MaxAllocatedResources[v1.ResourceCPU] = globalCPU + updated = true + } + + // Apply Memory resources + if hasContainerSpec && containerSpec.hasMem { + // Container-specific memory takes precedence + pol.MinAllocatedResources[v1.ResourceMemory] = containerSpec.memory + pol.MaxAllocatedResources[v1.ResourceMemory] = containerSpec.memory + updated = true + } else if hasGlobalMemory { + // Apply global memory to this container + pol.MinAllocatedResources[v1.ResourceMemory] = globalMemory + pol.MaxAllocatedResources[v1.ResourceMemory] = globalMemory + updated = true + } + } + + if scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas != nil { + m := *scheduledScaling.Spec.Strategy.Static.MinimumMinReplicas + if m > 0 { + // Validate that the requested minReplicas is not lower than HPA's recommendation + isValid, warningMsg, recommendedValue := r.validateMinReplicasAgainstHPARecommendation(ctx, t, m) + + // Determine what value to actually use + effectiveMinReplicas := m + if !isValid && recommendedValue > 0 { + // Use the HPA's recommended value instead of the requested value + effectiveMinReplicas = recommendedValue + log.FromContext(ctx).Info("ScheduledScaling minReplicas using HPA recommendation instead of requested value", + "tortoise", t.Name, + "requested", m, + "using", effectiveMinReplicas, + "warning", warningMsg) + } else if !isValid { + // Log the warning but still apply the requested change + log.FromContext(ctx).Info("ScheduledScaling minReplicas validation warning", + "tortoise", t.Name, + "requested", m, + "warning", warningMsg) + } + + // Update the status message to include the warning for user visibility + if !isValid && warningMsg != "" { + if scheduledScaling.Status.Message == "" { + scheduledScaling.Status.Message = warningMsg + } else { + scheduledScaling.Status.Message = scheduledScaling.Status.Message + "; " + warningMsg + } + } + + // Store the effective minReplicas value in annotation for the Tortoise controller to read + t.Annotations[annMinReplicas] = fmt.Sprintf("%d", effectiveMinReplicas) + log.FromContext(ctx).Info("Setting HPA minReplicas annotation for Tortoise controller", "tortoise", t.Name, "minReplicas", effectiveMinReplicas) + + updated = true + } + } + + if !updated { + log.FromContext(ctx).Info("Scheduled scaling made no changes to tortoise spec", "tortoise", t.Name) + return nil + } + + // Retry update with conflict resolution + maxRetries := 3 + for attempt := 0; attempt < maxRetries; attempt++ { + if err := r.Update(ctx, t); err != nil { + if apierrors.IsConflict(err) && attempt < maxRetries-1 { + // Refetch the latest version and retry + log.FromContext(ctx).Info("Conflict detected when updating tortoise, refetching and retrying", "tortoise", t.Name, "attempt", attempt+1) + + latest := &autoscalingv1beta3.Tortoise{} + if err := r.Get(ctx, client.ObjectKeyFromObject(t), latest); err != nil { + return fmt.Errorf("failed to refetch tortoise after conflict: %w", err) + } + + // Preserve existing HPA reference if present and not already specified in spec + if latest.Spec.TargetRefs.HorizontalPodAutoscalerName == nil && latest.Status.Targets.HorizontalPodAutoscaler != "" { + latest.Spec.TargetRefs.HorizontalPodAutoscalerName = &latest.Status.Targets.HorizontalPodAutoscaler + } + + // Re-apply the changes to the latest version + if latest.Annotations == nil { + latest.Annotations = map[string]string{} + } + + // Copy our changes to the latest version + for k, v := range t.Annotations { + if k == "autoscaling.mercari.com/scheduledscaling-original-spec" || k == "autoscaling.mercari.com/scheduledscaling-min-replicas" { + latest.Annotations[k] = v + } + } + + // Copy resource policy changes + latest.Spec.ResourcePolicy = t.Spec.ResourcePolicy + + t = latest + time.Sleep(time.Duration(attempt+1) * time.Second) + continue + } + return fmt.Errorf("update tortoise: %w", err) + } + // Update succeeded + break + } + return nil +} + +// applyNormalScaling restores the normal scaling configuration +func (r *ScheduledScalingReconciler) applyNormalScaling(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + logger := log.FromContext(ctx) + + // Fetch target tortoise + t := &autoscalingv1beta3.Tortoise{} + key := types.NamespacedName{Namespace: scheduledScaling.Namespace, Name: scheduledScaling.Spec.TargetRefs.TortoiseName} + if err := r.Get(ctx, key, t); err != nil { + return fmt.Errorf("failed to get target tortoise for restore: %w", err) + } + + const annOriginal = "autoscaling.mercari.com/scheduledscaling-original-spec" + const annMinReplicas = "autoscaling.mercari.com/scheduledscaling-min-replicas" + + logger.Info("applyNormalScaling: checking annotations", "tortoise", t.Name, "annotations", t.Annotations) + + // Always clean up ScheduledScaling annotations, even if we can't restore the original spec + hasChanges := false + + // Try to restore the original spec first, before removing annotations + if t.Annotations != nil { + if orig, ok := t.Annotations[annOriginal]; ok && orig != "" { + logger.Info("applyNormalScaling: found original spec annotation", "original", orig) + + var spec autoscalingv1beta3.TortoiseSpec + if err := json.Unmarshal([]byte(orig), &spec); err != nil { + logger.Error(err, "Failed to unmarshal original tortoise spec, but continuing with annotation cleanup") + } else { + // Preserve HPA reference if it was added during scheduled scaling to prevent HPA recreation + if spec.TargetRefs.HorizontalPodAutoscalerName == nil && t.Spec.TargetRefs.HorizontalPodAutoscalerName != nil && t.Status.Targets.HorizontalPodAutoscaler != "" { + // If original spec didn't have HPA reference but current spec does (added during scheduled scaling), + // preserve it to avoid HPA recreation when restoring + spec.TargetRefs.HorizontalPodAutoscalerName = t.Spec.TargetRefs.HorizontalPodAutoscalerName + } + + // HPA minReplicas restoration is handled by removing the annotation + // The Tortoise controller will restore the original HPA minReplicas when the annotation is removed + + logger.Info("applyNormalScaling: restoring tortoise spec", "tortoise", t.Name, "newSpec", spec) + t.Spec = spec + hasChanges = true + } + } else { + logger.Info("applyNormalScaling: original spec annotation not found, skipping spec restoration", "tortoise", t.Name) + } + } else { + logger.Info("applyNormalScaling: no annotations found on tortoise", "tortoise", t.Name) + } + + // Remove ScheduledScaling annotations after restoring the spec + if t.Annotations != nil { + if _, hasOriginal := t.Annotations[annOriginal]; hasOriginal { + delete(t.Annotations, annOriginal) + hasChanges = true + logger.Info("applyNormalScaling: removed original spec annotation", "tortoise", t.Name) + } + + if _, hasMinReplicas := t.Annotations[annMinReplicas]; hasMinReplicas { + delete(t.Annotations, annMinReplicas) + hasChanges = true + logger.Info("applyNormalScaling: removed min replicas annotation", "tortoise", t.Name) + } + } + + // Only update if we made changes + if hasChanges { + if err := r.Update(ctx, t); err != nil { + return fmt.Errorf("failed to update tortoise during cleanup: %w", err) + } + logger.Info("applyNormalScaling: successfully cleaned up tortoise", "tortoise", t.Name) + } else { + logger.Info("applyNormalScaling: no changes needed", "tortoise", t.Name) + } + + // HPA minReplicas restoration is handled by removing the annotation + // The Tortoise controller will restore the original HPA minReplicas when the annotation is removed + + return nil +} + +// updateStatus updates the status of the ScheduledScaling resource +func (r *ScheduledScalingReconciler) updateStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, phase autoscalingv1alpha1.ScheduledScalingPhase, reason, message string) error { + // Skip status updates if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).V(1).Info("Skipping status update for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + + if scheduledScaling.Status.Phase != phase { + scheduledScaling.Status.Phase = phase + scheduledScaling.Status.Reason = reason + scheduledScaling.Status.Message = message + scheduledScaling.Status.LastTransitionTime = metav1.Now() + + // Update human-readable schedule description + scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() + + if err := r.Status().Update(ctx, scheduledScaling); err != nil { + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update status + log.FromContext(ctx).Info("ScheduledScaling was deleted while updating status", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + if apierrors.IsConflict(err) { + // Resource was modified, status update will be retried on next reconciliation + log.FromContext(ctx).V(1).Info("Conflict detected during status update, will retry", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + return fmt.Errorf("failed to update status: %w", err) + } + } + + return nil +} + +// updateTimeBasedStatus updates the time-based status fields with formatted times +func (r *ScheduledScalingReconciler) updateTimeBasedStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, startTime, finishTime time.Time) error { + // Skip status updates if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).V(1).Info("Skipping time-based status update for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + + // Parse the times and create metav1.Time objects + startMetaTime := metav1.NewTime(startTime) + finishMetaTime := metav1.NewTime(finishTime) + + // Update the status fields + scheduledScaling.Status.CurrentWindowStart = &startMetaTime + scheduledScaling.Status.CurrentWindowEnd = &finishMetaTime + scheduledScaling.Status.FormattedStartTime = scheduledScaling.GetFormattedTime(&startMetaTime) + scheduledScaling.Status.FormattedEndTime = scheduledScaling.GetFormattedTime(&finishMetaTime) + + // For time-based schedules, Next Start is not applicable + scheduledScaling.Status.FormattedNextStartTime = "N/A" + + // Update human-readable schedule description + scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() + + if err := r.Status().Update(ctx, scheduledScaling); err != nil { + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update status + log.FromContext(ctx).Info("ScheduledScaling was deleted while updating time-based status", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + if apierrors.IsConflict(err) { + // Resource was modified, status update will be retried on next reconciliation + log.FromContext(ctx).V(1).Info("Conflict detected during time-based status update, will retry", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + return fmt.Errorf("failed to update time-based status: %w", err) + } + + return nil +} + +// updateCronStatus updates the cron-specific status fields +func (r *ScheduledScalingReconciler) updateCronStatus(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling, currentStart, currentEnd, nextStart *time.Time) error { + // Skip status updates if the object is being deleted + if scheduledScaling.DeletionTimestamp != nil { + log.FromContext(ctx).V(1).Info("Skipping cron status update for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + + // Update current window times + if currentStart != nil { + startTime := metav1.NewTime(*currentStart) + scheduledScaling.Status.CurrentWindowStart = &startTime + scheduledScaling.Status.FormattedStartTime = scheduledScaling.GetFormattedTime(&startTime) + } + if currentEnd != nil { + endTime := metav1.NewTime(*currentEnd) + scheduledScaling.Status.CurrentWindowEnd = &endTime + scheduledScaling.Status.FormattedEndTime = scheduledScaling.GetFormattedTime(&endTime) + } + if nextStart != nil { + nextTime := metav1.NewTime(*nextStart) + scheduledScaling.Status.NextWindowStart = &nextTime + scheduledScaling.Status.FormattedNextStartTime = scheduledScaling.GetFormattedTime(&nextTime) + } + + // Update human-readable schedule description + scheduledScaling.Status.HumanReadableSchedule = scheduledScaling.GetHumanReadableSchedule() + + if err := r.Status().Update(ctx, scheduledScaling); err != nil { + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update status + log.FromContext(ctx).Info("ScheduledScaling was deleted while updating cron status", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + if apierrors.IsConflict(err) { + // Resource was modified, status update will be retried on next reconciliation + log.FromContext(ctx).V(1).Info("Conflict detected during cron status update, will retry", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + return fmt.Errorf("failed to update cron status: %w", err) + } + + return nil +} + +// handleFinalizer handles the finalizer logic for ScheduledScaling resources +func (r *ScheduledScalingReconciler) handleFinalizer(ctx context.Context, scheduledScaling *autoscalingv1alpha1.ScheduledScaling) error { + logger := log.FromContext(ctx) + + // Check if the ScheduledScaling is being deleted + if scheduledScaling.DeletionTimestamp != nil { + // If finalizer exists, run cleanup logic + if controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { + logger.Info("Running finalizer cleanup for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + // Always attempt cleanup, regardless of phase, to ensure we don't leave orphaned annotations + logger.Info("Starting Tortoise cleanup and annotation removal", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + // Create a timeout context for the cleanup operation to prevent it from getting stuck + // Increase timeout to 60 seconds to handle slow API responses + cleanupCtx, cancel := context.WithTimeout(ctx, 60*time.Second) + defer cancel() + + // Attempt cleanup but don't let it block deletion + cleanupErr := r.applyNormalScaling(cleanupCtx, scheduledScaling) + if cleanupErr != nil { + if client.IgnoreNotFound(cleanupErr) == nil { + // Tortoise was not found (likely already deleted), which is fine + logger.Info("Tortoise not found during finalizer cleanup (likely already deleted)", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else if cleanupCtx.Err() == context.DeadlineExceeded { + // Cleanup timed out - log warning but continue with finalizer removal + logger.Info("Tortoise cleanup timed out during finalizer cleanup, continuing with finalizer removal to prevent stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "timeout", "60s") + } else { + // Some other error occurred - log it but don't fail the finalizer removal + // This prevents the resource from getting permanently stuck in deletion + logger.Error(cleanupErr, "Failed to restore Tortoise during finalizer cleanup, continuing with finalizer removal to prevent stuck deletion", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + } else { + logger.Info("Successfully cleaned up Tortoise annotations and restored original spec", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + + // Remove the finalizer - this should always succeed to prevent stuck deletion + controllerutil.RemoveFinalizer(scheduledScaling, scheduledScalingFinalizer) + logger.Info("Removing finalizer from ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + + // Retry finalizer removal with exponential backoff in case of conflicts + maxRetries := 3 + for attempt := 0; attempt < maxRetries; attempt++ { + // Refetch the resource to get the latest version before updating + latest := &autoscalingv1alpha1.ScheduledScaling{} + if err := r.Get(ctx, client.ObjectKeyFromObject(scheduledScaling), latest); err != nil { + if apierrors.IsNotFound(err) { + // Resource was already deleted, nothing more to do + logger.Info("ScheduledScaling resource was already deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + logger.Error(err, "Failed to refetch ScheduledScaling for finalizer removal", "attempt", attempt+1) + if attempt == maxRetries-1 { + return fmt.Errorf("failed to refetch resource for finalizer removal after %d attempts: %w", maxRetries, err) + } + time.Sleep(time.Duration(attempt+1) * time.Second) + continue + } + + // Remove finalizer from the latest version + controllerutil.RemoveFinalizer(latest, scheduledScalingFinalizer) + + if err := r.Update(ctx, latest); err != nil { + if apierrors.IsConflict(err) { + logger.Info("Conflict detected when removing finalizer, retrying", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace, "attempt", attempt+1) + if attempt < maxRetries-1 { + time.Sleep(time.Duration(attempt+1) * time.Second) + continue + } + } + if apierrors.IsNotFound(err) { + // Resource was deleted while we were trying to update it + logger.Info("ScheduledScaling was deleted while removing finalizer", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return nil + } + logger.Error(err, "Failed to remove finalizer", "attempt", attempt+1) + if attempt == maxRetries-1 { + return fmt.Errorf("failed to remove finalizer after %d attempts: %w", maxRetries, err) + } + time.Sleep(time.Duration(attempt+1) * time.Second) + continue + } + + // Successfully removed finalizer + logger.Info("Successfully removed finalizer from ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + break + } + + logger.Info("Finalizer cleanup completed for ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + return nil + } + + // Add finalizer if it doesn't exist + if !controllerutil.ContainsFinalizer(scheduledScaling, scheduledScalingFinalizer) { + // Only add finalizer if the resource is not being deleted + if scheduledScaling.DeletionTimestamp == nil { + controllerutil.AddFinalizer(scheduledScaling, scheduledScalingFinalizer) + if err := r.Update(ctx, scheduledScaling); err != nil { + if apierrors.IsConflict(err) { + logger.Info("Conflict detected when adding finalizer, will retry on next reconciliation", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + return fmt.Errorf("failed to add finalizer due to conflict: %w", err) + } + return fmt.Errorf("failed to add finalizer: %w", err) + } + logger.Info("Added finalizer to ScheduledScaling", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } else { + logger.Info("Skipping finalizer addition for resource being deleted", "name", scheduledScaling.Name, "namespace", scheduledScaling.Namespace) + } + } + + return nil +} + +// validateMinReplicasAgainstHPARecommendation validates that the requested minReplicas +// is not lower than the HPA's recommended value to prevent scaling down too aggressively +// Returns: (isValid, warningMessage, recommendedValue) +func (r *ScheduledScalingReconciler) validateMinReplicasAgainstHPARecommendation(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise, requestedMinReplicas int32) (bool, string, int32) { + logger := log.FromContext(ctx) + + // If no minReplicas is requested, no validation needed + if requestedMinReplicas <= 0 { + return true, "", 0 + } + + // Check if Tortoise has HPA recommendations + if len(tortoise.Status.Recommendations.Horizontal.MinReplicas) == 0 { + logger.Info("No HPA minReplicas recommendations available for validation", "tortoise", tortoise.Name) + return true, "", 0 + } + + // Find the current time slot recommendation + now := time.Now() + currentHour := now.Hour() + currentWeekday := now.Weekday().String() + + var recommendedMinReplicas int32 + var foundRecommendation bool + + for _, rec := range tortoise.Status.Recommendations.Horizontal.MinReplicas { + // Check if this recommendation applies to the current time + if rec.From <= currentHour && currentHour < rec.To { + // Check weekday if specified + if rec.WeekDay == nil || *rec.WeekDay == currentWeekday { + recommendedMinReplicas = rec.Value + foundRecommendation = true + break + } + } + } + + if !foundRecommendation { + logger.Info("No HPA minReplicas recommendation found for current time slot", "tortoise", tortoise.Name, "hour", currentHour, "weekday", currentWeekday) + return true, "", 0 + } + + // Validate that requested minReplicas is not lower than recommended + if requestedMinReplicas < recommendedMinReplicas { + warningMsg := fmt.Sprintf("Requested minReplicas (%d) is lower than HPA's current recommendation (%d) for the workload. Using HPA recommendation (%d) instead to prevent performance issues. Consider reviewing your scaling strategy.", + requestedMinReplicas, recommendedMinReplicas, recommendedMinReplicas) + + logger.Info("ScheduledScaling minReplicas validation warning", + "tortoise", tortoise.Name, + "requested", requestedMinReplicas, + "recommended", recommendedMinReplicas, + "warning", warningMsg) + + return false, warningMsg, recommendedMinReplicas + } + + return true, "", 0 +} + +// SetupWithManager sets up the controller with the Manager. +func (r *ScheduledScalingReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&autoscalingv1alpha1.ScheduledScaling{}). + Complete(r) +} diff --git a/internal/controller/scheduledscaling_controller_test.go b/internal/controller/scheduledscaling_controller_test.go new file mode 100644 index 00000000..2d51c750 --- /dev/null +++ b/internal/controller/scheduledscaling_controller_test.go @@ -0,0 +1,885 @@ +/* +Copyright 2024 The Tortoise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" + autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// Helper function to create int32 pointers +func int32Ptr(i int32) *int32 { + return &i +} + +var _ = Describe("ScheduledScaling Controller", func() { + const ( + timeout = time.Second * 10 + interval = time.Millisecond * 250 + ) + + Context("When creating a ScheduledScaling", func() { + It("Should handle basic scheduled scaling creation", func() { + ctx := context.Background() + + // Create a test ScheduledScaling resource + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-scheduled-scaling", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeTime, + StartAt: time.Now().Add(time.Hour).Format(time.RFC3339), + FinishAt: time.Now().Add(2 * time.Hour).Format(time.RFC3339), + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Verify the resource was created and wait for status update + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + // Wait for the controller to reconcile and set the status to Pending + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending)) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + }) + + It("Should reject invalid schedule times", func() { + ctx := context.Background() + + // Create a ScheduledScaling with invalid times (finish before start) + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-schedule", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeTime, + StartAt: time.Now().Add(2 * time.Hour).Format(time.RFC3339), + FinishAt: time.Now().Add(time.Hour).Format(time.RFC3339), // Invalid: finish before start + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and update status + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + }) + }) + + Context("Apply and restore scheduled scaling", func() { + It("should apply desired min resources and then restore original spec", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-apply", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Prepare a ScheduledScaling that is currently active and ends soon + start := time.Now().Add(-1 * time.Second).Format(time.RFC3339) + finish := time.Now().Add(3 * time.Second).Format(time.RFC3339) + ss := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{Name: "test-apply-restore", Namespace: "default"}, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: start, FinishAt: finish}, + TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, + Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{CPU: "500m", Memory: "512Mi"}, + }}, + }, + } + Expect(k8sClient.Create(ctx, ss)).Should(Succeed()) + + // During active window, Tortoise should be updated + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).ToNot(BeEmpty()) + pol := cur.Spec.ResourcePolicy[0] + cpu := pol.MinAllocatedResources[v1.ResourceCPU] + mem := pol.MinAllocatedResources[v1.ResourceMemory] + g.Expect((&cpu).Cmp(resource.MustParse("500m")) >= 0).To(BeTrue()) + g.Expect((&mem).Cmp(resource.MustParse("512Mi")) >= 0).To(BeTrue()) + }, timeout, interval).Should(Succeed()) + + // After the window finishes, original spec should be restored + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + pol := cur.Spec.ResourcePolicy[0] + g.Expect(pol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("100m"))).To(BeTrue()) + g.Expect(pol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("128Mi"))).To(BeTrue()) + }, time.Second*12, interval).Should(Succeed()) + }) + + It("should apply container-specific resources and override both min and max", func() { + ctx := context.Background() + + // Create a baseline Tortoise with multiple containers and max resources + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-container-specific", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2000m"), + v1.ResourceMemory: resource.MustParse("2Gi"), + }, + }, + { + ContainerName: "sidecar", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("50m"), + v1.ResourceMemory: resource.MustParse("64Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("500m"), + v1.ResourceMemory: resource.MustParse("512Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Prepare a ScheduledScaling with container-specific resources + start := time.Now().Add(-1 * time.Second).Format(time.RFC3339) + finish := time.Now().Add(3 * time.Second).Format(time.RFC3339) + ss := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{Name: "test-container-specific", Namespace: "default"}, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: start, FinishAt: finish}, + TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, + Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ + ContainerMinAllocatedResources: []autoscalingv1alpha1.ContainerResourceRequirements{ + { + ContainerName: "app", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "1500m", + Memory: "1.5Gi", + }, + }, + { + ContainerName: "sidecar", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "300m", + Memory: "256Mi", + }, + }, + }, + }}, + }, + } + Expect(k8sClient.Create(ctx, ss)).Should(Succeed()) + + // During active window, Tortoise should be updated with container-specific resources + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(2)) + + // Check app container + appPol := cur.Spec.ResourcePolicy[0] + if appPol.ContainerName == "sidecar" { + appPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(appPol.ContainerName).To(Equal("app")) + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + // Max resources should also be overridden + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + + // Check sidecar container + sidecarPol := cur.Spec.ResourcePolicy[0] + if sidecarPol.ContainerName == "app" { + sidecarPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(sidecarPol.ContainerName).To(Equal("sidecar")) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + // Max resources should also be overridden + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + }, timeout, interval).Should(Succeed()) + + // After the window finishes, original spec should be restored + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(2)) + + // Check app container restoration + appPol := cur.Spec.ResourcePolicy[0] + if appPol.ContainerName == "sidecar" { + appPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(appPol.ContainerName).To(Equal("app")) + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("100m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("128Mi"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("2000m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("2Gi"))).To(BeTrue()) + + // Check sidecar container restoration + sidecarPol := cur.Spec.ResourcePolicy[0] + if sidecarPol.ContainerName == "app" { + sidecarPol = cur.Spec.ResourcePolicy[1] + } + g.Expect(sidecarPol.ContainerName).To(Equal("sidecar")) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("50m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("64Mi"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("500m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("512Mi"))).To(BeTrue()) + }, time.Second*12, interval).Should(Succeed()) + }) + + It("should apply mixed global and container-specific resources with proper precedence", func() { + ctx := context.Background() + + // Create a baseline Tortoise with multiple containers + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-mixed", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2000m"), + v1.ResourceMemory: resource.MustParse("2Gi"), + }, + }, + { + ContainerName: "sidecar", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("50m"), + v1.ResourceMemory: resource.MustParse("64Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("500m"), + v1.ResourceMemory: resource.MustParse("512Mi"), + }, + }, + { + ContainerName: "monitoring", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("25m"), + v1.ResourceMemory: resource.MustParse("32Mi"), + }, + MaxAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: resource.MustParse("256Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Prepare a ScheduledScaling with both global and container-specific resources + start := time.Now().Add(-1 * time.Second).Format(time.RFC3339) + finish := time.Now().Add(3 * time.Second).Format(time.RFC3339) + ss := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{Name: "test-mixed-resources", Namespace: "default"}, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{Type: autoscalingv1alpha1.ScheduleTypeTime, StartAt: start, FinishAt: finish}, + TargetRefs: autoscalingv1alpha1.TargetRefs{TortoiseName: t.Name}, + Strategy: autoscalingv1alpha1.Strategy{Static: autoscalingv1alpha1.StaticStrategy{ + // Global resources (fallback for containers without specific specs) + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "800m", + Memory: "1Gi", + }, + // Container-specific resources (take precedence) + ContainerMinAllocatedResources: []autoscalingv1alpha1.ContainerResourceRequirements{ + { + ContainerName: "app", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "1500m", + Memory: "1.5Gi", + }, + }, + { + ContainerName: "sidecar", + Resources: autoscalingv1alpha1.ResourceRequirements{ + CPU: "300m", + Memory: "256Mi", + }, + }, + // monitoring container will use global resources + }, + }}, + }, + } + Expect(k8sClient.Create(ctx, ss)).Should(Succeed()) + + // During active window, verify mixed resource application + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(3)) + + // Find each container policy + var appPol, sidecarPol, monitoringPol *autoscalingv1beta3.ContainerResourcePolicy + for i := range cur.Spec.ResourcePolicy { + switch cur.Spec.ResourcePolicy[i].ContainerName { + case "app": + appPol = &cur.Spec.ResourcePolicy[i] + case "sidecar": + sidecarPol = &cur.Spec.ResourcePolicy[i] + case "monitoring": + monitoringPol = &cur.Spec.ResourcePolicy[i] + } + } + + // App container: should use container-specific resources + g.Expect(appPol).ToNot(BeNil()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("1500m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1.5Gi"))).To(BeTrue()) + + // Sidecar container: should use container-specific resources + g.Expect(sidecarPol).ToNot(BeNil()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("300m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + + // Monitoring container: should use global resources + g.Expect(monitoringPol).ToNot(BeNil()) + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("800m"))).To(BeTrue()) + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1Gi"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("800m"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("1Gi"))).To(BeTrue()) + }, timeout, interval).Should(Succeed()) + + // After the window finishes, original spec should be restored + Eventually(func(g Gomega) { + cur := &autoscalingv1beta3.Tortoise{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: t.Name, Namespace: t.Namespace}, cur) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(cur.Spec.ResourcePolicy).To(HaveLen(3)) + + // Find each container policy + var appPol, sidecarPol, monitoringPol *autoscalingv1beta3.ContainerResourcePolicy + for i := range cur.Spec.ResourcePolicy { + switch cur.Spec.ResourcePolicy[i].ContainerName { + case "app": + appPol = &cur.Spec.ResourcePolicy[i] + case "sidecar": + sidecarPol = &cur.Spec.ResourcePolicy[i] + case "monitoring": + monitoringPol = &cur.Spec.ResourcePolicy[i] + } + } + + // Verify all containers have original resources restored + g.Expect(appPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("100m"))).To(BeTrue()) + g.Expect(appPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("128Mi"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("2000m"))).To(BeTrue()) + g.Expect(appPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("2Gi"))).To(BeTrue()) + + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("50m"))).To(BeTrue()) + g.Expect(sidecarPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("64Mi"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("500m"))).To(BeTrue()) + g.Expect(sidecarPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("512Mi"))).To(BeTrue()) + + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("25m"))).To(BeTrue()) + g.Expect(monitoringPol.MinAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("32Mi"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceCPU].Equal(resource.MustParse("200m"))).To(BeTrue()) + g.Expect(monitoringPol.MaxAllocatedResources[v1.ResourceMemory].Equal(resource.MustParse("256Mi"))).To(BeTrue()) + }, time.Second*12, interval).Should(Succeed()) + }) + }) + + Context("Cron-based Scheduled Scaling", func() { + It("Should handle cron-based scheduled scaling creation", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-cron", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a test cron-based ScheduledScaling resource + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cron-scaling", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", // Weekdays at 9 AM + Duration: "8h", // 8-hour duration + TimeZone: "Asia/Tokyo", + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-cron", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(5), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "1000m", + Memory: "1Gi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Verify the resource was created and status is set correctly + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + // For cron-based schedules, should be in Active state if currently within schedule window + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Or(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending), Equal(autoscalingv1alpha1.ScheduledScalingPhaseActive))) + + // Verify the status message contains cron information + // The message format depends on whether the schedule is currently active or pending + if createdResource.Status.Phase == autoscalingv1alpha1.ScheduledScalingPhaseActive { + Expect(createdResource.Status.Message).Should(ContainSubstring("Cron-based scheduled scaling is active")) + } else { + Expect(createdResource.Status.Message).Should(ContainSubstring("cron: 0 9 * * 1-5")) + } + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should reject invalid cron expressions", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-invalid-cron", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling with invalid cron expression + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-cron", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "invalid cron expression", // Invalid cron + Duration: "8h", + TimeZone: "Asia/Tokyo", + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-invalid-cron", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and set status to Failed + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + Expect(createdResource.Status.Message).Should(ContainSubstring("invalid cronExpression format")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should reject missing duration for cron schedule", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-missing-duration", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling without duration + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-missing-duration", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + // Duration: missing + TimeZone: "Asia/Tokyo", + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-missing-duration", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and set status to Failed + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + Expect(createdResource.Status.Message).Should(ContainSubstring("duration is required when type is 'cron'")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should reject invalid timezone", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise-invalid-tz", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling with invalid timezone + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-timezone", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 1-5", + Duration: "8h", + TimeZone: "Invalid/Timezone", // Invalid timezone + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise-invalid-tz", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Wait for the controller to process and set status to Failed + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Verify the error reason + Expect(createdResource.Status.Reason).Should(Equal("InvalidSchedule")) + Expect(createdResource.Status.Message).Should(ContainSubstring("invalid timeZone")) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + It("Should handle cron with default timezone", func() { + ctx := context.Background() + + // Create a baseline Tortoise that ScheduledScaling will target + t := &autoscalingv1beta3.Tortoise{ + ObjectMeta: metav1.ObjectMeta{Name: "test-tortoise", Namespace: "default"}, + Spec: autoscalingv1beta3.TortoiseSpec{ + ResourcePolicy: []autoscalingv1beta3.ContainerResourcePolicy{ + { + ContainerName: "app", + MinAllocatedResources: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, t)).Should(Succeed()) + + // Create a cron ScheduledScaling without explicitly setting timezone (should default to Asia/Tokyo) + // Use a schedule that won't be active during testing (e.g., 2 AM on weekdays) + scheduledScaling := &autoscalingv1alpha1.ScheduledScaling{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-default-timezone", + Namespace: "default", + }, + Spec: autoscalingv1alpha1.ScheduledScalingSpec{ + Schedule: autoscalingv1alpha1.Schedule{ + Type: autoscalingv1alpha1.ScheduleTypeCron, + CronExpression: "0 9 * * 0,6", // 9 AM on weekends only (Sunday=0, Saturday=6, should not be active during weekdays) + Duration: "8h", + // TimeZone: not specified, should default to Asia/Tokyo + }, + TargetRefs: autoscalingv1alpha1.TargetRefs{ + TortoiseName: "test-tortoise", + }, + Strategy: autoscalingv1alpha1.Strategy{ + Static: autoscalingv1alpha1.StaticStrategy{ + MinimumMinReplicas: int32Ptr(3), + MinAllocatedResources: &autoscalingv1alpha1.ResourceRequirements{ + CPU: "500m", + Memory: "512Mi", + }, + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, scheduledScaling)).Should(Succeed()) + + // Verify the resource was created successfully (should not fail validation) + resourceLookupKey := types.NamespacedName{ + Name: scheduledScaling.Name, + Namespace: scheduledScaling.Namespace, + } + createdResource := &autoscalingv1alpha1.ScheduledScaling{} + + // Should be in Pending state (waiting for schedule), not Failed + Eventually(func() autoscalingv1alpha1.ScheduledScalingPhase { + err := k8sClient.Get(ctx, resourceLookupKey, createdResource) + if err != nil { + return "" + } + return createdResource.Status.Phase + }, timeout, interval).Should(Equal(autoscalingv1alpha1.ScheduledScalingPhasePending)) + + // Should not have error messages + Expect(createdResource.Status.Phase).ShouldNot(Equal(autoscalingv1alpha1.ScheduledScalingPhaseFailed)) + + // Clean up + Expect(k8sClient.Delete(ctx, scheduledScaling)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, t)).Should(Succeed()) + }) + + }) +}) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 1e0ee6f4..566a13e1 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -26,6 +26,7 @@ SOFTWARE. package controller import ( + "context" "fmt" "os" "path/filepath" @@ -39,11 +40,14 @@ import ( v1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" "k8s.io/client-go/rest" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" + autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" . "github.com/onsi/ginkgo/v2" @@ -57,6 +61,7 @@ var cfg *rest.Config var k8sClient client.Client var testEnv *envtest.Environment var scheme = runtime.NewScheme() +var controllerMgrCancel context.CancelFunc func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -83,6 +88,8 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) Expect(cfg).NotTo(BeNil()) + err = autoscalingv1alpha1.AddToScheme(scheme) + Expect(err).NotTo(HaveOccurred()) err = autoscalingv1beta3.AddToScheme(scheme) Expect(err).NotTo(HaveOccurred()) err = v1.AddToScheme(scheme) @@ -98,6 +105,29 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) Expect(k8sClient).NotTo(BeNil()) + // Start the controller manager + By("starting the controller manager") + mgr, err := ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme, + Metrics: server.Options{BindAddress: "0"}, + }) + Expect(err).NotTo(HaveOccurred()) + + // Setup the ScheduledScaling controller + err = (&ScheduledScalingReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).NotTo(HaveOccurred()) + + // Start the manager with a cancellable context + ctx, cancel := context.WithCancel(context.Background()) + controllerMgrCancel = cancel + go func() { + err = mgr.Start(ctx) + Expect(err).NotTo(HaveOccurred()) + }() + // fix a time to reduce the diff when regenerating the files. onlyTestNow = ptr.To(time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC)) }) @@ -120,6 +150,12 @@ $ rm -rf %s return } + // Stop manager first to help envtest terminate cleanly + if controllerMgrCancel != nil { + controllerMgrCancel() + time.Sleep(200 * time.Millisecond) + } + err := testEnv.Stop() Expect(err).NotTo(HaveOccurred()) }) diff --git a/internal/controller/tortoise_controller_test.go b/internal/controller/tortoise_controller_test.go index 418cb911..dd988cd9 100644 --- a/internal/controller/tortoise_controller_test.go +++ b/internal/controller/tortoise_controller_test.go @@ -21,6 +21,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/yaml" "github.com/mercari/tortoise/api/v1beta3" @@ -246,6 +247,7 @@ func removeUnnecessaryFields(rawdata []byte) ([]byte, error) { func startController(ctx context.Context) func() { mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme, + Metrics: server.Options{BindAddress: "0"}, LeaderElection: false, Controller: config.Controller{ SkipNameValidation: ptr.To(true), diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 83ae0a23..6a26459b 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -8,6 +8,8 @@ import ( "reflect" "regexp" "sort" + "strconv" + "strings" "time" v2 "k8s.io/api/autoscaling/v2" @@ -420,13 +422,24 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet recommendMax = *tortoise.Spec.MaxReplicas } + // ScheduledScaling: ensure max is not below the scheduled min override + var ssMinOverride int32 + if tortoise.Annotations != nil { + if v, ok := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; ok && v != "" { + if parsed, perr := strconv.ParseInt(v, 10, 32); perr == nil && parsed > 0 { + ssMinOverride = int32(parsed) + } + } + } + if ssMinOverride > 0 && recommendMax < ssMinOverride { + recommendMax = ssMinOverride + } + if recommendMax > c.maximumMaxReplica { c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, c.maximumMaxReplica, c.maximumMaxReplica)) recommendMax = c.maximumMaxReplica } - hpa.Spec.MaxReplicas = recommendMax - recommendMin, err := GetReplicasRecommendation(tortoise.Status.Recommendations.Horizontal.MinReplicas, now) if err != nil { return nil, tortoise, fmt.Errorf("get minReplicas recommendation: %w", err) @@ -446,7 +459,13 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet switch tortoise.Status.TortoisePhase { case autoscalingv1beta3.TortoisePhaseEmergency: // when emergency mode, we set the same value on minReplicas. - minToActuallyApply = recommendMax + // However, if ScheduledScaling is active, respect its minReplicas override + if ssMinOverride > 0 && ssMinOverride > recommendMax { + // ScheduledScaling requires higher minReplicas than emergency mode would set + minToActuallyApply = ssMinOverride + } else { + minToActuallyApply = recommendMax + } case autoscalingv1beta3.TortoisePhaseBackToNormal: // gradually reduce the minReplicas. currentMin := *hpa.Spec.MinReplicas @@ -463,6 +482,23 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet minToActuallyApply = recommendMin } + // ScheduledScaling: enforce configured minimum replicas during Active + if ssMinOverride > 0 && minToActuallyApply < ssMinOverride { + minToActuallyApply = ssMinOverride + } + + // Ensure consistency: MaxReplicas >= MinReplicas + if recommendMax < minToActuallyApply { + recommendMax = minToActuallyApply + if recommendMax > c.maximumMaxReplica { + recommendMax = c.maximumMaxReplica + if minToActuallyApply > recommendMax { + minToActuallyApply = recommendMax + } + } + } + + hpa.Spec.MaxReplicas = recommendMax hpa.Spec.MinReplicas = &minToActuallyApply if tortoise.Spec.UpdateMode != autoscalingv1beta3.UpdateModeOff && recordMetrics { // We don't want to record applied* metric when UpdateMode is Off. @@ -810,26 +846,76 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin currentMetrics := currenthpa.Status.CurrentMetrics for _, condition := range conditions { - if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" { + if condition.Type == "ScalingActive" && condition.Status == "False" && isMetricFailureReason(condition.Reason) { // Always apply grace period before triggering emergency mode to handle temporary // metric unavailability during HPA updates, deployments, scheduled scaling, etc. conditionAge := time.Since(condition.LastTransitionTime.Time) - if conditionAge < c.emergencyModeGracePeriod { + gracePeriod := c.emergencyModeGracePeriod + + // Extend grace period if ScheduledScaling is active to allow more time for new pods to start + if tortoise.Annotations != nil { + if _, exists := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; exists { + // Double the grace period during ScheduledScaling to account for pod startup time + gracePeriod = gracePeriod * 2 + logger.V(1).Info("ScheduledScaling detected, extending grace period", + "originalGracePeriod", c.emergencyModeGracePeriod, + "extendedGracePeriod", gracePeriod) + } + } + + if conditionAge < gracePeriod { logger.V(1).Info("HPA metric temporarily unavailable, giving grace time before emergency mode", - "gracePeriod", c.emergencyModeGracePeriod, - "conditionAge", conditionAge) + "gracePeriod", gracePeriod, + "conditionAge", conditionAge, + "reason", condition.Reason) return true // Give grace period before emergency mode } // Grace period expired, switch to Emergency mode since no metrics - logger.Info("HPA failed to get resource metrics after grace period, switch to emergency mode", - "gracePeriod", c.emergencyModeGracePeriod, - "conditionAge", conditionAge) + logger.Info("HPA failed to get metrics after grace period, switch to emergency mode", + "gracePeriod", gracePeriod, + "conditionAge", conditionAge, + "reason", condition.Reason) return false } } hasValidMetrics := false + + // Check if currentMetrics is empty or has empty entries (common during rapid scaling) + if len(currentMetrics) == 0 { + logger.V(1).Info("HPA currentMetrics is empty, checking grace period", "hpa", currenthpa.Name) + + // Use lastScaleTime to determine grace period for empty metrics + if currenthpa.Status.LastScaleTime != nil { + timeSinceLastScale := time.Since(currenthpa.Status.LastScaleTime.Time) + gracePeriod := c.emergencyModeGracePeriod + + // Extend grace period if ScheduledScaling is active + if tortoise.Annotations != nil { + if _, exists := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; exists { + gracePeriod = gracePeriod * 2 + } + } + + if timeSinceLastScale < gracePeriod { + logger.V(1).Info("Empty metrics within grace period since last scale", + "hpa", currenthpa.Name, + "timeSinceLastScale", timeSinceLastScale, + "gracePeriod", gracePeriod) + return true + } + } + // If no lastScaleTime or grace period expired, continue to check for emergency mode + } + for _, currentMetric := range currentMetrics { + // Check for empty metric types (common during scaling) + if currentMetric.Type == "" { + // For individual empty metric entries, we'll continue processing other metrics + // The final grace period check at the end will handle the overall empty metrics situation + continue // Skip empty metric entries but continue checking other metrics + } + switch currentMetric.Type { case v2.ContainerResourceMetricSourceType: if currentMetric.ContainerResource != nil && !currentMetric.ContainerResource.Current.Value.IsZero() { @@ -841,6 +927,14 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin // External metrics are also valid hasValidMetrics = true } + case v2.ObjectMetricSourceType: + if currentMetric.Object != nil && !currentMetric.Object.Current.Value.IsZero() { + hasValidMetrics = true + } + case v2.PodsMetricSourceType: + if currentMetric.Pods != nil && !currentMetric.Pods.Current.Value.IsZero() { + hasValidMetrics = true + } } } @@ -848,6 +942,58 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, tortoise *autoscalin return true } + // If ScheduledScaling is active and we have no valid metrics, check if we should still give grace time + if tortoise.Annotations != nil { + if _, exists := tortoise.Annotations["autoscaling.mercari.com/scheduledscaling-min-replicas"]; exists { + // For empty metrics during ScheduledScaling, we need to check the HPA's lastScaleTime + // to determine how long the scaling has been happening + if currenthpa.Status.LastScaleTime != nil { + timeSinceLastScale := time.Since(currenthpa.Status.LastScaleTime.Time) + gracePeriod := c.emergencyModeGracePeriod * 2 // Extended grace during ScheduledScaling + + if timeSinceLastScale < gracePeriod { + logger.Info("No valid metrics during ScheduledScaling, but within grace period since last scale", + "hpa", currenthpa.Name, + "tortoise", tortoise.Name, + "timeSinceLastScale", timeSinceLastScale, + "gracePeriod", gracePeriod) + return true // Still within grace period + } else { + logger.Info("No valid metrics during ScheduledScaling and grace period expired since last scale", + "hpa", currenthpa.Name, + "tortoise", tortoise.Name, + "timeSinceLastScale", timeSinceLastScale, + "gracePeriod", gracePeriod) + // Fall through to trigger emergency mode + } + } else { + // No lastScaleTime available, give benefit of doubt during ScheduledScaling + logger.Info("No valid metrics during ScheduledScaling and no lastScaleTime, giving additional grace", + "hpa", currenthpa.Name, + "tortoise", tortoise.Name) + return true + } + } + } + logger.Info("HPA looks unready because all the metrics indicate zero", "hpa", currenthpa.Name) return false } + +// isMetricFailureReason returns true if the HPA condition reason indicates a metric failure +// that should be covered by the emergency mode grace period +func isMetricFailureReason(reason string) bool { + switch reason { + case "FailedGetResourceMetric", + "FailedGetExternalMetric", + "FailedGetObjectMetric", + "FailedGetPodsMetric", + "FailedComputeMetricsReplicas", + "FailedComputeReplicas": + return true + default: + // Also catch any other "Failed*Metric*" patterns that might be added in future Kubernetes versions + return strings.Contains(reason, "Failed") && + (strings.Contains(reason, "Metric") || strings.Contains(reason, "Compute")) + } +}