diff --git a/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md b/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md index f9983c9b0..ba40752b1 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md +++ b/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md @@ -61,6 +61,8 @@ aws sagemaker \ Each team requires a Compute Allocation to manage their compute capacity. Both teams will have 2 instances allocated, 100 fair-share weight, and 50% borrowing capability. +If you are deploying the HyperPod EKS Terraform module, you can configure compute allocations with the `task_governance_compute_quotas` variable instead of running the following AWS CLI commands manually. See the [Terraform module README](../terraform-modules/README.md#task-governance-compute-allocations) for an example. + ``` aws sagemaker \ --region $REGION \ @@ -200,4 +202,4 @@ Status: Reason: Preempted Status: True Type: Requeued -``` \ No newline at end of file +``` diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md index 4c901b67d..f92f4a759 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md @@ -6,6 +6,7 @@ The diagram below depicts the Terraform modules that have been bundled into a si --- + ## Get the Modules Clone the AWSome Distributed Training repository and navigate to the terraform-modules directory: ```bash @@ -500,6 +501,41 @@ Set the following parameters to `true` in your `custom.tfvars` file to enable op | `create_hyperpod_inference_operator_module` | Installs the [HyperPod inference operator addon](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html) for deployment and management of machine learning inference endpoints | | `create_observability_module` | Installs the [HyperPod Observability addon](https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod-observability-addon-setup.html) to publish key metrics to Amazon Managed Service for Prometheus and displays them in Amazon Managed Grafana dashboards | +#### Task governance compute allocations + +When `create_task_governance_module = true`, you can also manage SageMaker HyperPod task governance compute allocations with Terraform by setting `task_governance_compute_quotas`. The Terraform providers do not currently expose a first-class SageMaker compute quota resource, so this module uses the AWS CLI from a Terraform `local-exec` provisioner to call the SageMaker `create-compute-quota`, `update-compute-quota`, and `delete-compute-quota` APIs. The machine running Terraform must have the AWS CLI and `python3` installed and authenticated for the target account. + +```hcl +create_task_governance_module = true + +task_governance_compute_quotas = [ + { + name = "team-a-quota" + description = "Team A compute allocation" + + compute_quota_resources = [ + { + instance_type = "ml.g5.8xlarge" + count = 2 + } + ] + + resource_sharing_config = { + strategy = "DontLend" + } + + preempt_team_tasks = "LowerPriority" + + target = { + team_name = "team-a" + fair_share_weight = 0 + } + } +] +``` + +For teams that lend and borrow idle capacity, use `strategy = "LendAndBorrow"` and optionally set `borrow_limit` or `absolute_borrow_limits`. + The HyperPod training and inference operators both require the [cert-manager](https://cert-manager.io/) EKS addon to be installed as a prerequisite. The variable `enable_cert_manager` is set to `true` by default, so that when `create_hyperpod_training_operator_module` or `create_hyperpod_inference_operator_module` are also set to `true`, cert-manager will be installed as a dependency of the operators. In other words, this stack will not install cert-manager as a standalone component, but it can be disabled if you already have it installed on an existing EKS cluster and wish to use one of the HyperPod operators. The HyperPod inference operator also has the following additional dependencies: @@ -598,5 +634,3 @@ Do not attempt to install these addons later using the console. Once you have your `rig_custom.tfvars` file is created, you can proceed to deployment. --- - - diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf index 538eca04f..5b0e0c38d 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf @@ -249,7 +249,10 @@ module "task_governance" { count = local.create_task_governance_module ? 1 : 0 source = "./modules/task_governance" + aws_region = var.aws_region + compute_quotas = var.task_governance_compute_quotas eks_cluster_name = var.eks_cluster_name + hyperpod_cluster_arn = length(module.hyperpod_cluster) > 0 ? module.hyperpod_cluster[0].hyperpod_cluster_arn : "" depends_on = [module.hyperpod_cluster] } diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf index 60243d6e4..68d5c1e0f 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf @@ -1,7 +1,90 @@ +locals { + compute_quota_resources = { + for quota in var.compute_quotas : quota.name => [ + for resource in quota.compute_quota_resources : merge( + { + InstanceType = resource.instance_type + }, + resource.count != null ? { + Count = resource.count + } : {}, + resource.accelerators != null ? { + Accelerators = resource.accelerators + } : {}, + resource.vcpu != null ? { + VCpu = resource.vcpu + } : {}, + resource.memory_in_gib != null ? { + MemoryInGiB = resource.memory_in_gib + } : {}, + resource.accelerator_partition != null ? { + AcceleratorPartition = { + Type = resource.accelerator_partition.type + Count = resource.accelerator_partition.count + } + } : {} + ) + ] + } + + compute_quota_absolute_borrow_limits = { + for quota in var.compute_quotas : quota.name => [ + for resource in quota.resource_sharing_config.absolute_borrow_limits : merge( + { + InstanceType = resource.instance_type + }, + resource.count != null ? { + Count = resource.count + } : {}, + resource.accelerators != null ? { + Accelerators = resource.accelerators + } : {}, + resource.vcpu != null ? { + VCpu = resource.vcpu + } : {}, + resource.memory_in_gib != null ? { + MemoryInGiB = resource.memory_in_gib + } : {}, + resource.accelerator_partition != null ? { + AcceleratorPartition = { + Type = resource.accelerator_partition.type + Count = resource.accelerator_partition.count + } + } : {} + ) + ] + } + + compute_quota_configs = { + for quota in var.compute_quotas : quota.name => { + ComputeQuotaResources = local.compute_quota_resources[quota.name] + ResourceSharingConfig = merge( + { + Strategy = quota.resource_sharing_config.strategy + }, + quota.resource_sharing_config.borrow_limit != null ? { + BorrowLimit = quota.resource_sharing_config.borrow_limit + } : {}, + length(local.compute_quota_absolute_borrow_limits[quota.name]) > 0 ? { + AbsoluteBorrowLimits = local.compute_quota_absolute_borrow_limits[quota.name] + } : {} + ) + PreemptTeamTasks = quota.preempt_team_tasks + } + } + + compute_quota_targets = { + for quota in var.compute_quotas : quota.name => { + TeamName = quota.target.team_name + FairShareWeight = quota.target.fair_share_weight + } + } +} + # EKS Addon for Task Governance resource "aws_eks_addon" "task_governance" { - cluster_name = var.eks_cluster_name - addon_name = "amazon-sagemaker-hyperpod-taskgovernance" + cluster_name = var.eks_cluster_name + addon_name = "amazon-sagemaker-hyperpod-taskgovernance" resolve_conflicts_on_create = "OVERWRITE" resolve_conflicts_on_update = "OVERWRITE" } @@ -18,4 +101,51 @@ resource "null_resource" "wait_for_kueue_webhook" { } depends_on = [aws_eks_addon.task_governance] -} \ No newline at end of file +} + +resource "null_resource" "compute_quota" { + for_each = { for quota in var.compute_quotas : quota.name => quota } + + triggers = { + activation_state = each.value.activation_state + cluster_arn = var.hyperpod_cluster_arn + compute_quota_config = jsonencode(local.compute_quota_configs[each.key]) + compute_quota_target = jsonencode(local.compute_quota_targets[each.key]) + description = each.value.description + name = each.value.name + region = var.aws_region + } + + provisioner "local-exec" { + command = "bash ${path.module}/scripts/manage-compute-quota.sh apply" + environment = { + ACTIVATION_STATE = self.triggers.activation_state + AWS_REGION = self.triggers.region + CLUSTER_ARN = self.triggers.cluster_arn + COMPUTE_QUOTA_CONFIG = self.triggers.compute_quota_config + COMPUTE_QUOTA_TARGET = self.triggers.compute_quota_target + DESCRIPTION = self.triggers.description + QUOTA_NAME = self.triggers.name + } + } + + provisioner "local-exec" { + when = destroy + command = "bash ${path.module}/scripts/manage-compute-quota.sh delete" + environment = { + ACTIVATION_STATE = self.triggers.activation_state + AWS_REGION = self.triggers.region + CLUSTER_ARN = self.triggers.cluster_arn + COMPUTE_QUOTA_CONFIG = self.triggers.compute_quota_config + COMPUTE_QUOTA_TARGET = self.triggers.compute_quota_target + DESCRIPTION = self.triggers.description + QUOTA_NAME = self.triggers.name + } + } + + lifecycle { + create_before_destroy = true + } + + depends_on = [null_resource.wait_for_kueue_webhook] +} diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf index 5846b16ab..d32ffd855 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf @@ -1,4 +1,9 @@ output "task_governance_addon_arn" { description = "ARN of the task governance addon" - value = aws_eks_addon.task_governance.arn -} \ No newline at end of file + value = aws_eks_addon.task_governance.arn +} + +output "compute_quota_names" { + description = "Names of task governance compute allocations managed by this module" + value = keys(null_resource.compute_quota) +} diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/scripts/manage-compute-quota.sh b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/scripts/manage-compute-quota.sh new file mode 100644 index 000000000..b68c2e94c --- /dev/null +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/scripts/manage-compute-quota.sh @@ -0,0 +1,279 @@ +#!/usr/bin/env bash +set -euo pipefail + +ACTION="${1:-}" +export AWS_PAGER="" + +log() { + printf '%s\n' "$*" >&2 +} + +require_env() { + local name + + for name in AWS_REGION CLUSTER_ARN QUOTA_NAME COMPUTE_QUOTA_CONFIG COMPUTE_QUOTA_TARGET ACTIVATION_STATE DESCRIPTION; do + if [[ -z "${!name+x}" ]]; then + log "Missing required environment variable: ${name}" + exit 1 + fi + done + + for name in AWS_REGION CLUSTER_ARN QUOTA_NAME COMPUTE_QUOTA_CONFIG COMPUTE_QUOTA_TARGET ACTIVATION_STATE; do + if [[ -z "${!name}" ]]; then + log "Required environment variable cannot be empty: ${name}" + exit 1 + fi + done +} + +aws_sm() { + aws sagemaker --region "${AWS_REGION}" "$@" +} + +json_field() { + local field="$1" + local json="$2" + + python3 -c 'import json, sys; print(json.load(sys.stdin).get(sys.argv[1], ""))' "${field}" <<< "${json}" +} + +find_quota() { + aws_sm list-compute-quotas \ + --cluster-arn "${CLUSTER_ARN}" \ + --name-contains "${QUOTA_NAME}" \ + --output json | + python3 -c ' +import json +import os +import sys + +data = json.load(sys.stdin) +name = os.environ["QUOTA_NAME"] +cluster_arn = os.environ["CLUSTER_ARN"] +matches = [ + quota for quota in data.get("ComputeQuotaSummaries", []) + if quota.get("Name") == name + and quota.get("ClusterArn") == cluster_arn + and quota.get("Status") != "Deleted" +] + +if len(matches) > 1: + print( + f"Found {len(matches)} compute quotas named {name!r} in cluster {cluster_arn!r}; expected at most one.", + file=sys.stderr, + ) + sys.exit(2) + +if matches: + print(json.dumps(matches[0], sort_keys=True, separators=(",", ":"))) +' +} + +describe_quota() { + local quota_id="$1" + + aws_sm describe-compute-quota \ + --compute-quota-id "${quota_id}" \ + --output json +} + +quota_matches_desired() { + local current_json="$1" + + printf '%s' "${current_json}" | + python3 -c ' +import json +import os +import sys + +def clean(value): + if isinstance(value, dict): + cleaned = {} + for key, child in value.items(): + child = clean(child) + if child is None or child == {} or child == []: + continue + cleaned[key] = child + return cleaned + if isinstance(value, list): + cleaned = [ + clean(child) for child in value + ] + cleaned = [ + child for child in cleaned + if child is not None and child != {} and child != [] + ] + return sorted(cleaned, key=lambda child: json.dumps(child, sort_keys=True, separators=(",", ":"))) + return value + +current = json.load(sys.stdin) +desired = { + "Description": os.environ.get("DESCRIPTION", ""), + "ComputeQuotaConfig": json.loads(os.environ["COMPUTE_QUOTA_CONFIG"]), + "ComputeQuotaTarget": json.loads(os.environ["COMPUTE_QUOTA_TARGET"]), + "ActivationState": os.environ["ACTIVATION_STATE"], +} +actual = { + "Description": current.get("Description", ""), + "ComputeQuotaConfig": current.get("ComputeQuotaConfig", {}), + "ComputeQuotaTarget": current.get("ComputeQuotaTarget", {}), + "ActivationState": current.get("ActivationState", ""), +} + +sys.exit(0 if clean(actual) == clean(desired) else 1) +' +} + +wait_for_stable_quota() { + local quota_id="$1" + local mode="${2:-ready}" + local attempts="${COMPUTE_QUOTA_WAIT_ATTEMPTS:-60}" + local sleep_seconds="${COMPUTE_QUOTA_WAIT_SECONDS:-10}" + local current_json + local status + local failure_reason + local attempt + + for ((attempt = 1; attempt <= attempts; attempt++)); do + if ! current_json="$(describe_quota "${quota_id}" 2>/dev/null)"; then + if [[ "${mode}" == "delete" ]]; then + return 0 + fi + log "Failed to describe compute quota ${quota_id}" + exit 1 + fi + + status="$(json_field Status "${current_json}")" + case "${status}" in + Created | Updated) + return 0 + ;; + Deleted) + if [[ "${mode}" == "delete" ]]; then + return 0 + fi + log "Compute quota ${quota_id} was deleted while waiting for it to become ready" + exit 1 + ;; + CreateFailed | CreateRollbackFailed | UpdateFailed | UpdateRollbackFailed | DeleteFailed | DeleteRollbackFailed) + failure_reason="$(json_field FailureReason "${current_json}")" + log "Compute quota ${quota_id} reached failed status ${status}: ${failure_reason}" + exit 1 + ;; + esac + + sleep "${sleep_seconds}" + done + + log "Timed out waiting for compute quota ${quota_id} to reach a stable status" + exit 1 +} + +apply_quota() { + local quota_json + local quota_id + local current_json + local result_json + local target_version + + quota_json="$(find_quota)" + + if [[ -z "${quota_json}" ]]; then + log "Creating SageMaker compute quota ${QUOTA_NAME}" + local -a create_args + create_args=( + create-compute-quota + --name "${QUOTA_NAME}" + --cluster-arn "${CLUSTER_ARN}" + --compute-quota-config "${COMPUTE_QUOTA_CONFIG}" + --compute-quota-target "${COMPUTE_QUOTA_TARGET}" + --activation-state "${ACTIVATION_STATE}" + --output json + ) + if [[ -n "${DESCRIPTION}" ]]; then + create_args+=(--description "${DESCRIPTION}") + fi + + result_json="$(aws_sm "${create_args[@]}")" + quota_id="$(json_field ComputeQuotaId "${result_json}")" + wait_for_stable_quota "${quota_id}" ready + return + fi + + quota_id="$(json_field ComputeQuotaId "${quota_json}")" + wait_for_stable_quota "${quota_id}" ready + current_json="$(describe_quota "${quota_id}")" + + if quota_matches_desired "${current_json}"; then + log "SageMaker compute quota ${QUOTA_NAME} is up to date" + return + fi + + target_version="$(json_field ComputeQuotaVersion "${current_json}")" + log "Updating SageMaker compute quota ${QUOTA_NAME} at version ${target_version}" + + aws_sm update-compute-quota \ + --compute-quota-id "${quota_id}" \ + --target-version "${target_version}" \ + --compute-quota-config "${COMPUTE_QUOTA_CONFIG}" \ + --compute-quota-target "${COMPUTE_QUOTA_TARGET}" \ + --activation-state "${ACTIVATION_STATE}" \ + --description "${DESCRIPTION}" \ + --output json >/dev/null + + wait_for_stable_quota "${quota_id}" ready +} + +delete_quota() { + local quota_json + local quota_id + local current_json + + quota_json="$(find_quota)" + + if [[ -z "${quota_json}" ]]; then + log "SageMaker compute quota ${QUOTA_NAME} is already absent" + return + fi + + quota_id="$(json_field ComputeQuotaId "${quota_json}")" + wait_for_stable_quota "${quota_id}" ready + current_json="$(describe_quota "${quota_id}")" + + if ! quota_matches_desired "${current_json}"; then + log "Skipping delete for ${QUOTA_NAME}; the live quota no longer matches this Terraform instance" + return + fi + + log "Deleting SageMaker compute quota ${QUOTA_NAME}" + aws_sm delete-compute-quota \ + --compute-quota-id "${quota_id}" \ + --output json >/dev/null + wait_for_stable_quota "${quota_id}" delete +} + +if [[ "${ACTION}" != "apply" && "${ACTION}" != "delete" ]]; then + log "Usage: $0 apply|delete" + exit 1 +fi + +if ! command -v aws >/dev/null 2>&1; then + log "The AWS CLI is required to manage SageMaker compute quotas" + exit 1 +fi + +if ! command -v python3 >/dev/null 2>&1; then + log "python3 is required to compare SageMaker compute quota JSON" + exit 1 +fi + +require_env + +case "${ACTION}" in + apply) + apply_quota + ;; + delete) + delete_quota + ;; +esac diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf index d76c182a0..0cca7742a 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf @@ -1,4 +1,111 @@ variable "eks_cluster_name" { description = "The name of the EKS cluster" type = string -} \ No newline at end of file +} + +variable "aws_region" { + description = "AWS Region where the HyperPod cluster and compute allocations exist" + type = string +} + +variable "hyperpod_cluster_arn" { + description = "ARN of the HyperPod cluster used for compute allocations" + type = string + default = "" +} + +variable "compute_quotas" { + description = "SageMaker HyperPod task governance compute allocations to create or update" + type = list(object({ + name = string + description = optional(string, "") + activation_state = optional(string, "Enabled") + compute_quota_resources = list(object({ + instance_type = string + count = optional(number) + accelerators = optional(number) + vcpu = optional(number) + memory_in_gib = optional(number) + accelerator_partition = optional(object({ + type = string + count = number + })) + })) + resource_sharing_config = optional(object({ + strategy = optional(string, "LendAndBorrow") + borrow_limit = optional(number) + absolute_borrow_limits = optional(list(object({ + instance_type = string + count = optional(number) + accelerators = optional(number) + vcpu = optional(number) + memory_in_gib = optional(number) + accelerator_partition = optional(object({ + type = string + count = number + })) + })), []) + }), {}) + preempt_team_tasks = optional(string, "LowerPriority") + target = object({ + team_name = string + fair_share_weight = optional(number, 0) + }) + })) + default = [] + + validation { + condition = length(var.compute_quotas) == 0 || var.hyperpod_cluster_arn != "" + error_message = "hyperpod_cluster_arn is required when compute_quotas are configured." + } + + validation { + condition = alltrue([ + for quota in var.compute_quotas : + contains(["Enabled", "Disabled"], quota.activation_state) + ]) + error_message = "Compute quota activation_state must be Enabled or Disabled." + } + + validation { + condition = alltrue([ + for quota in var.compute_quotas : + contains(["Lend", "DontLend", "LendAndBorrow"], quota.resource_sharing_config.strategy) + ]) + error_message = "Compute quota resource_sharing_config.strategy must be Lend, DontLend, or LendAndBorrow." + } + + validation { + condition = alltrue([ + for quota in var.compute_quotas : + quota.resource_sharing_config.borrow_limit == null || (quota.resource_sharing_config.borrow_limit >= 1 && quota.resource_sharing_config.borrow_limit <= 500) + ]) + error_message = "Compute quota resource_sharing_config.borrow_limit must be between 1 and 500 when set." + } + + validation { + condition = alltrue([ + for quota in var.compute_quotas : + contains(["Never", "LowerPriority"], quota.preempt_team_tasks) + ]) + error_message = "Compute quota preempt_team_tasks must be Never or LowerPriority." + } + + validation { + condition = alltrue([ + for quota in var.compute_quotas : + quota.target.fair_share_weight >= 0 && quota.target.fair_share_weight <= 100 + ]) + error_message = "Compute quota target.fair_share_weight must be between 0 and 100." + } + + validation { + condition = alltrue(flatten([ + for quota in var.compute_quotas : [ + for resource in quota.compute_quota_resources : + resource.count != null || resource.accelerators != null || resource.vcpu != null || resource.memory_in_gib != null || resource.accelerator_partition != null + ] + ])) + error_message = "Each compute quota resource must set at least one of count, accelerators, vcpu, memory_in_gib, or accelerator_partition." + } +} diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf index 0af7bcb53..ca579fc7e 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf @@ -130,6 +130,11 @@ output "hyperpod_cluster_status" { value = var.create_hyperpod_module ? module.hyperpod_cluster[0].hyperpod_cluster_status : null } +output "task_governance_compute_quota_names" { + description = "Names of task governance compute allocations managed by this module" + value = local.create_task_governance_module ? module.task_governance[0].compute_quota_names : [] +} + # Region Output output "aws_region" { description = "AWS region" diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf index 007e162ce..3d8506e8b 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf @@ -684,6 +684,97 @@ variable "create_task_governance_module" { default = false } +variable "task_governance_compute_quotas" { + description = "SageMaker HyperPod task governance compute allocations to create or update. Only used when create_task_governance_module is true." + type = list(object({ + name = string + description = optional(string, "") + activation_state = optional(string, "Enabled") + compute_quota_resources = list(object({ + instance_type = string + count = optional(number) + accelerators = optional(number) + vcpu = optional(number) + memory_in_gib = optional(number) + accelerator_partition = optional(object({ + type = string + count = number + })) + })) + resource_sharing_config = optional(object({ + strategy = optional(string, "LendAndBorrow") + borrow_limit = optional(number) + absolute_borrow_limits = optional(list(object({ + instance_type = string + count = optional(number) + accelerators = optional(number) + vcpu = optional(number) + memory_in_gib = optional(number) + accelerator_partition = optional(object({ + type = string + count = number + })) + })), []) + }), {}) + preempt_team_tasks = optional(string, "LowerPriority") + target = object({ + team_name = string + fair_share_weight = optional(number, 0) + }) + })) + default = [] + + validation { + condition = alltrue([ + for quota in var.task_governance_compute_quotas : + contains(["Enabled", "Disabled"], quota.activation_state) + ]) + error_message = "Task governance compute quota activation_state must be Enabled or Disabled." + } + + validation { + condition = alltrue([ + for quota in var.task_governance_compute_quotas : + contains(["Lend", "DontLend", "LendAndBorrow"], quota.resource_sharing_config.strategy) + ]) + error_message = "Task governance compute quota resource_sharing_config.strategy must be Lend, DontLend, or LendAndBorrow." + } + + validation { + condition = alltrue([ + for quota in var.task_governance_compute_quotas : + quota.resource_sharing_config.borrow_limit == null || (quota.resource_sharing_config.borrow_limit >= 1 && quota.resource_sharing_config.borrow_limit <= 500) + ]) + error_message = "Task governance compute quota resource_sharing_config.borrow_limit must be between 1 and 500 when set." + } + + validation { + condition = alltrue([ + for quota in var.task_governance_compute_quotas : + contains(["Never", "LowerPriority"], quota.preempt_team_tasks) + ]) + error_message = "Task governance compute quota preempt_team_tasks must be Never or LowerPriority." + } + + validation { + condition = alltrue([ + for quota in var.task_governance_compute_quotas : + quota.target.fair_share_weight >= 0 && quota.target.fair_share_weight <= 100 + ]) + error_message = "Task governance compute quota target.fair_share_weight must be between 0 and 100." + } + + validation { + condition = alltrue(flatten([ + for quota in var.task_governance_compute_quotas : [ + for resource in quota.compute_quota_resources : + resource.count != null || resource.accelerators != null || resource.vcpu != null || resource.memory_in_gib != null || resource.accelerator_partition != null + ] + ])) + error_message = "Each task governance compute quota resource must set at least one of count, accelerators, vcpu, memory_in_gib, or accelerator_partition." + } +} + # HyperPod Training Operator variable "create_hyperpod_training_operator_module" { description = "Whether to enable HyperPod Training Operator"