awslabs · FreCap · May 1, 2026
diff --git a/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md b/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md
@@ -61,6 +61,8 @@ aws sagemaker \
 
 Each team requires a Compute Allocation to manage their compute capacity. Both teams will have 2 instances allocated, 100 fair-share weight, and 50% borrowing capability.
 
+If you are deploying the HyperPod EKS Terraform module, you can configure compute allocations with the `task_governance_compute_quotas` variable instead of running the following AWS CLI commands manually. See the [Terraform module README](../terraform-modules/README.md#task-governance-compute-allocations) for an example.
+
 ```
 aws sagemaker \
     --region $REGION \
@@ -200,4 +202,4 @@ Status:
     Reason:                Preempted
     Status:                True
     Type:                  Requeued
-```
+```
diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md
@@ -6,6 +6,7 @@ The diagram below depicts the Terraform modules that have been bundled into a si
 
 ---
 
+
 ## Get the Modules
 Clone the AWSome Distributed Training repository and navigate to the terraform-modules directory:
 ```bash
@@ -500,6 +501,41 @@ Set the following parameters to `true` in your `custom.tfvars` file to enable op
 | `create_hyperpod_inference_operator_module`  | Installs the [HyperPod inference operator addon](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html) for deployment and management of machine learning inference endpoints |
 | `create_observability_module` | Installs the [HyperPod Observability addon](https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod-observability-addon-setup.html) to publish key metrics to Amazon Managed Service for Prometheus and displays them in Amazon Managed Grafana dashboards | 
 
+#### Task governance compute allocations
+
+When `create_task_governance_module = true`, you can also manage SageMaker HyperPod task governance compute allocations with Terraform by setting `task_governance_compute_quotas`. The Terraform providers do not currently expose a first-class SageMaker compute quota resource, so this module uses the AWS CLI from a Terraform `local-exec` provisioner to call the SageMaker `create-compute-quota`, `update-compute-quota`, and `delete-compute-quota` APIs. The machine running Terraform must have the AWS CLI and `python3` installed and authenticated for the target account.
+
+```hcl
+create_task_governance_module = true
+
+task_governance_compute_quotas = [
+  {
+    name        = "team-a-quota"
+    description = "Team A compute allocation"
+
+    compute_quota_resources = [
+      {
+        instance_type = "ml.g5.8xlarge"
+        count         = 2
+      }
+    ]
+
+    resource_sharing_config = {
+      strategy = "DontLend"
+    }
+
+    preempt_team_tasks = "LowerPriority"
+
+    target = {
+      team_name         = "team-a"
+      fair_share_weight = 0
+    }
+  }
+]
+```
+
+For teams that lend and borrow idle capacity, use `strategy = "LendAndBorrow"` and optionally set `borrow_limit` or `absolute_borrow_limits`.
+
 The HyperPod training and inference operators both require the [cert-manager](https://cert-manager.io/) EKS addon to be installed as a prerequisite. The variable `enable_cert_manager` is set to `true` by default, so that when `create_hyperpod_training_operator_module` or `create_hyperpod_inference_operator_module` are also set to `true`, cert-manager will be installed as a dependency of the operators. In other words, this stack will not install cert-manager as a standalone component, but it can be disabled if you already have it installed on an existing EKS cluster and wish to use one of the HyperPod operators. 
 
 The HyperPod inference operator also has the following additional dependencies: 
@@ -598,5 +634,3 @@ Do not attempt to install these addons later using the console.
 Once you have your `rig_custom.tfvars` file is created, you can proceed to deployment. 
 
 ---
-
-
diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf
@@ -249,7 +249,10 @@ module "task_governance" {
   count  = local.create_task_governance_module ? 1 : 0
   source = "./modules/task_governance"
 
+  aws_region           = var.aws_region
+  compute_quotas       = var.task_governance_compute_quotas
   eks_cluster_name     = var.eks_cluster_name
+  hyperpod_cluster_arn = length(module.hyperpod_cluster) > 0 ? module.hyperpod_cluster[0].hyperpod_cluster_arn : ""
 
   depends_on = [module.hyperpod_cluster]
 }

diff --git a/....sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf b/....sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf
@@ -1,7 +1,90 @@
+locals {
+  compute_quota_resources = {
+    for quota in var.compute_quotas : quota.name => [
+      for resource in quota.compute_quota_resources : merge(
+        {
+          InstanceType = resource.instance_type
+        },
+        resource.count != null ? {
+          Count = resource.count
+        } : {},
+        resource.accelerators != null ? {
+          Accelerators = resource.accelerators
+        } : {},
+        resource.vcpu != null ? {
+          VCpu = resource.vcpu
+        } : {},
+        resource.memory_in_gib != null ? {
+          MemoryInGiB = resource.memory_in_gib
+        } : {},
+        resource.accelerator_partition != null ? {
+          AcceleratorPartition = {
+            Type  = resource.accelerator_partition.type
+            Count = resource.accelerator_partition.count
+          }
+        } : {}
+      )
+    ]
+  }
+
+  compute_quota_absolute_borrow_limits = {
+    for quota in var.compute_quotas : quota.name => [
+      for resource in quota.resource_sharing_config.absolute_borrow_limits : merge(
+        {
+          InstanceType = resource.instance_type
+        },
+        resource.count != null ? {
+          Count = resource.count
+        } : {},
+        resource.accelerators != null ? {
+          Accelerators = resource.accelerators
+        } : {},
+        resource.vcpu != null ? {
+          VCpu = resource.vcpu
+        } : {},
+        resource.memory_in_gib != null ? {
+          MemoryInGiB = resource.memory_in_gib
+        } : {},
+        resource.accelerator_partition != null ? {
+          AcceleratorPartition = {
+            Type  = resource.accelerator_partition.type
+            Count = resource.accelerator_partition.count
+          }
+        } : {}
+      )
+    ]
+  }
+
+  compute_quota_configs = {
+    for quota in var.compute_quotas : quota.name => {
+      ComputeQuotaResources = local.compute_quota_resources[quota.name]
+      ResourceSharingConfig = merge(
+        {
+          Strategy = quota.resource_sharing_config.strategy
+        },
+        quota.resource_sharing_config.borrow_limit != null ? {
+          BorrowLimit = quota.resource_sharing_config.borrow_limit
+        } : {},
+        length(local.compute_quota_absolute_borrow_limits[quota.name]) > 0 ? {
+          AbsoluteBorrowLimits = local.compute_quota_absolute_borrow_limits[quota.name]
+        } : {}
+      )
+      PreemptTeamTasks = quota.preempt_team_tasks
+    }
+  }
+
+  compute_quota_targets = {
+    for quota in var.compute_quotas : quota.name => {
+      TeamName        = quota.target.team_name
+      FairShareWeight = quota.target.fair_share_weight
+    }
+  }
+}
+
 # EKS Addon for Task Governance
 resource "aws_eks_addon" "task_governance" {
-  cluster_name = var.eks_cluster_name
-  addon_name   = "amazon-sagemaker-hyperpod-taskgovernance"
+  cluster_name                = var.eks_cluster_name
+  addon_name                  = "amazon-sagemaker-hyperpod-taskgovernance"
   resolve_conflicts_on_create = "OVERWRITE"
   resolve_conflicts_on_update = "OVERWRITE"
 }
@@ -18,4 +101,51 @@ resource "null_resource" "wait_for_kueue_webhook" {
   }
 
   depends_on = [aws_eks_addon.task_governance]
-}
+}
+
+resource "null_resource" "compute_quota" {
+  for_each = { for quota in var.compute_quotas : quota.name => quota }
+
+  triggers = {
+    activation_state     = each.value.activation_state
+    cluster_arn          = var.hyperpod_cluster_arn
+    compute_quota_config = jsonencode(local.compute_quota_configs[each.key])
+    compute_quota_target = jsonencode(local.compute_quota_targets[each.key])
+    description          = each.value.description
+    name                 = each.value.name
+    region               = var.aws_region
+  }
+
+  provisioner "local-exec" {
+    command = "bash ${path.module}/scripts/manage-compute-quota.sh apply"
+    environment = {
+      ACTIVATION_STATE     = self.triggers.activation_state
+      AWS_REGION           = self.triggers.region
+      CLUSTER_ARN          = self.triggers.cluster_arn
+      COMPUTE_QUOTA_CONFIG = self.triggers.compute_quota_config
+      COMPUTE_QUOTA_TARGET = self.triggers.compute_quota_target
+      DESCRIPTION          = self.triggers.description
+      QUOTA_NAME           = self.triggers.name
+    }
+  }
+
+  provisioner "local-exec" {
+    when    = destroy
+    command = "bash ${path.module}/scripts/manage-compute-quota.sh delete"
+    environment = {
+      ACTIVATION_STATE     = self.triggers.activation_state
+      AWS_REGION           = self.triggers.region
+      CLUSTER_ARN          = self.triggers.cluster_arn
+      COMPUTE_QUOTA_CONFIG = self.triggers.compute_quota_config
+      COMPUTE_QUOTA_TARGET = self.triggers.compute_quota_target
+      DESCRIPTION          = self.triggers.description
+      QUOTA_NAME           = self.triggers.name
+    }
+  }
+
+  lifecycle {
+    create_before_destroy = true
+  }
+
+  depends_on = [null_resource.wait_for_kueue_webhook]
+}
diff --git a/...gemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf b/...gemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf
@@ -1,4 +1,9 @@
 output "task_governance_addon_arn" {
   description = "ARN of the task governance addon"
-  value       = aws_eks_addon.task_governance.arn 
-}
+  value       = aws_eks_addon.task_governance.arn
+}
+
+output "compute_quota_names" {
+  description = "Names of task governance compute allocations managed by this module"
+  value       = keys(null_resource.compute_quota)
+}