-
Notifications
You must be signed in to change notification settings - Fork 139
Torchx mcad coscheduler #693
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
e8005e7
e95a2f9
9e0e34c
0bb15a2
bb4ceed
31b6287
0fc471a
532db7f
10d899e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,10 +13,9 @@ | |
Prerequisites | ||
============== | ||
|
||
TorchX kubernetes scheduler depends on AppWrapper + MCAD. | ||
|
||
Install MCAD | ||
TorchX Kubernetes_MCAD scheduler depends on AppWrapper + MCAD. | ||
|
||
Install MCAD: | ||
See deploying Multi-Cluster-Application-Dispatcher guide | ||
https://github.com/project-codeflare/multi-cluster-app-dispatcher/blob/main/doc/deploy/deployment.md | ||
|
||
|
@@ -168,6 +167,7 @@ def role_to_pod( | |
role: Role, | ||
service_account: Optional[str], | ||
image_secret: Optional[str], | ||
coscheduler_name: Optional[str], | ||
) -> "V1Pod": | ||
from kubernetes.client.models import ( # noqa: F811 redefinition of unused | ||
V1Container, | ||
|
@@ -338,6 +338,7 @@ def role_to_pod( | |
service_account_name=service_account, | ||
volumes=volumes, | ||
node_selector=node_selector, | ||
scheduler_name=coscheduler_name, | ||
), | ||
metadata=V1ObjectMeta( | ||
name=name, | ||
|
@@ -352,6 +353,31 @@ def role_to_pod( | |
) | ||
|
||
|
||
def create_pod_group(role: Role, namespace: str, app_id: str) -> "Dict[str, Any]": | ||
pod_group_name = app_id + "-" + cleanup_str(role.name) + "-pg" | ||
|
||
pod_group: Dict[str, Any] = { | ||
"apiVersion": "scheduling.sigs.k8s.io/v1alpha1", | ||
"kind": "PodGroup", | ||
"metadata": { | ||
"name": pod_group_name, | ||
"namespace": namespace, | ||
"labels": { | ||
"appwrapper.mcad.ibm.com": app_id, | ||
}, | ||
}, | ||
"spec": { | ||
"minMember": role.num_replicas, | ||
}, | ||
} | ||
|
||
genericitem_pod_group: Dict[str, Any] = { | ||
"replicas": 1, | ||
"generictemplate": pod_group, | ||
} | ||
return genericitem_pod_group | ||
|
||
|
||
def mcad_svc(svc_name: str, namespace: str, service_port: str) -> "V1Service": | ||
from kubernetes.client.models import ( # noqa: F401, F811 | ||
V1Container, | ||
|
@@ -436,6 +462,7 @@ def app_to_resource( | |
namespace: str, | ||
service_account: Optional[str], | ||
image_secret: Optional[str], | ||
coscheduler_name: Optional[str], | ||
priority: Optional[int] = None, | ||
) -> Dict[str, Any]: | ||
""" | ||
|
@@ -448,6 +475,12 @@ def app_to_resource( | |
genericitems = [] | ||
|
||
unique_app_id = cleanup_str(make_unique(app.name)) | ||
|
||
if coscheduler_name is not None: | ||
for role_idx, role in enumerate(app.roles): | ||
genericitem_pod_group = create_pod_group(role, namespace, unique_app_id) | ||
genericitems.append(genericitem_pod_group) | ||
|
||
for role_idx, role in enumerate(app.roles): | ||
for replica_id in range(role.num_replicas): | ||
values = macros.Values( | ||
|
@@ -473,8 +506,13 @@ def app_to_resource( | |
replica_role, | ||
service_account, | ||
image_secret, | ||
coscheduler_name, | ||
) | ||
pod.metadata.labels.update( | ||
pod_labels( | ||
app, role_idx, role, replica_id, coscheduler_name, unique_app_id | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's a lot of fields here. Think it's better to switch these to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the code to include field names. |
||
) | ||
) | ||
pod.metadata.labels.update(pod_labels(app, role_idx, role, replica_id)) | ||
|
||
genericitem: Dict[str, Any] = { | ||
"replicas": 1, | ||
|
@@ -676,6 +714,7 @@ class KubernetesMCADOpts(TypedDict, total=False): | |
service_account: Optional[str] | ||
priority: Optional[int] | ||
image_secret: Optional[str] | ||
coscheduler_name: Optional[str] | ||
|
||
|
||
class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]): | ||
|
@@ -861,9 +900,15 @@ def _submit_dryrun( | |
namespace = cfg.get("namespace") | ||
assert isinstance(namespace, str), "namespace must be a str" | ||
|
||
coscheduler_name = cfg.get("coscheduler_name") | ||
assert coscheduler_name is None or isinstance( | ||
coscheduler_name, str | ||
), "coscheduler_name must be a string" | ||
|
||
resource = app_to_resource( | ||
app, namespace, service_account, image_secret, priority | ||
app, namespace, service_account, image_secret, coscheduler_name, priority | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. likewise kwargs for this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated |
||
) | ||
|
||
req = KubernetesMCADJob( | ||
resource=resource, | ||
images_to_push=images_to_push, | ||
|
@@ -917,6 +962,11 @@ def run_opts(self) -> runopts: | |
type_=str, | ||
help="The name of the Kubernetes/OpenShift secret set up for private images", | ||
) | ||
opts.add( | ||
"coscheduler_name", | ||
type_=str, | ||
help="Option to run TorchX-MCAD with a co-scheduler. User must provide the co-scheduler name.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add some links to any external documentation about coschedulers? Like: https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/README.md ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the documentation with brief explanation and references to secondary scheduling, coscheduling and PodGroups, and the PodGroup CRD. |
||
) | ||
return opts | ||
|
||
def describe(self, app_id: str) -> Optional[DescribeAppResponse]: | ||
|
@@ -1070,12 +1120,21 @@ def create_scheduler(session_name: str, **kwargs: Any) -> KubernetesMCADSchedule | |
|
||
# TODO update to Kubernetes standard labels (https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/) | ||
def pod_labels( | ||
app: AppDef, role_idx: int, role: Role, replica_id: int | ||
app: AppDef, | ||
role_idx: int, | ||
role: Role, | ||
replica_id: int, | ||
coscheduler_name: Optional[str], | ||
app_id: str, | ||
) -> Dict[str, str]: | ||
return { | ||
labels = { | ||
LABEL_VERSION: torchx.__version__, | ||
LABEL_APP_NAME: app.name, | ||
LABEL_ROLE_INDEX: str(role_idx), | ||
LABEL_ROLE_NAME: role.name, | ||
LABEL_REPLICA_ID: str(replica_id), | ||
} | ||
if coscheduler_name is not None: | ||
pod_group = app_id + "-" + cleanup_str(role.name) + "-pg" | ||
labels.update({"pod-group.scheduling.sigs.k8s.io": pod_group}) | ||
return labels |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be nice to add a link to the schema definition for this type. I think it's https://github.com/kubernetes-sigs/scheduler-plugins/blob/56e2398e5051f796d5f02adb821d57eb8b6c20a7/config/crd/bases/scheduling.sigs.k8s.io_podgroups.yaml#L9 ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Included the PodGroup CRD (release 1.24) in the updated documentation.