@@ -45,6 +45,7 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
45
45
from typing import TYPE_CHECKING, Optional, Dict, List
46
46
from pathlib import Path
47
47
48
+ import openshift as oc
48
49
from torchx.components.dist import ddp
49
50
from torchx.runner import get_runner
50
51
from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
@@ -88,8 +89,10 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
88
89
max_retries: int = 0,
89
90
mounts: Optional[List[str]] = None,
90
91
rdzv_port: int = 29500,
92
+ rdzv_backend: str = None,
91
93
scheduler_args: Optional[Dict[str, str]] = None,
92
94
image: Optional[str] = None,
95
+ workspace: Optional[str] = f"file://{Path.cwd()}",
93
96
):
94
97
if bool(script) == bool(m): # logical XOR
95
98
raise ValueError(
@@ -108,10 +111,12 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
108
111
self.max_retries = max_retries
109
112
self.mounts: List[str] = mounts if mounts is not None else []
110
113
self.rdzv_port = rdzv_port
114
+ self.rdzv_backend = rdzv_backend
111
115
self.scheduler_args: Dict[str, str] = (
112
116
scheduler_args if scheduler_args is not None else dict()
113
117
)
114
118
self.image = image
119
+ self.workspace = workspace
115
120
116
121
def _dry_run(self, cluster: "Cluster"):
117
122
j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus
@@ -131,17 +136,23 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
131
136
env=self.env,
132
137
max_retries=self.max_retries,
133
138
rdzv_port=self.rdzv_port,
139
+ rdzv_backend=self.rdzv_backend
140
+ if self.rdzv_backend is not None
141
+ else "static",
134
142
mounts=self.mounts,
135
143
),
136
144
scheduler=cluster.torchx_scheduler,
137
145
cfg=cluster.torchx_config(**self.scheduler_args),
138
- workspace=f"file://{Path.cwd()}" ,
146
+ workspace=self.workspace ,
139
147
)
140
148
141
149
def _missing_spec(self, spec: str):
142
150
raise ValueError(f"Job definition missing arg: {spec}")
143
151
144
152
def _dry_run_no_cluster(self):
153
+ if self.scheduler_args is not None:
154
+ if self.scheduler_args.get("namespace") is None:
155
+ self.scheduler_args["namespace"] = oc.get_project_name()
145
156
return torchx_runner.dryrun(
146
157
app=ddp(
147
158
*self.script_args,
@@ -166,13 +177,16 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
166
177
env=self.env, # should this still exist?
167
178
max_retries=self.max_retries,
168
179
rdzv_port=self.rdzv_port, # should this still exist?
180
+ rdzv_backend=self.rdzv_backend
181
+ if self.rdzv_backend is not None
182
+ else "c10d",
169
183
mounts=self.mounts,
170
184
image=self.image
171
185
if self.image is not None
172
186
else self._missing_spec("image"),
173
187
),
174
188
scheduler="kubernetes_mcad",
175
- cfg=self.scheduler_args if self.scheduler_args is not None else None ,
189
+ cfg=self.scheduler_args,
176
190
workspace="",
177
191
)
178
192
@@ -291,7 +305,7 @@ <h3>Methods</h3>
291
305
</ dd >
292
306
< dt id ="codeflare_sdk.job.jobs.DDPJobDefinition "> < code class ="flex name class ">
293
307
< span > class < span class ="ident "> DDPJobDefinition</ span > </ span >
294
- < span > (</ span > < span > script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None)</ span >
308
+ < span > (</ span > < span > script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, rdzv_backend: str = None, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None, workspace: Optional[str] = 'file:///home/meyceoz/Documents/codeflare-sdk' )</ span >
295
309
</ code > </ dt >
296
310
< dd >
297
311
< div class ="desc "> </ div >
@@ -315,8 +329,10 @@ <h3>Methods</h3>
315
329
max_retries: int = 0,
316
330
mounts: Optional[List[str]] = None,
317
331
rdzv_port: int = 29500,
332
+ rdzv_backend: str = None,
318
333
scheduler_args: Optional[Dict[str, str]] = None,
319
334
image: Optional[str] = None,
335
+ workspace: Optional[str] = f"file://{Path.cwd()}",
320
336
):
321
337
if bool(script) == bool(m): # logical XOR
322
338
raise ValueError(
@@ -335,10 +351,12 @@ <h3>Methods</h3>
335
351
self.max_retries = max_retries
336
352
self.mounts: List[str] = mounts if mounts is not None else []
337
353
self.rdzv_port = rdzv_port
354
+ self.rdzv_backend = rdzv_backend
338
355
self.scheduler_args: Dict[str, str] = (
339
356
scheduler_args if scheduler_args is not None else dict()
340
357
)
341
358
self.image = image
359
+ self.workspace = workspace
342
360
343
361
def _dry_run(self, cluster: "Cluster"):
344
362
j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus
@@ -358,17 +376,23 @@ <h3>Methods</h3>
358
376
env=self.env,
359
377
max_retries=self.max_retries,
360
378
rdzv_port=self.rdzv_port,
379
+ rdzv_backend=self.rdzv_backend
380
+ if self.rdzv_backend is not None
381
+ else "static",
361
382
mounts=self.mounts,
362
383
),
363
384
scheduler=cluster.torchx_scheduler,
364
385
cfg=cluster.torchx_config(**self.scheduler_args),
365
- workspace=f"file://{Path.cwd()}" ,
386
+ workspace=self.workspace ,
366
387
)
367
388
368
389
def _missing_spec(self, spec: str):
369
390
raise ValueError(f"Job definition missing arg: {spec}")
370
391
371
392
def _dry_run_no_cluster(self):
393
+ if self.scheduler_args is not None:
394
+ if self.scheduler_args.get("namespace") is None:
395
+ self.scheduler_args["namespace"] = oc.get_project_name()
372
396
return torchx_runner.dryrun(
373
397
app=ddp(
374
398
*self.script_args,
@@ -393,13 +417,16 @@ <h3>Methods</h3>
393
417
env=self.env, # should this still exist?
394
418
max_retries=self.max_retries,
395
419
rdzv_port=self.rdzv_port, # should this still exist?
420
+ rdzv_backend=self.rdzv_backend
421
+ if self.rdzv_backend is not None
422
+ else "c10d",
396
423
mounts=self.mounts,
397
424
image=self.image
398
425
if self.image is not None
399
426
else self._missing_spec("image"),
400
427
),
401
428
scheduler="kubernetes_mcad",
402
- cfg=self.scheduler_args if self.scheduler_args is not None else None ,
429
+ cfg=self.scheduler_args,
403
430
workspace="",
404
431
)
405
432
0 commit comments