Skip to content

Commit 65cbb2f

Browse files
authored
Update for 0.4.2 release (#90)
1 parent 6381066 commit 65cbb2f

File tree

4 files changed

+62
-60
lines changed

4 files changed

+62
-60
lines changed

docs/cluster/cluster.html

+24-50
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
9595
Called upon cluster object creation, creates an AppWrapper yaml based on
9696
the specifications of the ClusterConfiguration.
9797
&#34;&#34;&#34;
98+
99+
if self.config.namespace is None:
100+
self.config.namespace = oc.get_project_name()
101+
if type(self.config.namespace) is not str:
102+
raise TypeError(
103+
f&#34;Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication.&#34;
104+
)
105+
98106
name = self.config.name
99107
namespace = self.config.namespace
100108
min_cpu = self.config.min_cpus
@@ -317,26 +325,6 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
317325
return to_return
318326

319327

320-
def get_current_namespace() -&gt; str:
321-
&#34;&#34;&#34;
322-
Returns the user&#39;s current working namespace.
323-
&#34;&#34;&#34;
324-
try:
325-
namespace = oc.invoke(&#34;project&#34;, [&#34;-q&#34;]).actions()[0].out.strip()
326-
except oc.OpenShiftPythonException as osp: # pragma: no cover
327-
error_msg = osp.result.err()
328-
if (
329-
&#34;do not have rights&#34; in error_msg
330-
or &#34;Missing or incomplete configuration&#34; in error_msg
331-
):
332-
raise PermissionError(
333-
&#34;Action not permitted, have you run auth.login() or cluster.up()?&#34;
334-
)
335-
else:
336-
raise osp
337-
return namespace
338-
339-
340328
def list_all_clusters(namespace: str, print_to_console: bool = True):
341329
&#34;&#34;&#34;
342330
Returns (and prints by default) a list of all clusters in a given namespace.
@@ -537,35 +525,6 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
537525
<section>
538526
<h2 class="section-title" id="header-functions">Functions</h2>
539527
<dl>
540-
<dt id="codeflare_sdk.cluster.cluster.get_current_namespace"><code class="name flex">
541-
<span>def <span class="ident">get_current_namespace</span></span>(<span>) ‑> str</span>
542-
</code></dt>
543-
<dd>
544-
<div class="desc"><p>Returns the user's current working namespace.</p></div>
545-
<details class="source">
546-
<summary>
547-
<span>Expand source code</span>
548-
</summary>
549-
<pre><code class="python">def get_current_namespace() -&gt; str:
550-
&#34;&#34;&#34;
551-
Returns the user&#39;s current working namespace.
552-
&#34;&#34;&#34;
553-
try:
554-
namespace = oc.invoke(&#34;project&#34;, [&#34;-q&#34;]).actions()[0].out.strip()
555-
except oc.OpenShiftPythonException as osp: # pragma: no cover
556-
error_msg = osp.result.err()
557-
if (
558-
&#34;do not have rights&#34; in error_msg
559-
or &#34;Missing or incomplete configuration&#34; in error_msg
560-
):
561-
raise PermissionError(
562-
&#34;Action not permitted, have you run auth.login() or cluster.up()?&#34;
563-
)
564-
else:
565-
raise osp
566-
return namespace</code></pre>
567-
</details>
568-
</dd>
569528
<dt id="codeflare_sdk.cluster.cluster.list_all_clusters"><code class="name flex">
570529
<span>def <span class="ident">list_all_clusters</span></span>(<span>namespace: str, print_to_console: bool = True)</span>
571530
</code></dt>
@@ -655,6 +614,14 @@ <h2 class="section-title" id="header-classes">Classes</h2>
655614
Called upon cluster object creation, creates an AppWrapper yaml based on
656615
the specifications of the ClusterConfiguration.
657616
&#34;&#34;&#34;
617+
618+
if self.config.namespace is None:
619+
self.config.namespace = oc.get_project_name()
620+
if type(self.config.namespace) is not str:
621+
raise TypeError(
622+
f&#34;Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication.&#34;
623+
)
624+
658625
name = self.config.name
659626
namespace = self.config.namespace
660627
min_cpu = self.config.min_cpus
@@ -942,6 +909,14 @@ <h3>Methods</h3>
942909
Called upon cluster object creation, creates an AppWrapper yaml based on
943910
the specifications of the ClusterConfiguration.
944911
&#34;&#34;&#34;
912+
913+
if self.config.namespace is None:
914+
self.config.namespace = oc.get_project_name()
915+
if type(self.config.namespace) is not str:
916+
raise TypeError(
917+
f&#34;Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication.&#34;
918+
)
919+
945920
name = self.config.name
946921
namespace = self.config.namespace
947922
min_cpu = self.config.min_cpus
@@ -1253,7 +1228,6 @@ <h1>Index</h1>
12531228
</li>
12541229
<li><h3><a href="#header-functions">Functions</a></h3>
12551230
<ul class="">
1256-
<li><code><a title="codeflare_sdk.cluster.cluster.get_current_namespace" href="#codeflare_sdk.cluster.cluster.get_current_namespace">get_current_namespace</a></code></li>
12571231
<li><code><a title="codeflare_sdk.cluster.cluster.list_all_clusters" href="#codeflare_sdk.cluster.cluster.list_all_clusters">list_all_clusters</a></code></li>
12581232
<li><code><a title="codeflare_sdk.cluster.cluster.list_all_queued" href="#codeflare_sdk.cluster.cluster.list_all_queued">list_all_queued</a></code></li>
12591233
</ul>

docs/cluster/config.html

+4-3
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.config</code></h1>
5353
from dataclasses import dataclass, field
5454
from .auth import Authentication
5555
import pathlib
56+
import openshift
5657

5758
dir = pathlib.Path(__file__).parent.parent.resolve()
5859

@@ -65,7 +66,7 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.config</code></h1>
6566
&#34;&#34;&#34;
6667

6768
name: str
68-
namespace: str = &#34;default&#34;
69+
namespace: str = None
6970
head_info: list = field(default_factory=list)
7071
machine_types: list = field(default_factory=list) # [&#34;m4.xlarge&#34;, &#34;g4dn.xlarge&#34;]
7172
min_cpus: int = 1
@@ -92,7 +93,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
9293
<dl>
9394
<dt id="codeflare_sdk.cluster.config.ClusterConfiguration"><code class="flex name class">
9495
<span>class <span class="ident">ClusterConfiguration</span></span>
95-
<span>(</span><span>name: str, namespace: str = 'default', head_info: list = &lt;factory&gt;, machine_types: list = &lt;factory&gt;, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = &lt;factory&gt;, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103')</span>
96+
<span>(</span><span>name: str, namespace: str = None, head_info: list = &lt;factory&gt;, machine_types: list = &lt;factory&gt;, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = &lt;factory&gt;, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103')</span>
9697
</code></dt>
9798
<dd>
9899
<div class="desc"><p>This dataclass is used to specify resource requirements and other details, and
@@ -108,7 +109,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
108109
&#34;&#34;&#34;
109110

110111
name: str
111-
namespace: str = &#34;default&#34;
112+
namespace: str = None
112113
head_info: list = field(default_factory=list)
113114
machine_types: list = field(default_factory=list) # [&#34;m4.xlarge&#34;, &#34;g4dn.xlarge&#34;]
114115
min_cpus: int = 1

docs/job/jobs.html

+32-5
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
4545
from typing import TYPE_CHECKING, Optional, Dict, List
4646
from pathlib import Path
4747

48+
import openshift as oc
4849
from torchx.components.dist import ddp
4950
from torchx.runner import get_runner
5051
from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
@@ -88,8 +89,10 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
8889
max_retries: int = 0,
8990
mounts: Optional[List[str]] = None,
9091
rdzv_port: int = 29500,
92+
rdzv_backend: str = None,
9193
scheduler_args: Optional[Dict[str, str]] = None,
9294
image: Optional[str] = None,
95+
workspace: Optional[str] = f&#34;file://{Path.cwd()}&#34;,
9396
):
9497
if bool(script) == bool(m): # logical XOR
9598
raise ValueError(
@@ -108,10 +111,12 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
108111
self.max_retries = max_retries
109112
self.mounts: List[str] = mounts if mounts is not None else []
110113
self.rdzv_port = rdzv_port
114+
self.rdzv_backend = rdzv_backend
111115
self.scheduler_args: Dict[str, str] = (
112116
scheduler_args if scheduler_args is not None else dict()
113117
)
114118
self.image = image
119+
self.workspace = workspace
115120

116121
def _dry_run(self, cluster: &#34;Cluster&#34;):
117122
j = f&#34;{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}&#34; # # of proc. = # of gpus
@@ -131,17 +136,23 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
131136
env=self.env,
132137
max_retries=self.max_retries,
133138
rdzv_port=self.rdzv_port,
139+
rdzv_backend=self.rdzv_backend
140+
if self.rdzv_backend is not None
141+
else &#34;static&#34;,
134142
mounts=self.mounts,
135143
),
136144
scheduler=cluster.torchx_scheduler,
137145
cfg=cluster.torchx_config(**self.scheduler_args),
138-
workspace=f&#34;file://{Path.cwd()}&#34;,
146+
workspace=self.workspace,
139147
)
140148

141149
def _missing_spec(self, spec: str):
142150
raise ValueError(f&#34;Job definition missing arg: {spec}&#34;)
143151

144152
def _dry_run_no_cluster(self):
153+
if self.scheduler_args is not None:
154+
if self.scheduler_args.get(&#34;namespace&#34;) is None:
155+
self.scheduler_args[&#34;namespace&#34;] = oc.get_project_name()
145156
return torchx_runner.dryrun(
146157
app=ddp(
147158
*self.script_args,
@@ -166,13 +177,16 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
166177
env=self.env, # should this still exist?
167178
max_retries=self.max_retries,
168179
rdzv_port=self.rdzv_port, # should this still exist?
180+
rdzv_backend=self.rdzv_backend
181+
if self.rdzv_backend is not None
182+
else &#34;c10d&#34;,
169183
mounts=self.mounts,
170184
image=self.image
171185
if self.image is not None
172186
else self._missing_spec(&#34;image&#34;),
173187
),
174188
scheduler=&#34;kubernetes_mcad&#34;,
175-
cfg=self.scheduler_args if self.scheduler_args is not None else None,
189+
cfg=self.scheduler_args,
176190
workspace=&#34;&#34;,
177191
)
178192

@@ -291,7 +305,7 @@ <h3>Methods</h3>
291305
</dd>
292306
<dt id="codeflare_sdk.job.jobs.DDPJobDefinition"><code class="flex name class">
293307
<span>class <span class="ident">DDPJobDefinition</span></span>
294-
<span>(</span><span>script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None)</span>
308+
<span>(</span><span>script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, rdzv_backend: str = None, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None, workspace: Optional[str] = 'file:///home/meyceoz/Documents/codeflare-sdk')</span>
295309
</code></dt>
296310
<dd>
297311
<div class="desc"></div>
@@ -315,8 +329,10 @@ <h3>Methods</h3>
315329
max_retries: int = 0,
316330
mounts: Optional[List[str]] = None,
317331
rdzv_port: int = 29500,
332+
rdzv_backend: str = None,
318333
scheduler_args: Optional[Dict[str, str]] = None,
319334
image: Optional[str] = None,
335+
workspace: Optional[str] = f&#34;file://{Path.cwd()}&#34;,
320336
):
321337
if bool(script) == bool(m): # logical XOR
322338
raise ValueError(
@@ -335,10 +351,12 @@ <h3>Methods</h3>
335351
self.max_retries = max_retries
336352
self.mounts: List[str] = mounts if mounts is not None else []
337353
self.rdzv_port = rdzv_port
354+
self.rdzv_backend = rdzv_backend
338355
self.scheduler_args: Dict[str, str] = (
339356
scheduler_args if scheduler_args is not None else dict()
340357
)
341358
self.image = image
359+
self.workspace = workspace
342360

343361
def _dry_run(self, cluster: &#34;Cluster&#34;):
344362
j = f&#34;{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}&#34; # # of proc. = # of gpus
@@ -358,17 +376,23 @@ <h3>Methods</h3>
358376
env=self.env,
359377
max_retries=self.max_retries,
360378
rdzv_port=self.rdzv_port,
379+
rdzv_backend=self.rdzv_backend
380+
if self.rdzv_backend is not None
381+
else &#34;static&#34;,
361382
mounts=self.mounts,
362383
),
363384
scheduler=cluster.torchx_scheduler,
364385
cfg=cluster.torchx_config(**self.scheduler_args),
365-
workspace=f&#34;file://{Path.cwd()}&#34;,
386+
workspace=self.workspace,
366387
)
367388

368389
def _missing_spec(self, spec: str):
369390
raise ValueError(f&#34;Job definition missing arg: {spec}&#34;)
370391

371392
def _dry_run_no_cluster(self):
393+
if self.scheduler_args is not None:
394+
if self.scheduler_args.get(&#34;namespace&#34;) is None:
395+
self.scheduler_args[&#34;namespace&#34;] = oc.get_project_name()
372396
return torchx_runner.dryrun(
373397
app=ddp(
374398
*self.script_args,
@@ -393,13 +417,16 @@ <h3>Methods</h3>
393417
env=self.env, # should this still exist?
394418
max_retries=self.max_retries,
395419
rdzv_port=self.rdzv_port, # should this still exist?
420+
rdzv_backend=self.rdzv_backend
421+
if self.rdzv_backend is not None
422+
else &#34;c10d&#34;,
396423
mounts=self.mounts,
397424
image=self.image
398425
if self.image is not None
399426
else self._missing_spec(&#34;image&#34;),
400427
),
401428
scheduler=&#34;kubernetes_mcad&#34;,
402-
cfg=self.scheduler_args if self.scheduler_args is not None else None,
429+
cfg=self.scheduler_args,
403430
workspace=&#34;&#34;,
404431
)
405432

pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "codeflare-sdk"
3-
version = "0.4.1"
3+
version = "0.4.2"
44
description = "Python SDK for codeflare client"
55

66
license = "Apache-2.0"
@@ -25,4 +25,4 @@ openshift-client = "1.0.18"
2525
rich = "^12.5"
2626
ray = {version = "2.1.0", extras = ["default"]}
2727
kubernetes = "26.1.0"
28-
codeflare-torchx = "0.5.0.dev5"
28+
codeflare-torchx = "0.6.0.dev0"

0 commit comments

Comments
 (0)