Skip to content

Commit 894b557

Browse files
author
Shixian Cui
committed
Add AWS Inf2 instances support for aws_batch scheduler
1 parent 26cb186 commit 894b557

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed

torchx/specs/named_resources_aws.py

+44
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,46 @@ def aws_trn1_32xlarge() -> Resource:
348348
)
349349

350350

351+
def aws_inf2_xlarge() -> Resource:
352+
return Resource(
353+
cpu=4,
354+
gpu=0,
355+
memMB=32 * GiB,
356+
capabilities={K8S_ITYPE: "inf2.xlarge"},
357+
devices={NEURON_DEVICE: 1},
358+
)
359+
360+
361+
def aws_inf2_8xlarge() -> Resource:
362+
return Resource(
363+
cpu=32,
364+
gpu=0,
365+
memMB=32 * GiB,
366+
capabilities={K8S_ITYPE: "inf2.8xlarge"},
367+
devices={NEURON_DEVICE: 1},
368+
)
369+
370+
371+
def aws_inf2_24xlarge() -> Resource:
372+
return Resource(
373+
cpu=96,
374+
gpu=0,
375+
memMB=192 * GiB,
376+
capabilities={K8S_ITYPE: "inf2.24xlarge"},
377+
devices={NEURON_DEVICE: 6},
378+
)
379+
380+
381+
def aws_inf2_48xlarge() -> Resource:
382+
return Resource(
383+
cpu=192,
384+
gpu=0,
385+
memMB=384 * GiB,
386+
capabilities={K8S_ITYPE: "inf2.48xlarge"},
387+
devices={NEURON_DEVICE: 12},
388+
)
389+
390+
351391
NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
352392
"aws_t3.medium": aws_t3_medium,
353393
"aws_m5.2xlarge": aws_m5_2xlarge,
@@ -383,4 +423,8 @@ def aws_trn1_32xlarge() -> Resource:
383423
"aws_g6e.48xlarge": aws_g6e_48xlarge,
384424
"aws_trn1.2xlarge": aws_trn1_2xlarge,
385425
"aws_trn1.32xlarge": aws_trn1_32xlarge,
426+
"aws_inf2.xlarge": aws_inf2_xlarge,
427+
"aws_inf2.8xlarge": aws_inf2_8xlarge,
428+
"aws_inf2.24xlarge": aws_inf2_24xlarge,
429+
"aws_inf2.48xlarge": aws_inf2_48xlarge,
386430
}

torchx/specs/test/named_resources_aws_test.py

+29
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@
3131
aws_g6e_4xlarge,
3232
aws_g6e_8xlarge,
3333
aws_g6e_xlarge,
34+
aws_inf2_24xlarge,
35+
aws_inf2_48xlarge,
36+
aws_inf2_8xlarge,
37+
aws_inf2_xlarge,
3438
aws_m5_2xlarge,
3539
aws_p3_16xlarge,
3640
aws_p3_2xlarge,
@@ -231,6 +235,31 @@ def test_aws_trn1(self) -> None:
231235
self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16)
232236
self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices)
233237

238+
def test_aws_inf2(self) -> None:
239+
inf2_1 = aws_inf2_xlarge()
240+
self.assertEqual(4, inf2_1.cpu)
241+
self.assertEqual(0, inf2_1.gpu)
242+
self.assertEqual(32 * GiB, inf2_1.memMB)
243+
self.assertEqual({NEURON_DEVICE: 1}, inf2_1.devices)
244+
245+
inf2_8 = aws_inf2_8xlarge()
246+
self.assertEqual(32, inf2_8.cpu)
247+
self.assertEqual(0, inf2_8.gpu)
248+
self.assertEqual(32 * GiB, inf2_8.memMB)
249+
self.assertEqual({NEURON_DEVICE: 1}, inf2_8.devices)
250+
251+
inf2_24 = aws_inf2_24xlarge()
252+
self.assertEqual(96, inf2_24.cpu)
253+
self.assertEqual(0, inf2_24.gpu)
254+
self.assertEqual(192 * GiB, inf2_24.memMB)
255+
self.assertEqual({NEURON_DEVICE: 6}, inf2_24.devices)
256+
257+
inf2_48 = aws_inf2_48xlarge()
258+
self.assertEqual(192, inf2_48.cpu)
259+
self.assertEqual(0, inf2_48.gpu)
260+
self.assertEqual(384 * GiB, inf2_48.memMB)
261+
self.assertEqual({NEURON_DEVICE: 12}, inf2_48.devices)
262+
234263
def test_aws_m5_2xlarge(self) -> None:
235264
resource = aws_m5_2xlarge()
236265
self.assertEqual(8, resource.cpu)

0 commit comments

Comments
 (0)