Skip to content

Commit cda4940

Browse files
committed
More updates
1 parent 11d5497 commit cda4940

File tree

10 files changed

+194
-81
lines changed

10 files changed

+194
-81
lines changed

Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ ADD . /go/src/github.com/kubeflow/mpi-operator
99
WORKDIR /go/src/github.com/kubeflow/mpi-operator
1010
RUN apt update
1111
RUN apt install -y build-essential cmake zlib1g-dev
12+
RUN apt update \
13+
&& apt install -y --no-install-recommends \
14+
g++ \
15+
gfortran \
16+
libopenmpi-dev \
17+
&& rm -rf /var/lib/apt/lists/*
1218
RUN make RELEASE_VERSION=${RELEASE_VERSION} mpi-operator.$VERSION
1319
RUN ln -s mpi-operator.${VERSION} _output/cmd/bin/mpi-operator
1420

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,5 +213,5 @@ volcano-scheduler-deploy: volcano-scheduler-crd
213213

214214
.PHONY: charm
215215
charm: ${BIN_DIR}
216-
cd $(PROJECT_DIR)/dep-libs/charm && git checkout kubernetes && ./build charm++ netlrts-linux-x86_64 --enable-shrinkexpand -j8 --force
216+
cd $(PROJECT_DIR)/dep-libs/charm && git checkout shrinkexpand-mpi && ./build charm++ mpi-linux-x86_64 --enable-shrinkexpand -j8 --force --with-production
217217
cd pkg/controller && $(PROJECT_DIR)/dep-libs/charm/bin/charmc -language c++ -seq -o ${PROJECT_DIR}/${BIN_DIR}/rescale_client rescale_client.C -lccs-client

deploy/v2beta1/mpi-operator.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8241,7 +8241,7 @@ spec:
82418241
- args:
82428242
- -alsologtostderr
82438243
- --lock-namespace=mpi-operator
8244-
image: adityapb/mpi-operator:rescale-gap
8244+
image: adityapb/mpi-operator:mpi-imm
82458245
imagePullPolicy: Always
82468246
name: mpi-operator
82478247
serviceAccountName: mpi-operator

examples/v2beta1/charm/charm-0.yaml

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ metadata:
66
spec:
77
slotsPerWorker: 1
88
priority: 1
9+
runLauncherAsWorker: true
910
runPolicy:
1011
cleanPodPolicy: Running
1112
ttlSecondsAfterFinished: 600
@@ -16,33 +17,40 @@ spec:
1617
template:
1718
spec:
1819
containers:
19-
- image: adityapb/charm-netlrts:dev-new
20+
- image: adityapb/charm-mpi:latest
2021
imagePullPolicy: Always
2122
name: mpi-launcher
2223
securityContext:
2324
runAsUser: 1000
2425
command:
25-
- /app/charmrun
26+
- /app/charmrun_elastic
2627
args:
2728
- /app/jacobi2d
28-
- "8192"
29+
- "32768"
2930
- "512"
30-
- "600"
31+
- "10000"
3132
- +balancer
32-
- MetisLB
33+
- GreedyCentralLB
3334
- +LBDebug
3435
- "3"
35-
- ++verbose
3636
resources:
3737
limits:
3838
cpu: 1
39+
volumeMounts:
40+
- mountPath: /dev/shm
41+
name: shm-volume
42+
volumes:
43+
- name: shm-volume
44+
emptyDir:
45+
medium: Memory
46+
sizeLimit: 1.5Gi
3947
Worker:
40-
minReplicas: 1
41-
maxReplicas: 10
48+
minReplicas: 15
49+
maxReplicas: 31
4250
template:
4351
spec:
4452
containers:
45-
- image: adityapb/charm-netlrts:dev-new
53+
- image: adityapb/charm-mpi:latest
4654
imagePullPolicy: Always
4755
name: mpi-worker
4856
volumeMounts:
@@ -63,4 +71,4 @@ spec:
6371
- name: shm-volume
6472
emptyDir:
6573
medium: Memory
66-
sizeLimit: 1Gi
74+
sizeLimit: 1.5Gi

examples/v2beta1/charm/charm-2.yaml

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ metadata:
66
spec:
77
slotsPerWorker: 1
88
priority: 3
9+
runLauncherAsWorker: true
910
runPolicy:
1011
cleanPodPolicy: Running
1112
ttlSecondsAfterFinished: 600
@@ -16,35 +17,45 @@ spec:
1617
template:
1718
spec:
1819
containers:
19-
- image: adityapb/charm-netlrts:dev-new
20+
- image: adityapb/charm-mpi:latest
2021
imagePullPolicy: Always
2122
name: mpi-launcher
2223
securityContext:
2324
runAsUser: 1000
2425
command:
25-
- /app/charmrun
26+
- /app/charmrun_elastic
2627
args:
2728
- /app/jacobi2d
28-
- "4096"
29-
- "512"
30-
- "300"
29+
- "2048"
30+
- "128"
31+
- "100"
3132
- +balancer
32-
- MetisLB
33+
- GreedyCentralLB
3334
- +LBDebug
3435
- "3"
35-
- ++verbose
3636
resources:
3737
limits:
3838
cpu: 1
39+
volumeMounts:
40+
- mountPath: /dev/shm
41+
name: shm-volume
42+
volumes:
43+
- name: shm-volume
44+
emptyDir:
45+
medium: Memory
46+
sizeLimit: 1Gi
3947
Worker:
40-
minReplicas: 1
41-
maxReplicas: 10
48+
minReplicas: 43
49+
maxReplicas: 43
4250
template:
4351
spec:
4452
containers:
45-
- image: adityapb/charm-netlrts:dev-new
53+
- image: adityapb/charm-mpi:latest
4654
imagePullPolicy: Always
4755
name: mpi-worker
56+
volumeMounts:
57+
- mountPath: /dev/shm
58+
name: shm-volume
4859
securityContext:
4960
runAsUser: 1000
5061
command:
@@ -56,3 +67,8 @@ spec:
5667
resources:
5768
limits:
5869
cpu: 1
70+
volumes:
71+
- name: shm-volume
72+
emptyDir:
73+
medium: Memory
74+
sizeLimit: 1Gi

examples/v2beta1/charm/generate_jobs.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
timesteps_per_job = [1000, 2000, 500]
1212
job_prefixes = ["small", "medium", "large"]
1313

14-
njobs = 20
14+
njobs = 16
1515

1616

1717
def create_job(prefix, job_index, priority, problem_size, min_replicas, max_replicas, timesteps):
@@ -30,40 +30,45 @@ def create_job(prefix, job_index, priority, problem_size, min_replicas, max_repl
3030
problem_size=problem_size,
3131
chare_size=chare_size,
3232
timesteps=timesteps,
33-
min_replicas=min_replicas,
34-
max_replicas=max_replicas,
33+
min_replicas=min(max_replicas-1, 59),
34+
max_replicas=min(max_replicas-1, 59),
3535
)
3636

37-
with open(f"jobs/charm-job-{job_index}.yaml", "w") as file:
37+
with open(f"jobs_elastic/charm-job-{job_index}.yaml", "w") as file:
3838
file.write(job_yaml)
3939

4040

4141
def generate_jobs():
42-
jobs = [0] * int(njobs * 0.25)
43-
jobs += [1] * int(njobs * 0.5)
44-
jobs += [2] * int(njobs * 0.25)
45-
shuffle(jobs)
46-
print(jobs)
42+
sizes_per_pe = [256, 512, 1024, 1024]
43+
min_pes = [2, 4, 8, 16]
44+
timesteps_per_job = [40000, 40000, 40000, 10000]
45+
job_prefixes = ["small", "medium", "large", "xlarge"]
46+
counts = [0, 0, 0, 0]
47+
njobs = 16
48+
49+
jobs = [2, 1, 1, 0, 3, 3, 0, 3, 1, 1, 0, 3, 0, 1, 1, 1]
50+
priorities = [2, 3, 4, 4, 2, 1, 4, 1, 4, 3, 4, 1, 5, 3, 3, 3]
51+
4752
for i, job_index in enumerate(jobs):
4853
idx = job_index
49-
priority = 3 - idx + randint(0, 3)
54+
priority = priorities[i]
5055
min_replicas = min_pes[idx]
5156
max_replicas = 4 * min_replicas
5257
problem_size = min_replicas * sizes_per_pe[idx]
53-
timesteps = timesteps_per_job[idx] + 100 * randint(0, 10)
58+
timesteps = timesteps_per_job[idx]
5459
prefix = job_prefixes[idx]
5560
create_job(prefix, i, priority, problem_size, min_replicas, max_replicas, timesteps)
5661

5762

5863
def submit_jobs():
5964
for job_index in range(njobs):
60-
job_file = f"jobs/charm-job-{job_index}.yaml"
65+
job_file = f"jobs_elastic/charm-job-{job_index}.yaml"
6166
print(f"Submitting {job_file}")
6267
# Here you would submit the job using your cluster's job submission command
6368
os.system(f"kubectl apply -f {job_file}")
6469
# For this example, we'll just print the command
6570
#print(f"kubectl apply -f {job_file}")
66-
time.sleep(10)
71+
time.sleep(90)
6772

6873

6974
if __name__ == "__main__":

0 commit comments

Comments
 (0)