Skip to content

Commit 11d5497

Browse files
committed
Add simulation coe
1 parent 1af2eae commit 11d5497

File tree

2 files changed

+895
-0
lines changed

2 files changed

+895
-0
lines changed
Lines changed: 381 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,381 @@
1+
from simulate import Simulation, Job, plot_utilization, get_stats
2+
from random import randint, seed, choice
3+
import numpy as np
4+
import sys
5+
from copy import deepcopy
6+
import pandas as pd
7+
import matplotlib.pyplot as plt
8+
import scienceplots
9+
plt.style.use(['science', 'ieee'])
10+
#plt.rcParams.update({'font.size': 18})
11+
12+
seed(42)
13+
14+
class StencilJob(Job):
15+
def __init__(self, job_name, min_replicas, max_replicas, priority, **kwargs):
16+
super().__init__(job_name, min_replicas, max_replicas, priority)
17+
self.n = kwargs.pop('n', 16)
18+
self.max_pes = kwargs.pop('max_pes', 60)
19+
self.niters = kwargs.pop('niters', 10000)
20+
21+
self.data = {
22+
512 : [(2, 0.000996), (4, 0.000675), (8, 0.000627)],
23+
2048 : [(4, 0.00328), (8, 0.0021), (16, 0.0023)],
24+
8192 : [(8, 0.0325), (16, 0.0275), (32, 0.016)],
25+
16384 : [(16, 0.11), (32, 0.064), (59, 0.035)]
26+
}
27+
28+
self.models = {}
29+
self.model_replicas = {}
30+
for n, ndata in self.data.items():
31+
self.models[n] = [None, None]
32+
self.model_replicas[n] = [ndata[1][0], ndata[2][0]]
33+
for i in range(len(ndata)-1):
34+
x = [ndata[i][0], ndata[i+1][0]]
35+
y = [ndata[i][1], ndata[i+1][1]]
36+
A = np.vstack([x, np.ones(len(x))]).T
37+
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
38+
self.models[n][i] = (m, c)
39+
40+
self.lbdata = {
41+
512 : [(2, 0.006), (4, 0.006), (8, 0.006)],
42+
2048 : [(4, 0.0097), (8, 0.0097), (16, 0.046659)],
43+
8192 : [(8, 0.61581), (16, 2.934641), (32, 25.083405)],
44+
16384 : [(16, 14.601492), (32, 95.771426), (59, 59.793259)]
45+
}
46+
47+
self.lbmodels = {}
48+
self.lbmodel_replicas = {}
49+
for n, ndata in self.lbdata.items():
50+
self.lbmodels[n] = [None, None]
51+
self.lbmodel_replicas[n] = [ndata[1][0], ndata[2][0]]
52+
for i in range(len(ndata)-1):
53+
x = [ndata[i][0], ndata[i+1][0]]
54+
y = [ndata[i][1], ndata[i+1][1]]
55+
A = np.vstack([x, np.ones(len(x))]).T
56+
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
57+
self.lbmodels[n][i] = (m, c)
58+
59+
self.connect_time_pe = [2, 4, 8, 16, 32, 59]
60+
self.connect_time_t = [1.248, 1.492, 3.902, 5.035, 6.949, 12.25]
61+
# Fit linear regression model for connect time
62+
A = np.vstack([self.connect_time_pe, np.ones(len(self.connect_time_pe))]).T
63+
self.connect_time_model = np.linalg.lstsq(A, self.connect_time_t, rcond=None)[0]
64+
65+
def get_connect_time(self):
66+
return self.connect_time_model[0] * self.replicas + self.connect_time_model[1]
67+
68+
def get_runtime(self):
69+
models = self.models[self.n]
70+
replicas = self.model_replicas[self.n]
71+
72+
for i, r in enumerate(replicas):
73+
if self.replicas <= r:
74+
m, c = models[i]
75+
76+
return m * self.replicas + c
77+
78+
def get_completion_time(self):
79+
return self.get_runtime() * self.niters * (1 - self.completion_fraction)
80+
81+
def get_rescale_overhead(self):
82+
models = self.lbmodels[self.n]
83+
replicas = self.lbmodel_replicas[self.n]
84+
85+
for i, r in enumerate(replicas):
86+
if self.replicas <= r:
87+
m, c = models[i]
88+
89+
lbtime = m * self.replicas + c
90+
91+
#print("LBTIME:", self.replicas, lbtime)
92+
93+
return lbtime + self.get_connect_time()
94+
95+
def get_startup_overhead(self):
96+
return self.get_connect_time()
97+
98+
def update_runtime(self, runtime):
99+
fraction = runtime / (self.get_runtime() * self.niters * (1 - self.completion_fraction))
100+
#print(self.completion_fraction, fraction)
101+
self.completion_fraction += fraction * (1 - self.completion_fraction)
102+
103+
104+
def generate_job_list(nexps):
105+
jobs = []
106+
indices = [0, 1, 2, 3]
107+
sizes_per_pe = [256, 512, 1024, 1024]
108+
min_pes = [2, 4, 8, 16]
109+
timesteps_per_job = [40000, 40000, 40000, 10000]
110+
job_prefixes = ["small", "medium", "large", "xlarge"]
111+
njobs = 16
112+
113+
for n in range(nexps):
114+
counts = [0, 0, 0, 0]
115+
jobs_list = []
116+
for i in range(njobs):
117+
idx = choice(indices)
118+
priority = (3 - idx) + randint(1, 2)
119+
#priority = randint(1, 5)
120+
min_replicas = min_pes[idx]
121+
max_replicas = min(4 * min_replicas, 59)
122+
problem_size = min_replicas * sizes_per_pe[idx]
123+
timesteps = timesteps_per_job[idx] #+ 100 * randint(0, 10)
124+
prefix = job_prefixes[idx]
125+
#create_job(prefix, i, priority, problem_size, min_replicas, max_replicas, timesteps)
126+
127+
jobs_list.append(StencilJob("charm-%s-%i" % (prefix, counts[idx]), min_replicas, max_replicas,
128+
priority, n=problem_size, niters=timesteps))
129+
counts[idx] += 1
130+
jobs.append(jobs_list)
131+
return jobs
132+
133+
134+
def run_simulation(jobs_list, mode, max_pes, job_submission_time, rescale_gap):
135+
if mode == "min_replicas":
136+
for job in jobs_list:
137+
job.max_replicas = job.min_replicas
138+
elif mode == "max_replicas":
139+
for job in jobs_list:
140+
job.min_replicas = job.max_replicas
141+
142+
if mode == "elastic":
143+
simulator = Simulation(60, rescale_gap)
144+
else:
145+
simulator = Simulation(60, 100000 * 60)
146+
147+
events = simulator.simulate([job_submission_time*i for i in range(len(jobs_list))], jobs_list)
148+
return get_stats(events, max_pes)
149+
150+
def vary_submission_time(modes, max_pes):
151+
nexperiments = 100
152+
jobs = generate_job_list(nexperiments)
153+
154+
total, response, completion, util = {}, {}, {}, {}
155+
for m in modes:
156+
total[m] = []
157+
response[m] = []
158+
completion[m] = []
159+
util[m] = []
160+
161+
submission_times = [30*i for i in range(13)]
162+
for t in submission_times:
163+
for m in modes:
164+
print(f"Running simulation for mode {m} with submission time {t}")
165+
final_times, mean_responses, mean_completions, utilizations = [], [], [], []
166+
for i in range(nexperiments):
167+
final_time, mean_response, mean_completion, utilization = run_simulation(deepcopy(jobs[i]), m, max_pes, t, 5*60)
168+
final_times.append(final_time)
169+
mean_responses.append(mean_response)
170+
mean_completions.append(mean_completion)
171+
utilizations.append(utilization)
172+
173+
avg_final_time = np.mean(final_times)
174+
avg_mean_response = np.mean(mean_responses)
175+
avg_mean_completion = np.mean(mean_completions)
176+
avg_utilization = np.mean(utilizations)
177+
178+
total[m].append(avg_final_time)
179+
response[m].append(avg_mean_response)
180+
completion[m].append(avg_mean_completion)
181+
util[m].append(avg_utilization)
182+
183+
data = {
184+
'submission_time': submission_times
185+
}
186+
187+
for m in modes:
188+
data[f'total_{m}'] = total[m]
189+
data[f'response_{m}'] = response[m]
190+
data[f'completion{m}'] = completion[m]
191+
data[f'util_{m}'] = util[m]
192+
193+
df = pd.DataFrame(data)
194+
df.to_csv('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/results.csv', index=False)
195+
196+
markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', 'x', 'd', '|', '_']
197+
198+
for i, m in enumerate(modes):
199+
plt.plot(submission_times, total[m], label=m)
200+
plt.xlabel('Submission Gap (s)')
201+
plt.ylabel('Total Time (s)')
202+
#plt.title('Total Time vs Submission Gap')
203+
plt.legend()
204+
plt.grid(True)
205+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/total_time_plot.pdf')
206+
plt.close()
207+
208+
for i, m in enumerate(modes):
209+
plt.plot(submission_times, response[m], label=m)
210+
plt.xlabel('Submission Gap (s)')
211+
plt.ylabel('Response Time (s)')
212+
#plt.title('Response Time vs Submission Gap')
213+
plt.legend()
214+
plt.grid(True)
215+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/response_time_plot.pdf')
216+
plt.close()
217+
218+
for i, m in enumerate(modes):
219+
plt.plot(submission_times, util[m], label=m)
220+
plt.xlabel('Submission Gap (s)')
221+
plt.ylabel('Utilization')
222+
#plt.title('Utilization vs Submission Gap')
223+
plt.legend()
224+
plt.grid(True)
225+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/utilization_plot.pdf')
226+
plt.close()
227+
228+
for i, m in enumerate(modes):
229+
plt.plot(submission_times, completion[m], label=m)
230+
plt.xlabel('Submission Gap (s)')
231+
plt.ylabel('Completion Time (s)')
232+
#plt.title('Completion Time vs Submission Gap')
233+
plt.legend()
234+
plt.grid(True)
235+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/completion_time_plot.pdf')
236+
plt.close()
237+
238+
print(df)
239+
240+
def vary_rescale_gap(modes, max_pes):
241+
nexperiments = 100
242+
jobs = generate_job_list(nexperiments)
243+
244+
total, response, completion, util = {}, {}, {}, {}
245+
for m in modes:
246+
total[m] = []
247+
response[m] = []
248+
completion[m] = []
249+
util[m] = []
250+
251+
rescale_gaps = [120*i for i in range(11)]
252+
for t in rescale_gaps:
253+
for m in modes:
254+
if m != "elastic" and len(total[m]) > 0:
255+
total[m].append(total[m][0])
256+
response[m].append(response[m][0])
257+
completion[m].append(completion[m][0])
258+
util[m].append(util[m][0])
259+
continue
260+
print(f"Running simulation for mode {m} with rescale gap {t}")
261+
final_times, mean_responses, mean_completions, utilizations = [], [], [], []
262+
for i in range(nexperiments):
263+
final_time, mean_response, mean_completion, utilization = run_simulation(deepcopy(jobs[i]), m, max_pes, 180, t)
264+
final_times.append(final_time)
265+
mean_responses.append(mean_response)
266+
mean_completions.append(mean_completion)
267+
utilizations.append(utilization)
268+
269+
avg_final_time = np.mean(final_times)
270+
avg_mean_response = np.mean(mean_responses)
271+
avg_mean_completion = np.mean(mean_completions)
272+
avg_utilization = np.mean(utilizations)
273+
274+
total[m].append(avg_final_time)
275+
response[m].append(avg_mean_response)
276+
completion[m].append(avg_mean_completion)
277+
util[m].append(avg_utilization)
278+
279+
data = {
280+
'rescale_gap': rescale_gaps
281+
}
282+
283+
for m in modes:
284+
data[f'total_{m}'] = total[m]
285+
data[f'response_{m}'] = response[m]
286+
data[f'completion{m}'] = completion[m]
287+
data[f'util_{m}'] = util[m]
288+
289+
df = pd.DataFrame(data)
290+
df.to_csv('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/results_rescale.csv', index=False)
291+
markers = ['x', '', '', '', 'v', '<', '>', 'p', '*', 'h', 'H', 'o', 'd', '|', '_']
292+
293+
#plt.figure(figsize=(12, 8))
294+
295+
for i, m in enumerate(modes):
296+
plt.plot(rescale_gaps, total[m], label=m, marker=markers[i % len(markers)])
297+
plt.xlabel('Rescale Gap (s)')
298+
plt.ylabel('Total Time (s)')
299+
#plt.title('Total Time vs Rescale gap')
300+
plt.legend()
301+
plt.grid(True)
302+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/total_time_plot_rescale.pdf')
303+
#plt.show()
304+
plt.close()
305+
306+
for i, m in enumerate(modes):
307+
plt.plot(rescale_gaps, response[m], label=m, marker=markers[i % len(markers)])
308+
plt.xlabel('Rescale Gap (s)')
309+
plt.ylabel('Response Time (s)')
310+
#plt.title('Response Time vs Rescale gap')
311+
plt.legend()
312+
plt.grid(True)
313+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/response_time_plot_rescale.pdf')
314+
#plt.show()
315+
plt.close()
316+
317+
#plt.figure(figsize=(12, 8))
318+
319+
for i, m in enumerate(modes):
320+
plt.plot(rescale_gaps, util[m], label=m, marker=markers[i % len(markers)])
321+
plt.xlabel('Rescale Gap (s)')
322+
plt.ylabel('Utilization')
323+
#plt.title('Utilization vs Rescale gap')
324+
plt.legend()
325+
plt.grid(True)
326+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/utilization_plot_rescale.pdf')
327+
#plt.show()
328+
plt.close()
329+
330+
#plt.figure(figsize=(12, 8))
331+
332+
for i, m in enumerate(modes):
333+
plt.plot(rescale_gaps, completion[m], label=m, marker=markers[i % len(markers)])
334+
plt.xlabel('Rescale Gap (s)')
335+
plt.ylabel('Completion Time (s)')
336+
#plt.title('Completion Time vs Rescale gap')
337+
plt.legend()
338+
plt.grid(True)
339+
plt.savefig('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/completion_time_plot_rescale.pdf')
340+
#plt.show()
341+
plt.close()
342+
343+
print(df)
344+
345+
if __name__ == '__main__':
346+
sizes = [256, 512, 1024, 1024]
347+
sizes_per_pe = [256, 512, 1024, 1024]
348+
min_pes = [2, 4, 8, 16]
349+
timesteps_per_job = [40000, 40000, 40000, 10000]
350+
job_prefixes = ["small", "medium", "large", "xlarge"]
351+
counts = [0, 0, 0, 0]
352+
njobs = 16
353+
#njobs = 9
354+
355+
jobs = [2, 3, 2, 1, 0, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 1]
356+
#jobs = [2, 3, 2, 1, 0, 1, 1, 1, 0]
357+
358+
jobs_list = []
359+
for i, job_index in enumerate(jobs):
360+
idx = job_index
361+
priority = (3 - job_index) + randint(1, 2)
362+
min_replicas = min_pes[idx]
363+
max_replicas = min(4 * min_replicas, 59)
364+
problem_size = min_replicas * sizes_per_pe[idx]
365+
timesteps = timesteps_per_job[idx] #+ 100 * randint(0, 10)
366+
prefix = job_prefixes[idx]
367+
#create_job(prefix, i, priority, problem_size, min_replicas, max_replicas, timesteps)
368+
369+
jobs_list.append(StencilJob("charm-%s-%i" % (prefix, counts[idx]), max_replicas, max_replicas,
370+
priority, n=problem_size, niters=timesteps))
371+
counts[idx] += 1
372+
373+
simulator = Simulation(60, 3 * 60)
374+
events = simulator.simulate([90*i for i in range(njobs)], jobs_list)
375+
jobs = ["charm-small-%i" % i for i in range(16)] + ["charm-medium-%i" % i for i in range(16)] + \
376+
["charm-large-%i" % i for i in range(16)] + ["charm-xlarge-%i" % i for i in range(16)]
377+
#plot_utilization(events, jobs, 60)
378+
#print(events)
379+
print(get_stats(events, 60))
380+
#vary_rescale_gap(["elastic", "moldable", "min_replicas", "max_replicas"], 60)
381+
vary_submission_time(["elastic", "moldable", "min_replicas", "max_replicas"], 60)

0 commit comments

Comments
 (0)