1+ from simulate import Simulation , Job , plot_utilization , get_stats
2+ from random import randint , seed , choice
3+ import numpy as np
4+ import sys
5+ from copy import deepcopy
6+ import pandas as pd
7+ import matplotlib .pyplot as plt
8+ import scienceplots
9+ plt .style .use (['science' , 'ieee' ])
10+ #plt.rcParams.update({'font.size': 18})
11+
12+ seed (42 )
13+
14+ class StencilJob (Job ):
15+ def __init__ (self , job_name , min_replicas , max_replicas , priority , ** kwargs ):
16+ super ().__init__ (job_name , min_replicas , max_replicas , priority )
17+ self .n = kwargs .pop ('n' , 16 )
18+ self .max_pes = kwargs .pop ('max_pes' , 60 )
19+ self .niters = kwargs .pop ('niters' , 10000 )
20+
21+ self .data = {
22+ 512 : [(2 , 0.000996 ), (4 , 0.000675 ), (8 , 0.000627 )],
23+ 2048 : [(4 , 0.00328 ), (8 , 0.0021 ), (16 , 0.0023 )],
24+ 8192 : [(8 , 0.0325 ), (16 , 0.0275 ), (32 , 0.016 )],
25+ 16384 : [(16 , 0.11 ), (32 , 0.064 ), (59 , 0.035 )]
26+ }
27+
28+ self .models = {}
29+ self .model_replicas = {}
30+ for n , ndata in self .data .items ():
31+ self .models [n ] = [None , None ]
32+ self .model_replicas [n ] = [ndata [1 ][0 ], ndata [2 ][0 ]]
33+ for i in range (len (ndata )- 1 ):
34+ x = [ndata [i ][0 ], ndata [i + 1 ][0 ]]
35+ y = [ndata [i ][1 ], ndata [i + 1 ][1 ]]
36+ A = np .vstack ([x , np .ones (len (x ))]).T
37+ m , c = np .linalg .lstsq (A , y , rcond = None )[0 ]
38+ self .models [n ][i ] = (m , c )
39+
40+ self .lbdata = {
41+ 512 : [(2 , 0.006 ), (4 , 0.006 ), (8 , 0.006 )],
42+ 2048 : [(4 , 0.0097 ), (8 , 0.0097 ), (16 , 0.046659 )],
43+ 8192 : [(8 , 0.61581 ), (16 , 2.934641 ), (32 , 25.083405 )],
44+ 16384 : [(16 , 14.601492 ), (32 , 95.771426 ), (59 , 59.793259 )]
45+ }
46+
47+ self .lbmodels = {}
48+ self .lbmodel_replicas = {}
49+ for n , ndata in self .lbdata .items ():
50+ self .lbmodels [n ] = [None , None ]
51+ self .lbmodel_replicas [n ] = [ndata [1 ][0 ], ndata [2 ][0 ]]
52+ for i in range (len (ndata )- 1 ):
53+ x = [ndata [i ][0 ], ndata [i + 1 ][0 ]]
54+ y = [ndata [i ][1 ], ndata [i + 1 ][1 ]]
55+ A = np .vstack ([x , np .ones (len (x ))]).T
56+ m , c = np .linalg .lstsq (A , y , rcond = None )[0 ]
57+ self .lbmodels [n ][i ] = (m , c )
58+
59+ self .connect_time_pe = [2 , 4 , 8 , 16 , 32 , 59 ]
60+ self .connect_time_t = [1.248 , 1.492 , 3.902 , 5.035 , 6.949 , 12.25 ]
61+ # Fit linear regression model for connect time
62+ A = np .vstack ([self .connect_time_pe , np .ones (len (self .connect_time_pe ))]).T
63+ self .connect_time_model = np .linalg .lstsq (A , self .connect_time_t , rcond = None )[0 ]
64+
65+ def get_connect_time (self ):
66+ return self .connect_time_model [0 ] * self .replicas + self .connect_time_model [1 ]
67+
68+ def get_runtime (self ):
69+ models = self .models [self .n ]
70+ replicas = self .model_replicas [self .n ]
71+
72+ for i , r in enumerate (replicas ):
73+ if self .replicas <= r :
74+ m , c = models [i ]
75+
76+ return m * self .replicas + c
77+
78+ def get_completion_time (self ):
79+ return self .get_runtime () * self .niters * (1 - self .completion_fraction )
80+
81+ def get_rescale_overhead (self ):
82+ models = self .lbmodels [self .n ]
83+ replicas = self .lbmodel_replicas [self .n ]
84+
85+ for i , r in enumerate (replicas ):
86+ if self .replicas <= r :
87+ m , c = models [i ]
88+
89+ lbtime = m * self .replicas + c
90+
91+ #print("LBTIME:", self.replicas, lbtime)
92+
93+ return lbtime + self .get_connect_time ()
94+
95+ def get_startup_overhead (self ):
96+ return self .get_connect_time ()
97+
98+ def update_runtime (self , runtime ):
99+ fraction = runtime / (self .get_runtime () * self .niters * (1 - self .completion_fraction ))
100+ #print(self.completion_fraction, fraction)
101+ self .completion_fraction += fraction * (1 - self .completion_fraction )
102+
103+
104+ def generate_job_list (nexps ):
105+ jobs = []
106+ indices = [0 , 1 , 2 , 3 ]
107+ sizes_per_pe = [256 , 512 , 1024 , 1024 ]
108+ min_pes = [2 , 4 , 8 , 16 ]
109+ timesteps_per_job = [40000 , 40000 , 40000 , 10000 ]
110+ job_prefixes = ["small" , "medium" , "large" , "xlarge" ]
111+ njobs = 16
112+
113+ for n in range (nexps ):
114+ counts = [0 , 0 , 0 , 0 ]
115+ jobs_list = []
116+ for i in range (njobs ):
117+ idx = choice (indices )
118+ priority = (3 - idx ) + randint (1 , 2 )
119+ #priority = randint(1, 5)
120+ min_replicas = min_pes [idx ]
121+ max_replicas = min (4 * min_replicas , 59 )
122+ problem_size = min_replicas * sizes_per_pe [idx ]
123+ timesteps = timesteps_per_job [idx ] #+ 100 * randint(0, 10)
124+ prefix = job_prefixes [idx ]
125+ #create_job(prefix, i, priority, problem_size, min_replicas, max_replicas, timesteps)
126+
127+ jobs_list .append (StencilJob ("charm-%s-%i" % (prefix , counts [idx ]), min_replicas , max_replicas ,
128+ priority , n = problem_size , niters = timesteps ))
129+ counts [idx ] += 1
130+ jobs .append (jobs_list )
131+ return jobs
132+
133+
134+ def run_simulation (jobs_list , mode , max_pes , job_submission_time , rescale_gap ):
135+ if mode == "min_replicas" :
136+ for job in jobs_list :
137+ job .max_replicas = job .min_replicas
138+ elif mode == "max_replicas" :
139+ for job in jobs_list :
140+ job .min_replicas = job .max_replicas
141+
142+ if mode == "elastic" :
143+ simulator = Simulation (60 , rescale_gap )
144+ else :
145+ simulator = Simulation (60 , 100000 * 60 )
146+
147+ events = simulator .simulate ([job_submission_time * i for i in range (len (jobs_list ))], jobs_list )
148+ return get_stats (events , max_pes )
149+
150+ def vary_submission_time (modes , max_pes ):
151+ nexperiments = 100
152+ jobs = generate_job_list (nexperiments )
153+
154+ total , response , completion , util = {}, {}, {}, {}
155+ for m in modes :
156+ total [m ] = []
157+ response [m ] = []
158+ completion [m ] = []
159+ util [m ] = []
160+
161+ submission_times = [30 * i for i in range (13 )]
162+ for t in submission_times :
163+ for m in modes :
164+ print (f"Running simulation for mode { m } with submission time { t } " )
165+ final_times , mean_responses , mean_completions , utilizations = [], [], [], []
166+ for i in range (nexperiments ):
167+ final_time , mean_response , mean_completion , utilization = run_simulation (deepcopy (jobs [i ]), m , max_pes , t , 5 * 60 )
168+ final_times .append (final_time )
169+ mean_responses .append (mean_response )
170+ mean_completions .append (mean_completion )
171+ utilizations .append (utilization )
172+
173+ avg_final_time = np .mean (final_times )
174+ avg_mean_response = np .mean (mean_responses )
175+ avg_mean_completion = np .mean (mean_completions )
176+ avg_utilization = np .mean (utilizations )
177+
178+ total [m ].append (avg_final_time )
179+ response [m ].append (avg_mean_response )
180+ completion [m ].append (avg_mean_completion )
181+ util [m ].append (avg_utilization )
182+
183+ data = {
184+ 'submission_time' : submission_times
185+ }
186+
187+ for m in modes :
188+ data [f'total_{ m } ' ] = total [m ]
189+ data [f'response_{ m } ' ] = response [m ]
190+ data [f'completion{ m } ' ] = completion [m ]
191+ data [f'util_{ m } ' ] = util [m ]
192+
193+ df = pd .DataFrame (data )
194+ df .to_csv ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/results.csv' , index = False )
195+
196+ markers = ['o' , 's' , 'D' , '^' , 'v' , '<' , '>' , 'p' , '*' , 'h' , 'H' , 'x' , 'd' , '|' , '_' ]
197+
198+ for i , m in enumerate (modes ):
199+ plt .plot (submission_times , total [m ], label = m )
200+ plt .xlabel ('Submission Gap (s)' )
201+ plt .ylabel ('Total Time (s)' )
202+ #plt.title('Total Time vs Submission Gap')
203+ plt .legend ()
204+ plt .grid (True )
205+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/total_time_plot.pdf' )
206+ plt .close ()
207+
208+ for i , m in enumerate (modes ):
209+ plt .plot (submission_times , response [m ], label = m )
210+ plt .xlabel ('Submission Gap (s)' )
211+ plt .ylabel ('Response Time (s)' )
212+ #plt.title('Response Time vs Submission Gap')
213+ plt .legend ()
214+ plt .grid (True )
215+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/response_time_plot.pdf' )
216+ plt .close ()
217+
218+ for i , m in enumerate (modes ):
219+ plt .plot (submission_times , util [m ], label = m )
220+ plt .xlabel ('Submission Gap (s)' )
221+ plt .ylabel ('Utilization' )
222+ #plt.title('Utilization vs Submission Gap')
223+ plt .legend ()
224+ plt .grid (True )
225+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/utilization_plot.pdf' )
226+ plt .close ()
227+
228+ for i , m in enumerate (modes ):
229+ plt .plot (submission_times , completion [m ], label = m )
230+ plt .xlabel ('Submission Gap (s)' )
231+ plt .ylabel ('Completion Time (s)' )
232+ #plt.title('Completion Time vs Submission Gap')
233+ plt .legend ()
234+ plt .grid (True )
235+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/completion_time_plot.pdf' )
236+ plt .close ()
237+
238+ print (df )
239+
240+ def vary_rescale_gap (modes , max_pes ):
241+ nexperiments = 100
242+ jobs = generate_job_list (nexperiments )
243+
244+ total , response , completion , util = {}, {}, {}, {}
245+ for m in modes :
246+ total [m ] = []
247+ response [m ] = []
248+ completion [m ] = []
249+ util [m ] = []
250+
251+ rescale_gaps = [120 * i for i in range (11 )]
252+ for t in rescale_gaps :
253+ for m in modes :
254+ if m != "elastic" and len (total [m ]) > 0 :
255+ total [m ].append (total [m ][0 ])
256+ response [m ].append (response [m ][0 ])
257+ completion [m ].append (completion [m ][0 ])
258+ util [m ].append (util [m ][0 ])
259+ continue
260+ print (f"Running simulation for mode { m } with rescale gap { t } " )
261+ final_times , mean_responses , mean_completions , utilizations = [], [], [], []
262+ for i in range (nexperiments ):
263+ final_time , mean_response , mean_completion , utilization = run_simulation (deepcopy (jobs [i ]), m , max_pes , 180 , t )
264+ final_times .append (final_time )
265+ mean_responses .append (mean_response )
266+ mean_completions .append (mean_completion )
267+ utilizations .append (utilization )
268+
269+ avg_final_time = np .mean (final_times )
270+ avg_mean_response = np .mean (mean_responses )
271+ avg_mean_completion = np .mean (mean_completions )
272+ avg_utilization = np .mean (utilizations )
273+
274+ total [m ].append (avg_final_time )
275+ response [m ].append (avg_mean_response )
276+ completion [m ].append (avg_mean_completion )
277+ util [m ].append (avg_utilization )
278+
279+ data = {
280+ 'rescale_gap' : rescale_gaps
281+ }
282+
283+ for m in modes :
284+ data [f'total_{ m } ' ] = total [m ]
285+ data [f'response_{ m } ' ] = response [m ]
286+ data [f'completion{ m } ' ] = completion [m ]
287+ data [f'util_{ m } ' ] = util [m ]
288+
289+ df = pd .DataFrame (data )
290+ df .to_csv ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/results_rescale.csv' , index = False )
291+ markers = ['x' , '' , '' , '' , 'v' , '<' , '>' , 'p' , '*' , 'h' , 'H' , 'o' , 'd' , '|' , '_' ]
292+
293+ #plt.figure(figsize=(12, 8))
294+
295+ for i , m in enumerate (modes ):
296+ plt .plot (rescale_gaps , total [m ], label = m , marker = markers [i % len (markers )])
297+ plt .xlabel ('Rescale Gap (s)' )
298+ plt .ylabel ('Total Time (s)' )
299+ #plt.title('Total Time vs Rescale gap')
300+ plt .legend ()
301+ plt .grid (True )
302+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/total_time_plot_rescale.pdf' )
303+ #plt.show()
304+ plt .close ()
305+
306+ for i , m in enumerate (modes ):
307+ plt .plot (rescale_gaps , response [m ], label = m , marker = markers [i % len (markers )])
308+ plt .xlabel ('Rescale Gap (s)' )
309+ plt .ylabel ('Response Time (s)' )
310+ #plt.title('Response Time vs Rescale gap')
311+ plt .legend ()
312+ plt .grid (True )
313+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/response_time_plot_rescale.pdf' )
314+ #plt.show()
315+ plt .close ()
316+
317+ #plt.figure(figsize=(12, 8))
318+
319+ for i , m in enumerate (modes ):
320+ plt .plot (rescale_gaps , util [m ], label = m , marker = markers [i % len (markers )])
321+ plt .xlabel ('Rescale Gap (s)' )
322+ plt .ylabel ('Utilization' )
323+ #plt.title('Utilization vs Rescale gap')
324+ plt .legend ()
325+ plt .grid (True )
326+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/utilization_plot_rescale.pdf' )
327+ #plt.show()
328+ plt .close ()
329+
330+ #plt.figure(figsize=(12, 8))
331+
332+ for i , m in enumerate (modes ):
333+ plt .plot (rescale_gaps , completion [m ], label = m , marker = markers [i % len (markers )])
334+ plt .xlabel ('Rescale Gap (s)' )
335+ plt .ylabel ('Completion Time (s)' )
336+ #plt.title('Completion Time vs Rescale gap')
337+ plt .legend ()
338+ plt .grid (True )
339+ plt .savefig ('/home/aditya/mpi-operator/examples/v2beta1/charm/simulation/completion_time_plot_rescale.pdf' )
340+ #plt.show()
341+ plt .close ()
342+
343+ print (df )
344+
345+ if __name__ == '__main__' :
346+ sizes = [256 , 512 , 1024 , 1024 ]
347+ sizes_per_pe = [256 , 512 , 1024 , 1024 ]
348+ min_pes = [2 , 4 , 8 , 16 ]
349+ timesteps_per_job = [40000 , 40000 , 40000 , 10000 ]
350+ job_prefixes = ["small" , "medium" , "large" , "xlarge" ]
351+ counts = [0 , 0 , 0 , 0 ]
352+ njobs = 16
353+ #njobs = 9
354+
355+ jobs = [2 , 3 , 2 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 2 , 0 , 1 , 1 , 1 ]
356+ #jobs = [2, 3, 2, 1, 0, 1, 1, 1, 0]
357+
358+ jobs_list = []
359+ for i , job_index in enumerate (jobs ):
360+ idx = job_index
361+ priority = (3 - job_index ) + randint (1 , 2 )
362+ min_replicas = min_pes [idx ]
363+ max_replicas = min (4 * min_replicas , 59 )
364+ problem_size = min_replicas * sizes_per_pe [idx ]
365+ timesteps = timesteps_per_job [idx ] #+ 100 * randint(0, 10)
366+ prefix = job_prefixes [idx ]
367+ #create_job(prefix, i, priority, problem_size, min_replicas, max_replicas, timesteps)
368+
369+ jobs_list .append (StencilJob ("charm-%s-%i" % (prefix , counts [idx ]), max_replicas , max_replicas ,
370+ priority , n = problem_size , niters = timesteps ))
371+ counts [idx ] += 1
372+
373+ simulator = Simulation (60 , 3 * 60 )
374+ events = simulator .simulate ([90 * i for i in range (njobs )], jobs_list )
375+ jobs = ["charm-small-%i" % i for i in range (16 )] + ["charm-medium-%i" % i for i in range (16 )] + \
376+ ["charm-large-%i" % i for i in range (16 )] + ["charm-xlarge-%i" % i for i in range (16 )]
377+ #plot_utilization(events, jobs, 60)
378+ #print(events)
379+ print (get_stats (events , 60 ))
380+ #vary_rescale_gap(["elastic", "moldable", "min_replicas", "max_replicas"], 60)
381+ vary_submission_time (["elastic" , "moldable" , "min_replicas" , "max_replicas" ], 60 )
0 commit comments