1
1
"""
2
+ Module that handles the cluster log:
2
3
4
+ * Download from master and slaves
5
+ * Extract app data
6
+ * Extract worker data
3
7
"""
4
8
5
9
import multiprocessing
10
14
11
15
from boto .manage .cmdshell import sshclient_from_instance
12
16
13
- from config import KEYPAIR_PATH , SPARK_HOME , COREVM , COREHTVM
14
17
from util .utils import timing , string_to_datetime
15
18
16
19
17
- def download_master (i , output_folder , log_folder ):
18
- """
20
+ def download_master (i , output_folder , log_folder , config ):
21
+ """Download log from master instance
19
22
20
- :param i:
21
- :param output_folder:
22
- :param log_folder:
23
- :return:
23
+ :param i: master instance
24
+ :param output_folder: output folder where save the log
25
+ :param log_folder: log folder on the master instance
26
+ :return: output_folder and the app_id: the application id
24
27
"""
25
- ssh_client = sshclient_from_instance (i , KEYPAIR_PATH , user_name = 'ubuntu' )
28
+ ssh_client = sshclient_from_instance (i , config [ "Aws" ][ "KeyPair" ] , user_name = 'ubuntu' )
26
29
app_id = ""
27
- for file in ssh_client .listdir ("" + SPARK_HOME + "spark-events/" ):
30
+ for file in ssh_client .listdir ("" + config [ "Spark" ][ "SparkHome" ] + "spark-events/" ):
28
31
print ("BENCHMARK: " + file )
29
32
print ("LOG FOLDER: " + log_folder )
30
33
print ("OUTPUT FOLDER: " + output_folder )
@@ -35,11 +38,12 @@ def download_master(i, output_folder, log_folder):
35
38
os .makedirs (output_folder )
36
39
except FileExistsError :
37
40
print ("Output folder already exists" )
38
- inputfile = SPARK_HOME + "spark-events/" + file
39
- outputbz = inputfile + ".bz"
41
+ input_file = config [ "Spark" ][ "SparkHome" ] + "spark-events/" + file
42
+ output_bz = input_file + ".bz"
40
43
print ("Bzipping event log..." )
41
- ssh_client .run ("pbzip2 -9 -p" + str (COREVM ) + " -c " + inputfile + " > " + outputbz )
42
- ssh_client .get_file (outputbz , output_folder + "/" + file + ".bz" )
44
+ ssh_client .run ("pbzip2 -9 -p" + str (
45
+ config ["Control" ]["CoreVM" ]) + " -c " + input_file + " > " + output_bz )
46
+ ssh_client .get_file (output_bz , output_folder + "/" + file + ".bz" )
43
47
for file in ssh_client .listdir (log_folder ):
44
48
print (file )
45
49
if file != "bench-report.dat" :
@@ -48,20 +52,22 @@ def download_master(i, output_folder, log_folder):
48
52
return output_folder , app_id
49
53
50
54
51
- def download_slave (i , output_folder , app_id ):
52
- """
55
+ def download_slave (i , output_folder , app_id , config ):
56
+ """Download log from slave instance:
57
+ * The worker log that includes the controller output
58
+ * The cpu monitoring log
53
59
54
- :param i:
55
- :param output_folder:
56
- :param app_id:
57
- :return:
60
+ :param i: the slave instance
61
+ :param output_folder: the output folder where to save log
62
+ :param app_id: the application
63
+ :return: output_folder: the output folder
58
64
"""
59
- ssh_client = sshclient_from_instance (i , KEYPAIR_PATH , user_name = 'ubuntu' )
65
+ ssh_client = sshclient_from_instance (i , config [ "Aws" ][ "KeyPair" ] , user_name = 'ubuntu' )
60
66
print ("Downloading log from slave: " + i .public_dns_name )
61
67
try :
62
68
worker_ip_fixed = i .private_ip_address .replace ("." , "-" )
63
69
worker_log = "{0}logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-ip-{1}.out" .format (
64
- SPARK_HOME , worker_ip_fixed )
70
+ config [ "Spark" ][ "SparkHome" ] , worker_ip_fixed )
65
71
print (worker_log )
66
72
ssh_client .run (
67
73
"screen -ls | grep Detached | cut -d. -f1 | awk '{print $1}' | xargs -r kill" )
@@ -73,24 +79,25 @@ def download_slave(i, output_folder, app_id):
73
79
except FileNotFoundError :
74
80
print ("worker log not found" )
75
81
try :
76
- for file in ssh_client .listdir (SPARK_HOME + "work/" + app_id + "/" ):
82
+ for file in ssh_client .listdir (config [ "Spark" ][ "SparkHome" ] + "work/" + app_id + "/" ):
77
83
print ("Executor ID: " + file )
78
- ssh_client .get_file (SPARK_HOME + "work/" + app_id + "/" + file + "/stderr" ,
79
- output_folder + "/" + i .public_dns_name + "-" + file + ".stderr" )
84
+ ssh_client .get_file (
85
+ config ["Spark" ]["SparkHome" ] + "work/" + app_id + "/" + file + "/stderr" ,
86
+ output_folder + "/" + i .public_dns_name + "-" + file + ".stderr" )
80
87
except FileNotFoundError :
81
88
print ("stderr not found" )
82
89
return output_folder
83
90
84
91
85
92
@timing
86
93
def download (log_folder , instances , master_dns , output_folder ):
87
- """
94
+ """ Download the logs from the master and the worker nodes
88
95
89
- :param log_folder:
90
- :param instances:
91
- :param master_dns:
92
- :param output_folder:
93
- :return:
96
+ :param log_folder: the log folder of the application
97
+ :param instances: the instances of the cluster
98
+ :param master_dns: the dns of the master instances
99
+ :param output_folder: the output folder where to save the logs
100
+ :return: the output folder
94
101
"""
95
102
# MASTER
96
103
print ("Downloading log from Master: " + master_dns )
@@ -106,7 +113,6 @@ def download(log_folder, instances, master_dns, output_folder):
106
113
return output_folder
107
114
108
115
109
- @timing
110
116
def load_app_data (app_log_path ):
111
117
"""
112
118
Function that parse the application data like stage ids, start, deadline, end,
@@ -181,7 +187,7 @@ def load_app_data(app_log_path):
181
187
return app_info
182
188
183
189
184
- def load_worker_data (worker_log , cpu_log ):
190
+ def load_worker_data (worker_log , cpu_log , config ):
185
191
"""
186
192
Load the controller data from the worker_log and combine with the cpu_real data from cpu_log
187
193
@@ -245,7 +251,12 @@ def load_worker_data(worker_log, cpu_log):
245
251
and line [1 ] != " CPU" and line [0 ] != "Average:" :
246
252
worker_dict ["time_cpu" ].append (
247
253
dt .strptime (line [0 ], '%I:%M:%S %p' ).replace (year = 2016 ))
248
- cpuint = float ('{0:.2f}' .format ((float (line [2 ]) * COREHTVM ) / 100 ))
249
- worker_dict ["cpu_real" ].append (cpuint )
254
+ if config ["Aws" ]["HyperThreading" ]:
255
+ cpu_real = float (
256
+ '{0:.2f}' .format ((float (line [2 ]) * config ["Control" ]["CoreVM" ] * 2 ) / 100 ))
257
+ else :
258
+ cpu_real = float (
259
+ '{0:.2f}' .format ((float (line [2 ]) * config ["Control" ]["CoreVM" ]) / 100 ))
260
+ worker_dict ["cpu_real" ].append (cpu_real )
250
261
print (list (worker_dict .keys ()))
251
262
return worker_dict
0 commit comments