Refactoring clean up and fix

ElfoLiNk · ElfoLiNk · commit edab267aaa1a · 2016-11-26T11:36:22.000+01:00
diff --git a/config.py b/config.py
@@ -1,23 +1,21 @@
 """
-
+Configuration module of cSpark test benchmark
 """
 
-# import pprint
-
 # AWS
 DATA_AMI = {"eu-west-1": {"ami": 'ami-d3225da0', "az": 'eu-west-1c', "keypair": "gazzettaEU",
                           "price": "0.3"},
             "us-west-2": {"ami": 'ami-7f5ff81f', "snapid": "snap-4f38bf1c", "az": 'us-west-2c',
                           "keypair": "gazzetta",
                           "price": "0.4"}}
 
-CREDENTIAL_PROFILE = 'matteo'
+CREDENTIAL_PROFILE = 'default'
 REGION = "us-west-2"
-KEYPAIR_PATH = "C:\\Users\\Matteo\\Downloads\\" + DATA_AMI[REGION]["keypair"] + ".pem"
+KEY_PAIR_PATH = "C:\\Users\\Matteo\\Downloads\\" + DATA_AMI[REGION]["keypair"] + ".pem"
 SECURITY_GROUP = "spark-cluster"
 PRICE = DATA_AMI[REGION]["price"]
 INSTANCE_TYPE = "r3.4xlarge"
-NUMINSTANCE = 9
+NUM_INSTANCE = 0
 EBS_OPTIMIZED = True if "r3" not in INSTANCE_TYPE else False
 REBOOT = 0
 KILL_JAVA = 1
@@ -31,7 +29,7 @@
 }]
 
 # HDFS
-HDFS_MASTER = "ec2-35-160-124-233.us-west-2.compute.amazonaws.com"
+HDFS_MASTER = ""
 
 # Spark config
 SPARK_2 = "/opt/spark/"
@@ -56,11 +54,11 @@
 OFF_HEAP_BYTES = 30720000000
 
 # Core Config
-COREVM = 8
-COREHTVM = 16
-DISABLEHT = 1
-if DISABLEHT:
-    COREHTVM = COREVM
+CORE_VM = 8
+CORE_HT_VM = 16
+DISABLE_HT = 1
+if DISABLE_HT:
+    CORE_HT_VM = CORE_VM
 
 # CONTROL
 ALPHA = 0.95
@@ -77,13 +75,13 @@
 # 0%  209062
 # 20% 250874
 # 40% 284375
-MAXEXECUTOR = 8
-OVERSCALE = 2
+MAX_EXECUTOR = 8
+OVER_SCALE = 2
 K = 50
 TI = 12000
-TSAMPLE = 1000
-COREQUANTUM = 0.05
-COREMIN = 0.0
+T_SAMPLE = 1000
+CORE_QUANTUM = 0.05
+CORE_MIN = 0.0
 CPU_PERIOD = 100000
 
 # BENCHMARK
@@ -200,17 +198,17 @@
     "Deadline": DEADLINE,
     "Control": {
         "Alpha": ALPHA,
-        "OverScale": OVERSCALE,
-        "MaxExecutor": MAXEXECUTOR,
-        "CoreVM": COREVM,
+        "OverScale": OVER_SCALE,
+        "MaxExecutor": MAX_EXECUTOR,
+        "CoreVM": CORE_VM,
         "K": K,
         "Ti": TI,
-        "TSample": TSAMPLE,
-        "CoreQuantum": COREQUANTUM
+        "TSample": T_SAMPLE,
+        "CoreQuantum": CORE_QUANTUM
     },
     "Aws": {
         "InstanceType": INSTANCE_TYPE,
-        "HyperThreading": not DISABLEHT,
+        "HyperThreading": not DISABLE_HT,
         "Price": PRICE,
         "AMI": DATA_AMI[REGION]["ami"],
         "Region": REGION,
@@ -221,7 +219,7 @@
         "SnapshotId": DATA_AMI[REGION]["snapid"]
     },
     "Spark": {
-        "ExecutorCore": COREVM,
+        "ExecutorCore": CORE_VM,
         "ExecutorMemory": RAM_EXEC,
         "ExternalShuffle": ENABLE_EXTERNAL_SHUFFLE,
         "LocalityWait": LOCALITY_WAIT,
@@ -242,5 +240,3 @@
                "scala-sort-by-key-int": ["240", "241"],
                "scala-count": ["243", "244"],
                "scala-count-w-fltr": ["246", "247"]}
-
-# pprint.pprint(CONFIG_DICT)
diff --git a/launch.py b/launch.py
@@ -1,5 +1,8 @@
 """
-Launch the instance with spot request
+Handle the instance:
+* Launch new instance with spot request
+* Terminate instance
+* Check instance connectivity
 """
 
 import socket
@@ -163,10 +166,8 @@ def check_spot_price(client, config):
 
     spot_price_history_response = client.describe_spot_price_history(
         InstanceTypes=[config["Aws"]["InstanceType"]],
-        ProductDescriptions=[
-                                                                         'Linux/UNIX'],
-        AvailabilityZone=
-        config["Aws"]["AZ"])
+        ProductDescriptions=['Linux/UNIX'],
+        AvailabilityZone=config["Aws"]["AZ"])
     print(spot_price_history_response['SpotPriceHistory'][0])
     last_spot_price = [float(x['SpotPrice']) for x in
                        spot_price_history_response['SpotPriceHistory'][:10]]
diff --git a/log.py b/log.py
@@ -90,7 +90,7 @@ def download_slave(i, output_folder, app_id, config):
 
 
 @timing
-def download(log_folder, instances, master_dns, output_folder):
+def download(log_folder, instances, master_dns, output_folder, config):
     """ Download the logs from the master and the worker nodes
 
     :param log_folder: the log folder of the application
@@ -102,7 +102,7 @@ def download(log_folder, instances, master_dns, output_folder):
     # MASTER
     print("Downloading log from Master: " + master_dns)
     master_instance = [i for i in instances if i.public_dns_name == master_dns][0]
-    output_folder, app_id = download_master(master_instance, output_folder, log_folder)
+    output_folder, app_id = download_master(master_instance, output_folder, log_folder, config)
 
     # SLAVE
     with ThreadPoolExecutor(multiprocessing.cpu_count()) as executor:
@@ -193,6 +193,7 @@ def load_worker_data(worker_log, cpu_log, config):
 
     :param worker_log: the path of the log of the worker
     :param cpu_log:  the path of the cpu monitoring tool log of the worker
+    :param config: the configuration dictionary
     :return: worker_dict the dictionary of the worker's  data
     """
     print(worker_log)
diff --git a/main.py b/main.py
@@ -5,21 +5,22 @@
 
 import launch
 import run
-from config import NUMINSTANCE, REGION, TAG, REBOOT, CLUSTER_ID, TERMINATE, RUN, NUM_RUN, \
+from config import NUM_INSTANCE, REGION, TAG, REBOOT, CLUSTER_ID, TERMINATE, RUN, NUM_RUN, \
     CREDENTIAL_PROFILE, CONFIG_DICT
 
 
 def main():
     """ Main function;
-    - Launch spot request of NUMINSTANCE
-    - Run Benchmark
-    :return:
+    * Launch spot request of NUMINSTANCE
+    * Run Benchmark
+    * Download Log
+    * Plot data from log
     """
     session = boto3.Session(profile_name=CREDENTIAL_PROFILE)
     client = session.client('ec2', region_name=REGION)
 
-    if NUMINSTANCE > 0:
-        spot_request_ids = launch.launch(client, NUMINSTANCE, CONFIG_DICT)
+    if NUM_INSTANCE > 0:
+        spot_request_ids = launch.launch(client, NUM_INSTANCE, CONFIG_DICT)
 
         print("CHECK SECURITY GROUP ALLOWED IP SETTINGS!!!")
 
diff --git a/metrics.py b/metrics.py
@@ -65,8 +65,7 @@ def compute_cpu_time(app_id, app_info, workers_dict, config, folder):
                         index = min(range(len(time_cpu)), key=lambda i: abs(time_cpu[i] - time))
                         # print(index)
                     cpu_time_max += (config["Control"]["Tsample"] / 1000) * max(cpu, worker_dict[
-                        "cpu_real"][
-                        index + int(config["Control"]["Tsample"] / 1000)])
+                        "cpu_real"][index + int(config["Control"]["Tsample"] / 1000)])
         except KeyError:
             print(app_id + " not found")
     duration_s = app_info[app_id][max(list(app_info[app_id].keys()))]["end"].timestamp() - \
@@ -91,8 +90,8 @@ def compute_cpu_time(app_id, app_info, workers_dict, config, folder):
     throughput = float(num_task) / duration_s
     if cpu_time == 0:
         cpu_time = ((app_info[app_id][max(list(app_info[app_id].keys()))]["end"].timestamp() -
-                     app_info[app_id][PLOT_SID_STAGE]["start"].timestamp())) * config["Control"][
-                       "MaxExecutor"] * config["Control"]["CoreVM"]
+                     app_info[app_id][PLOT_SID_STAGE]["start"].timestamp())) * \
+                   config["Control"]["MaxExecutor"] * config["Control"]["CoreVM"]
         cpu_time_max = cpu_time
     cpu_time_max = math.floor(cpu_time_max)
     print("CPU_TIME: " + str(cpu_time))
@@ -210,7 +209,7 @@ def compute_metrics(folder):
     app_logs = glob.glob(folder + "*.err") + glob.glob(folder + "*.dat")
     app_info = {}
     for app_log in sorted(app_logs):
-        app_info = load_app_data(app_log, config)
+        app_info = load_app_data(app_log)
 
         for app_id in app_info:
             compute_errors(app_id, app_info[app_id], folder, config)
diff --git a/plot.py b/plot.py
@@ -630,7 +630,7 @@ def plot(folder):
     app_logs = glob.glob(folder + "*.err") + glob.glob(folder + "*.dat")
     app_info = {}
     for app_log in sorted(app_logs):
-        app_info = load_app_data(app_log, config)
+        app_info = load_app_data(app_log)
 
         for app_id in app_info:
             plot_app_overview(app_id, app_info[app_id], folder, config)
diff --git a/run.py b/run.py
diff --git a/util/extract_execution_time.py b/util/extract_execution_time.py