ElfoLiNk
diff --git a/‎config.py‎
Lines changed: 11 additions & 4 deletions b/‎config.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎launch.py‎
Lines changed: 21 additions & 22 deletions b/‎launch.py‎
Lines changed: 21 additions & 22 deletions
diff --git a/‎log.py‎
Lines changed: 45 additions & 34 deletions b/‎log.py‎
Lines changed: 45 additions & 34 deletions
diff --git a/‎main.py‎
Lines changed: 2 additions & 2 deletions b/‎main.py‎
Lines changed: 2 additions & 2 deletions
@@ -200,19 +200,25 @@
     "Deadline": DEADLINE,
     "Control": {
         "Alpha": ALPHA,
-        "Overscale": OVERSCALE,
+        "OverScale": OVERSCALE,
         "MaxExecutor": MAXEXECUTOR,
+        "CoreVM": COREVM,
         "K": K,
         "Ti": TI,
-        "Tsample": TSAMPLE,
+        "TSample": TSAMPLE,
         "CoreQuantum": COREQUANTUM
     },
     "Aws": {
         "InstanceType": INSTANCE_TYPE,
         "HyperThreading": not DISABLEHT,
         "Price": PRICE,
         "AMI": DATA_AMI[REGION]["ami"],
-        "Region": REGION
+        "Region": REGION,
+        "AZ": DATA_AMI[REGION]["az"],
+        "SecurityGroup": SECURITY_GROUP,
+        "KeyPair": DATA_AMI[REGION]["keypair"],
+        "EbsOptimized": EBS_OPTIMIZED,
+        "SnapshotId": DATA_AMI[REGION]["snapid"]
     },
     "Spark": {
         "ExecutorCore": COREVM,
@@ -222,7 +228,8 @@
         "LocalityWaitProcess": LOCALITY_WAIT_PROCESS,
         "LocalityWaitNode": LOCALITY_WAIT_NODE,
         "LocalityWaitRack": LOCALITY_WAIT_RACK,
-        "CPUtask": CPU_TASK
+        "CPUTask": CPU_TASK,
+        "SparkHome": SPARK_HOME
     },
     "HDFS": bool(HDFS)
 }
 
@@ -8,8 +8,6 @@
 from errno import ECONNREFUSED
 from errno import ETIMEDOUT
 
-from config import DATA_AMI, INSTANCE_TYPE, REGION, PRICE, SECURITY_GROUP, EBS_OPTIMIZED
-
 
 def query_yes_no(question, default="yes"):
     """Ask a yes/no question via raw_input() and return their answer.
@@ -154,19 +152,21 @@ def terminate(client, spot_request_ids, instance_ids):
     client.instances.filter(InstanceIds=instance_ids).terminate()
 
 
-def check_spot_price(client):
+def check_spot_price(client, config):
     """Check the current spot price on the selected amazon region of the instance type choosen
         and compare with the one provided by the user
 
     :param client: the ec2 client
+    :param config: the configuration dictionary of the user
     :return: Exit if the spot price of the user is too low (< current price + 20%)
     """
 
-    spot_price_history_response = client.describe_spot_price_history(InstanceTypes=[INSTANCE_TYPE],
-                                                                     ProductDescriptions=[
+    spot_price_history_response = client.describe_spot_price_history(
+        InstanceTypes=[config["Aws"]["InstanceType"]],
+        ProductDescriptions=[
                                                                          'Linux/UNIX'],
-                                                                     AvailabilityZone=
-                                                                     DATA_AMI[REGION]["az"])
+        AvailabilityZone=
+        config["Aws"]["AZ"])
     print(spot_price_history_response['SpotPriceHistory'][0])
     last_spot_price = [float(x['SpotPrice']) for x in
                        spot_price_history_response['SpotPriceHistory'][:10]]
@@ -176,49 +176,48 @@ def check_spot_price(client):
     spot_price += (spot_price * 0.2)
     spot_price = float("{0:.2f}".format(spot_price))
     print("LAST 10 SPOT PRICE + 20%: " + str(spot_price))
-    print("YOUR PRICE: " + str(PRICE))
-    if float(PRICE) < spot_price:
+    print("YOUR PRICE: " + str(config["Aws"]["Price"]))
+    if float(config["Aws"]["Price"]) < spot_price:
         print("ERROR PRICE")
         exit(1)
 
 
-def launch(client, num_instance):
+def launch(client, num_instance, config):
     """
     Launch num_instance on Amazon EC2 with spot request
 
     :param client: the ec2 client
     :param num_instance: number of instance to launch
+    :param config: the configuration dictionary of the user
     :return: the list of spot request's ids
     """
     if query_yes_no("Are you sure to launch " + str(num_instance) + " new instance?", "no"):
-        check_spot_price(client)
+        check_spot_price(client, config)
         spot_request_response = client.request_spot_instances(
-            SpotPrice=PRICE,
+            SpotPrice=config["Aws"]["Price"],
             InstanceCount=num_instance,
             Type='one-time',
             AvailabilityZoneGroup=
-            DATA_AMI[REGION]["az"],
+            config["Aws"]["AZ"],
             LaunchSpecification={
-                "ImageId": DATA_AMI[REGION]["ami"],
-                "KeyName": DATA_AMI[REGION]["keypair"],
+                "ImageId": config["Aws"]["AMI"],
+                "KeyName": config["Aws"]["KeyPair"],
                 "SecurityGroups": [
-                    SECURITY_GROUP,
+                    config["Aws"]["SecurityGroup"],
                 ],
                 'Placement': {
-                    'AvailabilityZone':
-                        DATA_AMI[REGION]["az"],
+                    'AvailabilityZone': config["Aws"]["AZ"],
                 },
-                "InstanceType": INSTANCE_TYPE,
-                "EbsOptimized": EBS_OPTIMIZED,
+                "InstanceType": config["Aws"]["InstanceType"],
+                "EbsOptimized": config["Aws"]["EbsOptimized"],
                 "BlockDeviceMappings": [
                     {
                         "DeviceName": "/dev/sda1",
                         "Ebs": {
                             "DeleteOnTermination": True,
                             "VolumeType": "gp2",
                             "VolumeSize": 200,
-                            "SnapshotId":
-                                DATA_AMI[REGION]["snapid"]
+                            "SnapshotId": config["Aws"]["SnapshotId"]
                         }
                     },
                     {
 
@@ -1,5 +1,9 @@
 """
+Module that handles the cluster log:
 
+* Download from master and slaves
+* Extract app data
+* Extract worker data
 """
 
 import multiprocessing
@@ -10,21 +14,20 @@
 
 from boto.manage.cmdshell import sshclient_from_instance
 
-from config import KEYPAIR_PATH, SPARK_HOME, COREVM, COREHTVM
 from util.utils import timing, string_to_datetime
 
 
-def download_master(i, output_folder, log_folder):
-    """
+def download_master(i, output_folder, log_folder, config):
+    """Download log from master instance
 
-    :param i:
-    :param output_folder:
-    :param log_folder:
-    :return:
+    :param i: master instance
+    :param output_folder: output folder where save the log
+    :param log_folder: log folder on the master instance
+    :return: output_folder and the app_id: the application id
     """
-    ssh_client = sshclient_from_instance(i, KEYPAIR_PATH, user_name='ubuntu')
+    ssh_client = sshclient_from_instance(i, config["Aws"]["KeyPair"], user_name='ubuntu')
     app_id = ""
-    for file in ssh_client.listdir("" + SPARK_HOME + "spark-events/"):
+    for file in ssh_client.listdir("" + config["Spark"]["SparkHome"] + "spark-events/"):
         print("BENCHMARK: " + file)
         print("LOG FOLDER: " + log_folder)
         print("OUTPUT FOLDER: " + output_folder)
@@ -35,11 +38,12 @@ def download_master(i, output_folder, log_folder):
             os.makedirs(output_folder)
         except FileExistsError:
             print("Output folder already exists")
-        inputfile = SPARK_HOME + "spark-events/" + file
-        outputbz = inputfile + ".bz"
+        input_file = config["Spark"]["SparkHome"] + "spark-events/" + file
+        output_bz = input_file + ".bz"
         print("Bzipping event log...")
-        ssh_client.run("pbzip2 -9 -p" + str(COREVM) + " -c " + inputfile + " > " + outputbz)
-        ssh_client.get_file(outputbz, output_folder + "/" + file + ".bz")
+        ssh_client.run("pbzip2 -9 -p" + str(
+            config["Control"]["CoreVM"]) + " -c " + input_file + " > " + output_bz)
+        ssh_client.get_file(output_bz, output_folder + "/" + file + ".bz")
     for file in ssh_client.listdir(log_folder):
         print(file)
         if file != "bench-report.dat":
@@ -48,20 +52,22 @@ def download_master(i, output_folder, log_folder):
     return output_folder, app_id
 
 
-def download_slave(i, output_folder, app_id):
-    """
+def download_slave(i, output_folder, app_id, config):
+    """Download log from slave instance:
+    * The worker log that includes the controller output
+    * The cpu monitoring log
 
-    :param i:
-    :param output_folder:
-    :param app_id:
-    :return:
+    :param i: the slave instance
+    :param output_folder: the output folder where to save log
+    :param app_id: the application
+    :return: output_folder: the output folder
     """
-    ssh_client = sshclient_from_instance(i, KEYPAIR_PATH, user_name='ubuntu')
+    ssh_client = sshclient_from_instance(i, config["Aws"]["KeyPair"], user_name='ubuntu')
     print("Downloading log from slave: " + i.public_dns_name)
     try:
         worker_ip_fixed = i.private_ip_address.replace(".", "-")
         worker_log = "{0}logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-ip-{1}.out".format(
-            SPARK_HOME, worker_ip_fixed)
+            config["Spark"]["SparkHome"], worker_ip_fixed)
         print(worker_log)
         ssh_client.run(
             "screen -ls | grep Detached | cut -d. -f1 | awk '{print $1}' | xargs -r kill")
@@ -73,24 +79,25 @@ def download_slave(i, output_folder, app_id):
     except FileNotFoundError:
         print("worker log not found")
     try:
-        for file in ssh_client.listdir(SPARK_HOME + "work/" + app_id + "/"):
+        for file in ssh_client.listdir(config["Spark"]["SparkHome"] + "work/" + app_id + "/"):
             print("Executor ID: " + file)
-            ssh_client.get_file(SPARK_HOME + "work/" + app_id + "/" + file + "/stderr",
-                                output_folder + "/" + i.public_dns_name + "-" + file + ".stderr")
+            ssh_client.get_file(
+                config["Spark"]["SparkHome"] + "work/" + app_id + "/" + file + "/stderr",
+                output_folder + "/" + i.public_dns_name + "-" + file + ".stderr")
     except FileNotFoundError:
         print("stderr not found")
     return output_folder
 
 
 @timing
 def download(log_folder, instances, master_dns, output_folder):
-    """
+    """ Download the logs from the master and the worker nodes
 
-    :param log_folder:
-    :param instances:
-    :param master_dns:
-    :param output_folder:
-    :return:
+    :param log_folder: the log folder of the application
+    :param instances: the instances of the cluster
+    :param master_dns: the dns of the master instances
+    :param output_folder: the output folder where to save the logs
+    :return: the output folder
     """
     # MASTER
     print("Downloading log from Master: " + master_dns)
@@ -106,7 +113,6 @@ def download(log_folder, instances, master_dns, output_folder):
     return output_folder
 
 
-@timing
 def load_app_data(app_log_path):
     """
     Function that parse the application data like stage ids, start, deadline, end,
@@ -181,7 +187,7 @@ def load_app_data(app_log_path):
         return app_info
 
 
-def load_worker_data(worker_log, cpu_log):
+def load_worker_data(worker_log, cpu_log, config):
     """
     Load the controller data from the worker_log and combine with the cpu_real data from cpu_log
 
@@ -245,7 +251,12 @@ def load_worker_data(worker_log, cpu_log):
                     and line[1] != " CPU" and line[0] != "Average:":
                 worker_dict["time_cpu"].append(
                     dt.strptime(line[0], '%I:%M:%S %p').replace(year=2016))
-                cpuint = float('{0:.2f}'.format((float(line[2]) * COREHTVM) / 100))
-                worker_dict["cpu_real"].append(cpuint)
+                if config["Aws"]["HyperThreading"]:
+                    cpu_real = float(
+                        '{0:.2f}'.format((float(line[2]) * config["Control"]["CoreVM"] * 2) / 100))
+                else:
+                    cpu_real = float(
+                        '{0:.2f}'.format((float(line[2]) * config["Control"]["CoreVM"]) / 100))
+                worker_dict["cpu_real"].append(cpu_real)
     print(list(worker_dict.keys()))
     return worker_dict
@@ -6,7 +6,7 @@
 import launch
 import run
 from config import NUMINSTANCE, REGION, TAG, REBOOT, CLUSTER_ID, TERMINATE, RUN, NUM_RUN, \
-    CREDENTIAL_PROFILE
+    CREDENTIAL_PROFILE, CONFIG_DICT
 
 
 def main():
@@ -19,7 +19,7 @@ def main():
     client = session.client('ec2', region_name=REGION)
 
     if NUMINSTANCE > 0:
-        spot_request_ids = launch.launch(client, NUMINSTANCE)
+        spot_request_ids = launch.launch(client, NUMINSTANCE, CONFIG_DICT)
 
         print("CHECK SECURITY GROUP ALLOWED IP SETTINGS!!!")