fixes

gioenn · gioenn · commit 9d67a03ee18b · 2017-01-10T20:53:08.000+01:00
diff --git a/config.py b/config.py
@@ -5,20 +5,20 @@
 # AWS
 DATA_AMI = {"eu-west-1": {"ami": 'ami-d3225da0',
                           "az": 'eu-west-1c',
-                          "keypair": "gazzettaEU",
+                          "keypair": "giovanni2",
                           "price": "0.3"},
             "us-west-2": {"ami": 'ami-7f5ff81f',
                           "snapid": "snap-4f38bf1c",
                           "az": 'us-west-2c',
-                          "keypair": "gazzetta",
-                          "price": "0.25"}}
+                          "keypair": "giovanni2",
+                          "price": "0.3"}}
 """AMI id for region and availability zone"""
 
 CREDENTIAL_PROFILE = 'cspark'
 """Credential profile name of AWS"""
 REGION = "us-west-2"
 """Region of AWS to use"""
-KEY_PAIR_PATH = "/home/meteos/" + DATA_AMI[REGION]["keypair"] + ".pem"
+KEY_PAIR_PATH = "/Users/Giovanni/Desktop/" + DATA_AMI[REGION]["keypair"] + ".pem"
 """KeyPair path for the instance"""
 SECURITY_GROUP = "spark-cluster"
 """Secutiry group of the instance"""
@@ -35,7 +35,7 @@
 NUM_RUN = 1
 """Number of run to repeat the benchmark"""
 
-CLUSTER_ID = "1"
+CLUSTER_ID = "CSPARK"
 """Id of the cluster with the launched instances"""
 print("Cluster ID : " + str(CLUSTER_ID))
 TAG = [{
@@ -44,7 +44,7 @@
 }]
 
 # HDFS
-HDFS_MASTER = ""
+HDFS_MASTER = "ec2-35-161-111-116.us-west-2.compute.amazonaws.com"
 """Url of the HDFS NameNode if not set the cluster created is an HDFS Cluster"""
 # Spark config
 SPARK_2_HOME = "/opt/spark/"
@@ -81,7 +81,7 @@
 
 # CONTROL
 ALPHA = 0.95
-DEADLINE = 239474
+DEADLINE = 284375
 # SVM
 # 0%  217500
 # 20% 261000
@@ -94,7 +94,7 @@
 # 0%  209062
 # 20% 250874
 # 40% 284375
-MAX_EXECUTOR = 8
+MAX_EXECUTOR = 9
 OVER_SCALE = 2
 K = 50
 TI = 12000
@@ -159,7 +159,8 @@
     },
     "PageRank": {
         "NUM_OF_PARTITIONS": (3, 1000),
-        "numV": (2, 7000000),
+        "numV": (2, 2000000),
+        "mu": (4, 5.0),
         "MAX_ITERATION": (8, 1)
     },
     "KMeans": {
diff --git a/download_log.py b/download_log.py
@@ -12,10 +12,10 @@
              ])
 
 logfolder = "./spark-bench/num"
-master_dns = "ec2-35-161-226-18.us-west-2.compute.amazonaws.com"
+master_dns = "ec2-35-165-203-239.us-west-2.compute.amazonaws.com"
 # master_dns = "ec2-54-70-77-95.us-west-2.compute.amazonaws.com"
 output_folder = "./spark-bench/num/"
-output_folder = log.download(logfolder, instances, master_dns, output_folder)
+output_folder = log.download(logfolder, instances, master_dns, output_folder, CONFIG_DICT)
 
 if output_folder[-1] != "/":
     output_folder += "/"
diff --git a/log.py b/log.py
@@ -108,7 +108,7 @@ def download(log_folder, instances, master_dns, output_folder, config):
     with ThreadPoolExecutor(multiprocessing.cpu_count()) as executor:
         for i in instances:
             if i.public_dns_name != master_dns:
-                worker = executor.submit(download_slave, i, output_folder, app_id)
+                worker = executor.submit(download_slave, i, output_folder, app_id, config)
                 output_folder = worker.result()
     return output_folder
 
@@ -260,5 +260,9 @@ def load_worker_data(worker_log, cpu_log, config):
                     cpu_real = float(
                         '{0:.2f}'.format((float(line[2]) * config["Control"]["CoreVM"]) / 100))
                 worker_dict["cpu_real"].append(cpu_real)
+    for app_id in list(worker_dict):
+        print(app_id)
+        if not len(worker_dict[app_id]) > 0:
+            del worker_dict[app_id]
     print(list(worker_dict.keys()))
     return worker_dict
diff --git a/main.py b/main.py
@@ -43,9 +43,9 @@ def main():
 
     if REBOOT:
         print("Rebooting instances...")
-        instances = client.instances.filter(
-            Filters=[{'Name': 'instance-state-name', 'Values': ['running']},
-                     {'Name': 'tag:ClusterId', 'Values': [CLUSTER_ID]}])
+        session = boto3.Session(profile_name=CREDENTIAL_PROFILE)
+        ec2 = session.resource('ec2', region_name=REGION)
+        instances = ec2.instances.filter(Filters=[{'Name': 'instance-state-name', 'Values': ['running']}, {'Name': 'tag:ClusterId', 'Values': [CLUSTER_ID]}])
         instance_ids = [x.id for x in instances]
         client.reboot_instances(InstanceIds=instance_ids)
         launch.wait_ping(client, instance_ids, copy.deepcopy(instance_ids))
diff --git a/plot.py b/plot.py
@@ -33,7 +33,7 @@
 LABEL_SIZE = 20
 TQ_MICRO = 20
 TQ_KMEANS = 9
-PDF = 1
+PDF = 0
 
 PLOT_PARAMETERS = {
     'axes.labelsize': LABEL_SIZE,  # fontsize for x and y labels (was 10)
@@ -256,9 +256,10 @@ def plot_worker(app_id, app_info, worker_log, worker_dict, config, first_ts_work
     folder_split = worker_log.split("/")
     name = folder_split[-3].lower() + "-worker-" + folder_split[-2].replace("%", "") + "-" + \
            folder_split[-1].split("-")[-1].replace(".out", "")
-    folder = "/".join(worker_log.split("\\")[:-1])
+    folder = "/".join(worker_log.split("/")[:-1])
     labels = ax1.get_xticklabels()
     plt.setp(labels, rotation=45)
+    print(folder)
     if PDF:
         plt.savefig(folder + "/" + name + ".pdf", bbox_inches='tight', dpi=300)
     else:
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-boto3>=1.4.1
+boto3==1.4.1
 matplotlib>=2.0.0b4
 numpy>=1.11.2
 pandas>=0.19.1
diff --git a/run.py b/run.py
@@ -91,9 +91,7 @@ def setup_slave(instance, master_dns):
     if UPDATE_SPARK:
         print("   Updating Spark...")
         ssh_client.run(
-            """cd /usr/local/spark && git pull &&  build/mvn -T 1C -Phive -Pnetlib-lgpl -Pyarn
-            -Phadoop-2.7 -Dhadoop.version=2.7.2 -Dscala-2.11 -DskipTests
-             -Dmaven.test.skip=true package""")
+            """cd /usr/local/spark && git pull && build/mvn clean && build/mvn -T 1C -Phive -Pnetlib-lgpl -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.2 -Dscala-2.11 -DskipTests -Dmaven.test.skip=true package""")
 
     # CLEAN UP EXECUTORS APP LOGS
     ssh_client.run("rm -r " + SPARK_HOME + "work/*")
@@ -166,9 +164,7 @@ def setup_master(instance):
     if UPDATE_SPARK_MASTER:
         print("   Updating Spark...")
         ssh_client.run(
-            """cd /usr/local/spark && git pull &&  build/mvn -T 1C -Phive -Pnetlib-lgpl -Pyarn
-            -Phadoop-2.7 -Dhadoop.version=2.7.2 -Dscala-2.11 -DskipTests
-             -Dmaven.test.skip=true package""")
+            """cd /usr/local/spark && git pull && build/mvn clean &&  build/mvn -T 1C -Phive -Pnetlib-lgpl -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.2 -Dscala-2.11 -DskipTests -Dmaven.test.skip=true package""")
 
     print("   Remove Logs")
     ssh_client.run("rm " + SPARK_HOME + "spark-events/*")
@@ -498,7 +494,7 @@ def run_benchmark():
     status = ssh_client.run('[ ! -e %s ]; echo $?' % (DATA_AMI[REGION]["keypair"] + ".pem"))
     if not int(status[1].decode('utf8').replace("\n", "")):
         ssh_client.put_file(KEY_PAIR_PATH, "/home/ubuntu/" + DATA_AMI[REGION]["keypair"] + ".pem")
-
+        ssh_client.run("chmod 400 "+ "/home/ubuntu/" + DATA_AMI[REGION]["keypair"] + ".pem")
     # LANCIARE BENCHMARK
     if HDFS == 0:
         if len(BENCHMARK_PERF) > 0:
diff --git a/util/plot_stages.py b/util/plot_stages.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-boto3>=1.4.1`
	`1`	`+boto3==1.4.1`
`2`	`2`	`matplotlib>=2.0.0b4`
`3`	`3`	`numpy>=1.11.2`
`4`	`4`	`pandas>=0.19.1`