ShifuML · wuhaifengdhu · Jun 23, 2020
diff --git a/shifu-tensorflow-eval/pom.xml b/shifu-tensorflow-eval/pom.xml
@@ -108,6 +108,11 @@
 									<classifier>hdp-yarn</classifier>
 									<outputDirectory>${project.build.directory}</outputDirectory>
 								</artifactItem>
+								<dependency>
+									<groupId>net.lingala.zip4j</groupId>
+									<artifactId>zip4j</artifactId>
+									<version>1.3.2</version>
+								</dependency>
 								<artifactItem>
 									<groupId>org.tensorflow</groupId>
 									<artifactId>tensorflow</artifactId>

diff --git a/shifu-tensorflow-on-yarn/src/main/resources/backup.py b/shifu-tensorflow-on-yarn/src/main/resources/backup.py
@@ -2,32 +2,21 @@
 Author: Tommy Mulc
 """
 
+import json
+import logging
 # from __future__ import print_function
 import os
-import tensorflow as tf
-import argparse
 import time
-import sys
-import logging
-import gzip
-from StringIO import StringIO
-import random
-import numpy as np
-from tensorflow.python.platform import gfile
-from tensorflow.python.framework import ops
-from tensorflow.python.saved_model import builder
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.saved_model import tag_constants
-import json
+
+import tensorflow as tf
 
 REPLICAS_TO_AGGREGATE_RATIO = 1
 FEATURE_COUNT = 30
 HIDDEN_NODES_COUNT = 20
 VALID_TRAINING_DATA_RATIO = 0.3
 DELIMITER = '|'
 BATCH_SIZE = 10
-EPOCH = 10 # TODO: should consider recovery from checkpoint, we need to reduce current global step
+EPOCH = 10  # TODO: should consider recovery from checkpoint, we need to reduce current global step
 WORKING_DIR = "hdfs://horton/user/webai/.yarn/"
 
 # read from env
@@ -49,23 +38,22 @@ def main(_):
 
     # allows this node know about all other nodes
     if job_name == 'ps':  # checks if parameter server
-        server = tf.train.Server(cluster,
-                                 job_name="ps",
-                                 task_index=task_index)
+        server = tf.compat.v1.train.Server(cluster,
+                                           job_name="ps",
+                                           task_index=task_index)
         server.join()
     else:  # it must be a worker server
         logging.info("Loading data from worker index = %d" % task_index)
 
-        server = tf.train.Server(cluster,
-                                 job_name="worker",
-                                 task_index=task_index)
+        server = tf.compat.v1.train.Server(cluster,
+                                           job_name="worker",
+                                           task_index=task_index)
 
         logging.info("backup worker join!!")
 
         while True:
             time.sleep(1000000)
 
 
-
 if __name__ == '__main__':
-    tf.app.run()
+    tf.compat.v1.app.run()