Update tf-cnn example (#210)

- now compatible with tensorflow 1.13 - no more kubebench environment variables in code - update dockerfiles and fix build issues
kubeflow · Aug 7, 2019 · bc682c1 · bc682c1
1 parent 21c1fef
commit bc682c1
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 26 deletions.
diff --git a/build/images/examples/tf-cnn/runner-cpu.Dockerfile b/build/images/examples/tf-cnn/runner-cpu.Dockerfile
@@ -10,18 +10,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM tensorflow/tensorflow:1.8.0
+FROM tensorflow/tensorflow:1.13.2
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates \
     build-essential \
     git
 
 RUN mkdir -p /opt
-RUN mkdir -p /kubebench/experiments
 
-RUN git clone -n https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
-RUN cd /opt/tf-benchmarks; git checkout 3b90c14fb2bf02ca5d27c188aee878663229a0a7
+RUN git clone --branch=cnn_tf_v1.13_compatible --depth=1 \
+    https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
 
 COPY examples/src/tf-cnn/runner.py /opt
 RUN chmod u+x /opt/*

diff --git a/build/images/examples/tf-cnn/runner-gpu.Dockerfile b/build/images/examples/tf-cnn/runner-gpu.Dockerfile
@@ -10,20 +10,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM tensorflow/tensorflow:1.8.0-gpu
+FROM tensorflow/tensorflow:1.13.2-gpu
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates \
     build-essential \
     git
 
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH}
 RUN mkdir -p /opt
-RUN mkdir -p /kubebench/experiments
 
-RUN git clone -n https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
-RUN cd /opt/tf-benchmarks; git checkout 3b90c14fb2bf02ca5d27c188aee878663229a0a7
+RUN git clone --branch=cnn_tf_v1.13_compatible --depth=1 \
+    https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
 
 COPY examples/src/tf-cnn/runner.py /opt
 RUN chmod u+x /opt/*

diff --git a/examples/src/tf-cnn/post_processor.py b/examples/src/tf-cnn/post_processor.py
@@ -12,6 +12,7 @@
 
 import json
 import os
+import sys
 
 
 def run():
@@ -30,8 +31,11 @@ def run():
     'Optimizer',
     'Variables',
     'Sync']
-  output_dir = os.environ.get("KUBEBENCH_EXP_OUTPUT_PATH")
-  result_dir = os.environ.get("KUBEBENCH_EXP_RESULT_PATH")
+
+  args = sys.argv[1:]
+  output_dir = args[0]
+  result_dir = args[1]
+
   result_file = os.path.join(result_dir, "result.json")
   if not os.path.exists(result_dir):
     os.makedirs(result_dir)

diff --git a/examples/src/tf-cnn/runner.py b/examples/src/tf-cnn/runner.py
@@ -10,10 +10,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import logging
 import json
 import os
-import shutil
 import subprocess
 import sys
 
@@ -45,26 +45,27 @@ def run_and_stream(cmd):
     raise ValueError("cmd: {0} exited with code {1}".format(
       " ".join(cmd), process.returncode))
 
-if __name__ == "__main__":
+
+def main():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--output_dir", type=str)
+  # split args for runner process and benchmark subprocess
+  runner_args, benchmark_args = parser.parse_known_args(sys.argv[1:])
+
   tf_config = os.environ.get("TF_CONFIG", '{}')
   tf_config_json = json.loads(tf_config)
   cluster = tf_config_json.get("cluster", {})
   job_name = tf_config_json.get("task", {}).get("type", "")
   task_index = tf_config_json.get("task", {}).get("index", "")
 
-  kubebench_exp_output_dir = os.environ.get("KUBEBENCH_EXP_OUTPUT_PATH")
-  if not os.path.exists(kubebench_exp_output_dir):
-    os.makedirs(kubebench_exp_output_dir)
-
-  log_dir = "/tmp/logs"
-  if not os.path.exists(log_dir):
-    os.makedirs(log_dir)
+  output_dir = runner_args.output_dir
+  if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
   jn = job_name if job_name != "" else "worker"
   ti = str(task_index) if task_index != "" else "0"
-  log_file = os.path.join(log_dir, jn + ti + ".log")
+  output_file = os.path.join(output_dir, jn + ti + ".log")
 
-  args = sys.argv[1:]
-  command = ["python", "tf_cnn_benchmarks.py"] + args
+  command = ["python", "tf_cnn_benchmarks.py"] + benchmark_args
   ps_hosts = ",".join(cluster.get("ps", []))
   worker_hosts = ",".join(cluster.get("worker", []))
   if cluster.get("ps", []) or len(cluster.get("worker", [])) > 1:
@@ -75,7 +76,7 @@ def run_and_stream(cmd):
 
   logging.getLogger().setLevel(logging.INFO)
   logging.basicConfig(level=logging.INFO,
-                      filename=log_file,
+                      filename=output_file,
                       filemode='w',
                       format=('%(levelname)s|%(asctime)s'
                               '|%(pathname)s|%(lineno)d| %(message)s'),
@@ -87,4 +88,6 @@ def run_and_stream(cmd):
   run_and_stream(command)
   logging.info("Finished: %s", " ".join(command))
 
-  shutil.copy(log_file, kubebench_exp_output_dir)
+
+if __name__ == "__main__":
+  main()