Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
Update tf-cnn example (#210)
Browse files Browse the repository at this point in the history
- now compatible with tensorflow 1.13
- no more kubebench environment variables in code
- update dockerfiles and fix build issues
  • Loading branch information
xyhuang authored and k8s-ci-robot committed Aug 7, 2019
1 parent 21c1fef commit bc682c1
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 26 deletions.
7 changes: 3 additions & 4 deletions build/images/examples/tf-cnn/runner-cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM tensorflow/tensorflow:1.8.0
FROM tensorflow/tensorflow:1.13.2

RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
build-essential \
git

RUN mkdir -p /opt
RUN mkdir -p /kubebench/experiments

RUN git clone -n https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
RUN cd /opt/tf-benchmarks; git checkout 3b90c14fb2bf02ca5d27c188aee878663229a0a7
RUN git clone --branch=cnn_tf_v1.13_compatible --depth=1 \
https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks

COPY examples/src/tf-cnn/runner.py /opt
RUN chmod u+x /opt/*
Expand Down
9 changes: 3 additions & 6 deletions build/images/examples/tf-cnn/runner-gpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM tensorflow/tensorflow:1.8.0-gpu
FROM tensorflow/tensorflow:1.13.2-gpu

RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
build-essential \
git

RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH}
RUN mkdir -p /opt
RUN mkdir -p /kubebench/experiments

RUN git clone -n https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
RUN cd /opt/tf-benchmarks; git checkout 3b90c14fb2bf02ca5d27c188aee878663229a0a7
RUN git clone --branch=cnn_tf_v1.13_compatible --depth=1 \
https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks

COPY examples/src/tf-cnn/runner.py /opt
RUN chmod u+x /opt/*
Expand Down
8 changes: 6 additions & 2 deletions examples/src/tf-cnn/post_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import json
import os
import sys


def run():
Expand All @@ -30,8 +31,11 @@ def run():
'Optimizer',
'Variables',
'Sync']
output_dir = os.environ.get("KUBEBENCH_EXP_OUTPUT_PATH")
result_dir = os.environ.get("KUBEBENCH_EXP_RESULT_PATH")

args = sys.argv[1:]
output_dir = args[0]
result_dir = args[1]

result_file = os.path.join(result_dir, "result.json")
if not os.path.exists(result_dir):
os.makedirs(result_dir)
Expand Down
31 changes: 17 additions & 14 deletions examples/src/tf-cnn/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import json
import os
import shutil
import subprocess
import sys

Expand Down Expand Up @@ -45,26 +45,27 @@ def run_and_stream(cmd):
raise ValueError("cmd: {0} exited with code {1}".format(
" ".join(cmd), process.returncode))

if __name__ == "__main__":

def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str)
# split args for runner process and benchmark subprocess
runner_args, benchmark_args = parser.parse_known_args(sys.argv[1:])

tf_config = os.environ.get("TF_CONFIG", '{}')
tf_config_json = json.loads(tf_config)
cluster = tf_config_json.get("cluster", {})
job_name = tf_config_json.get("task", {}).get("type", "")
task_index = tf_config_json.get("task", {}).get("index", "")

kubebench_exp_output_dir = os.environ.get("KUBEBENCH_EXP_OUTPUT_PATH")
if not os.path.exists(kubebench_exp_output_dir):
os.makedirs(kubebench_exp_output_dir)

log_dir = "/tmp/logs"
if not os.path.exists(log_dir):
os.makedirs(log_dir)
output_dir = runner_args.output_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir)
jn = job_name if job_name != "" else "worker"
ti = str(task_index) if task_index != "" else "0"
log_file = os.path.join(log_dir, jn + ti + ".log")
output_file = os.path.join(output_dir, jn + ti + ".log")

args = sys.argv[1:]
command = ["python", "tf_cnn_benchmarks.py"] + args
command = ["python", "tf_cnn_benchmarks.py"] + benchmark_args
ps_hosts = ",".join(cluster.get("ps", []))
worker_hosts = ",".join(cluster.get("worker", []))
if cluster.get("ps", []) or len(cluster.get("worker", [])) > 1:
Expand All @@ -75,7 +76,7 @@ def run_and_stream(cmd):

logging.getLogger().setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO,
filename=log_file,
filename=output_file,
filemode='w',
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
Expand All @@ -87,4 +88,6 @@ def run_and_stream(cmd):
run_and_stream(command)
logging.info("Finished: %s", " ".join(command))

shutil.copy(log_file, kubebench_exp_output_dir)

if __name__ == "__main__":
main()

0 comments on commit bc682c1

Please sign in to comment.