Skip to content

Commit fdea370

Browse files
authored
Introduce an experimental CLI to help train TL programs using multiple GPUs. (tensorlayer#287)
Introduce an experimental CLI to help train TL programs using multiple GPUs.
1 parent 21364aa commit fdea370

11 files changed

+226
-7
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,5 @@ data/.DS_Store
1010
*.pyc
1111
*.gz
1212
.spyproject/
13+
*~
14+

yapf.yaml .style.yapf

File renamed without changes.

docs/index.rst

+12
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,18 @@ method, this part of the documentation is for you.
5656
modules/distributed
5757
modules/db
5858

59+
60+
Command-line Reference
61+
----------------------
62+
63+
TensorLayer provides a handy command-line tool `tl` to perform some common tasks.
64+
65+
.. toctree::
66+
:maxdepth: 2
67+
68+
modules/cli
69+
70+
5971
Indices and tables
6072
==================
6173

docs/modules/cli.rst

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
CLI
2+
======
3+
4+
.. automodule:: tensorlayer.cli
5+
6+
.. automodule:: tensorlayer.cli.train

docs/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ numpydoc
33
scipy
44
scikit-image
55
matplotlib
6+
pymongo
7+
sphinx

example/tutorial_imagenet_inceptionV3_distributed.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def load_data(file, task_spec=None, batch_size=16, epochs=1, shuffle_size=0):
117117
dataset = dataset.shuffle(buffer_size=shuffle_size)
118118

119119
def _parse_example_fn(line):
120-
line_split = line.split(',')
120+
line_split = line.decode().split(',')
121121
filename = line_split[0]
122122
labels_names = line_split[1:]
123123
# labels

setup.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,14 @@
1414
include_package_data=True,
1515
author='TensorLayer Contributors',
1616
author_email='[email protected]',
17-
url = "https://github.com/zsdonghao/tensorlayer" ,
18-
license = "apache" ,
19-
packages = find_packages(),
17+
url="https://github.com/tensorlayer/tensorlayer",
18+
license="apache",
19+
packages=find_packages(),
2020
install_requires=install_requires,
21-
# scripts=['tutorial_mnist.py'],
22-
description = "Reinforcement Learning and Deep Learning Library for Researcher and Engineer.",
23-
keywords = "deep learning, reinforcement learning, tensorflow",
21+
scripts=[
22+
'tl',
23+
],
24+
description="Reinforcement Learning and Deep Learning Library for Researcher and Engineer.",
25+
keywords="deep learning, reinforcement learning, tensorflow",
2426
platform=['any'],
2527
)

tensorlayer/cli/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""
2+
The tensorlayer.cli module provides a command-line tool for some common tasks.
3+
"""

tensorlayer/cli/__main__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import argparse
2+
from tensorlayer.cli import train
3+
4+
if __name__ == "__main__":
5+
parser = argparse.ArgumentParser(prog='tl')
6+
subparsers = parser.add_subparsers(dest='cmd')
7+
train_parser = subparsers.add_parser('train', help='train a model using multiple local GPUs or CPUs.')
8+
train.build_arg_parser(train_parser)
9+
args = parser.parse_args()
10+
if args.cmd == 'train':
11+
train.main(args)
12+
else:
13+
parser.print_help()

tensorlayer/cli/train.py

+166
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
4+
"""
5+
tl train
6+
========
7+
(Alpha release)
8+
9+
The tensorlayer.cli.train module provides the ``tl train`` subcommand.
10+
It helps the user bootstrap a TensorFlow/TensorLayer program for distributed training
11+
using multiple GPU cards or CPUs on a computer.
12+
13+
You need to first setup the CUDA_VISIBLE_DEVICES to tell ``tl train``
14+
which GPUs are available. If the CUDA_VISIBLE_DEVICES is not given,
15+
``tl train`` would try best to discover all available GPUs.
16+
17+
In distribute training, each TensorFlow program needs a TF_CONFIG environment variable to describe
18+
the cluster. It also needs a master daemon to
19+
monitor all trainers. ``tl train`` is responsible
20+
for automatically managing these two tasks.
21+
22+
Usage
23+
-----
24+
25+
tl train [-h] [-p NUM_PSS] [-c CPU_TRAINERS] <file> [args [args ...]]
26+
27+
.. code-block:: bash
28+
29+
tl train example/tutorial_mnist_distributed.py
30+
31+
# example of using customized number of CPUs
32+
tl train -c 16 example/tutorial_imagenet_inceptionV3_distributed.py
33+
34+
# example of running training program with customized arguments
35+
tl train example/tutorial_imagenet_inceptionV3_distributed.py -- --batch_size 16
36+
37+
38+
Parameters
39+
----------
40+
41+
- ``file``: python file path.
42+
43+
- ``NUM_PSS`` : The number of parameter servers.
44+
45+
- ``CPU_TRAINERS``: The number of CPU trainers.
46+
47+
It is recommended that ``NUM_PSS + CPU_TRAINERS <= cpu count``
48+
49+
- ``args``: Any parameter after ``--`` would be passed to the python program.
50+
51+
52+
Notes
53+
-----
54+
55+
A parallel training program would require multiple parameter servers
56+
to help parallel trainers to exchange intermediate gradients.
57+
The best number of parameter servers is often proportional to the
58+
size of your model as well as the number of CPUs available.
59+
You can control the number of parameter servers using the ``-p`` parameter.
60+
61+
If you have a single computer with massive CPUs, you can use the ``-c`` parameter
62+
to enable CPU-only parallel training.
63+
The reason we are not supporting GPU-CPU co-training is because GPU and
64+
CPU are running at different speeds. Using them together in training would
65+
incur stragglers.
66+
"""
67+
68+
import argparse
69+
import json
70+
import multiprocessing
71+
import os
72+
import platform
73+
import re
74+
import subprocess
75+
import sys
76+
77+
PORT_BASE = 10000
78+
79+
80+
def _get_gpu_ids():
81+
if 'CUDA_VISIBLE_DEVICES' in os.environ:
82+
return [int(x) for x in os.environ.get('CUDA_VISIBLE_DEVICES', '').split(',')]
83+
if platform.system() in ['Darwin', 'Linux']:
84+
return [int(d.replace('nvidia', '')) for d in os.listdir('/dev') if re.match('^nvidia\d+$', d)]
85+
else:
86+
print('Please set CUDA_VISIBLE_DEVICES (see http://acceleware.com/blog/cudavisibledevices-masking-gpus)')
87+
return []
88+
89+
90+
GPU_IDS = _get_gpu_ids()
91+
92+
93+
def create_tf_config(cluster_spec, task_type, task_index):
94+
return {
95+
'cluster': cluster_spec,
96+
'task': {
97+
'type': task_type,
98+
'index': task_index
99+
},
100+
}
101+
102+
103+
def create_tf_jobs(cluster_spec, prog, args):
104+
gpu_assignment = dict((('worker', idx), gpu_idx) for (idx, gpu_idx) in enumerate(GPU_IDS))
105+
for job_type in cluster_spec:
106+
for task_index in range(len(cluster_spec[job_type])):
107+
new_env = os.environ.copy()
108+
new_env.update({
109+
'CUDA_VISIBLE_DEVICES': str(gpu_assignment.get((job_type, task_index), '')),
110+
'TF_CONFIG': json.dumps(create_tf_config(cluster_spec, job_type, task_index)),
111+
})
112+
yield subprocess.Popen(['python3', prog] + args, env=new_env)
113+
114+
115+
def validate_arguments(args):
116+
if args.num_pss < 1:
117+
print('Value error: must have ore than one parameter servers.')
118+
exit(1)
119+
120+
if not GPU_IDS:
121+
num_cpus = multiprocessing.cpu_count()
122+
if args.cpu_trainers > num_cpus:
123+
print('Value error: there are %s available CPUs but you are requiring %s.' % (num_cpus, args.cpu_trainers))
124+
exit(1)
125+
126+
if not os.path.isfile(args.file):
127+
print('Value error: model trainning file does not exist')
128+
exit(1)
129+
130+
131+
def main(args):
132+
validate_arguments(args)
133+
num_workers = len(GPU_IDS) if GPU_IDS else args.cpu_trainers
134+
print('Using program %s with args %s' % (args.file, ' '.join(args.args)))
135+
print('Using %d workers, %d parameter servers, %d GPUs.' % (num_workers, args.num_pss, len(GPU_IDS)))
136+
cluster_spec = {
137+
'ps': ['localhost:%d' % (PORT_BASE + i) for i in range(args.num_pss)],
138+
'worker': ['localhost:%d' % (PORT_BASE + args.num_pss + i) for i in range(num_workers)]
139+
}
140+
processes = list(create_tf_jobs(cluster_spec, args.file, args.args))
141+
try:
142+
print('Press ENTER to exit the training ...')
143+
sys.stdin.readline()
144+
except KeyboardInterrupt: # https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt
145+
print('Keyboard interrupt received')
146+
finally:
147+
print('stopping all subprocesses ...')
148+
for p in processes:
149+
p.kill()
150+
for p in processes:
151+
p.wait()
152+
print('END')
153+
154+
155+
def build_arg_parser(parser):
156+
parser.add_argument('-p', '--pss', dest='num_pss', type=int, default=1, help='number of parameter servers')
157+
parser.add_argument('-c', '--cpu_trainers', dest='cpu_trainers', type=int, default=1, help='number of CPU trainers')
158+
parser.add_argument('file', help='model trainning file path')
159+
parser.add_argument('args', nargs='*', type=str, help='arguments to <file>')
160+
161+
162+
if __name__ == "__main__":
163+
parser = argparse.ArgumentParser()
164+
build_arg_parser(parser)
165+
args = parser.parse_args()
166+
main(args)

tl

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
SOURCE="${BASH_SOURCE[0]}"
4+
while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
5+
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
6+
SOURCE="$(readlink "$SOURCE")"
7+
[[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
8+
done
9+
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
10+
11+
export PYTHONPATH="${DIR}/src:${PYTHONPATH}"
12+
13+
python3 -m tensorlayer.cli "$@"

0 commit comments

Comments
 (0)