|
| 1 | +#!/usr/bin/env python |
| 2 | +# encoding: utf-8 |
| 3 | + |
| 4 | +""" |
| 5 | +tl train |
| 6 | +======== |
| 7 | +(Alpha release) |
| 8 | +
|
| 9 | +The tensorlayer.cli.train module provides the ``tl train`` subcommand. |
| 10 | +It helps the user bootstrap a TensorFlow/TensorLayer program for distributed training |
| 11 | +using multiple GPU cards or CPUs on a computer. |
| 12 | +
|
| 13 | +You need to first setup the CUDA_VISIBLE_DEVICES to tell ``tl train`` |
| 14 | +which GPUs are available. If the CUDA_VISIBLE_DEVICES is not given, |
| 15 | +``tl train`` would try best to discover all available GPUs. |
| 16 | +
|
| 17 | +In distribute training, each TensorFlow program needs a TF_CONFIG environment variable to describe |
| 18 | +the cluster. It also needs a master daemon to |
| 19 | +monitor all trainers. ``tl train`` is responsible |
| 20 | +for automatically managing these two tasks. |
| 21 | +
|
| 22 | +Usage |
| 23 | +----- |
| 24 | +
|
| 25 | +tl train [-h] [-p NUM_PSS] [-c CPU_TRAINERS] <file> [args [args ...]] |
| 26 | +
|
| 27 | +.. code-block:: bash |
| 28 | + |
| 29 | + tl train example/tutorial_mnist_distributed.py |
| 30 | +
|
| 31 | + # example of using customized number of CPUs |
| 32 | + tl train -c 16 example/tutorial_imagenet_inceptionV3_distributed.py |
| 33 | +
|
| 34 | + # example of running training program with customized arguments |
| 35 | + tl train example/tutorial_imagenet_inceptionV3_distributed.py -- --batch_size 16 |
| 36 | +
|
| 37 | +
|
| 38 | +Parameters |
| 39 | +---------- |
| 40 | +
|
| 41 | +- ``file``: python file path. |
| 42 | +
|
| 43 | +- ``NUM_PSS`` : The number of parameter servers. |
| 44 | +
|
| 45 | +- ``CPU_TRAINERS``: The number of CPU trainers. |
| 46 | +
|
| 47 | + It is recommended that ``NUM_PSS + CPU_TRAINERS <= cpu count`` |
| 48 | +
|
| 49 | +- ``args``: Any parameter after ``--`` would be passed to the python program. |
| 50 | +
|
| 51 | +
|
| 52 | +Notes |
| 53 | +----- |
| 54 | +
|
| 55 | +A parallel training program would require multiple parameter servers |
| 56 | +to help parallel trainers to exchange intermediate gradients. |
| 57 | +The best number of parameter servers is often proportional to the |
| 58 | +size of your model as well as the number of CPUs available. |
| 59 | +You can control the number of parameter servers using the ``-p`` parameter. |
| 60 | +
|
| 61 | +If you have a single computer with massive CPUs, you can use the ``-c`` parameter |
| 62 | +to enable CPU-only parallel training. |
| 63 | +The reason we are not supporting GPU-CPU co-training is because GPU and |
| 64 | +CPU are running at different speeds. Using them together in training would |
| 65 | +incur stragglers. |
| 66 | +""" |
| 67 | + |
| 68 | +import argparse |
| 69 | +import json |
| 70 | +import multiprocessing |
| 71 | +import os |
| 72 | +import platform |
| 73 | +import re |
| 74 | +import subprocess |
| 75 | +import sys |
| 76 | + |
| 77 | +PORT_BASE = 10000 |
| 78 | + |
| 79 | + |
| 80 | +def _get_gpu_ids(): |
| 81 | + if 'CUDA_VISIBLE_DEVICES' in os.environ: |
| 82 | + return [int(x) for x in os.environ.get('CUDA_VISIBLE_DEVICES', '').split(',')] |
| 83 | + if platform.system() in ['Darwin', 'Linux']: |
| 84 | + return [int(d.replace('nvidia', '')) for d in os.listdir('/dev') if re.match('^nvidia\d+$', d)] |
| 85 | + else: |
| 86 | + print('Please set CUDA_VISIBLE_DEVICES (see http://acceleware.com/blog/cudavisibledevices-masking-gpus)') |
| 87 | + return [] |
| 88 | + |
| 89 | + |
| 90 | +GPU_IDS = _get_gpu_ids() |
| 91 | + |
| 92 | + |
| 93 | +def create_tf_config(cluster_spec, task_type, task_index): |
| 94 | + return { |
| 95 | + 'cluster': cluster_spec, |
| 96 | + 'task': { |
| 97 | + 'type': task_type, |
| 98 | + 'index': task_index |
| 99 | + }, |
| 100 | + } |
| 101 | + |
| 102 | + |
| 103 | +def create_tf_jobs(cluster_spec, prog, args): |
| 104 | + gpu_assignment = dict((('worker', idx), gpu_idx) for (idx, gpu_idx) in enumerate(GPU_IDS)) |
| 105 | + for job_type in cluster_spec: |
| 106 | + for task_index in range(len(cluster_spec[job_type])): |
| 107 | + new_env = os.environ.copy() |
| 108 | + new_env.update({ |
| 109 | + 'CUDA_VISIBLE_DEVICES': str(gpu_assignment.get((job_type, task_index), '')), |
| 110 | + 'TF_CONFIG': json.dumps(create_tf_config(cluster_spec, job_type, task_index)), |
| 111 | + }) |
| 112 | + yield subprocess.Popen(['python3', prog] + args, env=new_env) |
| 113 | + |
| 114 | + |
| 115 | +def validate_arguments(args): |
| 116 | + if args.num_pss < 1: |
| 117 | + print('Value error: must have ore than one parameter servers.') |
| 118 | + exit(1) |
| 119 | + |
| 120 | + if not GPU_IDS: |
| 121 | + num_cpus = multiprocessing.cpu_count() |
| 122 | + if args.cpu_trainers > num_cpus: |
| 123 | + print('Value error: there are %s available CPUs but you are requiring %s.' % (num_cpus, args.cpu_trainers)) |
| 124 | + exit(1) |
| 125 | + |
| 126 | + if not os.path.isfile(args.file): |
| 127 | + print('Value error: model trainning file does not exist') |
| 128 | + exit(1) |
| 129 | + |
| 130 | + |
| 131 | +def main(args): |
| 132 | + validate_arguments(args) |
| 133 | + num_workers = len(GPU_IDS) if GPU_IDS else args.cpu_trainers |
| 134 | + print('Using program %s with args %s' % (args.file, ' '.join(args.args))) |
| 135 | + print('Using %d workers, %d parameter servers, %d GPUs.' % (num_workers, args.num_pss, len(GPU_IDS))) |
| 136 | + cluster_spec = { |
| 137 | + 'ps': ['localhost:%d' % (PORT_BASE + i) for i in range(args.num_pss)], |
| 138 | + 'worker': ['localhost:%d' % (PORT_BASE + args.num_pss + i) for i in range(num_workers)] |
| 139 | + } |
| 140 | + processes = list(create_tf_jobs(cluster_spec, args.file, args.args)) |
| 141 | + try: |
| 142 | + print('Press ENTER to exit the training ...') |
| 143 | + sys.stdin.readline() |
| 144 | + except KeyboardInterrupt: # https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt |
| 145 | + print('Keyboard interrupt received') |
| 146 | + finally: |
| 147 | + print('stopping all subprocesses ...') |
| 148 | + for p in processes: |
| 149 | + p.kill() |
| 150 | + for p in processes: |
| 151 | + p.wait() |
| 152 | + print('END') |
| 153 | + |
| 154 | + |
| 155 | +def build_arg_parser(parser): |
| 156 | + parser.add_argument('-p', '--pss', dest='num_pss', type=int, default=1, help='number of parameter servers') |
| 157 | + parser.add_argument('-c', '--cpu_trainers', dest='cpu_trainers', type=int, default=1, help='number of CPU trainers') |
| 158 | + parser.add_argument('file', help='model trainning file path') |
| 159 | + parser.add_argument('args', nargs='*', type=str, help='arguments to <file>') |
| 160 | + |
| 161 | + |
| 162 | +if __name__ == "__main__": |
| 163 | + parser = argparse.ArgumentParser() |
| 164 | + build_arg_parser(parser) |
| 165 | + args = parser.parse_args() |
| 166 | + main(args) |
0 commit comments