|
| 1 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 2 | +# All rights reserved. |
| 3 | +# |
| 4 | +# This source code is licensed under the BSD-style license found in the |
| 5 | +# LICENSE file in the root directory of this source tree. |
| 6 | + |
| 7 | +import os |
| 8 | +import time |
| 9 | +from datetime import timedelta |
| 10 | +from typing import Optional |
| 11 | + |
| 12 | +import torch |
| 13 | + |
| 14 | +import torchtitan.components.ft as ft |
| 15 | + |
| 16 | +from torch.distributed.elastic.multiprocessing.errors import record |
| 17 | +from torchtitan.config_manager import JobConfig |
| 18 | +from torchtitan.distributed import utils as dist_utils |
| 19 | +from torchtitan.tools.logging import init_logger, logger |
| 20 | +from torchtitan.tools.profiling import ( |
| 21 | + maybe_enable_memory_snapshot, |
| 22 | + maybe_enable_profiling, |
| 23 | +) |
| 24 | +from torchtitan.train import Trainer |
| 25 | + |
| 26 | + |
| 27 | +class FtTrainer(Trainer): |
| 28 | + # Override the train method to add fault tolerance |
| 29 | + @record |
| 30 | + def train(self): |
| 31 | + job_config = self.job_config |
| 32 | + |
| 33 | + self.checkpointer.load(step=job_config.checkpoint.load_step) |
| 34 | + logger.info(f"Training starts at step {self.step + 1}.") |
| 35 | + |
| 36 | + with maybe_enable_profiling( |
| 37 | + job_config, global_step=self.step |
| 38 | + ) as torch_profiler, maybe_enable_memory_snapshot( |
| 39 | + job_config, global_step=self.step |
| 40 | + ) as memory_profiler, ft.maybe_semi_sync_training( |
| 41 | + job_config, |
| 42 | + ft_manager=self.ft_manager, |
| 43 | + model=self.model_parts[0], |
| 44 | + optimizer=self.optimizers, |
| 45 | + sync_every=2, |
| 46 | + ) as semi_sync_training: |
| 47 | + data_iterator = iter(self.dataloader) |
| 48 | + |
| 49 | + while self.step < job_config.training.steps: |
| 50 | + self.step += 1 |
| 51 | + self.gc_handler.run(self.step) |
| 52 | + inputs, labels = self.next_batch(data_iterator) |
| 53 | + self.train_step(inputs, labels) |
| 54 | + self.checkpointer.save( |
| 55 | + self.step, force=(self.step == job_config.training.steps) |
| 56 | + ) |
| 57 | + |
| 58 | + # signal the profiler that the next profiling step has started |
| 59 | + if torch_profiler: |
| 60 | + torch_profiler.step() |
| 61 | + if memory_profiler: |
| 62 | + memory_profiler.step() |
| 63 | + |
| 64 | + # reduce timeout after first train step for faster signal |
| 65 | + # (assuming lazy init and compilation are finished) |
| 66 | + if self.step == 1: |
| 67 | + dist_utils.set_pg_timeouts( |
| 68 | + timeout=timedelta( |
| 69 | + seconds=job_config.comm.train_timeout_seconds |
| 70 | + ), |
| 71 | + world_mesh=self.world_mesh, |
| 72 | + ) |
| 73 | + |
| 74 | + if torch.distributed.get_rank() == 0: |
| 75 | + logger.info("Sleeping 2 seconds for other ranks to complete") |
| 76 | + time.sleep(2) |
| 77 | + |
| 78 | + self.metrics_processor.close() |
| 79 | + logger.info("Training completed") |
| 80 | + |
| 81 | + |
| 82 | +if __name__ == "__main__": |
| 83 | + init_logger() |
| 84 | + config = JobConfig() |
| 85 | + config.maybe_add_custom_args() |
| 86 | + config.parse_args() |
| 87 | + trainer: Optional[Trainer] = None |
| 88 | + |
| 89 | + try: |
| 90 | + trainer = FtTrainer(config) |
| 91 | + |
| 92 | + if config.checkpoint.create_seed_checkpoint: |
| 93 | + assert int( |
| 94 | + os.environ["WORLD_SIZE"] |
| 95 | + ), "Must create seed checkpoint using a single device, to disable sharding." |
| 96 | + assert ( |
| 97 | + config.checkpoint.enable_checkpoint |
| 98 | + ), "Must enable checkpointing when creating a seed checkpoint." |
| 99 | + trainer.checkpointer.save(curr_step=0, force=True) |
| 100 | + logger.info("Created seed checkpoint") |
| 101 | + else: |
| 102 | + trainer.train() |
| 103 | + finally: |
| 104 | + if trainer: |
| 105 | + trainer.close() |
| 106 | + |
| 107 | + if torch.distributed.is_initialized(): |
| 108 | + torch.distributed.destroy_process_group() |
| 109 | + logger.info("Process group destroyed.") |
0 commit comments