From ab6be0d2446ca72e761b9543c6e7b3e4fb802f4e Mon Sep 17 00:00:00 2001 From: Georgios Ioannides <32945094+gioannides@users.noreply.github.com> Date: Thu, 14 Nov 2024 10:49:16 -0800 Subject: [PATCH] Fix model checkpoint saving issue when using PEFT (#727) Fix model checkpoint saving issue when using PEFT, the is no check for whether the directory already exists resulting in error when using distributed training Co-authored-by: gioannides --- optimum/neuron/utils/peft_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/neuron/utils/peft_utils.py b/optimum/neuron/utils/peft_utils.py index 4855f2ef2..8c753ec38 100644 --- a/optimum/neuron/utils/peft_utils.py +++ b/optimum/neuron/utils/peft_utils.py @@ -176,7 +176,7 @@ def state_dict(self): adapter_shards_dir_model = os.path.join(output_dir, "adapter_shards", "model") if not os.path.isdir(adapter_shards_dir_model): - os.makedirs(adapter_shards_dir_model) + os.makedirs(adapter_shards_dir_model, exist_ok=True) dummy_mod = DummyModule() neuronx_distributed.trainer.save_checkpoint(