From 46625b0b0633fcd4c1c0e2e3e9fde0c1127a384c Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 18:46:27 +0000 Subject: [PATCH 01/16] tweaks to work local --- requirements.txt | 4 ++-- train/__init__.py | 3 ++- train/base.py | 2 +- train/train_c.py | 3 ++- train/train_c_lora.py | 4 +++- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0a2397e..9e5c031 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate>=0.25.0 -torch==2.1.2+cu118 -torchvision==0.16.2+cu118 +#torch==2.1.2+cu118 +#torchvision==0.16.2+cu118 transformers>=4.30.0 numpy>=1.23.5 kornia>=0.7.0 diff --git a/train/__init__.py b/train/__init__.py index 2a65075..484f69b 100755 --- a/train/__init__.py +++ b/train/__init__.py @@ -1,4 +1,5 @@ from .train_b import WurstCore as WurstCoreB from .train_c import WurstCore as WurstCoreC from .train_c_controlnet import WurstCore as ControlNetCore -from .train_c_lora import WurstCore as LoraCore \ No newline at end of file +from .train_c_lora import WurstCore as LoraCore + diff --git a/train/base.py b/train/base.py index 4e8a6ef..2b2a83a 100755 --- a/train/base.py +++ b/train/base.py @@ -310,7 +310,7 @@ def train(self, data: WarpCore.Data, extras: WarpCore.Extras, models: Models, op self.sample(models, data, extras) def save_checkpoints(self, models: Models, optimizers: Optimizers, suffix=None): - barrier() + #barrier() suffix = '' if suffix is None else suffix self.save_info(self.info, suffix=suffix) models_dict = models.to_dict() diff --git a/train/train_c.py b/train/train_c.py index 87c6608..4465b95 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -192,7 +192,8 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> TrainingCore.Optimizers: - optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + import bitsandbytes as bnb + optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'generator_optim', fsdp_model=models.generator if self.config.use_fsdp else None) return self.Optimizers(generator=optimizer) diff --git a/train/train_c_lora.py b/train/train_c_lora.py index 8b83eee..2e71abf 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -5,7 +5,9 @@ from warmup_scheduler import GradualWarmupScheduler import sys +sys.path.append('.') import os +print(os.getcwd()) import re from dataclasses import dataclass @@ -327,4 +329,4 @@ def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, ext warpcore.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD # RUN TRAINING - warpcore() + warpcore(single_gpu=True) From 15afedc09a37b1f604a11a852f4f80246bce3468 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 19:10:56 +0000 Subject: [PATCH 02/16] uncomment, more changes --- core/__init__.py | 2 +- requirements.txt | 4 ++-- train/train_c.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/__init__.py b/core/__init__.py index 03af283..43ee89a 100644 --- a/core/__init__.py +++ b/core/__init__.py @@ -297,7 +297,7 @@ def __call__(self, single_gpu=False): if self.is_main_node: print() - print("**STARTIG JOB WITH CONFIG:**") + print("**STARTING JOB WITH CONFIG:**") print(yaml.dump(self.config.to_dict(), default_flow_style=False)) print("------------------------------------") print() diff --git a/requirements.txt b/requirements.txt index 9e5c031..0a2397e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate>=0.25.0 -#torch==2.1.2+cu118 -#torchvision==0.16.2+cu118 +torch==2.1.2+cu118 +torchvision==0.16.2+cu118 transformers>=4.30.0 numpy>=1.23.5 kornia>=0.7.0 diff --git a/train/train_c.py b/train/train_c.py index 4465b95..d819d3f 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -192,6 +192,7 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> TrainingCore.Optimizers: + print('using bitsandbytes') import bitsandbytes as bnb optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'generator_optim', From 12fe393852f9dd52f76f8acd8803a9f4bfb95a1c Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 13:59:03 -0700 Subject: [PATCH 03/16] check path --- train/train_c_lora.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/train/train_c_lora.py b/train/train_c_lora.py index 2e71abf..ec946e1 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -178,6 +178,7 @@ def dummy_context(): else: raise ValueError(f"Unknown model version {self.config.model_version}") + print(self.config.generator_checkpoint_path) if self.config.generator_checkpoint_path is not None: if loading_context is dummy_context: generator.load_state_dict(load_or_fail(self.config.generator_checkpoint_path)) @@ -254,7 +255,10 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> Optimizers: - optimizer = optim.AdamW(models.lora.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + print('using bitsandbytes') + import bitsandbytes as bnb + optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + #optimizer = optim.AdamW(models.lora.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'lora_optim', fsdp_model=models.lora if self.config.use_fsdp else None) return self.Optimizers(generator=None, lora=optimizer) From 21607947bd2246c444cd5282f00e2395bc17b9fd Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:07:31 -0700 Subject: [PATCH 04/16] print more --- core/utils/save_and_load.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/utils/save_and_load.py b/core/utils/save_and_load.py index 0215f66..04a1f47 100644 --- a/core/utils/save_and_load.py +++ b/core/utils/save_and_load.py @@ -32,6 +32,7 @@ def safe_save(ckpt, path): def load_or_fail(path, wandb_run_id=None): + print(path) accepted_extensions = [".pt", ".ckpt", ".json", ".safetensors"] try: assert any( @@ -45,6 +46,7 @@ def load_or_fail(path, wandb_run_id=None): with open(path, "r", encoding="utf-8") as f: checkpoint = json.load(f) elif path.endswith(".safetensors"): + print(path) checkpoint = {} with safetensors.safe_open(path, framework="pt", device="cpu") as f: for key in f.keys(): From e0fad3693bbcfdd02e3dbef7815c785b6f733f34 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:09:24 -0700 Subject: [PATCH 05/16] logs --- core/utils/save_and_load.py | 4 ++-- train/train_c_lora.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/utils/save_and_load.py b/core/utils/save_and_load.py index 04a1f47..ae462bf 100644 --- a/core/utils/save_and_load.py +++ b/core/utils/save_and_load.py @@ -32,7 +32,7 @@ def safe_save(ckpt, path): def load_or_fail(path, wandb_run_id=None): - print(path) + print(f'load_or_fail {path}') accepted_extensions = [".pt", ".ckpt", ".json", ".safetensors"] try: assert any( @@ -46,7 +46,7 @@ def load_or_fail(path, wandb_run_id=None): with open(path, "r", encoding="utf-8") as f: checkpoint = json.load(f) elif path.endswith(".safetensors"): - print(path) + print(f'load_or_fail2 {path}') checkpoint = {} with safetensors.safe_open(path, framework="pt", device="cpu") as f: for key in f.keys(): diff --git a/train/train_c_lora.py b/train/train_c_lora.py index ec946e1..74a9dfd 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -178,7 +178,7 @@ def dummy_context(): else: raise ValueError(f"Unknown model version {self.config.model_version}") - print(self.config.generator_checkpoint_path) + print(f"setup_models {self.config.generator_checkpoint_path}") if self.config.generator_checkpoint_path is not None: if loading_context is dummy_context: generator.load_state_dict(load_or_fail(self.config.generator_checkpoint_path)) From dd647c947d88c2ecbd039d30421833c0d931f10e Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:36:23 -0700 Subject: [PATCH 06/16] Add checks for device, Local slurm id first, then cuda, then cpui --- configs/inference/lora_c_1b_bfloat16.yaml | 38 +++++++++++++++++++++++ core/utils/save_and_load.py | 2 -- train/train_b.py | 3 +- train/train_c.py | 4 ++- train/train_c_controlnet.py | 3 +- train/train_c_lora.py | 5 ++- 6 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 configs/inference/lora_c_1b_bfloat16.yaml diff --git a/configs/inference/lora_c_1b_bfloat16.yaml b/configs/inference/lora_c_1b_bfloat16.yaml new file mode 100644 index 0000000..8ccdb32 --- /dev/null +++ b/configs/inference/lora_c_1b_bfloat16.yaml @@ -0,0 +1,38 @@ +# GLOBAL STUFF +experiment_id: stage_c_1b_lora +checkpoint_path: ~/cascade/chk +output_path: ~/cascade/lora_sample +model_version: 1B + +# TRAINING PARAMS +lr: 1.0e-4 +batch_size: 40 +image_size: 768 +multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16] +grad_accum_steps: 4 +updates: 10000 +backup_every: 1000 +save_every: 100 +warmup_updates: 1 +# use_fsdp: True -> FSDP doesn't work at the moment for LoRA +use_fsdp: False + +# GDF +# adaptive_loss_weight: True + +# LoRA specific. 'No Defect Train Railcar Wheel' +module_filters: ['.attn'] +rank: 4 +train_tokens: + # - ['^snail', null] # token starts with "snail" -> "snail" & "snails", don't need to be reinitialized + - ['[fernando]', '^dog'] # custom token [snail], initialize as avg of snail & snails + + +# ema_start_iters: 5000 +# ema_iters: 100 +# ema_beta: 0.9 + +webdataset_path: file:/home/asutermo/cascade/data/dataset.tar +effnet_checkpoint_path: models/effnet_encoder.safetensors +previewer_checkpoint_path: models/previewer.safetensors +generator_checkpoint_path: models/stage_c_lite_bf16.safetensors \ No newline at end of file diff --git a/core/utils/save_and_load.py b/core/utils/save_and_load.py index ae462bf..0215f66 100644 --- a/core/utils/save_and_load.py +++ b/core/utils/save_and_load.py @@ -32,7 +32,6 @@ def safe_save(ckpt, path): def load_or_fail(path, wandb_run_id=None): - print(f'load_or_fail {path}') accepted_extensions = [".pt", ".ckpt", ".json", ".safetensors"] try: assert any( @@ -46,7 +45,6 @@ def load_or_fail(path, wandb_run_id=None): with open(path, "r", encoding="utf-8") as f: checkpoint = json.load(f) elif path.endswith(".safetensors"): - print(f'load_or_fail2 {path}') checkpoint = {} with safetensors.safe_open(path, framework="pt", device="cpu") as f: for key in f.keys(): diff --git a/train/train_b.py b/train/train_b.py index 02b7b6e..0b68d44 100755 --- a/train/train_b.py +++ b/train/train_b.py @@ -294,9 +294,10 @@ def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, ext if __name__ == '__main__': print("Launching Script") + device = torch.device(int(os.environ.get('SLURM_LOCALID')) if 'SLURM_LOCALID' in os.environ else "cuda" if torch.cuda.is_available() else "cpu") warpcore = WurstCore( config_file_path=sys.argv[1] if len(sys.argv) > 1 else None, - device=torch.device(int(os.environ.get("SLURM_LOCALID"))) + device=device ) # core.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD diff --git a/train/train_c.py b/train/train_c.py index d819d3f..c333634 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -258,9 +258,11 @@ def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, ext if __name__ == '__main__': print("Launching Script") + + device = torch.device(int(os.environ.get('SLURM_LOCALID')) if 'SLURM_LOCALID' in os.environ else "cuda" if torch.cuda.is_available() else "cpu") warpcore = WurstCore( config_file_path=sys.argv[1] if len(sys.argv) > 1 else None, - device=torch.device(int(os.environ.get("SLURM_LOCALID"))) + device=device ) # core.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD diff --git a/train/train_c_controlnet.py b/train/train_c_controlnet.py index 59d58eb..1223b05 100755 --- a/train/train_c_controlnet.py +++ b/train/train_c_controlnet.py @@ -372,9 +372,10 @@ def sample(self, models: Models, data: WarpCore.Data, extras: Extras): if __name__ == '__main__': print("Launching Script") + device = torch.device(int(os.environ.get('SLURM_LOCALID')) if 'SLURM_LOCALID' in os.environ else "cuda" if torch.cuda.is_available() else "cpu") warpcore = WurstCore( config_file_path=sys.argv[1] if len(sys.argv) > 1 else None, - device=torch.device(int(os.environ.get("SLURM_LOCALID"))) + device=device ) warpcore.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD diff --git a/train/train_c_lora.py b/train/train_c_lora.py index 74a9dfd..3042d95 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -168,7 +168,6 @@ def dummy_context(): yield None loading_context = dummy_context if self.config.training else init_empty_weights - with loading_context(): # Diffusion models if self.config.model_version == '3.6B': @@ -178,7 +177,6 @@ def dummy_context(): else: raise ValueError(f"Unknown model version {self.config.model_version}") - print(f"setup_models {self.config.generator_checkpoint_path}") if self.config.generator_checkpoint_path is not None: if loading_context is dummy_context: generator.load_state_dict(load_or_fail(self.config.generator_checkpoint_path)) @@ -326,9 +324,10 @@ def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, ext if __name__ == '__main__': print("Launching Script") + device = torch.device(int(os.environ.get('SLURM_LOCALID')) if 'SLURM_LOCALID' in os.environ else "cuda" if torch.cuda.is_available() else "cpu") warpcore = WurstCore( config_file_path=sys.argv[1] if len(sys.argv) > 1 else None, - device=torch.device(int(os.environ.get("SLURM_LOCALID"))) + device=device ) warpcore.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD From 46c494ddde8a052b150788b3dbefde4e13cf5d8d Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:39:12 -0700 Subject: [PATCH 07/16] single gpu --- train/train_b.py | 3 ++- train/train_c.py | 4 +++- train/train_c_controlnet.py | 4 +++- train/train_c_lora.py | 3 ++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/train/train_b.py b/train/train_b.py index 0b68d44..c60cfc2 100755 --- a/train/train_b.py +++ b/train/train_b.py @@ -302,4 +302,5 @@ def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, ext # core.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD # RUN TRAINING - warpcore() + use_single_gpu = torch.cuda.device_count() == 1 + warpcore(single_gpu=use_single_gpu) diff --git a/train/train_c.py b/train/train_c.py index c333634..6d1d9ff 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -267,4 +267,6 @@ def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, ext # core.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD # RUN TRAINING - warpcore() + use_single_gpu = torch.cuda.device_count() == 1 + warpcore(single_gpu=use_single_gpu) + diff --git a/train/train_c_controlnet.py b/train/train_c_controlnet.py index 1223b05..0ddb98c 100755 --- a/train/train_c_controlnet.py +++ b/train/train_c_controlnet.py @@ -380,4 +380,6 @@ def sample(self, models: Models, data: WarpCore.Data, extras: Extras): warpcore.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD # RUN TRAINING - warpcore() + use_single_gpu = torch.cuda.device_count() == 1 + warpcore(single_gpu=use_single_gpu) + diff --git a/train/train_c_lora.py b/train/train_c_lora.py index 3042d95..fa0f078 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -332,4 +332,5 @@ def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, ext warpcore.fsdp_defaults['sharding_strategy'] = ShardingStrategy.NO_SHARD # RUN TRAINING - warpcore(single_gpu=True) + use_single_gpu = torch.cuda.device_count() == 1 + warpcore(single_gpu=use_single_gpu) From 90901e5ec6f62b234c805ba125b2da5b5f1279c3 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:45:41 -0700 Subject: [PATCH 08/16] barrier if not single gpu --- core/__init__.py | 3 +++ core/templates/diffusion.py | 3 ++- train/base.py | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/core/__init__.py b/core/__init__.py index 43ee89a..c799c72 100644 --- a/core/__init__.py +++ b/core/__init__.py @@ -34,6 +34,8 @@ class Config(Base): wandb_project: str = None wandb_entity: str = None + single_gpu: bool = False + @dataclass() # not frozen, means that fields are mutable class Info(): # not inheriting from Base, because we don't want to enforce the default fields wandb_run_id: str = None @@ -141,6 +143,7 @@ def setup_config(self, config_file_path=None, config_dict=None, training=True) - return self.Config(training=training) def setup_ddp(self, experiment_id, single_gpu=False): + self.single_gpu = single_gpu if not single_gpu: local_rank = int(os.environ.get("SLURM_LOCALID")) process_id = int(os.environ.get("SLURM_PROCID")) diff --git a/core/templates/diffusion.py b/core/templates/diffusion.py index f36dc3f..9925f55 100644 --- a/core/templates/diffusion.py +++ b/core/templates/diffusion.py @@ -218,7 +218,8 @@ def models_to_save(self): return ['generator', 'generator_ema'] def save_checkpoints(self, models: Models, optimizers: Optimizers, suffix=None): - barrier() + if not self.single_gpu: + barrier() suffix = '' if suffix is None else suffix self.save_info(self.info, suffix=suffix) models_dict = models.to_dict() diff --git a/train/base.py b/train/base.py index 2b2a83a..dd29468 100755 --- a/train/base.py +++ b/train/base.py @@ -310,7 +310,8 @@ def train(self, data: WarpCore.Data, extras: WarpCore.Extras, models: Models, op self.sample(models, data, extras) def save_checkpoints(self, models: Models, optimizers: Optimizers, suffix=None): - #barrier() + if not self.single_gpu: + barrier() suffix = '' if suffix is None else suffix self.save_info(self.info, suffix=suffix) models_dict = models.to_dict() From a48594b247f2857d2cfe64c458e71c52d082e111 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:46:24 -0700 Subject: [PATCH 09/16] remove bitsandbytes for now --- train/train_c.py | 4 +--- train/train_c_lora.py | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/train/train_c.py b/train/train_c.py index 6d1d9ff..9db71dc 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -192,9 +192,7 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> TrainingCore.Optimizers: - print('using bitsandbytes') - import bitsandbytes as bnb - optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + optimizer = optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'generator_optim', fsdp_model=models.generator if self.config.use_fsdp else None) return self.Optimizers(generator=optimizer) diff --git a/train/train_c_lora.py b/train/train_c_lora.py index fa0f078..dda723e 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -253,10 +253,7 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> Optimizers: - print('using bitsandbytes') - import bitsandbytes as bnb - optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) - #optimizer = optim.AdamW(models.lora.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + optimizer = optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'lora_optim', fsdp_model=models.lora if self.config.use_fsdp else None) return self.Optimizers(generator=None, lora=optimizer) From 2353ce1a6ee6400adcd25cc7622c640b2b4a6478 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:48:34 -0700 Subject: [PATCH 10/16] more fixes --- train/train_c.py | 2 +- train/train_c_lora.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/train/train_c.py b/train/train_c.py index 9db71dc..47e9104 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -192,7 +192,7 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> TrainingCore.Optimizers: - optimizer = optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'generator_optim', fsdp_model=models.generator if self.config.use_fsdp else None) return self.Optimizers(generator=optimizer) diff --git a/train/train_c_lora.py b/train/train_c_lora.py index dda723e..7b83125 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -253,7 +253,7 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> Optimizers: - optimizer = optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'lora_optim', fsdp_model=models.lora if self.config.use_fsdp else None) return self.Optimizers(generator=None, lora=optimizer) From f4ef10c97624936103625aa4dc60aec52c23a76b Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:49:07 -0700 Subject: [PATCH 11/16] cleanup sys --- train/train_c_lora.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/train/train_c_lora.py b/train/train_c_lora.py index 7b83125..204c438 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -5,9 +5,7 @@ from warmup_scheduler import GradualWarmupScheduler import sys -sys.path.append('.') import os -print(os.getcwd()) import re from dataclasses import dataclass From 94c17305e0c2fa08d679ec42dfd2bb10e7f6fb7c Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 14:55:41 -0700 Subject: [PATCH 12/16] cleanup: remove unused imports --- train/train_c_controlnet.py | 4 +--- train/train_c_lora.py | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/train/train_c_controlnet.py b/train/train_c_controlnet.py index 0ddb98c..1239e6e 100755 --- a/train/train_c_controlnet.py +++ b/train/train_c_controlnet.py @@ -15,9 +15,8 @@ from modules import EfficientNetEncoder from modules import StageC -from modules import ResBlock, AttnBlock, TimestepBlock, FeedForwardBlock from modules import Previewer -from modules import ControlNet, ControlNetDeliverer +from modules import ControlNet from modules import controlnet_filters from train.base import DataCore, TrainingCore @@ -26,7 +25,6 @@ from core.utils import EXPECTED, EXPECTED_TRAIN, load_or_fail from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy -from torch.distributed.fsdp.wrap import ModuleWrapPolicy from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy import functools from accelerate import init_empty_weights diff --git a/train/train_c_lora.py b/train/train_c_lora.py index 204c438..5d0764d 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -15,7 +15,6 @@ from modules.effnet import EfficientNetEncoder from modules.stage_c import StageC -from modules.stage_c import ResBlock, AttnBlock, TimestepBlock, FeedForwardBlock from modules.previewer import Previewer from modules.lora import apply_lora, apply_retoken, LoRA, ReToken @@ -26,8 +25,6 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy from torch.distributed.fsdp.wrap import ModuleWrapPolicy -from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy -import functools from accelerate import init_empty_weights from accelerate.utils import set_module_tensor_to_device from contextlib import contextmanager From 39fee6db184cd18a9d3187e819edc2867983cf12 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 15:02:43 -0700 Subject: [PATCH 13/16] setup 8bit adam --- train/base.py | 3 +++ train/train_b.py | 10 +++++++++- train/train_c.py | 10 +++++++++- train/train_c_controlnet.py | 10 +++++++++- train/train_c_lora.py | 10 +++++++++- 5 files changed, 39 insertions(+), 4 deletions(-) diff --git a/train/base.py b/train/base.py index dd29468..474bb17 100755 --- a/train/base.py +++ b/train/base.py @@ -195,6 +195,9 @@ class Config(DataCore.Config, WarpCore.Config): use_fsdp: bool = None + # Optimizer Params + use_8bit_adam: bool = False + @dataclass() # not frozen, means that fields are mutable. Doesn't support EXPECTED class Info(WarpCore.Info): ema_loss: float = None diff --git a/train/train_b.py b/train/train_b.py index c60cfc2..9ac987a 100755 --- a/train/train_b.py +++ b/train/train_b.py @@ -209,7 +209,15 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> TrainingCore.Optimizers: - optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + if self.config.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + else: + optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'generator_optim', fsdp_model=models.generator if self.config.use_fsdp else None) return self.Optimizers(generator=optimizer) diff --git a/train/train_c.py b/train/train_c.py index 47e9104..9bf8156 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -192,7 +192,15 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> TrainingCore.Optimizers: - optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + if self.config.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + else: + optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'generator_optim', fsdp_model=models.generator if self.config.use_fsdp else None) return self.Optimizers(generator=optimizer) diff --git a/train/train_c_controlnet.py b/train/train_c_controlnet.py index 1239e6e..2eccc66 100755 --- a/train/train_c_controlnet.py +++ b/train/train_c_controlnet.py @@ -233,7 +233,15 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> Optimizers: - optimizer = optim.AdamW(models.controlnet.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + if self.config.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + else: + optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'controlnet_optim', fsdp_model=models.controlnet if self.config.use_fsdp else None) return self.Optimizers(generator=None, controlnet=optimizer) diff --git a/train/train_c_lora.py b/train/train_c_lora.py index 5d0764d..fee3eff 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -248,7 +248,15 @@ def dummy_context(): ) def setup_optimizers(self, extras: Extras, models: Models) -> Optimizers: - optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + if self.config.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + optimizer = bnb.optim.AdamW8bit(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) + else: + optimizer = optim.AdamW(models.generator.parameters(), lr=self.config.lr) # , eps=1e-7, betas=(0.9, 0.95)) optimizer = self.load_optimizer(optimizer, 'lora_optim', fsdp_model=models.lora if self.config.use_fsdp else None) return self.Optimizers(generator=None, lora=optimizer) From 900188c42824daa9fb9f678c3a52fa634d7d7e81 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 22:14:10 +0000 Subject: [PATCH 14/16] 8bit adam support, move config --- configs/{inference => training}/lora_c_1b_bfloat16.yaml | 0 train/base.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename configs/{inference => training}/lora_c_1b_bfloat16.yaml (100%) diff --git a/configs/inference/lora_c_1b_bfloat16.yaml b/configs/training/lora_c_1b_bfloat16.yaml similarity index 100% rename from configs/inference/lora_c_1b_bfloat16.yaml rename to configs/training/lora_c_1b_bfloat16.yaml diff --git a/train/base.py b/train/base.py index 474bb17..a732d24 100755 --- a/train/base.py +++ b/train/base.py @@ -196,7 +196,7 @@ class Config(DataCore.Config, WarpCore.Config): use_fsdp: bool = None # Optimizer Params - use_8bit_adam: bool = False + use_8bit_adam: bool = None @dataclass() # not frozen, means that fields are mutable. Doesn't support EXPECTED class Info(WarpCore.Info): From f0c474c682444dd72d0ddb6f7bc39cb7d35a8caf Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Fri, 16 Feb 2024 22:15:29 +0000 Subject: [PATCH 15/16] pathing --- configs/training/lora_c_1b_bfloat16.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/training/lora_c_1b_bfloat16.yaml b/configs/training/lora_c_1b_bfloat16.yaml index 8ccdb32..4bd56a5 100644 --- a/configs/training/lora_c_1b_bfloat16.yaml +++ b/configs/training/lora_c_1b_bfloat16.yaml @@ -1,7 +1,7 @@ # GLOBAL STUFF experiment_id: stage_c_1b_lora -checkpoint_path: ~/cascade/chk -output_path: ~/cascade/lora_sample +checkpoint_path: /tmp/cascade/chk +output_path: /tmp/cascade/lora_sample model_version: 1B # TRAINING PARAMS @@ -35,4 +35,4 @@ train_tokens: webdataset_path: file:/home/asutermo/cascade/data/dataset.tar effnet_checkpoint_path: models/effnet_encoder.safetensors previewer_checkpoint_path: models/previewer.safetensors -generator_checkpoint_path: models/stage_c_lite_bf16.safetensors \ No newline at end of file +generator_checkpoint_path: models/stage_c_lite_bf16.safetensors From cb4656da3cc210481f52bacafc9e462c6167d4a1 Mon Sep 17 00:00:00 2001 From: Andrew Suter-Morris Date: Mon, 19 Feb 2024 17:24:50 +0000 Subject: [PATCH 16/16] don't allow single_gpu and fsdp --- train/train_b.py | 2 +- train/train_c.py | 2 +- train/train_c_controlnet.py | 2 +- train/train_c_lora.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/train/train_b.py b/train/train_b.py index 9ac987a..8ab2da2 100755 --- a/train/train_b.py +++ b/train/train_b.py @@ -189,7 +189,7 @@ def dummy_context(): generator_ema = self.load_model(generator_ema, 'generator_ema') generator_ema.to(dtype).to(self.device).eval().requires_grad_(False) - if self.config.use_fsdp: + if not self.single_gpu and self.config.use_fsdp: fsdp_auto_wrap_policy = ModuleWrapPolicy([ResBlock, AttnBlock, TimestepBlock, FeedForwardBlock]) generator = FSDP(generator, **self.fsdp_defaults, auto_wrap_policy=fsdp_auto_wrap_policy, device_id=self.device) if generator_ema is not None: diff --git a/train/train_c.py b/train/train_c.py index 9bf8156..1f748bb 100755 --- a/train/train_c.py +++ b/train/train_c.py @@ -174,7 +174,7 @@ def dummy_context(): generator_ema = self.load_model(generator_ema, 'generator_ema') generator_ema.to(dtype).to(self.device).eval().requires_grad_(False) - if self.config.use_fsdp: + if not self.single_gpu and self.config.use_fsdp: fsdp_auto_wrap_policy = ModuleWrapPolicy([ResBlock, AttnBlock, TimestepBlock, FeedForwardBlock]) generator = FSDP(generator, **self.fsdp_defaults, auto_wrap_policy=fsdp_auto_wrap_policy, device_id=self.device) if generator_ema is not None: diff --git a/train/train_c_controlnet.py b/train/train_c_controlnet.py index 2eccc66..075fbfa 100755 --- a/train/train_c_controlnet.py +++ b/train/train_c_controlnet.py @@ -221,7 +221,7 @@ def dummy_context(): controlnet = self.load_model(controlnet, 'controlnet') controlnet.backbone.eval().requires_grad_(True) - if self.config.use_fsdp: + if not self.single_gpu and self.config.use_fsdp: fsdp_auto_wrap_policy = functools.partial(size_based_auto_wrap_policy, min_num_params=3000) controlnet = FSDP(controlnet, **self.fsdp_defaults, auto_wrap_policy=fsdp_auto_wrap_policy, device_id=self.device) diff --git a/train/train_c_lora.py b/train/train_c_lora.py index fee3eff..5aaffb6 100755 --- a/train/train_c_lora.py +++ b/train/train_c_lora.py @@ -181,7 +181,7 @@ def dummy_context(): generator = generator.to(dtype).to(self.device) generator = self.load_model(generator, 'generator') - # if self.config.use_fsdp: + # if not self.single_gpu and self.config.use_fsdp: # fsdp_auto_wrap_policy = functools.partial(size_based_auto_wrap_policy, min_num_params=3000) # generator = FSDP(generator, **self.fsdp_defaults, auto_wrap_policy=fsdp_auto_wrap_policy, device_id=self.device) @@ -235,7 +235,7 @@ def dummy_context(): lora = self.load_model(lora, 'lora') lora.to(self.device).train().requires_grad_(True) - if self.config.use_fsdp: + if not self.single_gpu and self.config.use_fsdp: # fsdp_auto_wrap_policy = functools.partial(size_based_auto_wrap_policy, min_num_params=3000) fsdp_auto_wrap_policy = ModuleWrapPolicy([LoRA, ReToken]) lora = FSDP(lora, **self.fsdp_defaults, auto_wrap_policy=fsdp_auto_wrap_policy, device_id=self.device)