From 74e1915465957c7d33b164bfd96532a506b681aa Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Mon, 21 Apr 2025 09:06:35 +0000 Subject: [PATCH 1/4] Use logger in place of print statements in finetuning scripts Signed-off-by: Mamta Singh --- QEfficient/cloud/finetune.py | 15 +++++---- QEfficient/finetune/dataset/custom_dataset.py | 8 +++-- .../finetune/dataset/grammar_dataset.py | 10 +++--- QEfficient/finetune/eval.py | 10 +++--- QEfficient/finetune/utils/plot_metrics.py | 6 ++-- QEfficient/finetune/utils/train_utils.py | 33 ++++++++++--------- 6 files changed, 46 insertions(+), 36 deletions(-) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index f312d00cb..474f7864c 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -31,11 +31,12 @@ ) from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train from QEfficient.utils._utils import login_and_download_hf_lm +from QEfficient.utils.logging_utils import logger try: import torch_qaic # noqa: F401 except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.warning(f"{e}. Moving ahead without these qaic modules.") from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer @@ -114,7 +115,7 @@ def main(**kwargs): # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") + logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.") model.resize_token_embeddings(len(tokenizer)) print_model_size(model, train_config) @@ -163,10 +164,10 @@ def main(**kwargs): # ) ## train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train") - print("length of dataset_train", len(dataset_train)) + logger.info("length of dataset_train", len(dataset_train)) custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config) if custom_data_collator: - print("custom_data_collator is used") + logger.info("custom_data_collator is used") train_dl_kwargs["collate_fn"] = custom_data_collator # Create DataLoaders for the training and validation dataset @@ -176,7 +177,7 @@ def main(**kwargs): pin_memory=True, **train_dl_kwargs, ) - print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}") + logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}") eval_dataloader = None if train_config.run_validation: @@ -200,7 +201,7 @@ def main(**kwargs): f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") longest_seq_length, _ = get_longest_seq_length( torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset]) @@ -208,7 +209,7 @@ def main(**kwargs): else: longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset) - print( + logger.info( f"The longest sequence length in the train data is {longest_seq_length}, " f"passed context length is {train_config.context_length} and overall model's context length is " f"{model.config.max_position_embeddings}" diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 4bee06c58..7164e13a1 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -8,6 +8,8 @@ import importlib from pathlib import Path +from QEfficient.utils.logging_utils import logger + def load_module_from_py_file(py_file: str) -> object: """ @@ -40,7 +42,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): try: return getattr(module, func_name)(dataset_config, tokenizer, split) except AttributeError as e: - print( + logger.error( f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." ) raise e @@ -63,6 +65,6 @@ def get_data_collator(dataset_processer, dataset_config): try: return getattr(module, func_name)(dataset_processer) except AttributeError: - print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") - print("Using the default data_collator instead.") + logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") + logger.info("Using the default data_collator instead.") return None diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py index 8f04b7544..6ebeeb2d1 100644 --- a/QEfficient/finetune/dataset/grammar_dataset.py +++ b/QEfficient/finetune/dataset/grammar_dataset.py @@ -10,6 +10,8 @@ from datasets import load_dataset from torch.utils.data import Dataset +from QEfficient.utils.logging_utils import logger + class grammar(Dataset): def __init__(self, tokenizer, csv_name=None, context_length=None): @@ -20,7 +22,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None): delimiter=",", ) except Exception as e: - print( + logger.error( "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset." ) raise e @@ -36,7 +38,7 @@ def convert_to_features(self, example_batch): # Create prompt and tokenize contexts and questions if self.print_text: - print("Input Text: ", self.clean_text(example_batch["text"])) + logger.info("Input Text: ", self.clean_text(example_batch["text"])) input_ = example_batch["input"] target_ = example_batch["target"] @@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None): """cover function for handling loading the working dataset""" """dataset loading""" currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv" - print(f"Loading dataset {currPath}") + logger.info(f"Loading dataset {currPath}") csv_name = str(currPath) - print(csv_name) + logger.info(csv_name) dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length) return dataset diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py index 918230554..fe9d9ae6a 100644 --- a/QEfficient/finetune/eval.py +++ b/QEfficient/finetune/eval.py @@ -25,12 +25,14 @@ ) from utils.train_utils import evaluation, print_model_size +from QEfficient.utils.logging_utils import logger + try: import torch_qaic # noqa: F401 device = "qaic:0" except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.warning(f"{e}. Moving ahead without these qaic modules.") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Suppress all warnings @@ -76,7 +78,7 @@ def main(**kwargs): # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") + logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.") model.resize_token_embeddings(len(tokenizer)) print_model_size(model, train_config) @@ -107,13 +109,13 @@ def main(**kwargs): pin_memory=True, **val_dl_kwargs, ) - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") if len(eval_dataloader) == 0: raise ValueError( f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") model.to(device) _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device) diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py index 5fc54f279..e2dd37f49 100644 --- a/QEfficient/finetune/utils/plot_metrics.py +++ b/QEfficient/finetune/utils/plot_metrics.py @@ -11,6 +11,8 @@ import matplotlib.pyplot as plt +from QEfficient.utils.logging_utils import logger + def plot_metric(data, metric_name, x_label, y_label, title, colors): plt.figure(figsize=(7, 6)) @@ -67,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors): def plot_metrics(file_path): if not os.path.exists(file_path): - print(f"File {file_path} does not exist.") + logger.error(f"File {file_path} does not exist.") return with open(file_path, "r") as f: try: data = json.load(f) except json.JSONDecodeError: - print("Invalid JSON file.") + logger.error("Invalid JSON file.") return directory = os.path.dirname(file_path) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 2bc701008..68b884cab 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -19,6 +19,7 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG +from QEfficient.utils.logging_utils import logger try: import torch_qaic # noqa: F401 @@ -27,7 +28,7 @@ import torch_qaic.utils as qaic_utils # noqa: F401 from torch.qaic.amp import GradScaler as QAicGradScaler except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.warning(f"{e}. Moving ahead without these qaic modules.") from torch.amp import GradScaler @@ -116,12 +117,12 @@ def train( for epoch in range(train_config.num_epochs): if loss_0_counter.item() == train_config.convergence_counter: if train_config.enable_ddp: - print( + logger.info( f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." ) break else: - print( + logger.info( f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." ) break @@ -129,13 +130,13 @@ def train( if train_config.use_peft and train_config.from_peft_checkpoint: intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 if epoch < intermediate_epoch: - print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") + logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") # to bring the count of train_step in sync with where it left off total_train_steps += len(train_dataloader) continue - print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") - print(f"train_config.max_train_step: {train_config.max_train_step}") + logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") + logger.info(f"train_config.max_train_step: {train_config.max_train_step}") # stop when the maximum number of training steps is reached if max_steps_reached: break @@ -162,7 +163,7 @@ def train( # to bring the count of train_step in sync with where it left off if epoch == intermediate_epoch and step == 0: total_train_steps += intermediate_step - print( + logger.info( f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them." ) if epoch == intermediate_epoch and step < intermediate_step: @@ -197,7 +198,7 @@ def train( labels = batch["labels"][:, 0] preds = torch.nn.functional.softmax(logits, dim=-1) acc_helper.forward(preds, labels) - print("Mismatches detected:", verifier.get_perop_mismatch_count()) + logger.info("Mismatches detected:", verifier.get_perop_mismatch_count()) else: model_outputs = model(**batch) loss = model_outputs.loss # Forward call @@ -279,13 +280,13 @@ def train( ) if train_config.enable_ddp: if loss_0_counter.item() == train_config.convergence_counter: - print( + logger.info( f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}." ) break else: if loss_0_counter.item() == train_config.convergence_counter: - print( + logger.info( f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning." ) break @@ -347,15 +348,15 @@ def train( if train_config.run_validation: if eval_epoch_loss < best_val_loss: best_val_loss = eval_epoch_loss - print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") + logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") val_loss.append(float(eval_epoch_loss)) val_metric.append(float(eval_metric)) if train_config.task_type == "seq_classification": - print( + logger.info( f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) else: - print( + logger.info( f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) @@ -459,7 +460,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): eval_metric = torch.exp(eval_epoch_loss) # Print evaluation metrics - print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") + logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") return eval_metric, eval_epoch_loss, val_step_loss, val_step_metric @@ -489,9 +490,9 @@ def print_model_size(model, config) -> None: model_name (str): Name of the model. """ - print(f"--> Model {config.model_name}") + logger.info(f"--> Model {config.model_name}") total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") + logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") def save_to_json( From 669cf983094e56c4afc73339063e777f0ec0db16 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Fri, 16 May 2025 14:51:18 +0530 Subject: [PATCH 2/4] Address comments Signed-off-by: Mamta Singh --- QEfficient/cloud/finetune.py | 13 ++-- QEfficient/finetune/dataset/custom_dataset.py | 2 +- .../finetune/dataset/grammar_dataset.py | 2 +- QEfficient/finetune/eval.py | 4 +- QEfficient/finetune/utils/config_utils.py | 2 +- QEfficient/finetune/utils/train_utils.py | 39 ++++-------- QEfficient/utils/logging_utils.py | 61 +++++++++++++++++++ 7 files changed, 84 insertions(+), 39 deletions(-) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index f9e427f2e..8c36bdd3d 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import logging import random import warnings from typing import Any, Dict, Optional, Union @@ -18,7 +19,7 @@ import torch.utils.data from peft import PeftModel, get_peft_model from torch.optim.lr_scheduler import StepLR -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer from QEfficient.finetune.configs.training import TrainConfig from QEfficient.finetune.utils.config_utils import ( @@ -33,7 +34,7 @@ ) from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train from QEfficient.utils._utils import login_and_download_hf_lm -from QEfficient.utils.logging_utils import logger +from QEfficient.utils.logging_utils import ft_logger as logger # Try importing QAIC-specific module, proceed without it if unavailable try: @@ -41,8 +42,8 @@ except ImportError as e: logger.warning(f"{e}. Moving ahead without these qaic modules.") +logger.setLevel(logging.INFO) -from transformers import AutoModelForSequenceClassification # Suppress all warnings warnings.filterwarnings("ignore") @@ -245,7 +246,7 @@ def setup_dataloaders( # ) ## train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train") - logger.info("length of dataset_train", len(dataset_train)) + logger.info(f"length of dataset_train = {len(dataset_train)}") # FIXME (Meet): Add custom data collator registration from the outside by the user. custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config) @@ -260,7 +261,7 @@ def setup_dataloaders( pin_memory=True, **train_dl_kwargs, ) - logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}") + logger.info(f"Num of Training Set Batches loaded = {len(train_dataloader)}") eval_dataloader = None if train_config.run_validation: @@ -284,7 +285,7 @@ def setup_dataloaders( f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") longest_seq_length, _ = get_longest_seq_length( torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset]) diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 7164e13a1..c32100f2b 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -8,7 +8,7 @@ import importlib from pathlib import Path -from QEfficient.utils.logging_utils import logger +from QEfficient.utils.logging_utils import ft_logger as logger def load_module_from_py_file(py_file: str) -> object: diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py index 6ebeeb2d1..adc8c6550 100644 --- a/QEfficient/finetune/dataset/grammar_dataset.py +++ b/QEfficient/finetune/dataset/grammar_dataset.py @@ -23,7 +23,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None): ) except Exception as e: logger.error( - "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset." + "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset." ) raise e diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py index 076628570..87f9b5417 100644 --- a/QEfficient/finetune/eval.py +++ b/QEfficient/finetune/eval.py @@ -109,13 +109,13 @@ def main(**kwargs): pin_memory=True, **val_dl_kwargs, ) - logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") if len(eval_dataloader) == 0: raise ValueError( f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") model.to(device) _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device) diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index c8b04adf4..bc45a8731 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -54,7 +54,7 @@ def update_config(config, **kwargs): raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'") else: config_type = type(config).__name__ - logger.warning(f"Unknown parameter '{k}' for config type '{config_type}'") + logger.debug(f"Unknown parameter '{k}' for config type '{config_type}'") def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None, **kwargs) -> Any: diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 4a7e59b1d..a43f6ab27 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -19,7 +19,7 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import TrainConfig -from QEfficient.utils.logging_utils import logger +from QEfficient.utils.logging_utils import ft_logger as logger try: import torch_qaic # noqa: F401 @@ -85,10 +85,7 @@ def train( device_type = device.split(":")[0] tensorboard_updates = None - if train_config.enable_ddp: - if local_rank == 0: - tensorboard_updates = SummaryWriter() - else: + if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0): tensorboard_updates = SummaryWriter() if train_config.grad_scaler: @@ -113,14 +110,9 @@ def train( # Start the training loop for epoch in range(train_config.num_epochs): if loss_0_counter.item() == train_config.convergence_counter: - if train_config.enable_ddp: - logger.info( - f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." - ) - break - else: + if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0): logger.info( - f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." + f"Skipping epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." ) break @@ -161,7 +153,7 @@ def train( if epoch == intermediate_epoch and step == 0: total_train_steps += intermediate_step logger.info( - f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them." + f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it." ) if epoch == intermediate_epoch and step < intermediate_step: total_train_steps += 1 @@ -221,10 +213,7 @@ def train( else: loss_0_counter = torch.tensor([0]).to(device) - if train_config.enable_ddp: - if local_rank == 0: - tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps) - else: + if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0): tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps) if train_config.save_metrics: @@ -275,16 +264,10 @@ def train( val_step_metric, val_metric, ) - if train_config.enable_ddp: - if loss_0_counter.item() == train_config.convergence_counter: - logger.info( - f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}." - ) - break - else: + if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0): if loss_0_counter.item() == train_config.convergence_counter: logger.info( - f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning." + f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps.Hence,stopping the fine tuning." ) break @@ -457,7 +440,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): eval_metric = torch.exp(eval_epoch_loss) # Print evaluation metrics - logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") + logger.info(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric @@ -487,9 +470,9 @@ def print_model_size(model, config) -> None: model_name (str): Name of the model. """ - logger.info(f"--> Model {config.model_name}") + logger.info(f"Model : {config.model_name}") total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") + logger.info(f"{config.model_name} has {total_params / 1e6} Million params\n") def save_to_json( diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index c17fde29c..f73344c4e 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -56,3 +56,64 @@ def create_logger() -> logging.Logger: # Define the logger object that can be used for logging purposes throughout the module. logger = create_logger() + + +def create_ft_logger(log_file="finetune.log") -> logging.Logger: + """ + Creates a logger object with Colored QEffFormatter. + """ + logger = logging.getLogger("QEfficient") + + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(QEffFormatter()) + logger.addHandler(ch) + + # create file handler and set level to debug + fh = logging.FileHandler(log_file) + fh.setLevel(logging.INFO) + fh.setFormatter(QEffFormatter()) + logger.addHandler(fh) + + return logger + + +# Define the logger object that can be used for logging purposes throughout the finetuning module. +ft_logger = create_ft_logger() +""" + +class FT_Logger: + def __init__(self, level=logging.INFO, log_file="finetune.log"): + self.logger = logging.getLogger("QEfficient") + self.logger.setLevel(level) + self.level = level + + # Create handlers + self.file_handler = logging.FileHandler(log_file) + self.console_handler = logging.StreamHandler() + + self.file_handler.setFormatter(QEffFormatter()) + self.console_handler.setFormatter(QEffFormatter()) + + # Add handlers to the logger + self.logger.addHandler(self.file_handler) + self.logger.addHandler(self.console_handler) + + def get_logger(self): + return self.logger + + def raise_valueerror(self, message): + self.logger.error(message) + raise ValueError(message) + + def raise_runtimeerror(self, message): + self.logger.error(message) + raise RuntimeError(message) + + def raise_filenotfounderror(self, message): + self.logger.error(message) + raise FileNotFoundError(message) + +ft_logger = FT_Logger().get_logger() +""" From ce14058afcbd7dd83017b1094d521b09540a393c Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Thu, 22 May 2025 10:51:30 +0000 Subject: [PATCH 3/4] Update logging_utils and log for zero rank Signed-off-by: Mamta Singh --- QEfficient/cloud/finetune.py | 43 ++++--- QEfficient/finetune/configs/training.py | 2 + QEfficient/finetune/dataset/custom_dataset.py | 15 ++- QEfficient/finetune/eval.py | 4 +- QEfficient/finetune/utils/train_utils.py | 59 ++++++---- QEfficient/utils/_utils.py | 2 +- QEfficient/utils/logging_utils.py | 110 ++++++++---------- 7 files changed, 121 insertions(+), 114 deletions(-) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 7b9487c81..8dd572564 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -32,17 +32,22 @@ get_custom_data_collator, get_preprocessed_dataset, ) -from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train +from QEfficient.finetune.utils.train_utils import ( + get_longest_seq_length, + print_model_size, + print_trainable_parameters, + train, +) from QEfficient.utils._utils import login_and_download_hf_lm -from QEfficient.utils.logging_utils import ft_logger as logger +from QEfficient.utils.logging_utils import logger + +logger.setLevel(logging.INFO) # Try importing QAIC-specific module, proceed without it if unavailable try: import torch_qaic # noqa: F401 except ImportError as e: - logger.warning(f"{e}. Moving ahead without these qaic modules.") - -logger.setLevel(logging.INFO) + logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.") # Suppress all warnings @@ -121,7 +126,7 @@ def load_model_and_tokenizer( ) if not hasattr(model, "base_model_prefix"): - raise RuntimeError("Given huggingface model does not have 'base_model_prefix' attribute.") + logger.raise_runtimeerror("Given huggingface model does not have 'base_model_prefix' attribute.") for param in getattr(model, model.base_model_prefix).parameters(): param.requires_grad = False @@ -146,7 +151,7 @@ def load_model_and_tokenizer( # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.") + logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logger.WARNING) model.resize_token_embeddings(len(tokenizer)) # FIXME (Meet): Cover below line inside the logger once it is implemented. @@ -162,7 +167,9 @@ def load_model_and_tokenizer( if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False}) else: - raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.") + logger.raise_runtimeerror( + "Given model doesn't support gradient checkpointing. Please disable it and run it." + ) model = apply_peft(model, train_config, peft_config_file, **kwargs) @@ -197,7 +204,7 @@ def apply_peft( else: peft_config = generate_peft_config(train_config, peft_config_file, **kwargs) model = get_peft_model(model, peft_config) - model.print_trainable_parameters() + print_trainable_parameters(model) return model @@ -222,7 +229,7 @@ def setup_dataloaders( - Length of longest sequence in the dataset. Raises: - ValueError: If validation is enabled but the validation set is too small. + RuntimeError: If validation is enabled but the validation set is too small. Notes: - Applies a custom data collator if provided by get_custom_data_collator. @@ -246,12 +253,12 @@ def setup_dataloaders( # ) ## train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train") - logger.info(f"length of dataset_train = {len(dataset_train)}") + logger.log_rank_zero(f"Length of dataset_train = {len(dataset_train)}") # FIXME (Meet): Add custom data collator registration from the outside by the user. custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config) if custom_data_collator: - logger.info("custom_data_collator is used") + logger.log_rank_zero("Custom_data_collator is used") train_dl_kwargs["collate_fn"] = custom_data_collator # Create DataLoaders for the training and validation dataset @@ -261,7 +268,7 @@ def setup_dataloaders( pin_memory=True, **train_dl_kwargs, ) - logger.info(f"Num of Training Set Batches loaded = {len(train_dataloader)}") + logger.log_rank_zero(f"Number of Training Set Batches loaded = {len(train_dataloader)}") eval_dataloader = None if train_config.run_validation: @@ -281,11 +288,11 @@ def setup_dataloaders( **val_dl_kwargs, ) if len(eval_dataloader) == 0: - raise ValueError( + logger.raise_runtimeerror( f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}") longest_seq_length, _ = get_longest_seq_length( torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset]) @@ -329,7 +336,7 @@ def main(peft_config_file: str = None, **kwargs) -> None: # Create DataLoaders for the training and validation dataset train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer) - logger.info( + logger.log_rank_zero( f"The longest sequence length in the train data is {longest_seq_length}, " f"passed context length is {train_config.context_length} and overall model's context length is " f"{model.config.max_position_embeddings}" @@ -340,7 +347,7 @@ def main(peft_config_file: str = None, **kwargs) -> None: scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma) if train_config.enable_ddp: model = nn.parallel.DistributedDataParallel(model, device_ids=[dist.get_rank()]) - results = train( + _ = train( model, tokenizer, train_dataloader, @@ -352,7 +359,7 @@ def main(peft_config_file: str = None, **kwargs) -> None: ) if train_config.enable_ddp: dist.destroy_process_group() - return results + return if __name__ == "__main__": diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py index 258565c07..1698e680b 100644 --- a/QEfficient/finetune/configs/training.py +++ b/QEfficient/finetune/configs/training.py @@ -105,3 +105,5 @@ class TrainConfig: grad_scaler: bool = True dump_root_dir: str = "meta-llama-samsum-mismatches/step_" opByOpVerifier: bool = False + + dump_logs: bool = True diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index d50504d38..b392e4724 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -8,7 +8,7 @@ import importlib from pathlib import Path -from QEfficient.utils.logging_utils import ft_logger as logger +from QEfficient.utils.logging_utils import logger def load_module_from_py_file(py_file: str) -> object: @@ -32,20 +32,19 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): module_path, func_name = dataset_config.file, "get_custom_dataset" if not module_path.endswith(".py"): - raise ValueError(f"Dataset file {module_path} is not a .py file.") + logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.") module_path = Path(module_path) if not module_path.is_file(): - raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") + logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") module = load_module_from_py_file(module_path.as_posix()) try: return getattr(module, func_name)(dataset_config, tokenizer, split) - except AttributeError as e: - logger.error( + except AttributeError: + logger.raise_runtimeerror( f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." ) - raise e def get_data_collator(dataset_processer, dataset_config): @@ -55,11 +54,11 @@ def get_data_collator(dataset_processer, dataset_config): module_path, func_name = dataset_config.file, "get_data_collator" if not module_path.endswith(".py"): - raise ValueError(f"Dataset file {module_path} is not a .py file.") + logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.") module_path = Path(module_path) if not module_path.is_file(): - raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") + logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") module = load_module_from_py_file(module_path.as_posix()) try: diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py index e1f9b77e6..1095fe9e9 100644 --- a/QEfficient/finetune/eval.py +++ b/QEfficient/finetune/eval.py @@ -109,13 +109,13 @@ def main(**kwargs): pin_memory=True, **val_dl_kwargs, ) - logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.log_rank_zero(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") if len(eval_dataloader) == 0: raise ValueError( f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.log_rank_zero(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}") model.to(device) _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 9089fffe1..2c38ecd2c 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -19,7 +19,7 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import TrainConfig -from QEfficient.utils.logging_utils import ft_logger as logger +from QEfficient.utils.logging_utils import logger try: import torch_qaic # noqa: F401 @@ -28,7 +28,7 @@ import torch_qaic.utils as qaic_utils # noqa: F401 from torch.qaic.amp import GradScaler as QAicGradScaler except ImportError as e: - logger.warning(f"{e}. Moving ahead without these qaic modules.") + logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.") from torch.amp import GradScaler @@ -110,22 +110,21 @@ def train( # Start the training loop for epoch in range(train_config.num_epochs): if loss_0_counter.item() == train_config.convergence_counter: - if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0): - logger.info( - f"Skipping epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." - ) - break + logger.log_rank_zero( + f"Skipping epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." + ) + break if train_config.use_peft and train_config.from_peft_checkpoint: intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 if epoch < intermediate_epoch: - logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") + logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") # to bring the count of train_step in sync with where it left off total_train_steps += len(train_dataloader) continue - logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") - logger.info(f"train_config.max_train_step: {train_config.max_train_step}") + logger.log_rank_zero(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") + logger.log_rank_zero(f"train_config.max_train_step: {train_config.max_train_step}") # stop when the maximum number of training steps is reached if max_steps_reached: break @@ -152,7 +151,7 @@ def train( # to bring the count of train_step in sync with where it left off if epoch == intermediate_epoch and step == 0: total_train_steps += intermediate_step - logger.info( + logger.log_rank_zero( f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it." ) if epoch == intermediate_epoch and step < intermediate_step: @@ -264,12 +263,11 @@ def train( val_step_metric, val_metric, ) - if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0): - if loss_0_counter.item() == train_config.convergence_counter: - logger.info( - f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps.Hence,stopping the fine tuning." - ) - break + if loss_0_counter.item() == train_config.convergence_counter: + logger.log_rank_zero( + f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps.Hence,stopping the fine tuning." + ) + break pbar.close() epoch_end_time = time.perf_counter() - epoch_start_time @@ -328,15 +326,15 @@ def train( if train_config.run_validation: if eval_epoch_loss < best_val_loss: best_val_loss = eval_epoch_loss - logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") + logger.log_rank_zero(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") val_loss.append(float(eval_epoch_loss)) val_metric.append(float(eval_metric)) if train_config.task_type == "seq_classification": - logger.info( + logger.log_rank_zero( f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) else: - logger.info( + logger.log_rank_zero( f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) @@ -440,7 +438,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): eval_metric = torch.exp(eval_epoch_loss) # Print evaluation metrics - logger.info(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") + logger.log_rank_zero(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric @@ -467,12 +465,23 @@ def print_model_size(model, config) -> None: Args: model: The PyTorch model. - model_name (str): Name of the model. + config : Config of the model. """ - - logger.info(f"Model : {config.model_name}") total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - logger.info(f"{config.model_name} has {total_params / 1e6} Million params\n") + logger.log_rank_zero(f"{config.model_name} has {total_params / 1e6} Million params.") + + +def print_trainable_parameters(model) -> None: + """ + Print the number of trainable parameters, all params and percentage of trainablke params. + + Args: + model: The PyTorch model. + """ + trainable_params, all_param = model.get_nb_trainable_parameters() + logger.log_rank_zero( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}" + ) def save_to_json( diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index f8bc5753c..4e631f5eb 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -36,7 +36,7 @@ class DownloadRetryLimitExceeded(Exception): def login_and_download_hf_lm(model_name, *args, **kwargs): - logger.info(f"loading HuggingFace model for {model_name}") + logger.log_rank_zero(f"loading HuggingFace model for {model_name}") hf_token = kwargs.pop("hf_token", None) cache_dir = kwargs.pop("cache_dir", None) if hf_token is not None: diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index 8ed7d2c7d..29ac39936 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -6,6 +6,12 @@ # ----------------------------------------------------------------------------- import logging +import os +from datetime import datetime + +import torch.distributed as dist + +from QEfficient.utils.constants import ROOT_DIR class QEffFormatter(logging.Formatter): @@ -44,76 +50,60 @@ def create_logger() -> logging.Logger: """ logger = logging.getLogger("QEfficient") - # create console handler and set level to debug + # create console handler and set level ch = logging.StreamHandler() ch.setLevel(logging.INFO) - # define formatter ch.setFormatter(QEffFormatter()) - logger.addHandler(ch) - return logger - - -# Define the logger object that can be used for logging purposes throughout the module. -logger = create_logger() - -def create_ft_logger(log_file="finetune.log") -> logging.Logger: - """ - Creates a logger object with Colored QEffFormatter. - """ - logger = logging.getLogger("QEfficient") - - # create console handler and set level to debug - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - ch.setFormatter(QEffFormatter()) - logger.addHandler(ch) - - # create file handler and set level to debug - fh = logging.FileHandler(log_file) - fh.setLevel(logging.INFO) - fh.setFormatter(QEffFormatter()) - logger.addHandler(fh) + dump_logs = True + if dump_logs: + logs_path = os.path.join(ROOT_DIR, "logs") + if not os.path.exists(logs_path): + os.makedirs(logs_path, exist_ok=True) + file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt" + log_file = os.path.join(logs_path, file_name) + + # create file handler and set level + fh = logging.FileHandler(log_file) + fh.setLevel(logging.INFO) + formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") + fh.setFormatter(formatter) + logger.addHandler(fh) return logger -# Define the logger object that can be used for logging purposes throughout the finetuning module. -ft_logger = create_ft_logger() -""" - -class FT_Logger: - def __init__(self, level=logging.INFO, log_file="finetune.log"): - self.logger = logging.getLogger("QEfficient") - self.logger.setLevel(level) - self.level = level - - # Create handlers - self.file_handler = logging.FileHandler(log_file) - self.console_handler = logging.StreamHandler() - - self.file_handler.setFormatter(QEffFormatter()) - self.console_handler.setFormatter(QEffFormatter()) - - # Add handlers to the logger - self.logger.addHandler(self.file_handler) - self.logger.addHandler(self.console_handler) - - def get_logger(self): - return self.logger - - def raise_valueerror(self, message): - self.logger.error(message) - raise ValueError(message) - +class CustomLogger(logging.Logger): def raise_runtimeerror(self, message): - self.logger.error(message) + self.error(message) raise RuntimeError(message) - - def raise_filenotfounderror(self, message): - self.logger.error(message) - raise FileNotFoundError(message) -ft_logger = FT_Logger().get_logger() + def log_rank_zero(self, msg: str, level: int = logging.INFO) -> None: + rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 + if rank != 0: + return + self.log(level, msg, stacklevel=2) + + +""" def dump_logs(self, dump_logs=True): + if dump_logs: + logs_path = os.path.join(ROOT_DIR, "logs") + if not os.path.exists(logs_path): + os.makedirs(logs_path, exist_ok=True) + file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt" + log_file = os.path.join(logs_path, file_name) + + # create file handler and set level + fh = logging.FileHandler(log_file) + fh.setLevel(logging.INFO) + formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") + fh.setFormatter(formatter) + logger.addHandler(fh) """ + + +logging.setLoggerClass(CustomLogger) + +# Define the logger object that can be used for logging purposes throughout the module. +logger = create_logger() From f97b24e0f56fb60e71dfa95165c74449b6a6d240 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Mon, 26 May 2025 13:47:47 +0000 Subject: [PATCH 4/4] set log_level and dump_logs flag Signed-off-by: Mamta Singh --- QEfficient/cloud/finetune.py | 6 +++--- QEfficient/cloud/infer.py | 14 +++++++++++++- QEfficient/finetune/configs/training.py | 3 +++ QEfficient/utils/logging_utils.py | 19 +------------------ 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 8dd572564..3321e6324 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -import logging import random import warnings from typing import Any, Dict, Optional, Union @@ -41,8 +40,6 @@ from QEfficient.utils._utils import login_and_download_hf_lm from QEfficient.utils.logging_utils import logger -logger.setLevel(logging.INFO) - # Try importing QAIC-specific module, proceed without it if unavailable try: import torch_qaic # noqa: F401 @@ -330,6 +327,9 @@ def main(peft_config_file: str = None, **kwargs) -> None: dataset_config = generate_dataset_config(train_config.dataset) update_config(dataset_config, **kwargs) + logger.prepare_dump_logs(train_config.dump_logs) + logger.setLevel(train_config.log_level) + setup_distributed_training(train_config) setup_seeds(train_config.seed) model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 43bbda0ba..4ced552f3 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -311,6 +311,13 @@ def main( action="store_true", help="pass to print info logs", ) + parser.add_argument( + "--log_level", + "--log-level", + type=int, + default=20, + help="set the Log level {NOTSET:0, DEBUG:10, INFO:20, WARNING:30, ERROR:40, CRITICAL:50}", + ) parser.add_argument( "--full_batch_size", "--full-batch-size", @@ -353,6 +360,11 @@ def main( ) compiler_options_dict[key] = value if args.verbose: - logger.setLevel(logging.INFO) + logger.prepare_dump_logs(args.verbose) + if args.log_level: + logger.setLevel(args.log_level) + else: + logger.setLevel(logging.INFO) del args.verbose # type: ignore + del args.log_level # type: ignore main(**args.__dict__, **compiler_options_dict) diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py index 1698e680b..d95d08679 100644 --- a/QEfficient/finetune/configs/training.py +++ b/QEfficient/finetune/configs/training.py @@ -4,6 +4,8 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- + +import logging from dataclasses import dataclass @@ -107,3 +109,4 @@ class TrainConfig: opByOpVerifier: bool = False dump_logs: bool = True + log_level: str = logging.INFO diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py index 29ac39936..d62168ed5 100644 --- a/QEfficient/utils/logging_utils.py +++ b/QEfficient/utils/logging_utils.py @@ -56,21 +56,6 @@ def create_logger() -> logging.Logger: ch.setFormatter(QEffFormatter()) logger.addHandler(ch) - dump_logs = True - if dump_logs: - logs_path = os.path.join(ROOT_DIR, "logs") - if not os.path.exists(logs_path): - os.makedirs(logs_path, exist_ok=True) - file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt" - log_file = os.path.join(logs_path, file_name) - - # create file handler and set level - fh = logging.FileHandler(log_file) - fh.setLevel(logging.INFO) - formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") - fh.setFormatter(formatter) - logger.addHandler(fh) - return logger @@ -85,8 +70,7 @@ def log_rank_zero(self, msg: str, level: int = logging.INFO) -> None: return self.log(level, msg, stacklevel=2) - -""" def dump_logs(self, dump_logs=True): + def prepare_dump_logs(self, dump_logs=False): if dump_logs: logs_path = os.path.join(ROOT_DIR, "logs") if not os.path.exists(logs_path): @@ -100,7 +84,6 @@ def log_rank_zero(self, msg: str, level: int = logging.INFO) -> None: formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") fh.setFormatter(formatter) logger.addHandler(fh) -""" logging.setLoggerClass(CustomLogger)