From 74e1915465957c7d33b164bfd96532a506b681aa Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Mon, 21 Apr 2025 09:06:35 +0000
Subject: [PATCH 1/4] Use logger in place of print statements in finetuning
 scripts

Signed-off-by: Mamta Singh <quic_mamtsing@quicinc.com>
---
 QEfficient/cloud/finetune.py                  | 15 +++++----
 QEfficient/finetune/dataset/custom_dataset.py |  8 +++--
 .../finetune/dataset/grammar_dataset.py       | 10 +++---
 QEfficient/finetune/eval.py                   | 10 +++---
 QEfficient/finetune/utils/plot_metrics.py     |  6 ++--
 QEfficient/finetune/utils/train_utils.py      | 33 ++++++++++---------
 6 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index f312d00cb..474f7864c 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -31,11 +31,12 @@
 )
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
 from QEfficient.utils._utils import login_and_download_hf_lm
+from QEfficient.utils.logging_utils import logger
 
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    logger.warning(f"{e}. Moving ahead without these qaic modules.")
 
 
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
@@ -114,7 +115,7 @@ def main(**kwargs):
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
+        logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
     print_model_size(model, train_config)
@@ -163,10 +164,10 @@ def main(**kwargs):
     #         )
     ##
     train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
-    print("length of dataset_train", len(dataset_train))
+    logger.info("length of dataset_train", len(dataset_train))
     custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
     if custom_data_collator:
-        print("custom_data_collator is used")
+        logger.info("custom_data_collator is used")
         train_dl_kwargs["collate_fn"] = custom_data_collator
 
     # Create DataLoaders for the training and validation dataset
@@ -176,7 +177,7 @@ def main(**kwargs):
         pin_memory=True,
         **train_dl_kwargs,
     )
-    print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
+    logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
 
     eval_dataloader = None
     if train_config.run_validation:
@@ -200,7 +201,7 @@ def main(**kwargs):
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
         longest_seq_length, _ = get_longest_seq_length(
             torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
@@ -208,7 +209,7 @@ def main(**kwargs):
     else:
         longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset)
 
-    print(
+    logger.info(
         f"The longest sequence length in the train data is {longest_seq_length}, "
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index 4bee06c58..7164e13a1 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -8,6 +8,8 @@
 import importlib
 from pathlib import Path
 
+from QEfficient.utils.logging_utils import logger
+
 
 def load_module_from_py_file(py_file: str) -> object:
     """
@@ -40,7 +42,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split)
     except AttributeError as e:
-        print(
+        logger.error(
             f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
         )
         raise e
@@ -63,6 +65,6 @@ def get_data_collator(dataset_processer, dataset_config):
     try:
         return getattr(module, func_name)(dataset_processer)
     except AttributeError:
-        print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
-        print("Using the default data_collator instead.")
+        logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
+        logger.info("Using the default data_collator instead.")
         return None
diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py
index 8f04b7544..6ebeeb2d1 100644
--- a/QEfficient/finetune/dataset/grammar_dataset.py
+++ b/QEfficient/finetune/dataset/grammar_dataset.py
@@ -10,6 +10,8 @@
 from datasets import load_dataset
 from torch.utils.data import Dataset
 
+from QEfficient.utils.logging_utils import logger
+
 
 class grammar(Dataset):
     def __init__(self, tokenizer, csv_name=None, context_length=None):
@@ -20,7 +22,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
                 delimiter=",",
             )
         except Exception as e:
-            print(
+            logger.error(
                 "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
             )
             raise e
@@ -36,7 +38,7 @@ def convert_to_features(self, example_batch):
         # Create prompt and tokenize contexts and questions
 
         if self.print_text:
-            print("Input Text: ", self.clean_text(example_batch["text"]))
+            logger.info("Input Text: ", self.clean_text(example_batch["text"]))
 
         input_ = example_batch["input"]
         target_ = example_batch["target"]
@@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
     """cover function for handling loading the working dataset"""
     """dataset loading"""
     currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
-    print(f"Loading dataset {currPath}")
+    logger.info(f"Loading dataset {currPath}")
     csv_name = str(currPath)
-    print(csv_name)
+    logger.info(csv_name)
     dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)
 
     return dataset
diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py
index 918230554..fe9d9ae6a 100644
--- a/QEfficient/finetune/eval.py
+++ b/QEfficient/finetune/eval.py
@@ -25,12 +25,14 @@
 )
 from utils.train_utils import evaluation, print_model_size
 
+from QEfficient.utils.logging_utils import logger
+
 try:
     import torch_qaic  # noqa: F401
 
     device = "qaic:0"
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    logger.warning(f"{e}. Moving ahead without these qaic modules.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 # Suppress all warnings
@@ -76,7 +78,7 @@ def main(**kwargs):
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
+        logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
     print_model_size(model, train_config)
@@ -107,13 +109,13 @@ def main(**kwargs):
             pin_memory=True,
             **val_dl_kwargs,
         )
-        print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+        logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
         if len(eval_dataloader) == 0:
             raise ValueError(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
     model.to(device)
     _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py
index 5fc54f279..e2dd37f49 100644
--- a/QEfficient/finetune/utils/plot_metrics.py
+++ b/QEfficient/finetune/utils/plot_metrics.py
@@ -11,6 +11,8 @@
 
 import matplotlib.pyplot as plt
 
+from QEfficient.utils.logging_utils import logger
+
 
 def plot_metric(data, metric_name, x_label, y_label, title, colors):
     plt.figure(figsize=(7, 6))
@@ -67,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors):
 
 def plot_metrics(file_path):
     if not os.path.exists(file_path):
-        print(f"File {file_path} does not exist.")
+        logger.error(f"File {file_path} does not exist.")
         return
 
     with open(file_path, "r") as f:
         try:
             data = json.load(f)
         except json.JSONDecodeError:
-            print("Invalid JSON file.")
+            logger.error("Invalid JSON file.")
             return
 
     directory = os.path.dirname(file_path)
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
index 2bc701008..68b884cab 100644
--- a/QEfficient/finetune/utils/train_utils.py
+++ b/QEfficient/finetune/utils/train_utils.py
@@ -19,6 +19,7 @@
 from tqdm import tqdm
 
 from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG
+from QEfficient.utils.logging_utils import logger
 
 try:
     import torch_qaic  # noqa: F401
@@ -27,7 +28,7 @@
     import torch_qaic.utils as qaic_utils  # noqa: F401
     from torch.qaic.amp import GradScaler as QAicGradScaler
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    logger.warning(f"{e}. Moving ahead without these qaic modules.")
 
 from torch.amp import GradScaler
 
@@ -116,12 +117,12 @@ def train(
     for epoch in range(train_config.num_epochs):
         if loss_0_counter.item() == train_config.convergence_counter:
             if train_config.enable_ddp:
-                print(
+                logger.info(
                     f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
                 )
                 break
             else:
-                print(
+                logger.info(
                     f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps."
                 )
                 break
@@ -129,13 +130,13 @@ def train(
         if train_config.use_peft and train_config.from_peft_checkpoint:
             intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
             if epoch < intermediate_epoch:
-                print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
+                logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
                 # to bring the count of train_step in sync with where it left off
                 total_train_steps += len(train_dataloader)
                 continue
 
-        print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
-        print(f"train_config.max_train_step: {train_config.max_train_step}")
+        logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
+        logger.info(f"train_config.max_train_step: {train_config.max_train_step}")
         # stop when the maximum number of training steps is reached
         if max_steps_reached:
             break
@@ -162,7 +163,7 @@ def train(
                 # to bring the count of train_step in sync with where it left off
                 if epoch == intermediate_epoch and step == 0:
                     total_train_steps += intermediate_step
-                    print(
+                    logger.info(
                         f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them."
                     )
                 if epoch == intermediate_epoch and step < intermediate_step:
@@ -197,7 +198,7 @@ def train(
                             labels = batch["labels"][:, 0]
                             preds = torch.nn.functional.softmax(logits, dim=-1)
                             acc_helper.forward(preds, labels)
-                    print("Mismatches detected:", verifier.get_perop_mismatch_count())
+                    logger.info("Mismatches detected:", verifier.get_perop_mismatch_count())
                 else:
                     model_outputs = model(**batch)
                     loss = model_outputs.loss  # Forward call
@@ -279,13 +280,13 @@ def train(
                 )
             if train_config.enable_ddp:
                 if loss_0_counter.item() == train_config.convergence_counter:
-                    print(
+                    logger.info(
                         f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}."
                     )
                     break
             else:
                 if loss_0_counter.item() == train_config.convergence_counter:
-                    print(
+                    logger.info(
                         f"Loss value has been  <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning."
                     )
                     break
@@ -347,15 +348,15 @@ def train(
         if train_config.run_validation:
             if eval_epoch_loss < best_val_loss:
                 best_val_loss = eval_epoch_loss
-                print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
+                logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
             val_loss.append(float(eval_epoch_loss))
             val_metric.append(float(eval_metric))
         if train_config.task_type == "seq_classification":
-            print(
+            logger.info(
                 f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
             )
         else:
-            print(
+            logger.info(
                 f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
             )
 
@@ -459,7 +460,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
         eval_metric = torch.exp(eval_epoch_loss)
 
     # Print evaluation metrics
-    print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
+    logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
 
     return eval_metric, eval_epoch_loss, val_step_loss, val_step_metric
 
@@ -489,9 +490,9 @@ def print_model_size(model, config) -> None:
         model_name (str): Name of the model.
     """
 
-    print(f"--> Model {config.model_name}")
+    logger.info(f"--> Model {config.model_name}")
     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")
+    logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")
 
 
 def save_to_json(

From 669cf983094e56c4afc73339063e777f0ec0db16 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Fri, 16 May 2025 14:51:18 +0530
Subject: [PATCH 2/4] Address comments

Signed-off-by: Mamta Singh <mamtsing@blr-ubuntu-g293-22.qualcomm.com>
---
 QEfficient/cloud/finetune.py                  | 13 ++--
 QEfficient/finetune/dataset/custom_dataset.py |  2 +-
 .../finetune/dataset/grammar_dataset.py       |  2 +-
 QEfficient/finetune/eval.py                   |  4 +-
 QEfficient/finetune/utils/config_utils.py     |  2 +-
 QEfficient/finetune/utils/train_utils.py      | 39 ++++--------
 QEfficient/utils/logging_utils.py             | 61 +++++++++++++++++++
 7 files changed, 84 insertions(+), 39 deletions(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index f9e427f2e..8c36bdd3d 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import logging
 import random
 import warnings
 from typing import Any, Dict, Optional, Union
@@ -18,7 +19,7 @@
 import torch.utils.data
 from peft import PeftModel, get_peft_model
 from torch.optim.lr_scheduler import StepLR
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
 from QEfficient.finetune.configs.training import TrainConfig
 from QEfficient.finetune.utils.config_utils import (
@@ -33,7 +34,7 @@
 )
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
 from QEfficient.utils._utils import login_and_download_hf_lm
-from QEfficient.utils.logging_utils import logger
+from QEfficient.utils.logging_utils import ft_logger as logger
 
 # Try importing QAIC-specific module, proceed without it if unavailable
 try:
@@ -41,8 +42,8 @@
 except ImportError as e:
     logger.warning(f"{e}. Moving ahead without these qaic modules.")
 
+logger.setLevel(logging.INFO)
 
-from transformers import AutoModelForSequenceClassification
 
 # Suppress all warnings
 warnings.filterwarnings("ignore")
@@ -245,7 +246,7 @@ def setup_dataloaders(
     #         )
     ##
     train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
-    logger.info("length of dataset_train", len(dataset_train))
+    logger.info(f"length of dataset_train = {len(dataset_train)}")
 
     # FIXME (Meet): Add custom data collator registration from the outside by the user.
     custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
@@ -260,7 +261,7 @@ def setup_dataloaders(
         pin_memory=True,
         **train_dl_kwargs,
     )
-    logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
+    logger.info(f"Num of Training Set Batches loaded = {len(train_dataloader)}")
 
     eval_dataloader = None
     if train_config.run_validation:
@@ -284,7 +285,7 @@ def setup_dataloaders(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
         longest_seq_length, _ = get_longest_seq_length(
             torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index 7164e13a1..c32100f2b 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -8,7 +8,7 @@
 import importlib
 from pathlib import Path
 
-from QEfficient.utils.logging_utils import logger
+from QEfficient.utils.logging_utils import ft_logger as logger
 
 
 def load_module_from_py_file(py_file: str) -> object:
diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py
index 6ebeeb2d1..adc8c6550 100644
--- a/QEfficient/finetune/dataset/grammar_dataset.py
+++ b/QEfficient/finetune/dataset/grammar_dataset.py
@@ -23,7 +23,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
             )
         except Exception as e:
             logger.error(
-                "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
+                "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
             )
             raise e
 
diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py
index 076628570..87f9b5417 100644
--- a/QEfficient/finetune/eval.py
+++ b/QEfficient/finetune/eval.py
@@ -109,13 +109,13 @@ def main(**kwargs):
             pin_memory=True,
             **val_dl_kwargs,
         )
-        logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+        logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
         if len(eval_dataloader) == 0:
             raise ValueError(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
     model.to(device)
     _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
index c8b04adf4..bc45a8731 100644
--- a/QEfficient/finetune/utils/config_utils.py
+++ b/QEfficient/finetune/utils/config_utils.py
@@ -54,7 +54,7 @@ def update_config(config, **kwargs):
                         raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'")
             else:
                 config_type = type(config).__name__
-                logger.warning(f"Unknown parameter '{k}' for config type '{config_type}'")
+                logger.debug(f"Unknown parameter '{k}' for config type '{config_type}'")
 
 
 def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None, **kwargs) -> Any:
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
index 4a7e59b1d..a43f6ab27 100644
--- a/QEfficient/finetune/utils/train_utils.py
+++ b/QEfficient/finetune/utils/train_utils.py
@@ -19,7 +19,7 @@
 from tqdm import tqdm
 
 from QEfficient.finetune.configs.training import TrainConfig
-from QEfficient.utils.logging_utils import logger
+from QEfficient.utils.logging_utils import ft_logger as logger
 
 try:
     import torch_qaic  # noqa: F401
@@ -85,10 +85,7 @@ def train(
     device_type = device.split(":")[0]
 
     tensorboard_updates = None
-    if train_config.enable_ddp:
-        if local_rank == 0:
-            tensorboard_updates = SummaryWriter()
-    else:
+    if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0):
         tensorboard_updates = SummaryWriter()
 
     if train_config.grad_scaler:
@@ -113,14 +110,9 @@ def train(
     # Start the training loop
     for epoch in range(train_config.num_epochs):
         if loss_0_counter.item() == train_config.convergence_counter:
-            if train_config.enable_ddp:
-                logger.info(
-                    f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
-                )
-                break
-            else:
+            if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0):
                 logger.info(
-                    f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps."
+                    f"Skipping epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
                 )
                 break
 
@@ -161,7 +153,7 @@ def train(
                 if epoch == intermediate_epoch and step == 0:
                     total_train_steps += intermediate_step
                     logger.info(
-                        f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them."
+                        f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it."
                     )
                 if epoch == intermediate_epoch and step < intermediate_step:
                     total_train_steps += 1
@@ -221,10 +213,7 @@ def train(
                 else:
                     loss_0_counter = torch.tensor([0]).to(device)
 
-            if train_config.enable_ddp:
-                if local_rank == 0:
-                    tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps)
-            else:
+            if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0):
                 tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps)
 
             if train_config.save_metrics:
@@ -275,16 +264,10 @@ def train(
                     val_step_metric,
                     val_metric,
                 )
-            if train_config.enable_ddp:
-                if loss_0_counter.item() == train_config.convergence_counter:
-                    logger.info(
-                        f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}."
-                    )
-                    break
-            else:
+            if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0):
                 if loss_0_counter.item() == train_config.convergence_counter:
                     logger.info(
-                        f"Loss value has been  <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning."
+                        f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps.Hence,stopping the fine tuning."
                     )
                     break
 
@@ -457,7 +440,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
         eval_metric = torch.exp(eval_epoch_loss)
 
     # Print evaluation metrics
-    logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
+    logger.info(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
 
     return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric
 
@@ -487,9 +470,9 @@ def print_model_size(model, config) -> None:
         model_name (str): Name of the model.
     """
 
-    logger.info(f"--> Model {config.model_name}")
+    logger.info(f"Model : {config.model_name}")
     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")
+    logger.info(f"{config.model_name} has {total_params / 1e6} Million params\n")
 
 
 def save_to_json(
diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py
index c17fde29c..f73344c4e 100644
--- a/QEfficient/utils/logging_utils.py
+++ b/QEfficient/utils/logging_utils.py
@@ -56,3 +56,64 @@ def create_logger() -> logging.Logger:
 
 # Define the logger object that can be used for logging purposes throughout the module.
 logger = create_logger()
+
+
+def create_ft_logger(log_file="finetune.log") -> logging.Logger:
+    """
+    Creates a logger object with Colored QEffFormatter.
+    """
+    logger = logging.getLogger("QEfficient")
+
+    # create console handler and set level to debug
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.INFO)
+    ch.setFormatter(QEffFormatter())
+    logger.addHandler(ch)
+
+    # create file handler and set level to debug
+    fh = logging.FileHandler(log_file)
+    fh.setLevel(logging.INFO)
+    fh.setFormatter(QEffFormatter())
+    logger.addHandler(fh)
+
+    return logger
+
+
+# Define the logger object that can be used for logging purposes throughout the finetuning module.
+ft_logger = create_ft_logger()
+"""
+
+class FT_Logger:
+    def __init__(self, level=logging.INFO, log_file="finetune.log"):
+        self.logger = logging.getLogger("QEfficient")
+        self.logger.setLevel(level)
+        self.level = level
+
+        # Create handlers
+        self.file_handler = logging.FileHandler(log_file)
+        self.console_handler = logging.StreamHandler()
+
+        self.file_handler.setFormatter(QEffFormatter())
+        self.console_handler.setFormatter(QEffFormatter())
+
+        # Add handlers to the logger
+        self.logger.addHandler(self.file_handler)
+        self.logger.addHandler(self.console_handler)
+
+    def get_logger(self):
+        return self.logger
+        
+    def raise_valueerror(self, message):
+        self.logger.error(message)
+        raise ValueError(message)
+
+    def raise_runtimeerror(self, message):
+        self.logger.error(message)
+        raise RuntimeError(message)
+        
+    def raise_filenotfounderror(self, message):
+        self.logger.error(message)
+        raise FileNotFoundError(message)
+
+ft_logger = FT_Logger().get_logger()
+"""

From ce14058afcbd7dd83017b1094d521b09540a393c Mon Sep 17 00:00:00 2001
From: Mamta Singh <mamtsing@qti.qualcomm.com>
Date: Thu, 22 May 2025 10:51:30 +0000
Subject: [PATCH 3/4] Update logging_utils and log for zero rank

Signed-off-by: Mamta Singh <mamtsing@qti.qualcomm.com>
---
 QEfficient/cloud/finetune.py                  |  43 ++++---
 QEfficient/finetune/configs/training.py       |   2 +
 QEfficient/finetune/dataset/custom_dataset.py |  15 ++-
 QEfficient/finetune/eval.py                   |   4 +-
 QEfficient/finetune/utils/train_utils.py      |  59 ++++++----
 QEfficient/utils/_utils.py                    |   2 +-
 QEfficient/utils/logging_utils.py             | 110 ++++++++----------
 7 files changed, 121 insertions(+), 114 deletions(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index 7b9487c81..8dd572564 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -32,17 +32,22 @@
     get_custom_data_collator,
     get_preprocessed_dataset,
 )
-from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
+from QEfficient.finetune.utils.train_utils import (
+    get_longest_seq_length,
+    print_model_size,
+    print_trainable_parameters,
+    train,
+)
 from QEfficient.utils._utils import login_and_download_hf_lm
-from QEfficient.utils.logging_utils import ft_logger as logger
+from QEfficient.utils.logging_utils import logger
+
+logger.setLevel(logging.INFO)
 
 # Try importing QAIC-specific module, proceed without it if unavailable
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
-    logger.warning(f"{e}. Moving ahead without these qaic modules.")
-
-logger.setLevel(logging.INFO)
+    logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.")
 
 
 # Suppress all warnings
@@ -121,7 +126,7 @@ def load_model_and_tokenizer(
         )
 
         if not hasattr(model, "base_model_prefix"):
-            raise RuntimeError("Given huggingface model does not have 'base_model_prefix' attribute.")
+            logger.raise_runtimeerror("Given huggingface model does not have 'base_model_prefix' attribute.")
 
         for param in getattr(model, model.base_model_prefix).parameters():
             param.requires_grad = False
@@ -146,7 +151,7 @@ def load_model_and_tokenizer(
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
+        logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logger.WARNING)
         model.resize_token_embeddings(len(tokenizer))
 
     # FIXME (Meet): Cover below line inside the logger once it is implemented.
@@ -162,7 +167,9 @@ def load_model_and_tokenizer(
         if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing:
             model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False})
         else:
-            raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.")
+            logger.raise_runtimeerror(
+                "Given model doesn't support gradient checkpointing. Please disable it and run it."
+            )
 
     model = apply_peft(model, train_config, peft_config_file, **kwargs)
 
@@ -197,7 +204,7 @@ def apply_peft(
     else:
         peft_config = generate_peft_config(train_config, peft_config_file, **kwargs)
         model = get_peft_model(model, peft_config)
-    model.print_trainable_parameters()
+    print_trainable_parameters(model)
 
     return model
 
@@ -222,7 +229,7 @@ def setup_dataloaders(
             - Length of longest sequence in the dataset.
 
     Raises:
-        ValueError: If validation is enabled but the validation set is too small.
+        RuntimeError: If validation is enabled but the validation set is too small.
 
     Notes:
         - Applies a custom data collator if provided by get_custom_data_collator.
@@ -246,12 +253,12 @@ def setup_dataloaders(
     #         )
     ##
     train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
-    logger.info(f"length of dataset_train = {len(dataset_train)}")
+    logger.log_rank_zero(f"Length of dataset_train = {len(dataset_train)}")
 
     # FIXME (Meet): Add custom data collator registration from the outside by the user.
     custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
     if custom_data_collator:
-        logger.info("custom_data_collator is used")
+        logger.log_rank_zero("Custom_data_collator is used")
         train_dl_kwargs["collate_fn"] = custom_data_collator
 
     # Create DataLoaders for the training and validation dataset
@@ -261,7 +268,7 @@ def setup_dataloaders(
         pin_memory=True,
         **train_dl_kwargs,
     )
-    logger.info(f"Num of Training Set Batches loaded = {len(train_dataloader)}")
+    logger.log_rank_zero(f"Number of Training Set Batches loaded = {len(train_dataloader)}")
 
     eval_dataloader = None
     if train_config.run_validation:
@@ -281,11 +288,11 @@ def setup_dataloaders(
             **val_dl_kwargs,
         )
         if len(eval_dataloader) == 0:
-            raise ValueError(
+            logger.raise_runtimeerror(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")
 
         longest_seq_length, _ = get_longest_seq_length(
             torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
@@ -329,7 +336,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
 
     # Create DataLoaders for the training and validation dataset
     train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer)
-    logger.info(
+    logger.log_rank_zero(
         f"The longest sequence length in the train data is {longest_seq_length}, "
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
@@ -340,7 +347,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
     if train_config.enable_ddp:
         model = nn.parallel.DistributedDataParallel(model, device_ids=[dist.get_rank()])
-    results = train(
+    _ = train(
         model,
         tokenizer,
         train_dataloader,
@@ -352,7 +359,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
     )
     if train_config.enable_ddp:
         dist.destroy_process_group()
-    return results
+    return
 
 
 if __name__ == "__main__":
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
index 258565c07..1698e680b 100644
--- a/QEfficient/finetune/configs/training.py
+++ b/QEfficient/finetune/configs/training.py
@@ -105,3 +105,5 @@ class TrainConfig:
     grad_scaler: bool = True
     dump_root_dir: str = "meta-llama-samsum-mismatches/step_"
     opByOpVerifier: bool = False
+
+    dump_logs: bool = True
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index d50504d38..b392e4724 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -8,7 +8,7 @@
 import importlib
 from pathlib import Path
 
-from QEfficient.utils.logging_utils import ft_logger as logger
+from QEfficient.utils.logging_utils import logger
 
 
 def load_module_from_py_file(py_file: str) -> object:
@@ -32,20 +32,19 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
         module_path, func_name = dataset_config.file, "get_custom_dataset"
 
     if not module_path.endswith(".py"):
-        raise ValueError(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split)
-    except AttributeError as e:
-        logger.error(
+    except AttributeError:
+        logger.raise_runtimeerror(
             f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
         )
-        raise e
 
 
 def get_data_collator(dataset_processer, dataset_config):
@@ -55,11 +54,11 @@ def get_data_collator(dataset_processer, dataset_config):
         module_path, func_name = dataset_config.file, "get_data_collator"
 
     if not module_path.endswith(".py"):
-        raise ValueError(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py
index e1f9b77e6..1095fe9e9 100644
--- a/QEfficient/finetune/eval.py
+++ b/QEfficient/finetune/eval.py
@@ -109,13 +109,13 @@ def main(**kwargs):
             pin_memory=True,
             **val_dl_kwargs,
         )
-        logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+        logger.log_rank_zero(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
         if len(eval_dataloader) == 0:
             raise ValueError(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            logger.info(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.log_rank_zero(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
     model.to(device)
     _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
index 9089fffe1..2c38ecd2c 100644
--- a/QEfficient/finetune/utils/train_utils.py
+++ b/QEfficient/finetune/utils/train_utils.py
@@ -19,7 +19,7 @@
 from tqdm import tqdm
 
 from QEfficient.finetune.configs.training import TrainConfig
-from QEfficient.utils.logging_utils import ft_logger as logger
+from QEfficient.utils.logging_utils import logger
 
 try:
     import torch_qaic  # noqa: F401
@@ -28,7 +28,7 @@
     import torch_qaic.utils as qaic_utils  # noqa: F401
     from torch.qaic.amp import GradScaler as QAicGradScaler
 except ImportError as e:
-    logger.warning(f"{e}. Moving ahead without these qaic modules.")
+    logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.")
 
 from torch.amp import GradScaler
 
@@ -110,22 +110,21 @@ def train(
     # Start the training loop
     for epoch in range(train_config.num_epochs):
         if loss_0_counter.item() == train_config.convergence_counter:
-            if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0):
-                logger.info(
-                    f"Skipping epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
-                )
-                break
+            logger.log_rank_zero(
+                f"Skipping epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
+            )
+            break
 
         if train_config.use_peft and train_config.from_peft_checkpoint:
             intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
             if epoch < intermediate_epoch:
-                logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
+                logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
                 # to bring the count of train_step in sync with where it left off
                 total_train_steps += len(train_dataloader)
                 continue
 
-        logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
-        logger.info(f"train_config.max_train_step: {train_config.max_train_step}")
+        logger.log_rank_zero(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
+        logger.log_rank_zero(f"train_config.max_train_step: {train_config.max_train_step}")
         # stop when the maximum number of training steps is reached
         if max_steps_reached:
             break
@@ -152,7 +151,7 @@ def train(
                 # to bring the count of train_step in sync with where it left off
                 if epoch == intermediate_epoch and step == 0:
                     total_train_steps += intermediate_step
-                    logger.info(
+                    logger.log_rank_zero(
                         f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it."
                     )
                 if epoch == intermediate_epoch and step < intermediate_step:
@@ -264,12 +263,11 @@ def train(
                     val_step_metric,
                     val_metric,
                 )
-            if (not train_config.enable_ddp) or (train_config.enable_ddp and local_rank == 0):
-                if loss_0_counter.item() == train_config.convergence_counter:
-                    logger.info(
-                        f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps.Hence,stopping the fine tuning."
-                    )
-                    break
+            if loss_0_counter.item() == train_config.convergence_counter:
+                logger.log_rank_zero(
+                    f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps.Hence,stopping the fine tuning."
+                )
+                break
 
         pbar.close()
         epoch_end_time = time.perf_counter() - epoch_start_time
@@ -328,15 +326,15 @@ def train(
         if train_config.run_validation:
             if eval_epoch_loss < best_val_loss:
                 best_val_loss = eval_epoch_loss
-                logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
+                logger.log_rank_zero(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
             val_loss.append(float(eval_epoch_loss))
             val_metric.append(float(eval_metric))
         if train_config.task_type == "seq_classification":
-            logger.info(
+            logger.log_rank_zero(
                 f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
             )
         else:
-            logger.info(
+            logger.log_rank_zero(
                 f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
             )
 
@@ -440,7 +438,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
         eval_metric = torch.exp(eval_epoch_loss)
 
     # Print evaluation metrics
-    logger.info(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
+    logger.log_rank_zero(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
 
     return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric
 
@@ -467,12 +465,23 @@ def print_model_size(model, config) -> None:
 
     Args:
         model: The PyTorch model.
-        model_name (str): Name of the model.
+        config : Config of the model.
     """
-
-    logger.info(f"Model : {config.model_name}")
     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    logger.info(f"{config.model_name} has {total_params / 1e6} Million params\n")
+    logger.log_rank_zero(f"{config.model_name} has {total_params / 1e6} Million params.")
+
+
+def print_trainable_parameters(model) -> None:
+    """
+    Print the number of trainable parameters, all params and percentage of trainablke params.
+
+    Args:
+        model: The PyTorch model.
+    """
+    trainable_params, all_param = model.get_nb_trainable_parameters()
+    logger.log_rank_zero(
+        f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}"
+    )
 
 
 def save_to_json(
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index f8bc5753c..4e631f5eb 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -36,7 +36,7 @@ class DownloadRetryLimitExceeded(Exception):
 
 
 def login_and_download_hf_lm(model_name, *args, **kwargs):
-    logger.info(f"loading HuggingFace model for {model_name}")
+    logger.log_rank_zero(f"loading HuggingFace model for {model_name}")
     hf_token = kwargs.pop("hf_token", None)
     cache_dir = kwargs.pop("cache_dir", None)
     if hf_token is not None:
diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py
index 8ed7d2c7d..29ac39936 100644
--- a/QEfficient/utils/logging_utils.py
+++ b/QEfficient/utils/logging_utils.py
@@ -6,6 +6,12 @@
 # -----------------------------------------------------------------------------
 
 import logging
+import os
+from datetime import datetime
+
+import torch.distributed as dist
+
+from QEfficient.utils.constants import ROOT_DIR
 
 
 class QEffFormatter(logging.Formatter):
@@ -44,76 +50,60 @@ def create_logger() -> logging.Logger:
     """
     logger = logging.getLogger("QEfficient")
 
-    # create console handler and set level to debug
+    # create console handler and set level
     ch = logging.StreamHandler()
     ch.setLevel(logging.INFO)
-    # define formatter
     ch.setFormatter(QEffFormatter())
-
     logger.addHandler(ch)
-    return logger
-
-
-# Define the logger object that can be used for logging purposes throughout the module.
-logger = create_logger()
 
-
-def create_ft_logger(log_file="finetune.log") -> logging.Logger:
-    """
-    Creates a logger object with Colored QEffFormatter.
-    """
-    logger = logging.getLogger("QEfficient")
-
-    # create console handler and set level to debug
-    ch = logging.StreamHandler()
-    ch.setLevel(logging.INFO)
-    ch.setFormatter(QEffFormatter())
-    logger.addHandler(ch)
-
-    # create file handler and set level to debug
-    fh = logging.FileHandler(log_file)
-    fh.setLevel(logging.INFO)
-    fh.setFormatter(QEffFormatter())
-    logger.addHandler(fh)
+    dump_logs = True
+    if dump_logs:
+        logs_path = os.path.join(ROOT_DIR, "logs")
+        if not os.path.exists(logs_path):
+            os.makedirs(logs_path, exist_ok=True)
+        file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt"
+        log_file = os.path.join(logs_path, file_name)
+
+        # create file handler and set level
+        fh = logging.FileHandler(log_file)
+        fh.setLevel(logging.INFO)
+        formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s")
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
 
     return logger
 
 
-# Define the logger object that can be used for logging purposes throughout the finetuning module.
-ft_logger = create_ft_logger()
-"""
-
-class FT_Logger:
-    def __init__(self, level=logging.INFO, log_file="finetune.log"):
-        self.logger = logging.getLogger("QEfficient")
-        self.logger.setLevel(level)
-        self.level = level
-
-        # Create handlers
-        self.file_handler = logging.FileHandler(log_file)
-        self.console_handler = logging.StreamHandler()
-
-        self.file_handler.setFormatter(QEffFormatter())
-        self.console_handler.setFormatter(QEffFormatter())
-
-        # Add handlers to the logger
-        self.logger.addHandler(self.file_handler)
-        self.logger.addHandler(self.console_handler)
-
-    def get_logger(self):
-        return self.logger
-        
-    def raise_valueerror(self, message):
-        self.logger.error(message)
-        raise ValueError(message)
-
+class CustomLogger(logging.Logger):
     def raise_runtimeerror(self, message):
-        self.logger.error(message)
+        self.error(message)
         raise RuntimeError(message)
-        
-    def raise_filenotfounderror(self, message):
-        self.logger.error(message)
-        raise FileNotFoundError(message)
 
-ft_logger = FT_Logger().get_logger()
+    def log_rank_zero(self, msg: str, level: int = logging.INFO) -> None:
+        rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+        if rank != 0:
+            return
+        self.log(level, msg, stacklevel=2)
+
+
+"""    def dump_logs(self, dump_logs=True):
+        if dump_logs:
+            logs_path = os.path.join(ROOT_DIR, "logs")
+            if not os.path.exists(logs_path):
+                os.makedirs(logs_path, exist_ok=True)
+            file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt"
+            log_file = os.path.join(logs_path, file_name)
+
+            # create file handler and set level
+            fh = logging.FileHandler(log_file)
+            fh.setLevel(logging.INFO)
+            formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s")
+            fh.setFormatter(formatter)
+            logger.addHandler(fh)
 """
+
+
+logging.setLoggerClass(CustomLogger)
+
+# Define the logger object that can be used for logging purposes throughout the module.
+logger = create_logger()

From f97b24e0f56fb60e71dfa95165c74449b6a6d240 Mon Sep 17 00:00:00 2001
From: Mamta Singh <mamtsing@qti.qualcomm.com>
Date: Mon, 26 May 2025 13:47:47 +0000
Subject: [PATCH 4/4] set log_level and dump_logs flag

Signed-off-by: Mamta Singh <mamtsing@qti.qualcomm.com>
---
 QEfficient/cloud/finetune.py            |  6 +++---
 QEfficient/cloud/infer.py               | 14 +++++++++++++-
 QEfficient/finetune/configs/training.py |  3 +++
 QEfficient/utils/logging_utils.py       | 19 +------------------
 4 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index 8dd572564..3321e6324 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import logging
 import random
 import warnings
 from typing import Any, Dict, Optional, Union
@@ -41,8 +40,6 @@
 from QEfficient.utils._utils import login_and_download_hf_lm
 from QEfficient.utils.logging_utils import logger
 
-logger.setLevel(logging.INFO)
-
 # Try importing QAIC-specific module, proceed without it if unavailable
 try:
     import torch_qaic  # noqa: F401
@@ -330,6 +327,9 @@ def main(peft_config_file: str = None, **kwargs) -> None:
     dataset_config = generate_dataset_config(train_config.dataset)
     update_config(dataset_config, **kwargs)
 
+    logger.prepare_dump_logs(train_config.dump_logs)
+    logger.setLevel(train_config.log_level)
+
     setup_distributed_training(train_config)
     setup_seeds(train_config.seed)
     model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs)
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 43bbda0ba..4ced552f3 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -311,6 +311,13 @@ def main(
         action="store_true",
         help="pass to print info logs",
     )
+    parser.add_argument(
+        "--log_level",
+        "--log-level",
+        type=int,
+        default=20,
+        help="set the Log level {NOTSET:0, DEBUG:10, INFO:20, WARNING:30, ERROR:40, CRITICAL:50}",
+    )
     parser.add_argument(
         "--full_batch_size",
         "--full-batch-size",
@@ -353,6 +360,11 @@ def main(
             )
             compiler_options_dict[key] = value
     if args.verbose:
-        logger.setLevel(logging.INFO)
+        logger.prepare_dump_logs(args.verbose)
+        if args.log_level:
+            logger.setLevel(args.log_level)
+        else:
+            logger.setLevel(logging.INFO)
     del args.verbose  # type: ignore
+    del args.log_level  # type: ignore
     main(**args.__dict__, **compiler_options_dict)
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
index 1698e680b..d95d08679 100644
--- a/QEfficient/finetune/configs/training.py
+++ b/QEfficient/finetune/configs/training.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import logging
 from dataclasses import dataclass
 
 
@@ -107,3 +109,4 @@ class TrainConfig:
     opByOpVerifier: bool = False
 
     dump_logs: bool = True
+    log_level: str = logging.INFO
diff --git a/QEfficient/utils/logging_utils.py b/QEfficient/utils/logging_utils.py
index 29ac39936..d62168ed5 100644
--- a/QEfficient/utils/logging_utils.py
+++ b/QEfficient/utils/logging_utils.py
@@ -56,21 +56,6 @@ def create_logger() -> logging.Logger:
     ch.setFormatter(QEffFormatter())
     logger.addHandler(ch)
 
-    dump_logs = True
-    if dump_logs:
-        logs_path = os.path.join(ROOT_DIR, "logs")
-        if not os.path.exists(logs_path):
-            os.makedirs(logs_path, exist_ok=True)
-        file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt"
-        log_file = os.path.join(logs_path, file_name)
-
-        # create file handler and set level
-        fh = logging.FileHandler(log_file)
-        fh.setLevel(logging.INFO)
-        formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s")
-        fh.setFormatter(formatter)
-        logger.addHandler(fh)
-
     return logger
 
 
@@ -85,8 +70,7 @@ def log_rank_zero(self, msg: str, level: int = logging.INFO) -> None:
             return
         self.log(level, msg, stacklevel=2)
 
-
-"""    def dump_logs(self, dump_logs=True):
+    def prepare_dump_logs(self, dump_logs=False):
         if dump_logs:
             logs_path = os.path.join(ROOT_DIR, "logs")
             if not os.path.exists(logs_path):
@@ -100,7 +84,6 @@ def log_rank_zero(self, msg: str, level: int = logging.INFO) -> None:
             formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s")
             fh.setFormatter(formatter)
             logger.addHandler(fh)
-"""
 
 
 logging.setLoggerClass(CustomLogger)