modify error handling

mamtsing · quic-mamta · commit 3d8a53e4bd71 · 2025-06-25T18:33:36.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import logging
 import random
 import warnings
 from typing import Any, Dict, Optional, Union
@@ -40,7 +41,7 @@
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
-    logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.")
+    logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.", logging.WARNING)
 
 
 # Suppress all warnings
@@ -121,7 +122,7 @@ def load_model_and_tokenizer(
         )
 
         if not hasattr(model, "base_model_prefix"):
-            logger.raise_runtimeerror("Given huggingface model does not have 'base_model_prefix' attribute.")
+            logger.raise_error("Given huggingface model does not have 'base_model_prefix' attribute.", RuntimeError)
 
         for param in getattr(model, model.base_model_prefix).parameters():
             param.requires_grad = False
@@ -146,7 +147,7 @@ def load_model_and_tokenizer(
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logger.WARNING)
+        logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logging.WARNING)
         model.resize_token_embeddings(len(tokenizer))
 
     print_model_size(model)
@@ -161,8 +162,8 @@ def load_model_and_tokenizer(
         if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing:
             model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False})
         else:
-            logger.raise_runtimeerror(
-                "Given model doesn't support gradient checkpointing. Please disable it and run it."
+            logger.raise_error(
+                "Given model doesn't support gradient checkpointing. Please disable it and run it.", RuntimeError
             )
 
     model = apply_peft(model, train_config, peft_config_file, **kwargs)
@@ -237,8 +238,9 @@ def setup_dataloaders(
     if train_config.run_validation:
         eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="val")
         if len(eval_dataloader) == 0:
-            logger.raise_runtimeerror(
-                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
+            logger.raise_error(
+                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})",
+                ValueError,
             )
         else:
             logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")
@@ -280,8 +282,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
     dataset_config = generate_dataset_config(train_config.dataset)
     update_config(dataset_config, **kwargs)
 
-    logger.prepare_dump_logs(train_config.dump_logs)
-    logger.setLevel(train_config.log_level)
+    logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
 
     setup_distributed_training(train_config)
     setup_seeds(train_config.seed)
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
@@ -95,7 +95,6 @@ class TrainConfig:
     use_profiler: bool = False  # Enable pytorch profiler, can not be used with flop counter at the same time.
     # profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
 
-    dump_root_dir: str = "mismatches/step_"
     opByOpVerifier: bool = False
 
     dump_logs: bool = True
diff --git a/QEfficient/finetune/dataset/alpaca_dataset.py b/QEfficient/finetune/dataset/alpaca_dataset.py
@@ -11,6 +11,8 @@
 import torch
 from torch.utils.data import Dataset
 
+from QEfficient.finetune.utils.logging_utils import logger
+
 PROMPT_DICT = {
     "prompt_input": (
         "Below is an instruction that describes a task, paired with an input that provides further context. "
@@ -27,7 +29,13 @@
 
 class InstructionDataset(Dataset):
     def __init__(self, dataset_config, tokenizer, partition="train", context_length=None):
-        self.ann = json.load(open(dataset_config.data_path))
+        try:
+            self.ann = json.load(open(dataset_config.data_path))
+        except FileNotFoundError:
+            logger.raise_error(
+                "Loading of alpaca dataset failed! Please use (wget -c https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json -P dataset/) to download the alpaca dataset.",
+                FileNotFoundError,
+            )
         # Use 5% of the dataset for evaluation
         eval_length = int(len(self.ann) / 20)
         if partition == "train":
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
@@ -32,18 +32,21 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
         module_path, func_name = dataset_config.file, "get_custom_dataset"
 
     if not module_path.endswith(".py"):
-        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_error(
+            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+        )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
     except AttributeError:
-        logger.raise_runtimeerror(
-            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
+        logger.raise_error(
+            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
+            AttributeError,
         )
 
 
@@ -54,11 +57,13 @@ def get_data_collator(dataset_processer, dataset_config):
         module_path, func_name = dataset_config.file, "get_data_collator"
 
     if not module_path.endswith(".py"):
-        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_error(
+            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+        )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py
@@ -21,11 +21,11 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
                 data_files={"train": [csv_name]},  # "eval": "grammar_validation.csv"},
                 delimiter=",",
             )
-        except Exception as e:
-            logger.raise_runtimeerror(
-                "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
+        except FileNotFoundError:
+            logger.raise_error(
+                "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset.",
+                FileNotFoundError,
             )
-            raise e
 
         self.context_length = context_length
         self.tokenizer = tokenizer
diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py
@@ -26,7 +26,7 @@
 
     device = "qaic:0"
 except ImportError as e:
-    logger.warning(f"{e}. Moving ahead without these qaic modules.")
+    logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 # Suppress all warnings
@@ -78,16 +78,17 @@ def main(**kwargs):
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
+        logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
     print_model_size(model)
 
     if train_config.run_validation:
         eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="test")
         if len(eval_dataloader) == 0:
-            raise ValueError(
-                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
+            logger.raise_error(
+                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})",
+                ValueError,
             )
         else:
             logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
@@ -44,7 +44,9 @@ def update_config(config, **kwargs):
                     if hasattr(config, param_name):
                         setattr(config, param_name, v)
                     else:
-                        raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'")
+                        logger.raise_error(
+                            f"Config '{config_name}' does not have parameter: '{param_name}'", ValueError
+                        )
             else:
                 config_type = type(config).__name__
                 logger.debug(f"Unknown parameter '{k}' for config type '{config_type}'")
@@ -70,7 +72,7 @@ def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None
     else:
         config_map = {"lora": (LoraConfig, PeftLoraConfig)}
         if train_config.peft_method not in config_map:
-            raise RuntimeError(f"Peft config not found: {train_config.peft_method}")
+            logger.raise_error(f"Peft config not found: {train_config.peft_method}", RuntimeError)
 
         config_cls, peft_config_cls = config_map[train_config.peft_method]
         if config_cls is None:
@@ -119,7 +121,7 @@ def validate_config(config_data: Dict[str, Any], config_type: str = "lora") -> N
         - Ensures types match expected values (int, float, list, etc.).
     """
     if config_type.lower() != "lora":
-        raise ValueError(f"Unsupported config_type: {config_type}. Only 'lora' is supported.")
+        logger.raise_error(f"Unsupported config_type: {config_type}. Only 'lora' is supported.", ValueError)
 
     required_fields = {
         "r": int,
@@ -136,26 +138,28 @@ def validate_config(config_data: Dict[str, Any], config_type: str = "lora") -> N
     # Check for missing required fields
     missing_fields = [field for field in required_fields if field not in config_data]
     if missing_fields:
-        raise ValueError(f"Missing required fields in {config_type} config: {missing_fields}")
+        logger.raise_error(f"Missing required fields in {config_type} config: {missing_fields}", ValueError)
 
     # Validate types of required fields
     for field, expected_type in required_fields.items():
         if not isinstance(config_data[field], expected_type):
-            raise ValueError(
+            logger.raise_error(
                 f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, "
-                f"got {type(config_data[field]).__name__}"
+                f"got {type(config_data[field]).__name__}",
+                ValueError,
             )
 
     # Validate target_modules contains strings
     if not all(isinstance(mod, str) for mod in config_data["target_modules"]):
-        raise ValueError("All elements in 'target_modules' must be strings")
+        logger.raise_error("All elements in 'target_modules' must be strings", ValueError)
 
     # Validate types of optional fields if present
     for field, expected_type in optional_fields.items():
         if field in config_data and not isinstance(config_data[field], expected_type):
-            raise ValueError(
+            logger.raise_error(
                 f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, "
-                f"got {type(config_data[field]).__name__}"
+                f"got {type(config_data[field]).__name__}",
+                ValueError,
             )
 
 
@@ -173,12 +177,12 @@ def load_config_file(config_path: str) -> Dict[str, Any]:
         ValueError: If the file format is unsupported.
     """
     if not os.path.exists(config_path):
-        raise FileNotFoundError(f"Config file not found: {config_path}")
+        logger.raise_error(f"Config file not found: {config_path}", FileNotFoundError)
 
     with open(config_path, "r") as f:
         if config_path.endswith(".yaml") or config_path.endswith(".yml"):
             return yaml.safe_load(f)
         elif config_path.endswith(".json"):
             return json.load(f)
         else:
-            raise ValueError("Unsupported config file format. Use .yaml, .yml, or .json")
+            logger.raise_error("Unsupported config file format. Use .yaml, .yml, or .json", ValueError)
diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py
@@ -18,7 +18,7 @@ def get_preprocessed_dataset(
     tokenizer, dataset_config, split: str = "train", context_length: int = None
 ) -> torch.utils.data.Dataset:
     if dataset_config.dataset not in DATASET_PREPROC:
-        raise NotImplementedError(f"{dataset_config.dataset} is not (yet) implemented")
+        logger.raise_error(f"{dataset_config.dataset} is not (yet) implemented", NotImplementedError)
 
     def get_split():
         return dataset_config.train_split if split == "train" else dataset_config.test_split
@@ -39,8 +39,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split):
     if train_config.enable_ddp:
         if train_config.enable_sorting_for_ddp:
             if train_config.context_length:
-                raise ValueError(
-                    "Sorting cannot be done with padding, Please disable sorting or pass context_length as None to disable padding"
+                logger.raise_error(
+                    "Sorting cannot be done with padding, Please disable sorting or pass context_length as None to disable padding",
+                    ValueError,
                 )
             else:
                 kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler(
diff --git a/QEfficient/finetune/utils/logging_utils.py b/QEfficient/finetune/utils/logging_utils.py
@@ -10,29 +10,29 @@
 from datetime import datetime
 
 from QEfficient.finetune.utils.helper import is_rank_zero
-from QEfficient.utils.constants import ROOT_DIR
 
 
 class FTLogger:
-    def __init__(self, level=logging.DEBUG):
+    def __init__(self):
         self.logger = logging.getLogger("QEfficient")
         if not getattr(self.logger, "_custom_methods_added", False):
             self._bind_custom_methods()
             self.logger._custom_methods_added = True  # Prevent adding handlers/methods twice
 
     def _bind_custom_methods(self):
-        def raise_runtimeerror(message):
+        def raise_error(message, errortype=RuntimeError):
             self.logger.error(message)
-            raise RuntimeError(message)
+            raise errortype(message)
 
         def log_rank_zero(msg: str, level: int = logging.INFO):
             if not is_rank_zero:
                 return
             self.logger.log(level, msg, stacklevel=2)
 
-        def prepare_dump_logs(dump_logs=False, level=logging.INFO):
+        def prepare_for_logs(output_path, dump_logs=False, level=logging.INFO):
+            self.logger.setLevel(level)
             if dump_logs:
-                logs_path = os.path.join(ROOT_DIR, "logs")
+                logs_path = os.path.join(output_path, "logs")
                 if not os.path.exists(logs_path):
                     os.makedirs(logs_path, exist_ok=True)
                 file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt"
@@ -44,9 +44,9 @@ def prepare_dump_logs(dump_logs=False, level=logging.INFO):
                 fh.setFormatter(formatter)
                 self.logger.addHandler(fh)
 
-        self.logger.raise_runtimeerror = raise_runtimeerror
+        self.logger.raise_error = raise_error
         self.logger.log_rank_zero = log_rank_zero
-        self.logger.prepare_dump_logs = prepare_dump_logs
+        self.logger.prepare_for_logs = prepare_for_logs
 
     def get_logger(self):
         return self.logger
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
@@ -254,18 +254,14 @@ def get_finetune_parser():
         action="store_true",
         help="Enable distributed data parallel training. This will load the replicas of model on given number of devices and train the model. This should be used using torchrun interface. Please check docs for exact usage.",
     )
-    parser.add_argument(
-        "--dump_root_dir",
-        "--dump-root-dir",
-        required=False,
-        type=str,
-        default="mismatches/step_",
-        help="Directory for mismatch dumps by opByOpVerifier",
-    )
     parser.add_argument(
         "--opByOpVerifier",
         action="store_true",
-        help="Enable operation-by-operation verification w.r.t reference device(cpu). It is a context manager interface that captures and verifies each operator against reference device. In case results of test & reference do not match under given tolerances, a standalone unittest is generated at dump_root_dir.",
+        help=argparse.SUPPRESS,
+        # This is for debugging purpose only.
+        # Enables operation-by-operation verification w.r.t reference device(cpu).
+        # It is a context manager interface that captures and verifies each operator against reference device.
+        # In case results of test & reference do not match under given tolerances, a standalone unittest is generated at dump_root_dir.
     )
 
     return parser
diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py
@@ -69,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors):
 
 def plot_metrics(file_path):
     if not os.path.exists(file_path):
-        logger.error(f"File {file_path} does not exist.")
+        logger.raise_error(f"File {file_path} does not exist.", FileNotFoundError)
         return
 
     with open(file_path, "r") as f:
         try:
             data = json.load(f)
         except json.JSONDecodeError:
-            logger.error("Invalid JSON file.")
+            logger.raise_error("Invalid JSON file.", json.JSONDecodeError)
             return
 
     directory = os.path.dirname(file_path)
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -85,8 +85,9 @@ def train(
     max_steps_reached = False  # Flag to indicate max training steps reached
 
     tensorboard_updates = None
+    tensorboard_log_dir = train_config.output_dir + "/runs/" + f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
     if is_rank_zero():
-        tensorboard_updates = SummaryWriter()
+        tensorboard_updates = SummaryWriter(log_dir=tensorboard_log_dir)
 
     device_type = torch.device(device).type
 
@@ -181,7 +182,7 @@ def train(
                         atol=1e-1,
                         use_ref_output_on_mismatch=True,
                         filter_config=qaic_debug.DispatchFilterConfig.default(device),
-                        dump_root_dir=train_config.dump_root_dir + str(step),
+                        dump_root_dir=train_config.output_dir + "/mismatches/step_" + str(step),
                     ) as verifier:
                         model_outputs = model(**batch)
                         loss = model_outputs.loss  # Forward call