diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
index 3e265d82..25693192 100644
--- a/.github/workflows/deploy.yaml
+++ b/.github/workflows/deploy.yaml
@@ -2,7 +2,7 @@ name: Deploy Mode Validation & Inference
 
 on:
   push:
-    branches: [main]
+    branches: [main,TRAIN]
   pull_request:
     branches: [main]
 
diff --git a/tests/test_tools/test_data_loader.py b/tests/test_tools/test_data_loader.py
index 0cfb9c65..005d8ed2 100644
--- a/tests/test_tools/test_data_loader.py
+++ b/tests/test_tools/test_data_loader.py
@@ -12,7 +12,7 @@ def test_create_dataloader_cache(train_cfg: Config):
     train_cfg.task.data.shuffle = False
     train_cfg.task.data.batch_size = 2
 
-    cache_file = Path("tests/data/train.cache")
+    cache_file = Path("tests/data/images/train.cache")
     cache_file.unlink(missing_ok=True)
 
     make_cache_loader = create_dataloader(train_cfg.task.data, train_cfg.dataset)
diff --git a/tests/test_tools/test_solver.py b/tests/test_tools/test_solver.py
index 217f6877..822d991c 100644
--- a/tests/test_tools/test_solver.py
+++ b/tests/test_tools/test_solver.py
@@ -17,7 +17,7 @@
 @pytest.fixture
 def model_validator(validation_cfg: Config, model: YOLO, vec2box: Vec2Box, validation_progress_logger, device):
     validator = ModelValidator(
-        validation_cfg.task, validation_cfg.dataset, model, vec2box, validation_progress_logger, device
+        validation_cfg, model, vec2box, validation_progress_logger, device
     )
     return validator
 
diff --git a/tests/test_utils/test_bounding_box_utils.py b/tests/test_utils/test_bounding_box_utils.py
index 58a9a917..cdaede78 100644
--- a/tests/test_utils/test_bounding_box_utils.py
+++ b/tests/test_utils/test_bounding_box_utils.py
@@ -146,23 +146,62 @@ def test_anc2box_autoanchor(inference_v7_cfg: Config):
 
 
 def test_bbox_nms():
-    cls_dist = tensor(
-        [[[0.1, 0.7, 0.2], [0.6, 0.3, 0.1]], [[0.4, 0.4, 0.2], [0.5, 0.4, 0.1]]]  # Example class distribution
+    cls_dist = torch.tensor(
+        [
+            [
+                [0.7, 0.1, 0.2],  # High confidence, class 0
+                [0.3, 0.6, 0.1],  # High confidence, class 1
+                [-3.0, -2.0, -1.0],  # low confidence, class 2
+                [0.6, 0.2, 0.2],  # Medium confidence, class 0
+            ],
+            [
+                [0.55, 0.25, 0.2],  # Medium confidence, class 0
+                [-4.0, -0.5, -2.0],  # low confidence, class 1
+                [0.15, 0.2, 0.65],  # Medium confidence, class 2
+                [0.8, 0.1, 0.1],  # High confidence, class 0
+            ],
+        ],
+        dtype=float32,
     )
-    bbox = tensor(
-        [[[50, 50, 100, 100], [60, 60, 110, 110]], [[40, 40, 90, 90], [70, 70, 120, 120]]],  # Example bounding boxes
+    bbox = torch.tensor(
+        [
+            [
+                [0, 0, 160, 120],  # Overlaps with box 4
+                [160, 120, 320, 240],
+                [0, 120, 160, 240],
+                [16, 12, 176, 132],
+            ],
+            [
+                [0, 0, 160, 120],  # Overlaps with box 4
+                [160, 120, 320, 240],
+                [0, 120, 160, 240],
+                [16, 12, 176, 132],
+            ],
+        ],
         dtype=float32,
     )
     nms_cfg = NMSConfig(min_confidence=0.5, min_iou=0.5)
 
-    expected_output = [
-        tensor(
+    # Batch 1:
+    #  - box 1 is kept with class 0 as it has a higher confidence than box 4 i.e. box 4 is filtered out
+    #  - box 2 is kept with class 1
+    #  - box 3 is rejected by the confidence filter
+    # Batch 2:
+    #  - box 4 is kept with class 0 as it has a higher confidence than box 1 i.e. box 1 is filtered out
+    #  - box 2 is rejected by the confidence filter
+    #  - box 3 is kept with class 2
+    expected_output = torch.tensor(
+        [
             [
-                [1.0000, 50.0000, 50.0000, 100.0000, 100.0000, 0.6682],
-                [0.0000, 60.0000, 60.0000, 110.0000, 110.0000, 0.6457],
-            ]
-        )
-    ]
+                [0.0, 0.0, 0.0, 160.0, 120.0, 0.6682],
+                [1.0, 160.0, 120.0, 320.0, 240.0, 0.6457],
+            ],
+            [
+                [0.0, 16.0, 12.0, 176.0, 132.0, 0.6900],
+                [2.0, 0.0, 120.0, 160.0, 240.0, 0.6570],
+            ],
+        ]
+    )
 
     output = bbox_nms(cls_dist, bbox, nms_cfg)
 
@@ -171,13 +210,52 @@ def test_bbox_nms():
 
 
 def test_calculate_map():
-    predictions = tensor([[0, 60, 60, 160, 160, 0.5], [0, 40, 40, 120, 120, 0.5]])  # [class, x1, y1, x2, y2]
-    ground_truths = tensor([[0, 50, 50, 150, 150], [0, 30, 30, 100, 100]])  # [class, x1, y1, x2, y2]
-
-    mAP = calculate_map(predictions, ground_truths)
-
-    expected_ap50 = tensor(0.5)
-    expected_ap50_95 = tensor(0.2)
+    # set test data
+    predictions = torch.tensor([
+        [0, 60, 60, 160, 160, 0.9],  # [class, x1, y1, x2, y2, confidence]
+        [0, 40, 40, 120, 120, 0.8],
+        [1, 10, 10, 70, 70, 0.7]
+    ])
+    ground_truths = torch.tensor([
+        [0, 50, 50, 150, 150],  # [class, x1, y1, x2, y2]
+        [1, 15, 15, 65, 65],
+        [0, 30, 30, 100, 100],
+    ])
+
+    # test basic function
+    result = calculate_map(predictions, ground_truths)
+    assert "mAP.50" in result
+    assert "mAP.5:.95" in result
+    assert 0 <= result["mAP.50"] <= 1
+    assert 0 <= result["mAP.5:.95"] <= 1
+
+    # test class-level metrics
+    assert "class_mAP" in result
+    assert 0 in result["class_mAP"]
+    assert 1 in result["class_mAP"]
+    
+    # test different IoU thresholds
+    custom_thresholds = [0.3, 0.5, 0.7]
+    result_custom = calculate_map(predictions, ground_truths, iou_thresholds=custom_thresholds)
+    assert "mAP.30" in result_custom
+    assert "mAP.50" in result_custom
+    assert "mAP.70" in result_custom
+
+    # test edge cases, not considered
+    empty_predictions = torch.zeros((0, 6))
+    empty_result = calculate_map(empty_predictions, ground_truths)
+    assert empty_result["mAP.50"] == 0
+
+    empty_ground_truths = torch.zeros((0, 5))
+    empty_gt_result = calculate_map(predictions, empty_ground_truths)
+    assert empty_gt_result["mAP.50"] == 0
+
+    # test perfect match
+    perfect_predictions = torch.tensor([
+        [0, 50, 50, 150, 150, 1.0],
+        [0, 30, 30, 100, 100, 1.0],
+        [1, 15, 15, 65, 65, 1.0]
+    ])
+    perfect_result = calculate_map(perfect_predictions, ground_truths)
+    assert pytest.approx(perfect_result["mAP.50"], 1e-6) == 1.0
 
-    assert isclose(mAP["mAP.5"], expected_ap50, atol=1e-5), f"AP50 mismatch"
-    assert isclose(mAP["mAP.5:.95"], expected_ap50_95, atol=1e-5), f"Mean AP mismatch"
diff --git a/yolo/config/dataset/mock.yaml b/yolo/config/dataset/mock.yaml
index c7d58a10..fc73643c 100644
--- a/yolo/config/dataset/mock.yaml
+++ b/yolo/config/dataset/mock.yaml
@@ -1,6 +1,8 @@
 path: tests/data
-train: train
-validation: val
+image_train: images/train
+label_train: annotations/instances_train.json
+image_validation: images/val
+label_validation: annotations/instances_val.json
 
 class_num: 80
 class_list: ['Person', 'Bicycle', 'Car', 'Motorcycle', 'Airplane', 'Bus', 'Train', 'Truck', 'Boat', 'Traffic light', 'Fire hydrant', 'Stop sign', 'Parking meter', 'Bench', 'Bird', 'Cat', 'Dog', 'Horse', 'Sheep', 'Cow', 'Elephant', 'Bear', 'Zebra', 'Giraffe', 'Backpack', 'Umbrella', 'Handbag', 'Tie', 'Suitcase', 'Frisbee', 'Skis', 'Snowboard', 'Sports ball', 'Kite', 'Baseball bat', 'Baseball glove', 'Skateboard', 'Surfboard', 'Tennis racket', 'Bottle', 'Wine glass', 'Cup', 'Fork', 'Knife', 'Spoon', 'Bowl', 'Banana', 'Apple', 'Sandwich', 'Orange', 'Broccoli', 'Carrot', 'Hot dog', 'Pizza', 'Donut', 'Cake', 'Chair', 'Couch', 'Potted plant', 'Bed', 'Dining table', 'Toilet', 'Tv', 'Laptop', 'Mouse', 'Remote', 'Keyboard', 'Cell phone', 'Microwave', 'Oven', 'Toaster', 'Sink', 'Refrigerator', 'Book', 'Clock', 'Vase', 'Scissors', 'Teddy bear', 'Hair drier', 'Toothbrush']
diff --git a/yolo/config/task/train.yaml b/yolo/config/task/train.yaml
index d3eab6cf..ff5ffbac 100644
--- a/yolo/config/task/train.yaml
+++ b/yolo/config/task/train.yaml
@@ -1,4 +1,5 @@
 task: train
+mode: detection
 
 defaults:
   - validation: ../validation
diff --git a/yolo/lazy.py b/yolo/lazy.py
index 1bc5577e..24b45701 100644
--- a/yolo/lazy.py
+++ b/yolo/lazy.py
@@ -32,7 +32,7 @@ def main(cfg: Config):
     if cfg.task.task == "train":
         solver = ModelTrainer(cfg, model, converter, progress, device, use_ddp)
     if cfg.task.task == "validation":
-        solver = ModelValidator(cfg.task, cfg.dataset, model, converter, progress, device)
+        solver = ModelValidator(cfg, model, converter, progress, device)
     if cfg.task.task == "inference":
         solver = ModelTester(cfg, model, converter, progress, device)
     progress.start()
diff --git a/yolo/tools/data_loader.py b/yolo/tools/data_loader.py
index 9ceb455f..23720789 100644
--- a/yolo/tools/data_loader.py
+++ b/yolo/tools/data_loader.py
@@ -24,7 +24,7 @@
 
 
 class YoloDataset(Dataset):
-    def __init__(self, data_cfg: DataConfig, dataset_cfg: DatasetConfig, phase: str = "train2017"):
+    def __init__(self, data_cfg: DataConfig, dataset_cfg: DatasetConfig, phase: str = "train"):
         augment_cfg = data_cfg.data_augment
         self.image_size = data_cfg.image_size
         phase_name = dataset_cfg.get(phase, phase)
@@ -32,31 +32,65 @@ def __init__(self, data_cfg: DataConfig, dataset_cfg: DatasetConfig, phase: str
         transforms = [eval(aug)(prob) for aug, prob in augment_cfg.items()]
         self.transform = AugmentationComposer(transforms, self.image_size)
         self.transform.get_more_data = self.get_more_data
-        self.data = self.load_data(Path(dataset_cfg.path), phase_name)
-
-    def load_data(self, dataset_path: Path, phase_name: str):
+        
+        self.get_dataset_path(cfg=dataset_cfg,phase=phase)
+       
+        self.data = []
+        for images_path, labels_path in zip(self.images_paths, self.labels_paths):
+            datas = self.load_data(images_path, labels_path, phase_name)
+            datas = [ (images_path / data[0], *data[1:]) for data in datas]
+            self.data.extend(datas)
+
+    def get_dataset_path(self,cfg:DataConfig, phase: str = "train"):
+         # dataset source
+        images_paths = getattr(cfg, "image_" + phase)
+        if isinstance(images_paths, str):
+            images_paths = [images_paths]
+        elif isinstance(images_paths, tuple):
+            images_paths = list(images_paths)
+        self.images_paths = [Path(cfg.path) / images_path for images_path in images_paths]
+        
+        labels_paths = getattr(cfg, "label_" + phase)
+        if isinstance(labels_paths, str):
+            labels_paths = [labels_paths]
+        elif isinstance(labels_paths, tuple):
+            labels_paths = list(labels_paths)
+        self.labels_paths = [Path(cfg.path) / labels_path for labels_path in labels_paths]
+        
+        assert len(self.images_paths) == len(self.labels_paths)
+
+    def load_data(self, images_path: Path, labels_path: Path, phase_name: str):
         """
         Loads data from a cache or generates a new cache for a specific dataset phase.
 
         Parameters:
-            dataset_path (Path): The root path to the dataset directory.
+            images_path (Path): The root path to the images directory.
+            labels_path (Path): The root path to the labels directory.
             phase_name (str): The specific phase of the dataset (e.g., 'train', 'test') to load or generate data for.
 
         Returns:
             dict: The loaded data from the cache for the specified phase.
         """
-        cache_path = dataset_path / f"{phase_name}.cache"
+        cache_path = images_path.with_suffix(".cache")
 
         if not cache_path.exists():
-            logger.info("🏭 Generating {} cache", phase_name)
-            data = self.filter_data(dataset_path, phase_name)
+            data = self.filter_data(images_path, labels_path, phase_name)
+            logger.info("🏭 Generating {} cache, containing {} samples", phase_name, len(data))
             torch.save(data, cache_path)
         else:
             data = torch.load(cache_path)
-            logger.info("📦 Loaded {} cache", phase_name)
+            logger.info("📦 Loaded {} cache, containing {} samples", phase_name, len(data))
+            # TODO: add Validate cache
+            # if data[0][0].parent == Path("images")/phase_name:
+            #     logger.info("✅ Cache validation successful")
+            # else:
+            #     logger.warning("⚠️ Cache validation failed, regenerating")
+            #     data = self.filter_data(images_path, labels_path, phase_name)
+            #     torch.save(data, cache_path)
+        
         return data
 
-    def filter_data(self, dataset_path: Path, phase_name: str) -> list:
+    def filter_data(self, images_path: Path, labels_path: Path, phase_name: str) -> list:
         """
         Filters and collects dataset information by pairing images with their corresponding labels.
 
@@ -67,8 +101,7 @@ def filter_data(self, dataset_path: Path, phase_name: str) -> list:
         Returns:
             list: A list of tuples, each containing the path to an image file and its associated segmentation as a tensor.
         """
-        images_path = dataset_path / "images" / phase_name
-        labels_path, data_type = locate_label_paths(dataset_path, phase_name)
+        labels_path, data_type = locate_label_paths(labels_path, phase_name)
         images_list = sorted([p.name for p in Path(images_path).iterdir() if p.is_file()])
         if data_type == "json":
             annotations_index, image_info_dict = create_image_metadata(labels_path)
@@ -78,30 +111,29 @@ def filter_data(self, dataset_path: Path, phase_name: str) -> list:
         for image_name in track(images_list, description="Filtering data"):
             if not image_name.lower().endswith((".jpg", ".jpeg", ".png")):
                 continue
-            image_id = Path(image_name).stem
 
-            if data_type == "json":
-                image_info = image_info_dict.get(image_id, None)
+            if data_type == "json": 
+                image_info = image_info_dict.get(image_name, None)
+                # TODO: neg case can be load
                 if image_info is None:
                     continue
                 annotations = annotations_index.get(image_info["id"], [])
-                image_seg_annotations = scale_segmentation(annotations, image_info)
+                image_seg_annotations = scale_segmentation(annotations, image_info) # coco2yolo 
                 if not image_seg_annotations:
                     continue
 
             elif data_type == "txt":
-                label_path = labels_path / f"{image_id}.txt"
+                label_path = labels_path / Path(image_name).with_suffix('.txt')
                 if not label_path.is_file():
                     continue
-                with open(label_path, "r") as file:
-                    image_seg_annotations = [list(map(float, line.strip().split())) for line in file]
+                with label_path.open("r") as f:
+                    image_seg_annotations = [list(map(float, line.strip().split())) for line in f]
             else:
                 image_seg_annotations = []
+            # TODO: correct the box and log the image file
+            labels = self.load_valid_labels(images_path / image_name, image_seg_annotations)
 
-            labels = self.load_valid_labels(image_id, image_seg_annotations)
-
-            img_path = images_path / image_name
-            data.append((img_path, labels))
+            data.append((image_name, labels))
             valid_inputs += 1
         logger.info("Recorded {}/{} valid inputs", valid_inputs, len(images_list))
         return data
@@ -133,6 +165,7 @@ def load_valid_labels(self, label_path: str, seg_data_one_img: list) -> Union[Te
 
     def get_data(self, idx):
         img_path, bboxes = self.data[idx]
+        # img_path = self.images_path / img_path
         img = Image.open(img_path).convert("RGB")
         return img, bboxes, img_path
 
@@ -200,7 +233,7 @@ def create_dataloader(data_cfg: DataConfig, dataset_cfg: DatasetConfig, task: st
     if task == "inference":
         return StreamDataLoader(data_cfg)
 
-    if dataset_cfg.auto_download:
+    if dataset_cfg.get("auto_download",None):
         prepare_dataset(dataset_cfg, task)
 
     return YoloDataLoader(data_cfg, dataset_cfg, task, use_ddp)
@@ -300,4 +333,4 @@ def stop(self):
             self.thread.join(timeout=1)
 
     def __len__(self):
-        return self.queue.qsize() if not self.is_stream else 0
+        return self.queue.qsize() if not self.is_stream else 0
\ No newline at end of file
diff --git a/yolo/tools/solver.py b/yolo/tools/solver.py
index 69953e39..3ca292fa 100644
--- a/yolo/tools/solver.py
+++ b/yolo/tools/solver.py
@@ -21,7 +21,7 @@
 from yolo.tools.drawer import draw_bboxes, draw_model
 from yolo.tools.loss_functions import create_loss_function
 from yolo.utils.bounding_box_utils import Vec2Box, calculate_map
-from yolo.utils.dataset_utils import locate_label_paths
+from yolo.utils.dataset_utils import locate_label_paths,create_image_metadata
 from yolo.utils.logging_utils import ProgressLogger, log_model_structure
 from yolo.utils.model_utils import (
     ExponentialMovingAverage,
@@ -31,7 +31,7 @@
     create_scheduler,
     predicts_to_json,
 )
-from yolo.utils.solver_utils import calculate_ap
+from yolo.utils.solver_utils import calculate_ap, merge_coco_objects
 
 
 class ModelTrainer:
@@ -58,13 +58,13 @@ def __init__(self, cfg: Config, model: YOLO, vec2box: Vec2Box, progress: Progres
         self.validation_dataloader = create_dataloader(
             cfg.task.validation.data, cfg.dataset, cfg.task.validation.task, use_ddp
         )
-        self.validator = ModelValidator(cfg.task.validation, cfg.dataset, model, vec2box, progress, device)
+        self.validator = ModelValidator(cfg, model, vec2box, progress, self.device)
 
-        if getattr(train_cfg.ema, "enabled", False):
-            self.ema = ExponentialMovingAverage(model, decay=train_cfg.ema.decay)
-        else:
-            self.ema = None
-        self.scaler = GradScaler()
+
+        self.ema = ExponentialMovingAverage(model, decay=train_cfg.ema.decay) if getattr(train_cfg.ema, "enabled", False) else None
+        
+        self.scaler = GradScaler() if self.device == 'cuda' else None
+        
 
     def train_one_batch(self, images: Tensor, targets: Tensor):
         images, targets = images.to(self.device), targets.to(self.device)
@@ -76,11 +76,16 @@ def train_one_batch(self, images: Tensor, targets: Tensor):
             main_predicts = self.vec2box(predicts["Main"])
             loss, loss_item = self.loss_fn(aux_predicts, main_predicts, targets)
 
-        self.scaler.scale(loss).backward()
-        self.scaler.unscale_(self.optimizer)
-        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)
-        self.scaler.step(self.optimizer)
-        self.scaler.update()
+        if self.scaler:
+            self.scaler.scale(loss).backward()
+            self.scaler.unscale_(self.optimizer)
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+        else:
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)
+            self.optimizer.step()
 
         return loss_item
 
@@ -126,11 +131,11 @@ def save_checkpoint(self, epoch_idx: int, file_name: Optional[str] = None):
         torch.save(checkpoint, file_path)
 
     def good_epoch(self, mAPs: Dict[str, Tensor]) -> bool:
-        save_flag = True
+        save_flag = False
         for mAP_key, mAP_val in mAPs.items():
+            if not self.mAPs_dict[mAP_key] or mAP_val > max(self.mAPs_dict[mAP_key]):
+                save_flag = True
             self.mAPs_dict[mAP_key].append(mAP_val)
-            if mAP_val < max(self.mAPs_dict[mAP_key]):
-                save_flag = False
         return save_flag
 
     def solve(self, dataloader: DataLoader):
@@ -212,8 +217,7 @@ def solve(self, dataloader: StreamDataLoader):
 class ModelValidator:
     def __init__(
         self,
-        validation_cfg: ValidationConfig,
-        dataset_cfg: DatasetConfig,
+        cfg: Config,
         model: YOLO,
         vec2box: Vec2Box,
         progress: ProgressLogger,
@@ -222,46 +226,84 @@ def __init__(
         self.model = model
         self.device = device
         self.progress = progress
-
-        self.post_proccess = PostProccess(vec2box, validation_cfg.nms)
+        
+        self.post_proccess = PostProccess(vec2box, cfg.task.validation.nms)
+        # TODO: yolo don't need this step
         self.json_path = self.progress.save_path / "predict.json"
 
         with contextlib.redirect_stdout(io.StringIO()):
             # TODO: load with config file
-            json_path, _ = locate_label_paths(Path(dataset_cfg.path), dataset_cfg.get("validation", "val"))
-            if json_path:
-                self.coco_gt = COCO(json_path)
+            labels_paths = getattr(cfg.dataset, "label_" + cfg.task.validation.task)
+            
+            if isinstance(labels_paths, str):
+                labels_paths = [labels_paths]
+            elif isinstance(labels_paths, tuple):
+                labels_paths = list(labels_paths)
+            self.labels_paths = [Path(cfg.dataset.path) / labels_path for labels_path in labels_paths]
+            
+            images_paths = getattr(cfg.dataset, "image_" + cfg.task.validation.task)
+            if isinstance(images_paths, str):
+                images_paths = [images_paths]
+            elif isinstance(images_paths, tuple):
+                images_paths = list(images_paths)
+            self.images_paths = [Path(cfg.dataset.path) / images_path for images_path in images_paths]
+            
+            # get the gt label
+            self.labels_infos = [locate_label_paths(labels_path, cfg.dataset.get("validation", "validation")) \
+                            for labels_path in self.labels_paths]
+
+            # merge coco object, duplicate data is not merged
+            coco_objects = [COCO(json_path[0]) for json_path in self.labels_infos if json_path[0] and json_path[1]=="json"]
+            self.coco_gt = merge_coco_objects(coco_objects)
 
     def solve(self, dataloader, epoch_idx=1):
         # logger.info("🧪 Start Validation!")
         self.model.eval()
         predict_json, mAPs = [], defaultdict(list)
+        # only save the unique path
+        image_info_dicts = {}
+        for images_path, labels_info in zip(self.images_paths, self.labels_infos):
+            # only coco need to calculate the more ap
+            if labels_info[1] == "json":
+                _, image_info_dict = create_image_metadata(labels_info[0])
+                modified_dict = {f"{images_path/key}": value for key, value in image_info_dict.items()}
+                image_info_dicts.update(modified_dict)
+          
         self.progress.start_one_epoch(len(dataloader), task="Validate")
         for batch_size, images, targets, rev_tensor, img_paths in dataloader:
+            images_path = dataloader.dataset.images_paths
             images, targets, rev_tensor = images.to(self.device), targets.to(self.device), rev_tensor.to(self.device)
             with torch.no_grad():
-                predicts = self.model(images)
-                predicts = self.post_proccess(predicts)
+                output = self.model(images)
+                predicts = self.post_proccess(output)
                 for idx, predict in enumerate(predicts):
                     mAP = calculate_map(predict, targets[idx])
                     for mAP_key, mAP_val in mAP.items():
-                        mAPs[mAP_key].append(mAP_val)
+                        if mAP_key != "class_mAP":
+                            mAPs[mAP_key].append(mAP_val)
 
             avg_mAPs = {key: torch.mean(torch.stack(val)) for key, val in mAPs.items()}
             self.progress.one_batch(avg_mAPs)
-
-            predict_json.extend(predicts_to_json(img_paths, predicts, rev_tensor))
+            if image_info_dicts:
+                predict_json.extend(predicts_to_json(img_paths, image_info_dicts, predicts, rev_tensor))
         self.progress.finish_one_epoch(avg_mAPs, epoch_idx=epoch_idx)
         self.progress.visualize_image(images, targets, predicts, epoch_idx=epoch_idx)
 
-        with open(self.json_path, "w") as f:
-            predict_json = collect_prediction(predict_json, self.progress.local_rank)
-            if self.progress.local_rank != 0:
-                return
-            json.dump(predict_json, f)
-        if hasattr(self, "coco_gt"):
-            self.progress.start_pycocotools()
-            result = calculate_ap(self.coco_gt, predict_json)
-            self.progress.finish_pycocotools(result, epoch_idx)
+        if predict_json:
+            with open(self.json_path, "w") as f:
+                predict_json = collect_prediction(predict_json, self.progress.local_rank)
+                if self.progress.local_rank == 0:
+                    json.dump(predict_json, f)
+                    
+                    if hasattr(self, "coco_gt"):
+                        self.progress.start_pycocotools()
+                        result = calculate_ap(self.coco_gt, predict_json)
+                        self.progress.finish_pycocotools(result, epoch_idx)
+                    else:
+                        logger.warning("⚠️ COCO ground truth not found. Please check dataset configuration.")
+                else:
+                    return
+        else:
+            logger.warning("⚠️ No predictions available for evaluation.")
 
         return avg_mAPs
diff --git a/yolo/utils/bounding_box_utils.py b/yolo/utils/bounding_box_utils.py
index 12d95c58..ab24837d 100644
--- a/yolo/utils/bounding_box_utils.py
+++ b/yolo/utils/bounding_box_utils.py
@@ -13,6 +13,18 @@
 
 
 def calculate_iou(bbox1, bbox2, metrics="iou") -> Tensor:
+    """
+    Calculate Intersection over Union (IoU) between two sets of bounding boxes.
+
+    Args:
+        bbox1 (Tensor): First set of bounding boxes. Shape: [A, 4] or [B, A, 4] or [B, Z, A, 4]
+        bbox2 (Tensor): Second set of bounding boxes. Shape: [B, 4] or [B, A, 4] or [B, Z, A, 4]
+        metrics (str, optional): IoU metric to use. Default: "iou".
+
+    Returns:
+        Tensor: IoU scores between each pair of bounding boxes. Shape: [B, A, B] or [B, Z, A, B]
+    """ 
+        
     metrics = metrics.lower()
     EPS = 1e-9
     dtype = bbox1.dtype
@@ -76,6 +88,16 @@ def calculate_iou(bbox1, bbox2, metrics="iou") -> Tensor:
 
 
 def transform_bbox(bbox: Tensor, indicator="xywh -> xyxy"):
+    """
+    Transform bounding boxes between different formats.
+
+    Args:
+        bbox (Tensor): Input bounding boxes. Shape: [B, N, 4]
+        indicator (str, optional): Transformation indicator. Default: "xywh -> xyxy"
+
+    Returns:
+        Tensor: Transformed bounding boxes. Shape: [B, N, 4]
+    """
     data_type = bbox.dtype
     in_type, out_type = indicator.replace(" ", "").split("->")
 
@@ -388,7 +410,7 @@ def bbox_nms(cls_dist: Tensor, bbox: Tensor, nms_cfg: NMSConfig, confidence: Opt
     valid_box = bbox[valid_mask.repeat(1, 1, 4)].view(-1, 4)
 
     batch_idx, *_ = torch.where(valid_mask)
-    nms_idx = batched_nms(valid_box, valid_cls, batch_idx, nms_cfg.min_iou)
+    nms_idx = batched_nms(valid_box, valid_con, batch_idx, nms_cfg.min_iou)
     predicts_nms = []
     for idx in range(cls_dist.size(0)):
         instance_idx = nms_idx[idx == batch_idx[nms_idx]]
@@ -401,29 +423,73 @@ def bbox_nms(cls_dist: Tensor, bbox: Tensor, nms_cfg: NMSConfig, confidence: Opt
     return predicts_nms
 
 
-def calculate_map(predictions, ground_truths, iou_thresholds=arange(0.5, 1, 0.05)) -> Dict[str, Tensor]:
-    # TODO: Refactor this block, Flexible for calculate different mAP condition?
+def calculate_map(
+    predictions: Tensor,
+    ground_truths: Tensor,
+    iou_thresholds: Union[List[float], Tensor] = torch.arange(0.5, 1.0, 0.05),
+    class_metrics: bool = True,
+) -> Dict[str, Union[Tensor, Dict[int, Tensor]]]:
+    """
+    Calculate Mean Average Precision (mAP) for object detection.
+
+    Args:
+        predictions (Tensor): Predicted bounding boxes and scores. Shape: [N, 6] (class, x1, y1, x2, y2, confidence)
+        ground_truths (Tensor): Ground truth bounding boxes. Shape: [M, 5] (class, x1, y1, x2, y2)
+        iou_thresholds (Union[List[float], Tensor]): IoU thresholds for mAP calculation. Default: [0.5, 0.55, ..., 0.95]
+        class_metrics (bool): Whether to calculate class-wise mAP. Default: True
+
+    Returns:
+        Dict[str, Union[Tensor, Dict[int, Tensor]]]: A dictionary containing mAP scores:
+            - mAP.XX: mAP at IoU threshold XX
+            - mAP.5:.95: mean mAP over IoU thresholds 0.5 to 0.95
+            - class_mAP: class-wise mAP scores (if class_metrics is True)
+    """
     device = predictions.device
     n_preds = predictions.size(0)
     n_gts = (ground_truths[:, 0] != -1).sum()
+    
+    # process the case of no predictions
+    if n_preds == 0:
+        mAP = {f"mAP.{int(threshold*100)}": torch.tensor(0.0, device=device) for threshold in iou_thresholds}
+        mAP["mAP.5:.95"] = torch.tensor(0.0, device=device)
+        
+        if class_metrics:
+            unique_classes = torch.unique(ground_truths[:n_gts, 0])
+            class_mAP = {int(cls.item()): {f"mAP.{int(threshold*100)}": torch.tensor(0.0, device=device) 
+                                           for threshold in iou_thresholds} for cls in unique_classes}
+            for cls in class_mAP:
+                class_mAP[cls]["mAP.5:.95"] = torch.tensor(0.0, device=device)
+            mAP["class_mAP"] = class_mAP
+        logger.info("🧸 Found no predictions")
+        return mAP
+    
+    # remove the padded data
     ground_truths = ground_truths[:n_gts]
+    
+    ious = calculate_iou(predictions[:, 1:-1], ground_truths[:, 1:1+4])  # [n_preds, n_gts]
+    
+    if isinstance(iou_thresholds, list):
+        iou_thresholds = torch.tensor(iou_thresholds, device=device)
+    
     aps = []
-
-    ious = calculate_iou(predictions[:, 1:-1], ground_truths[:, 1:])  # [n_preds, n_gts]
+    class_aps = {} if class_metrics else None
 
     for threshold in iou_thresholds:
         tp = torch.zeros(n_preds, device=device, dtype=bool)
 
+        # get the max iou and the index of the max iou
         max_iou, max_indices = ious.max(dim=1)
+        # get the index of the max iou that is above the threshold
         above_threshold = max_iou >= threshold
+        # match the class
         matched_classes = predictions[:, 0] == ground_truths[max_indices, 0]
-        max_match = torch.zeros_like(ious)
-        max_match[arange(n_preds), max_indices] = max_iou
-        if max_match.size(0):
-            tp[max_match.argmax(dim=0)] = True
-        tp[~above_threshold | ~matched_classes] = False
+        
+        valid_matches = above_threshold & matched_classes
 
-        _, indices = torch.sort(predictions[:, 1], descending=True)
+        tp[valid_matches] = True
+        
+        # order by confidence
+        _, indices = torch.sort(predictions[:, -1], descending=True)
         tp = tp[indices]
 
         tp_cumsum = torch.cumsum(tp, dim=0)
@@ -432,17 +498,53 @@ def calculate_map(predictions, ground_truths, iou_thresholds=arange(0.5, 1, 0.05
         precision = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-6)
         recall = tp_cumsum / (n_gts + 1e-6)
 
+        # add the first and last point to ensure correct calculation of AP
         precision = torch.cat([torch.ones(1, device=device), precision, torch.zeros(1, device=device)])
         recall = torch.cat([torch.zeros(1, device=device), recall, torch.ones(1, device=device)])
 
-        precision, _ = torch.cummax(precision.flip(0), dim=0)
-        precision = precision.flip(0)
+        # calculate the smoothed precision
+        precision = torch.cummax(precision.flip(0), dim=0)[0].flip(0)
 
-        ap = torch.trapezoid(precision, recall)
+        ap = torch.trapz(precision, recall)
         aps.append(ap)
 
-    mAP = {
-        "mAP.5": aps[0],
-        "mAP.5:.95": torch.mean(torch.stack(aps)),
-    }
+        if class_metrics:
+            for cls in torch.unique(ground_truths[:, 0]):
+                cls_mask = predictions[:, 0] == cls
+                if cls_mask.sum() == 0:
+                    continue
+                cls_tp = tp[cls_mask]
+                cls_fp = ~cls_tp
+                cls_gt_count = (ground_truths[:, 0] == cls).sum()
+                
+                cls_precision = torch.cumsum(cls_tp, dim=0) / (torch.cumsum(cls_tp, dim=0) + torch.cumsum(cls_fp, dim=0) + 1e-6)
+                cls_recall = torch.cumsum(cls_tp, dim=0) / (cls_gt_count + 1e-6)
+                
+                cls_precision = torch.cat([torch.ones(1, device=device), cls_precision, torch.zeros(1, device=device)])
+                cls_recall = torch.cat([torch.zeros(1, device=device), cls_recall, torch.ones(1, device=device)])
+                
+                cls_precision = torch.cummax(cls_precision.flip(0), dim=0)[0].flip(0)
+                
+                cls_ap = torch.trapz(cls_precision, cls_recall)
+                
+                if cls.item() not in class_aps:
+                    class_aps[int(cls.item())] = []
+                class_aps[int(cls.item())].append(cls_ap)
+
+    mAP = {}
+    for i, threshold in enumerate(iou_thresholds):
+        mAP[f"mAP.{int(threshold*100)}"] = aps[i]
+    
+    # add mAP.5:.95
+    mAP["mAP.5:.95"] = torch.mean(torch.stack(aps))
+
+    if class_metrics:
+        class_mAP = {}
+        for cls in class_aps:
+            class_mAP[cls] = {}
+            for i, threshold in enumerate(iou_thresholds):
+                class_mAP[cls][f"mAP.{int(threshold*100)}"] = class_aps[cls][i]
+            class_mAP[cls]["mAP.5:.95"] = torch.mean(torch.stack(class_aps[cls]))
+        mAP["class_mAP"] = class_mAP
+
     return mAP
diff --git a/yolo/utils/dataset_utils.py b/yolo/utils/dataset_utils.py
index a6c6e1fd..abd740bd 100644
--- a/yolo/utils/dataset_utils.py
+++ b/yolo/utils/dataset_utils.py
@@ -10,7 +10,7 @@
 from yolo.tools.data_conversion import discretize_categories
 
 
-def locate_label_paths(dataset_path: Path, phase_name: Path) -> Tuple[Path, Path]:
+def locate_label_paths(label_path: Path, phase_name: Path) -> Tuple[Path, Path]:
     """
     Find the path to label files for a specified dataset and phase(e.g. training).
 
@@ -21,17 +21,14 @@ def locate_label_paths(dataset_path: Path, phase_name: Path) -> Tuple[Path, Path
     Returns:
         Tuple[Path, Path]: A tuple containing the path to the labels file and the file format ("json" or "txt").
     """
-    json_labels_path = dataset_path / "annotations" / f"instances_{phase_name}.json"
 
-    txt_labels_path = dataset_path / "labels" / phase_name
+    if label_path.is_file():
+        return label_path, "json"
 
-    if json_labels_path.is_file():
-        return json_labels_path, "json"
-
-    elif txt_labels_path.is_dir():
-        txt_files = [f for f in os.listdir(txt_labels_path) if f.endswith(".txt")]
+    elif label_path.is_dir():
+        txt_files = [f for f in os.listdir(label_path) if f.endswith(".txt")]
         if txt_files:
-            return txt_labels_path, "txt"
+            return label_path, "txt"
 
     logger.warning("No labels found in the specified dataset path and phase name.")
     return [], None
@@ -52,7 +49,7 @@ def create_image_metadata(labels_path: str) -> Tuple[Dict[str, List], Dict[str,
         labels_data = json.load(file)
         id_to_idx = discretize_categories(labels_data.get("categories", [])) if "categories" in labels_data else None
         annotations_index = organize_annotations_by_image(labels_data, id_to_idx)  # check lookup is a good name?
-        image_info_dict = {Path(img["file_name"]).stem: img for img in labels_data["images"]}
+        image_info_dict = {img["file_name"]: img for img in labels_data["images"]}
         return annotations_index, image_info_dict
 
 
@@ -81,7 +78,8 @@ def organize_annotations_by_image(data: Dict[str, Any], id_to_idx: Optional[Dict
 
 
 def scale_segmentation(
-    annotations: List[Dict[str, Any]], image_dimensions: Dict[str, int]
+    annotations: List[Dict[str, Any]], image_dimensions: Dict[str, int],
+    mode: str ="detection"
 ) -> Optional[List[List[float]]]:
     """
     Scale the segmentation data based on image dimensions and return a list of scaled segmentation data.
@@ -100,10 +98,13 @@ def scale_segmentation(
     h, w = image_dimensions["height"], image_dimensions["width"]
     for anno in annotations:
         category_id = anno["category_id"]
-        if "segmentation" in anno:
+        if "segmentation" in anno and mode=="segmentation":
             seg_list = [item for sublist in anno["segmentation"] for item in sublist]
-        elif "bbox" in anno:
+        elif "bbox" in anno and mode=="detection":
             seg_list = anno["bbox"]
+        else:
+            # invalid annotation
+            return []
         scaled_seg_data = (
             np.array(seg_list).reshape(-1, 2) / [w, h]
         ).tolist()  # make the list group in x, y pairs and scaled with image width, height
diff --git a/yolo/utils/model_utils.py b/yolo/utils/model_utils.py
index c35b6009..3f456d0b 100644
--- a/yolo/utils/model_utils.py
+++ b/yolo/utils/model_utils.py
@@ -73,6 +73,7 @@ def next_batch(self):
     optimizer_class.next_epoch = next_epoch
     optimizer = optimizer_class(model_parameters, **optim_cfg.args)
     optimizer.max_lr = [0.1, 0, 0]
+    logger.info(f"✅ Optimizer {optim_cfg.type} initialized successfully")
     return optimizer
 
 
@@ -160,7 +161,7 @@ def collect_prediction(predict_json: List, local_rank: int) -> List:
     return predict_json
 
 
-def predicts_to_json(img_paths, predicts, rev_tensor):
+def predicts_to_json(img_paths, image_id_map, predicts, rev_tensor):
     """
     TODO: function document
     turn a batch of imagepath and predicts(n x 6 for each image) to a List of diction(Detection output)
@@ -171,8 +172,12 @@ def predicts_to_json(img_paths, predicts, rev_tensor):
         bboxes[:, 1:5] = (bboxes[:, 1:5] - shift[None]) / scale[None]
         bboxes[:, 1:5] = transform_bbox(bboxes[:, 1:5], "xyxy -> xywh")
         for cls, *pos, conf in bboxes:
+            img_path_map = image_id_map.get(str(img_path))
+            # when not exist, continue
+            if img_path_map is None:
+                continue
             bbox = {
-                "image_id": int(Path(img_path).stem),
+                "image_id": img_path_map["id"],
                 "category_id": IDX_TO_ID[int(cls)],
                 "bbox": [float(p) for p in pos],
                 "score": float(conf),
diff --git a/yolo/utils/solver_utils.py b/yolo/utils/solver_utils.py
index b04efcfd..c94d8337 100644
--- a/yolo/utils/solver_utils.py
+++ b/yolo/utils/solver_utils.py
@@ -1,6 +1,7 @@
 import contextlib
 import io
 
+import copy
 import numpy as np
 from pycocotools.coco import COCO
 from pycocotools.cocoeval import COCOeval
@@ -46,3 +47,57 @@ def make_ap_table(score, past_result=[], last_score=None, epoch=-1):
         ap_table.add_row(f"{epoch: 3d}", ap_name, f"{ap_color}{ap_value:.2f}", ar_name, f"{ar_color}{ar_value:.2f}")
 
     return ap_table, this_ap
+
+def merge_coco_objects(coco_list):
+    """
+    Merge multiple COCO objects into a single one.
+
+    Args:
+        coco_list (List[COCO]): A list of COCO objects to merge.
+
+    Returns:
+        COCO: The merged COCO object.
+    """
+    if not coco_list:
+        return None  # Return None if the list is empty
+
+    if len(coco_list) == 1:
+        return coco_list[0]  # If there's only one object, return it directly
+
+    # Start with the first COCO object
+    merged_coco = COCO()
+    merged_coco.dataset = copy.deepcopy(coco_list[0].dataset)
+
+    for coco in coco_list[1:]:
+        # Check if the data is identical to the already merged data
+        if coco.dataset == merged_coco.dataset:
+            continue  # If identical, skip this object
+
+        # Merge images
+        max_img_id = max(merged_coco.imgs.keys()) if merged_coco.imgs else 0
+        for img_id, img_info in coco.imgs.items():
+            if img_info not in merged_coco.dataset['images']:
+                new_img_id = img_id + max_img_id
+                merged_coco.dataset['images'].append({**img_info, 'id': new_img_id})
+
+        # Merge categories
+        max_cat_id = max(merged_coco.cats.keys()) if merged_coco.cats else 0
+        for cat_id, cat_info in coco.cats.items():
+            if cat_info not in merged_coco.dataset['categories']:
+                new_cat_id = cat_id + max_cat_id
+                merged_coco.dataset['categories'].append({**cat_info, 'id': new_cat_id})
+
+        # Merge annotations
+        max_ann_id = max(ann['id'] for ann in merged_coco.dataset['annotations']) if merged_coco.dataset['annotations'] else 0
+        for ann in coco.dataset['annotations']:
+            if ann not in merged_coco.dataset['annotations']:
+                new_ann = copy.deepcopy(ann)
+                new_ann['id'] = ann['id'] + max_ann_id
+                new_ann['image_id'] = ann['image_id'] + max_img_id
+                new_ann['category_id'] = ann['category_id'] + max_cat_id
+                merged_coco.dataset['annotations'].append(new_ann)
+
+    # Create index
+    merged_coco.createIndex()
+
+    return merged_coco
\ No newline at end of file