nepfaff · evelyn-fu · Apr 9, 2025 · Apr 9, 2025 · Apr 9, 2025 · Apr 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,6 @@
 .venv
 .venv_nerfstudio
 data
-checkpoints
 __pycache__
 outputs
-.vscode
+.vscode
diff --git a/README.md b/README.md
@@ -105,8 +105,39 @@ Our segmentation pipeline for obtaining object and gripper masks. You might want
 do human-in-the-loop segmentation by annotating specific frames with positive/ negative
 labels for more robust results. We provide a simple GUI for this purpose. The default
 automatic annotations using DINO work well in many cases but can struggle with the
-gripper masks. All downstream object tracking and reconstruction results are sensitive
-to the segmentation quality and thus spending a bit of effort here might be worthwhile.
+gripper masks. This is possibly because our particular gripper seems to be out of 
+distribution for SAM2, and thus it looses track of it for long videos. This can be 
+solved with re-prompting it after failure. All downstream object tracking and 
+reconstruction results are sensitive to the segmentation quality and thus spending a bit
+of effort here might be worthwhile.
+
+##### Gripper masking with fine tuned models
+Using fine tuned SAM2 and GroundingDINO networks for a gripper that is out of distribution
+can help to remove the extra step of reprompting after failure.
+
+We provide fine tuned networks for SAM2 and GroundingDINO for the segmentation and annotation
+of the gripper used in our provided dataset which can be downloaded from [here](https://mitprod-my.sharepoint.com/personal/nepfaff_mit_edu/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fnepfaff%5Fmit%5Fedu%2FDocuments%2Fscalable%5Freal2sim%5Fmodel%5Fweights&ga=1).
+
+Please put the downloaded checkpoint files in the `./checkpoints` directory.
+We used mmdetection's implementation to fine tune Grounding DINO. Please see the 
+[mmdetection Official Github](https://github.com/open-mmlab/mmdetection/tree/main)
+for installation instructions. Make sure to be in the virtual environment set up with poetry, 
+not the Nerfstudio virtual environment.
+
+When `--txt_prompt` is set to `gripper`, the segmentation script will use the gripper fine tuned
+models for annotation and segmentation.
+
+To fine tune your own object detection model for your gripper, see [these instructions](https://github.com/open-mmlab/mmdetection/blob/main/configs/grounding_dino/README.md)
+from the mmdetection Official Github.
+
+To fine tune your own segmentation model for your gripper, see [these instructions](https://github.com/facebookresearch/sam2/blob/main/training/README.md) for training from the
+SAM2 Official Github.
+
+An example of segmentation failure on the gripper with default models: \
+<img src="assets/mask_sam2_failure.png" width="200"> 
+
+Gripper segmentation on the same image with custom models: \
+<img src="assets/mask_sam2_custom.png" width="200"> 
 
 ### Submodules
 
@@ -177,7 +208,9 @@ Note that this needs to be done once per environment for the robot data from ste
 
 ### 4. Run asset generation
 
-The asset generation can be run with `scalable_real2sim/run_asset_generation.py`.
+The asset generation can be run with `scalable_real2sim/run_asset_generation.py`. The `--use-finetuned-gripper-segmentation` flag can be specified to use fine tuned SAM2 and GroundingDINO
+models for gripper segmentation. See the section on `segment_moving_obj_data.py` for installation
+instructions.
 
 ## Figures
 

diff --git a/assets/mask_sam2_custom.png b/assets/mask_sam2_custom.png
diff --git a/assets/mask_sam2_failure.png b/assets/mask_sam2_failure.png
diff --git a/checkpoints/.gitignore b/checkpoints/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/run_asset_generation.py b/run_asset_generation.py
@@ -73,7 +73,9 @@ def downsample_images(data_dir: str, num_images: int) -> None:
     )
 
 
-def run_segmentation(data_dir: str, output_dir: str) -> None:
+def run_segmentation(
+    data_dir: str, output_dir: str, use_finetuned_gripper_networks: bool = False
+) -> None:
     start = time.perf_counter()
 
     # Detect the object of interest. Need to add a dot for the DINO model.
@@ -89,8 +91,12 @@ def run_segmentation(data_dir: str, output_dir: str) -> None:
     logging.info(f"Detected object of interest: {object_of_interest}")
 
     gripper_txt = (
-        "Blue plastic robotic gripper with two symmetrical, curved arms "
-        "attached to the end of a metallic robotic arm."
+        ("gripper")
+        if use_finetuned_gripper_networks
+        else (
+            "Blue plastic robotic gripper with two symmetrical, curved arms "
+            "attached to the end of a metallic robotic arm."
+        )
     )
 
     # Generate the object masks.
@@ -383,6 +389,7 @@ def main(
     skip_segmentation: bool = False,
     bundle_sdf_interpolate_missing_vertices: bool = False,
     use_depth: bool = False,
+    use_finetuned_gripper_segmentation: bool = False,
 ):
     logging.info("Starting asset generation...")
 
@@ -422,7 +429,11 @@ def main(
             # Generate object and gripper masks.
             if not skip_segmentation:
                 logging.info("Running segmentation...")
-                run_segmentation(data_dir=object_dir, output_dir=object_dir)
+                run_segmentation(
+                    data_dir=object_dir,
+                    output_dir=object_dir,
+                    use_finetuned_gripper_networks=use_finetuned_gripper_segmentation,
+                )
             else:
                 logging.info("Skipping segmentation...")
                 if not os.path.exists(os.path.join(object_dir, "masks")):
@@ -607,6 +618,12 @@ def main(
         help="If specified, use depth images for geometric reconstruction when "
         "supported by the reconstruction method.",
     )
+    parser.add_argument(
+        "--use-finetuned-gripper-segmentation",
+        action="store_true",
+        help="If specified, use fine tuned SAM2 and GroundingDINO models for gripper"
+        "segmentation.",
+    )
     args = parser.parse_args()
 
     if not os.path.exists(args.data_dir):
@@ -622,4 +639,5 @@ def main(
         skip_segmentation=args.skip_segmentation,
         bundle_sdf_interpolate_missing_vertices=args.bundle_sdf_interpolate_missing_vertices,
         use_depth=args.use_depth,
+        use_finetuned_gripper_segmentation=args.use_finetuned_gripper_segmentation,
     )
diff --git a/scalable_real2sim/segmentation/finetuned_grounding_dino_utils/coco_detection.py b/scalable_real2sim/segmentation/finetuned_grounding_dino_utils/coco_detection.py
@@ -0,0 +1,102 @@
+# This configuration file is taken from https://github.com/open-mmlab/mmdetection/tree/main/configs
+
+# dataset settings
+dataset_type = "CocoDataset"
+data_root = "data/coco/"
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="LoadAnnotations", with_bbox=True),
+    dict(type="Resize", scale=(1333, 800), keep_ratio=True),
+    dict(type="RandomFlip", prob=0.5),
+    dict(type="PackDetInputs"),
+]
+test_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="Resize", scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type="LoadAnnotations", with_bbox=True),
+    dict(
+        type="PackDetInputs",
+        meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor"),
+    ),
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    batch_sampler=dict(type="AspectRatioBatchSampler"),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file="annotations/instances_train2017.json",
+        data_prefix=dict(img="train2017/"),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args,
+    ),
+)
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file="annotations/instances_val2017.json",
+        data_prefix=dict(img="val2017/"),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args,
+    ),
+)
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type="CocoMetric",
+    ann_file=data_root + "annotations/instances_val2017.json",
+    metric="bbox",
+    format_only=False,
+    backend_args=backend_args,
+)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type='CocoMetric',
+#     metric='bbox',
+#     format_only=True,
+#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_detection/test')
diff --git a/scalable_real2sim/segmentation/finetuned_grounding_dino_utils/default_runtime.py b/scalable_real2sim/segmentation/finetuned_grounding_dino_utils/default_runtime.py
@@ -0,0 +1,28 @@
+# This configuration file is taken from https://github.com/open-mmlab/mmdetection/tree/main/configs
+
+default_scope = "mmdet"
+
+default_hooks = dict(
+    timer=dict(type="IterTimerHook"),
+    logger=dict(type="LoggerHook", interval=50),
+    param_scheduler=dict(type="ParamSchedulerHook"),
+    checkpoint=dict(type="CheckpointHook", interval=1),
+    sampler_seed=dict(type="DistSamplerSeedHook"),
+    visualization=dict(type="DetVisualizationHook"),
+)
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
+    dist_cfg=dict(backend="nccl"),
+)
+
+vis_backends = [dict(type="LocalVisBackend")]
+visualizer = dict(
+    type="DetLocalVisualizer", vis_backends=vis_backends, name="visualizer"
+)
+log_processor = dict(type="LogProcessor", window_size=50, by_epoch=True)
+
+log_level = "INFO"
+load_from = None
+resume = False