From a83db7fad67d41a324fdd20fd35ed75e26f339ca Mon Sep 17 00:00:00 2001
From: Sean McCulloch <86432671+seanmcculloch@users.noreply.github.com>
Date: Fri, 13 Feb 2026 14:50:12 -0800
Subject: [PATCH 1/6] feat: split xml ipd

* feat: split xml IP detection

- Add split tile shape support to overlap_detection.py
- Add split path construction and crop passthrough to metadata_builder.py
- Add crop slicing to image_reader.py
- Add fetch_local_xml utility function to pipelines/utils.py
- Update xml_to_dataframe.py for split XML support
- Add uv.lock to gitignore

Tests for this feature are in a separate PR.

* test: add split XML IP detection tests

- Add test_xml_to_dataframe tests for split XML parsing
- Add test_image_reader tests for crop slicing
- Add test_metadata_builder tests for split metadata handling
- Add dataset_split.xml test fixture

These tests verify the split XML support added in feat/split-xml-ipd.

* Initial plan

* Update Rhapso/data_prep/xml_to_dataframe.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update Rhapso/data_prep/xml_to_dataframe.py

descriptive error with bad split xmls.

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Fix channel parsing in parse_image_loader_split_zarr to use .ome.zarr suffix

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

* Update Rhapso/detection/metadata_builder.py

ceil crop_max when downsampling

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update Rhapso/data_prep/xml_to_dataframe.py

descriptive error upon bad split xml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Initial plan

* Fix channel extraction, crop validation, path handling, and test signatures

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

* Move import to top level and use calculated level instead of hardcoded '0'

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

* fix: handling of multiscale levels

* fix: prevent skipping ip detection with 1 split tile only

* feat: add overlapping_only flag (default true) to ipd

* Fix non-split zarr path construction in overlap detection (#164)

* Initial plan

* Fix dim_other path to use proper path joining and include level

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .gitignore                                    |   3 +
 Rhapso/data_prep/xml_to_dataframe.py          |  92 +++++++++-
 Rhapso/detection/image_reader.py              |  15 ++
 Rhapso/detection/metadata_builder.py          |  46 +++--
 Rhapso/detection/overlap_detection.py         | 165 ++++++++++++------
 .../pipelines/ray/interest_point_detection.py |   9 +-
 Rhapso/pipelines/utils.py                     |  26 +++
 tests/XML_test_data/dataset_split.xml         | 150 ++++++++++++++++
 tests/test_data_prep/test_xml_to_dataframe.py |  66 ++++++-
 tests/test_detection/test_image_reader.py     | 102 +++++++++++
 tests/test_detection/test_metadata_builder.py | 152 ++++++++++++++++
 .../test_detection/test_overlap_detection.py  |   9 -
 12 files changed, 744 insertions(+), 91 deletions(-)
 create mode 100644 Rhapso/pipelines/utils.py
 create mode 100644 tests/XML_test_data/dataset_split.xml
 create mode 100644 tests/test_detection/test_image_reader.py
 create mode 100644 tests/test_detection/test_metadata_builder.py

diff --git a/.gitignore b/.gitignore
index 6f86969..15abfc6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,6 +91,9 @@ ipython_config.py
 #   install all needed dependencies.
 #Pipfile.lock
 
+# uv
+uv.lock
+
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 
diff --git a/Rhapso/data_prep/xml_to_dataframe.py b/Rhapso/data_prep/xml_to_dataframe.py
index ee1284c..f165cb8 100644
--- a/Rhapso/data_prep/xml_to_dataframe.py
+++ b/Rhapso/data_prep/xml_to_dataframe.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import xml.etree.ElementTree as ET
+import re
 
 # This component recieves an XML file containing Tiff or Zarr image metadata and converts
 # it into several Dataframes
@@ -83,17 +84,98 @@ def parse_image_loader_tiff(self, root):
         # Convert the list to a DataFrame and return
         return pd.DataFrame(image_loader_data)
     
-    def parse_image_loader_split_zarr(self):
-        pass
+    def parse_image_loader_split_zarr(self, root):
+        """
+        Parses a split.viewerimgloader XML structure where a single source image is virtually
+        subdivided into overlapping tiles via SetupIdDefinitions.
+
+        Parameters
+        ----------
+        root : xml.etree.ElementTree.Element
+            Root element of the parsed XML.
+
+        Returns
+        -------
+        pd.DataFrame
+            One row per split tile with columns: view_setup, timepoint, series, channel,
+            file_path, crop_min, crop_max, zarr_base_path.
+        """
+        outer_loader = root.find(".//ImageLoader[@format='split.viewerimgloader']")
+        if outer_loader is None:
+            raise ValueError(
+                "split.viewerimgloader ImageLoader node not found in XML; "
+                "ensure the XML contains an ImageLoader with format='split.viewerimgloader'."
+            )
+
+        inner_loader = outer_loader.find("ImageLoader")
+        if inner_loader is None:
+            raise ValueError(
+                "Nested ImageLoader node not found inside split.viewerimgloader configuration."
+            )
+
+        zarr_elem = inner_loader.find("zarr")
+        if zarr_elem is None or zarr_elem.text is None:
+            raise ValueError(
+                "<zarr> node with base path is missing from split.viewerimgloader configuration."
+            )
+
+        zarr_base_path = zarr_elem.text.strip()
+        # Build lookup from source setup id to (timepoint, zgroup_path)
+        zgroup_lookup = {}
+        for zg in inner_loader.findall(".//zgroups/zgroup"):
+            setup = zg.get("setup")
+            tp = zg.get("tp") or zg.get("timepoint")
+            path = zg.get("path")
+            zgroup_lookup[setup] = (tp, path)
+
+        image_loader_data = []
+        for sid in outer_loader.findall(".//SetupIds/SetupIdDefinition"):
+            new_id = sid.find("NewId").text.strip()
+            old_id = sid.find("OldId").text.strip()
+            crop_min = sid.find("min").text.strip()
+            crop_max = sid.find("max").text.strip()
+
+            if old_id not in zgroup_lookup:
+                raise ValueError(
+                    "SetupIdDefinition refers to OldId {!r} that is not present in the "
+                    "inner loader's zgroups. Available setup ids: {}".format(
+                        old_id, sorted(zgroup_lookup.keys())
+                    )
+                )
+            tp, zgroup_path = zgroup_lookup[old_id]
+
+            # Extract channel using regex to handle both .zarr and .ome.zarr
+            channel_match = re.search(r'_ch_(\d+)', zgroup_path)
+            if channel_match:
+                channel = channel_match.group(1)
+            else:
+                channel = 0
+
+            image_loader_data.append({
+                "view_setup": new_id,
+                "timepoint": tp,
+                "series": 1,
+                "channel": channel,
+                "file_path": zgroup_path,
+                "crop_min": crop_min,
+                "crop_max": crop_max,
+                "zarr_base_path": zarr_base_path,
+            })
+
+        return pd.DataFrame(image_loader_data)
 
     def route_image_loader(self, root):
         """
         Directs the XML parsing process based on the image loader format specified in the XML.
         """
         format_node = root.find(".//ImageLoader")
-        format_type = format_node.get("format")
+        if format_node is None:
+            raise ValueError("No <ImageLoader> element found in XML; cannot determine image loader format.")
 
-        if "filemap" in format_type:
+        format_type = (format_node.get("format") or "").lower()
+        if "split" in format_type:
+            return self.parse_image_loader_split_zarr(root)
+        elif "filemap" in format_type:
             return self.parse_image_loader_tiff(root)
         else:
             return self.parse_image_loader_zarr(root)
@@ -104,7 +186,7 @@ def parse_view_setups(self, root):
         """
         viewsetups_data = []
 
-        for vs in root.findall(".//ViewSetup"):
+        for vs in root.findall("./SequenceDescription/ViewSetups/ViewSetup"):
             id_ = vs.find("id").text
             # name = vs.find("name").text
             name = vs.findtext("name")
diff --git a/Rhapso/detection/image_reader.py b/Rhapso/detection/image_reader.py
index cbe7076..b4d0ab5 100644
--- a/Rhapso/detection/image_reader.py
+++ b/Rhapso/detection/image_reader.py
@@ -91,6 +91,21 @@ def fetch_image_data(self, record, dsxy, dsz):
         dask_array = dask_array.astype(np.float32)
         dask_array = dask_array.transpose()
 
+        # Apply split tile crop if present
+        crop_min = record.get('crop_min')
+        crop_max = record.get('crop_max')
+        if crop_min is not None and crop_max is not None:
+            if len(crop_min) != 3 or len(crop_max) != 3:
+                raise ValueError(
+                    f"crop_min and crop_max must both be length 3 for 3D cropping; "
+                    f"got crop_min={crop_min}, crop_max={crop_max}"
+                )
+            dask_array = dask_array[
+                crop_min[0]:crop_max[0] + 1,
+                crop_min[1]:crop_max[1] + 1,
+                crop_min[2]:crop_max[2] + 1
+            ]
+
         # Downsample Dask array
         downsampled_stack = self.interface_downsampling(dask_array, dsxy, dsz)
 
diff --git a/Rhapso/detection/metadata_builder.py b/Rhapso/detection/metadata_builder.py
index 203a5be..b03f126 100644
--- a/Rhapso/detection/metadata_builder.py
+++ b/Rhapso/detection/metadata_builder.py
@@ -21,7 +21,7 @@ def __init__(self, dataframes, overlapping_area, image_file_prefix, file_type, d
         self.sub_region_chunking = not chunks_per_bound == 0
         self.metadata = []
     
-    def build_image_metadata(self, process_intervals, file_path, view_id):
+    def build_image_metadata(self, process_intervals, file_path, view_id, crop_min=None, crop_max=None):
         """
         Builds list of metadata with optional sub-chunking
         """
@@ -41,7 +41,9 @@ def build_image_metadata(self, process_intervals, file_path, view_id):
                     'file_path': file_path,
                     'interval_key': interval_key,
                     'offset': 0,
-                    'lb': lb_fixed
+                    'lb': lb_fixed,
+                    'crop_min': crop_min,
+                    'crop_max': crop_max
                 }) 
 
             # Apply sub-region chunking
@@ -73,8 +75,10 @@ def build_image_metadata(self, process_intervals, file_path, view_id):
                             'file_path': file_path,
                             'interval_key': interval_key,
                             'offset': z,
-                            'lb' : lb
-                        })  
+                            'lb' : lb,
+                            'crop_min': crop_min,
+                            'crop_max': crop_max
+                        })
 
                 elif self.file_type == "zarr":
 
@@ -102,26 +106,44 @@ def build_image_metadata(self, process_intervals, file_path, view_id):
                             'file_path': file_path,
                             'interval_key': interval_key,
                             'offset': z,
-                            'lb' : lb
-                        })  
-    
+                            'lb' : lb,
+                            'crop_min': crop_min,
+                            'crop_max': crop_max
+                        })
+
     def build_paths(self):
         """
         Iterates through views to interface metadata building
         """
+        is_split = 'crop_min' in self.image_loader_df.columns
+
         for _, row in self.image_loader_df.iterrows():
             view_id = f"timepoint: {row['timepoint']}, setup: {row['view_setup']}"
             process_intervals = self.overlapping_area[view_id]
-            
+
             if self.file_type == 'zarr':
-                file_path = self.image_file_prefix + row['file_path'] + f'/{self.level}'
+                if is_split:
+                    file_path = row['zarr_base_path'] + row['file_path'] + f'/{self.level}'
+                else:
+                    file_path = self.image_file_prefix + row['file_path'] + f'/{self.level}'
             elif self.file_type == 'tiff':
-                file_path = self.image_file_prefix + row['file_path'] 
+                file_path = self.image_file_prefix + row['file_path']
             else:
                 raise ValueError(f"Unsupported file_type: {self.file_type!r}")
-            
+
+            # Extract and scale crop bounds for split tiles
+            crop_min = None
+            crop_max = None
+            if is_split:
+                scale = 2 ** self.level if self.level is not None else 1
+                cmin = [int(v) // scale for v in row['crop_min'].split()]
+                # For inclusive bounds, use a ceil-style mapping for crop_max to avoid shrinking coverage
+                cmax = [int(np.ceil((int(v) + 1) / scale) - 1) for v in row['crop_max'].split()]
+                crop_min = cmin
+                crop_max = cmax
+
             if self.run_type == 'ray':
-                self.build_image_metadata(process_intervals, file_path, view_id)
+                self.build_image_metadata(process_intervals, file_path, view_id, crop_min, crop_max)
             else:
                 raise ValueError(f"Unsupported run type: {self.run_type!r}")
 
diff --git a/Rhapso/detection/overlap_detection.py b/Rhapso/detection/overlap_detection.py
index 20e748c..f55a1d1 100644
--- a/Rhapso/detection/overlap_detection.py
+++ b/Rhapso/detection/overlap_detection.py
@@ -23,12 +23,13 @@ def time_interval(self):
         pass
 
 class OverlapDetection():
-    def __init__(self, transform_models, dataframes, dsxy, dsz, prefix, file_type):
+    def __init__(self, transform_models, dataframes, dsxy, dsz, prefix, file_type, overlapping_only=True):
         self.transform_models = transform_models
         self.image_loader_df = dataframes['image_loader']
         self.dsxy, self.dsz = dsxy, dsz
         self.prefix = prefix
         self.file_type = file_type
+        self.overlapping_only = overlapping_only
         self.to_process = {}
         self.image_shape_cache = {}
         self.max_interval_size = 0
@@ -66,7 +67,28 @@ def load_image_metadata(self, file_path):
             self.image_shape_cache[file_path] = shape
         
         return shape
-    
+
+    def _split_tile_shape(self, row):
+        """Derive 6D shape tuple from split tile crop bounds.
+
+        Parameters
+        ----------
+        row : pd.Series
+            Row from image_loader_df with 'crop_min' and 'crop_max' columns.
+            Values are space-separated "X Y Z" strings.
+
+        Returns
+        -------
+        tuple
+            6D shape tuple (1, 1, 1, Z, Y, X) matching load_image_metadata format.
+        """
+        cmin = list(map(int, row['crop_min'].split()))
+        cmax = list(map(int, row['crop_max'].split()))
+        x_size = cmax[0] - cmin[0] + 1
+        y_size = cmax[1] - cmin[1] + 1
+        z_size = cmax[2] - cmin[2] + 1
+        return (1, 1, 1, z_size, y_size, x_size)
+
     # def open_and_downsample(self, shape):
     #     X = int(shape[5])
     #     Y = int(shape[4])
@@ -240,15 +262,20 @@ def find_overlapping_area(self):
         """
         Compute XY Z overlap intervals against every other view, accounting for mipmap/downsampling and per-view affine transforms
         """
+        is_split = 'crop_min' in self.image_loader_df.columns
+
         for i, row_i in self.image_loader_df.iterrows():
             view_id = f"timepoint: {row_i['timepoint']}, setup: {row_i['view_setup']}"
             
             # get inverted matrice of downsampling
-            all_intervals = []        
+            all_intervals = []
             if self.file_type == 'zarr':
                 level, leftovers = self.choose_zarr_level()
 
-                dim_base = self.load_image_metadata(os.path.join(self.prefix, row_i['file_path']))
+                if is_split:
+                    dim_base = self._split_tile_shape(row_i)
+                else:
+                    dim_base = self.load_image_metadata(os.path.join(self.prefix, row_i['file_path'], str(level)))
 
                 # isotropic pyramid
                 s = float(2 ** level)  
@@ -264,59 +291,91 @@ def find_overlapping_area(self):
                 level = None
 
             downsampled_dim_base = self.open_and_downsample(dim_base, dsxy, dsz)
-            t1 = self.get_inverse_mipmap_transform(mipmap_of_downsample) 
-
-            # compare with all view_ids
-            for j, row_j in self.image_loader_df.iterrows():
-                if i == j: continue
-                
-                view_id_other = f"timepoint: {row_j['timepoint']}, setup: {row_j['view_setup']}"
-
-                if self.file_type == 'zarr':
-                    dim_other = self.load_image_metadata(self.prefix + row_j['file_path'] + f'/{0}')
-                elif self.file_type == 'tiff':
-                    dim_other = self.load_image_metadata(self.prefix + row_j['file_path'])
-                
-                # get transforms matrix from both view_ids and downsampling matrices
-                matrix = self.transform_models.get(view_id)
-                matrix_other = self.transform_models.get(view_id_other)
-
-                if self.file_type == 'zarr':
-                    s = float(2 ** level)  
-                    mipmap_of_downsample_other = self.affine_with_half_pixel_shift(s, s, s)
-                elif self.file_type == 'tiff':
-                    mipmap_of_downsample_other = self.create_mipmap_transform()
-
-                inverse_mipmap_of_downsample_other = self.get_inverse_mipmap_transform(mipmap_of_downsample_other)
-                inverse_matrix = self.get_inverse_mipmap_transform(matrix)
-
-                concatenated_matrix = np.dot(inverse_matrix, matrix_other) 
-                t2 = np.dot(inverse_mipmap_of_downsample_other, concatenated_matrix)
-
-                intervals = self.estimate_bounds(t1, dim_base)
-                intervals_other = self.estimate_bounds(t2, dim_other)
-
-                bounding_boxes = tuple(map(lambda x: np.round(x).astype(int), intervals))
-                bounding_boxes_other = tuple(map(lambda x: np.round(x).astype(int), intervals_other))
-
-                # find upper and lower bounds of intersection
-                if np.all((bounding_boxes[1] >= bounding_boxes_other[0]) & (bounding_boxes_other[1] >= bounding_boxes[0])):
-                    intersected_boxes = self.calculate_intersection(bounding_boxes, bounding_boxes_other)
-                    intersect = self.calculate_intersection(downsampled_dim_base, intersected_boxes)     
-                    intersect_dict = {
-                        'lower_bound': intersect[0],
-                        'upper_bound': intersect[1],
-                        'span': self.calculate_new_dims(intersect[0], intersect[1])
-                    }
-
-                    lb, ub = intersect[0], intersect[1]
+            t1 = self.get_inverse_mipmap_transform(mipmap_of_downsample)
+
+            if self.overlapping_only:
+                # compare with all view_ids
+                for j, row_j in self.image_loader_df.iterrows():
+                    if i == j: continue
+
+                    view_id_other = f"timepoint: {row_j['timepoint']}, setup: {row_j['view_setup']}"
+
+                    if self.file_type == 'zarr':
+                        if is_split:
+                            dim_other = self._split_tile_shape(row_j)
+                        else:
+                            dim_other = self.load_image_metadata(os.path.join(self.prefix, row_j['file_path'], str(level)))
+                    elif self.file_type == 'tiff':
+                        dim_other = self.load_image_metadata(os.path.join(self.prefix, row_j['file_path']))
+
+                    # get transforms matrix from both view_ids and downsampling matrices
+                    matrix = self.transform_models.get(view_id)
+                    matrix_other = self.transform_models.get(view_id_other)
+
+                    if self.file_type == 'zarr':
+                        s = float(2 ** level)
+                        mipmap_of_downsample_other = self.affine_with_half_pixel_shift(s, s, s)
+                    elif self.file_type == 'tiff':
+                        mipmap_of_downsample_other = self.create_mipmap_transform()
+
+                    inverse_mipmap_of_downsample_other = self.get_inverse_mipmap_transform(mipmap_of_downsample_other)
+                    inverse_matrix = self.get_inverse_mipmap_transform(matrix)
+
+                    concatenated_matrix = np.dot(inverse_matrix, matrix_other)
+                    t2 = np.dot(inverse_mipmap_of_downsample_other, concatenated_matrix)
+
+                    intervals = self.estimate_bounds(t1, dim_base)
+                    intervals_other = self.estimate_bounds(t2, dim_other)
+
+                    bounding_boxes = tuple(map(lambda x: np.round(x).astype(int), intervals))
+                    bounding_boxes_other = tuple(map(lambda x: np.round(x).astype(int), intervals_other))
+
+                    # find upper and lower bounds of intersection
+                    if np.all((bounding_boxes[1] >= bounding_boxes_other[0]) & (bounding_boxes_other[1] >= bounding_boxes[0])):
+                        intersected_boxes = self.calculate_intersection(bounding_boxes, bounding_boxes_other)
+                        intersect = self.calculate_intersection(downsampled_dim_base, intersected_boxes)
+                        intersect_dict = {
+                            'lower_bound': intersect[0],
+                            'upper_bound': intersect[1],
+                            'span': self.calculate_new_dims(intersect[0], intersect[1])
+                        }
+
+                        lb, ub = intersect[0], intersect[1]
+                        sz = self.size_interval(lb, ub)
+                        if sz > self.max_interval_size:
+                            self.max_interval_size = sz
+
+                        # add max size
+                        all_intervals.append(intersect_dict)
+
+                # Single-view dataset: no pairwise overlaps exist, so use the
+                # full downsampled volume as the processing region.
+                if not all_intervals and len(self.image_loader_df) == 1:
+                    lb = np.array(downsampled_dim_base[0])
+                    ub = np.array(downsampled_dim_base[1])
+                    all_intervals.append({
+                        'lower_bound': lb,
+                        'upper_bound': ub,
+                        'span': self.calculate_new_dims(lb, ub),
+                    })
                     sz = self.size_interval(lb, ub)
                     if sz > self.max_interval_size:
                         self.max_interval_size = sz
 
-                    # add max size
-                    all_intervals.append(intersect_dict)        
-    
+            else:
+                # Full-volume mode: use the entire downsampled tile as the
+                # processing region (for registration, not stitching).
+                lb = np.array(downsampled_dim_base[0])
+                ub = np.array(downsampled_dim_base[1])
+                all_intervals.append({
+                    'lower_bound': lb,
+                    'upper_bound': ub,
+                    'span': self.calculate_new_dims(lb, ub),
+                })
+                sz = self.size_interval(lb, ub)
+                if sz > self.max_interval_size:
+                    self.max_interval_size = sz
+
             self.to_process[view_id] = all_intervals
         
         return dsxy, dsz, level, mipmap_of_downsample
diff --git a/Rhapso/pipelines/ray/interest_point_detection.py b/Rhapso/pipelines/ray/interest_point_detection.py
index 1af6aa9..9db2b5c 100644
--- a/Rhapso/pipelines/ray/interest_point_detection.py
+++ b/Rhapso/pipelines/ray/interest_point_detection.py
@@ -13,9 +13,9 @@
 # This class implements the interest point detection pipeline
 
 class InterestPointDetection:
-    def __init__(self, dsxy, dsz, min_intensity, max_intensity, sigma, threshold, file_type, xml_file_path, 
-                 image_file_prefix, xml_output_file_path, n5_output_file_prefix, combine_distance, chunks_per_bound, run_type, 
-                 max_spots, median_filter):
+    def __init__(self, dsxy, dsz, min_intensity, max_intensity, sigma, threshold, file_type, xml_file_path,
+                 image_file_prefix, xml_output_file_path, n5_output_file_prefix, combine_distance, chunks_per_bound, run_type,
+                 max_spots, median_filter, overlapping_only=True):
         self.dsxy = dsxy
         self.dsz = dsz
         self.min_intensity = min_intensity
@@ -32,6 +32,7 @@ def __init__(self, dsxy, dsz, min_intensity, max_intensity, sigma, threshold, fi
         self.run_type = run_type
         self.max_spots = max_spots
         self.median_filter = median_filter
+        self.overlapping_only = overlapping_only
 
     def detection(self):
         # Get XML file
@@ -57,7 +58,7 @@ def detection(self):
         print("Transforms models have been created")
 
         # Use view transform matrices to find areas of overlap
-        overlap_detection = OverlapDetection(view_transform_matrices, dataframes, self.dsxy, self.dsz, self.image_file_prefix, self.file_type)
+        overlap_detection = OverlapDetection(view_transform_matrices, dataframes, self.dsxy, self.dsz, self.image_file_prefix, self.file_type, overlapping_only=self.overlapping_only)
         overlapping_area, new_dsxy, new_dsz, level, max_interval_size, mip_map_downsample = overlap_detection.run()
         print("Overlap detection is done")
 
diff --git a/Rhapso/pipelines/utils.py b/Rhapso/pipelines/utils.py
new file mode 100644
index 0000000..0b33c38
--- /dev/null
+++ b/Rhapso/pipelines/utils.py
@@ -0,0 +1,26 @@
+"""
+Utility functions for pipelines
+"""
+
+
+def fetch_local_xml(file_path):
+    """
+    Read XML content from a local file.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the XML file
+
+    Returns
+    -------
+    str
+        XML file contents
+    """
+    try:
+        with open(file_path, "r", encoding="utf-8") as file:
+            return file.read()
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Could not find XML file at '{file_path}'")
+    except Exception as e:
+        raise RuntimeError(f"Error reading XML file at '{file_path}': {e}")
diff --git a/tests/XML_test_data/dataset_split.xml b/tests/XML_test_data/dataset_split.xml
new file mode 100644
index 0000000..9cc492f
--- /dev/null
+++ b/tests/XML_test_data/dataset_split.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<SpimData version="0.2">
+  <BasePath type="relative">.</BasePath>
+  <SequenceDescription>
+    <ViewSetups>
+      <ViewSetup>
+        <id>0</id>
+        <name>Tile 0</name>
+        <size>500 500 100</size>
+        <voxelSize>
+          <unit>um</unit>
+          <size>1.0 1.0 1.0</size>
+        </voxelSize>
+        <attributes>
+          <tile>0</tile>
+        </attributes>
+      </ViewSetup>
+      <ViewSetup>
+        <id>1</id>
+        <name>Tile 1</name>
+        <size>500 500 100</size>
+        <voxelSize>
+          <unit>um</unit>
+          <size>1.0 1.0 1.0</size>
+        </voxelSize>
+        <attributes>
+          <tile>1</tile>
+        </attributes>
+      </ViewSetup>
+      <ViewSetup>
+        <id>2</id>
+        <name>Tile 2</name>
+        <size>500 500 100</size>
+        <voxelSize>
+          <unit>um</unit>
+          <size>1.0 1.0 1.0</size>
+        </voxelSize>
+        <attributes>
+          <tile>2</tile>
+        </attributes>
+      </ViewSetup>
+      <ViewSetup>
+        <id>3</id>
+        <name>Tile 3</name>
+        <size>500 500 100</size>
+        <voxelSize>
+          <unit>um</unit>
+          <size>1.0 1.0 1.0</size>
+        </voxelSize>
+        <attributes>
+          <tile>3</tile>
+        </attributes>
+      </ViewSetup>
+    </ViewSetups>
+    <Timepoints type="range">
+      <first>0</first>
+      <last>0</last>
+    </Timepoints>
+    <MissingViews />
+  </SequenceDescription>
+  <ViewRegistrations>
+    <ViewRegistration timepoint="0" setup="0">
+      <ViewTransform type="affine">
+        <Name>calibration</Name>
+        <affine>1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+      <ViewTransform type="affine">
+        <Name>Image Splitting</Name>
+        <affine>1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+    </ViewRegistration>
+    <ViewRegistration timepoint="0" setup="1">
+      <ViewTransform type="affine">
+        <Name>calibration</Name>
+        <affine>1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+      <ViewTransform type="affine">
+        <Name>Image Splitting</Name>
+        <affine>1.0 0.0 0.0 300.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+    </ViewRegistration>
+    <ViewRegistration timepoint="0" setup="2">
+      <ViewTransform type="affine">
+        <Name>calibration</Name>
+        <affine>1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+      <ViewTransform type="affine">
+        <Name>Image Splitting</Name>
+        <affine>1.0 0.0 0.0 0.0 0.0 1.0 0.0 300.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+    </ViewRegistration>
+    <ViewRegistration timepoint="0" setup="3">
+      <ViewTransform type="affine">
+        <Name>calibration</Name>
+        <affine>1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+      <ViewTransform type="affine">
+        <Name>Image Splitting</Name>
+        <affine>1.0 0.0 0.0 300.0 0.0 1.0 0.0 300.0 0.0 0.0 1.0 0.0</affine>
+      </ViewTransform>
+    </ViewRegistration>
+  </ViewRegistrations>
+  <ImageLoader format="split.viewerimgloader">
+    <ImageLoader format="bdv.multimg.zarr" version="3.0">
+      <zarr type="absolute">s3://test-bucket/SPIM.ome.zarr/</zarr>
+      <zgroups>
+        <zgroup setup="0" tp="0" path="Tile_X_0000_Y_0000_Z_0000_ch_405.zarr" indicies="0 0" />
+      </zgroups>
+    </ImageLoader>
+    <SequenceDescription>
+      <ViewSetups>
+        <ViewSetup>
+          <id>0</id>
+          <name>Source Image</name>
+          <size>800 800 100</size>
+          <voxelSize>
+            <unit>um</unit>
+            <size>1.0 1.0 1.0</size>
+          </voxelSize>
+        </ViewSetup>
+      </ViewSetups>
+    </SequenceDescription>
+    <SetupIds>
+      <SetupIdDefinition>
+        <NewId>0</NewId>
+        <OldId>0</OldId>
+        <min>0 0 0</min>
+        <max>499 499 99</max>
+      </SetupIdDefinition>
+      <SetupIdDefinition>
+        <NewId>1</NewId>
+        <OldId>0</OldId>
+        <min>300 0 0</min>
+        <max>799 499 99</max>
+      </SetupIdDefinition>
+      <SetupIdDefinition>
+        <NewId>2</NewId>
+        <OldId>0</OldId>
+        <min>0 300 0</min>
+        <max>499 799 99</max>
+      </SetupIdDefinition>
+      <SetupIdDefinition>
+        <NewId>3</NewId>
+        <OldId>0</OldId>
+        <min>300 300 0</min>
+        <max>799 799 99</max>
+      </SetupIdDefinition>
+    </SetupIds>
+  </ImageLoader>
+</SpimData>
diff --git a/tests/test_data_prep/test_xml_to_dataframe.py b/tests/test_data_prep/test_xml_to_dataframe.py
index 6118b84..fee6318 100644
--- a/tests/test_data_prep/test_xml_to_dataframe.py
+++ b/tests/test_data_prep/test_xml_to_dataframe.py
@@ -67,28 +67,26 @@ def test_parse_view_interest_points(self):
         xml_content = fetch_local_xml(self.xml_content_standard)
         self.parser = XMLToDataFrame(xml_content)
         root = ET.fromstring(xml_content)
-        df = self.parser.parse_view_interest_points(root, "data_prep")
+        df = self.parser.parse_view_interest_points(root)
         self.assertTrue(df.empty)
 
     def test_run(self):
         xml_content = fetch_local_xml(self.xml_content_standard)
         self.parser = XMLToDataFrame(xml_content)
-        result = self.parser.run("data_prep")
+        result = self.parser.run()
         self.assertIn("image_loader", result)
         self.assertIn("view_setups", result)
         self.assertIn("view_registrations", result)
         self.assertIn("view_interest_points", result)
 
     def test_interest_points_already_exist(self):
+        """Test that existing interest points are parsed correctly"""
         xml_content = fetch_local_xml(self.xml_content_interestPoints)
         self.parser = XMLToDataFrame(xml_content)
         root = ET.fromstring(xml_content)
-        with self.assertRaises(Exception) as context:
-            self.parser.parse_view_interest_points(root, "data_prep")
-        self.assertEqual(
-            str(context.exception),
-            "There should be no interest points in this file yet.",
-        )
+        df = self.parser.parse_view_interest_points(root)
+        # Should parse existing interest points without raising an exception
+        self.assertIsInstance(df, pd.DataFrame)
 
     def test_no_labels(self):
         xml_content = fetch_local_xml(self.xml_content_no_tags)
@@ -130,6 +128,58 @@ def test_no_file_mapping_exists(self):
             self.parser.parse_image_loader_tiff(root)
         self.assertEqual(str(context.exception), "There are no files in this XML")
 
+    def test_parse_image_loader_split_zarr(self):
+        """Test split zarr parsing with 4 tiles"""
+        xml_path = "tests/XML_test_data/dataset_split.xml"
+        xml_content = fetch_local_xml(xml_path)
+        self.parser = XMLToDataFrame(xml_content)
+        result = self.parser.run()
+        df = result['image_loader']
+
+        # Should have 4 rows (one per split tile)
+        self.assertEqual(len(df), 4)
+
+        # Check required columns exist
+        expected_cols = {'view_setup', 'timepoint', 'crop_min', 'crop_max', 'zarr_base_path', 'file_path'}
+        self.assertTrue(expected_cols.issubset(set(df.columns)))
+
+        # Check values for each tile
+        self.assertEqual(df.iloc[0]['view_setup'], '0')
+        self.assertEqual(df.iloc[0]['crop_min'], '0 0 0')
+        self.assertEqual(df.iloc[0]['crop_max'], '499 499 99')
+        self.assertEqual(df.iloc[0]['zarr_base_path'], 's3://test-bucket/SPIM.ome.zarr/')
+
+        self.assertEqual(df.iloc[1]['view_setup'], '1')
+        self.assertEqual(df.iloc[1]['crop_min'], '300 0 0')
+        self.assertEqual(df.iloc[1]['crop_max'], '799 499 99')
+
+        self.assertEqual(df.iloc[2]['view_setup'], '2')
+        self.assertEqual(df.iloc[2]['crop_min'], '0 300 0')
+        self.assertEqual(df.iloc[2]['crop_max'], '499 799 99')
+
+        self.assertEqual(df.iloc[3]['view_setup'], '3')
+        self.assertEqual(df.iloc[3]['crop_min'], '300 300 0')
+        self.assertEqual(df.iloc[3]['crop_max'], '799 799 99')
+
+        # All rows should have same file_path and timepoint
+        self.assertEqual(df.iloc[0]['file_path'], df.iloc[3]['file_path'])
+        self.assertEqual(df.iloc[0]['timepoint'], df.iloc[3]['timepoint'])
+
+    def test_parse_view_setups_split(self):
+        """Test that outer ViewSetups are parsed correctly for split XML"""
+        xml_path = "tests/XML_test_data/dataset_split.xml"
+        xml_content = fetch_local_xml(xml_path)
+        self.parser = XMLToDataFrame(xml_content)
+        root = ET.fromstring(xml_content)
+        df = self.parser.parse_view_setups(root)
+
+        # Should have 4 rows (outer ViewSetups only, not the inner one)
+        self.assertEqual(len(df), 4)
+
+        # Check IDs
+        ids = sorted([df.iloc[i]['id'] for i in range(len(df))])
+        self.assertEqual(ids, ['0', '1', '2', '3'])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_detection/test_image_reader.py b/tests/test_detection/test_image_reader.py
new file mode 100644
index 0000000..85f193a
--- /dev/null
+++ b/tests/test_detection/test_image_reader.py
@@ -0,0 +1,102 @@
+import unittest
+from unittest.mock import patch, MagicMock
+import dask.array as da
+import numpy as np
+
+from Rhapso.detection.image_reader import ImageReader
+
+
+class TestImageReader(unittest.TestCase):
+    def test_fetch_image_data_crop_applied_before_downsampling(self):
+        """Test that crop is applied after transpose, before downsampling"""
+        reader = ImageReader(file_type='zarr')
+
+        # Create a mock record with crop bounds
+        record = {
+            'view_id': 'timepoint: 0, setup: 0',
+            'file_path': 's3://bucket/test.zarr/0',
+            'interval_key': ((0, 0, 0), (50, 50, 25), (51, 51, 26)),
+            'offset': 0,
+            'lb': (0, 0, 0),
+            'crop_min': [2, 2, 2],
+            'crop_max': [7, 7, 7]
+        }
+
+        # Mock the zarr opening to return a known dask array (10x10x10)
+        mock_array = da.ones((1, 1, 10, 10, 10), dtype=np.float32)
+
+        with patch('zarr.open') as mock_zarr, \
+             patch('s3fs.S3FileSystem'), \
+             patch('s3fs.S3Map'), \
+             patch('dask.array.from_zarr', return_value=mock_array):
+
+            # Call fetch_image_data
+            view_id, interval_key, chunk, offset, lower_bound = reader.fetch_image_data(
+                record, dsxy=1, dsz=1
+            )
+
+            # Verify crop was applied: array should be [2:8, 2:8, 2:8] = 6x6x6
+            self.assertEqual(chunk.shape, (6, 6, 6))
+            self.assertEqual(view_id, 'timepoint: 0, setup: 0')
+
+    def test_fetch_image_data_without_crop(self):
+        """Test backward compatibility: records without crop fields work normally"""
+        reader = ImageReader(file_type='zarr')
+
+        # Record without crop fields
+        record = {
+            'view_id': 'timepoint: 0, setup: 0',
+            'file_path': 's3://bucket/test.zarr/0',
+            'interval_key': ((0, 0, 0), (50, 50, 25), (51, 51, 26)),
+            'offset': 0,
+            'lb': (0, 0, 0),
+            'crop_min': None,
+            'crop_max': None
+        }
+
+        # Mock the zarr opening to return a known dask array
+        mock_array = da.ones((1, 1, 10, 10, 10), dtype=np.float32)
+
+        with patch('zarr.open') as mock_zarr, \
+             patch('s3fs.S3FileSystem'), \
+             patch('s3fs.S3Map'), \
+             patch('dask.array.from_zarr', return_value=mock_array):
+
+            # Call fetch_image_data - should not raise an error
+            view_id, interval_key, chunk, offset, lower_bound = reader.fetch_image_data(
+                record, dsxy=1, dsz=1
+            )
+
+            # Should succeed without crop
+            self.assertEqual(view_id, 'timepoint: 0, setup: 0')
+
+    def test_fetch_image_data_tiff_no_crop_error(self):
+        """Test that tiff mode without crop works (no changes to tiff path)"""
+        reader = ImageReader(file_type='tiff')
+
+        record = {
+            'view_id': 'timepoint: 0, setup: 0',
+            'file_path': '/path/to/test.tif',
+            'interval_key': ((0, 0, 0), (50, 50, 25), (51, 51, 26)),
+            'offset': 0,
+            'lb': (0, 0, 0),
+            'crop_min': None,
+            'crop_max': None
+        }
+
+        # Mock the BioImage reader
+        mock_bioimage = MagicMock()
+        mock_dask_array = da.ones((1, 1, 1, 10, 10, 10), dtype=np.float32)
+        mock_bioimage.get_dask_stack.return_value = mock_dask_array
+
+        with patch('Rhapso.detection.image_reader.CustomBioImage', return_value=mock_bioimage):
+            # Should not raise an error
+            view_id, interval_key, chunk, offset, lower_bound = reader.fetch_image_data(
+                record, dsxy=1, dsz=1
+            )
+
+            self.assertEqual(view_id, 'timepoint: 0, setup: 0')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_detection/test_metadata_builder.py b/tests/test_detection/test_metadata_builder.py
new file mode 100644
index 0000000..8602985
--- /dev/null
+++ b/tests/test_detection/test_metadata_builder.py
@@ -0,0 +1,152 @@
+import unittest
+import pandas as pd
+import numpy as np
+
+from Rhapso.detection.metadata_builder import MetadataBuilder
+
+
+class TestMetadataBuilder(unittest.TestCase):
+    def test_build_paths_split_uses_zarr_base_path(self):
+        """Test that split mode uses zarr_base_path for file path construction"""
+        # Create mock image_loader_df with split columns
+        image_loader_df = pd.DataFrame({
+            'view_setup': ['0', '1'],
+            'timepoint': ['0', '0'],
+            'file_path': ['Tile_X_0000_Y_0000_Z_0000_ch_405.zarr', 'Tile_X_0000_Y_0000_Z_0000_ch_405.zarr'],
+            'crop_min': ['0 0 0', '300 0 0'],
+            'crop_max': ['499 499 99', '799 499 99'],
+            'zarr_base_path': ['s3://bucket/SPIM.ome.zarr/', 's3://bucket/SPIM.ome.zarr/']
+        })
+
+        # Mock overlapping_area
+        overlapping_area = {
+            'timepoint: 0, setup: 0': [{'lower_bound': np.array([0, 0, 0]), 'upper_bound': np.array([100, 100, 50])}],
+            'timepoint: 0, setup: 1': [{'lower_bound': np.array([50, 0, 0]), 'upper_bound': np.array([150, 100, 50])}]
+        }
+
+        dataframes = {'image_loader': image_loader_df}
+        builder = MetadataBuilder(
+            dataframes=dataframes,
+            overlapping_area=overlapping_area,
+            image_file_prefix='s3://bucket/SPIM.ome.zarr/',
+            file_type='zarr',
+            dsxy=1.0,
+            dsz=1.0,
+            chunks_per_bound=1,
+            sigma=1.0,
+            run_type='ray',
+            level=0
+        )
+        builder.build_paths()
+
+        # Check that file_path uses zarr_base_path
+        self.assertTrue(
+            'zarr' in builder.metadata[0]['file_path'],
+            f"File path should contain zarr path: {builder.metadata[0]['file_path']}"
+        )
+
+    def test_build_paths_split_passes_crop_bounds(self):
+        """Test that crop bounds are included in metadata records"""
+        image_loader_df = pd.DataFrame({
+            'view_setup': ['0'],
+            'timepoint': ['0'],
+            'file_path': ['Tile_X_0000_Y_0000_Z_0000_ch_405.zarr'],
+            'crop_min': ['0 0 0'],
+            'crop_max': ['499 499 99'],
+            'zarr_base_path': ['s3://bucket/SPIM.ome.zarr/']
+        })
+
+        overlapping_area = {
+            'timepoint: 0, setup: 0': [{'lower_bound': np.array([0, 0, 0]), 'upper_bound': np.array([100, 100, 50])}]
+        }
+
+        dataframes = {'image_loader': image_loader_df}
+        builder = MetadataBuilder(
+            dataframes=dataframes,
+            overlapping_area=overlapping_area,
+            image_file_prefix='s3://bucket/SPIM.ome.zarr/',
+            file_type='zarr',
+            dsxy=1.0,
+            dsz=1.0,
+            chunks_per_bound=0,  # No chunking
+            sigma=1.0,
+            run_type='ray',
+            level=0
+        )
+        builder.build_paths()
+
+        # Check that crop_min and crop_max are in metadata
+        self.assertIn('crop_min', builder.metadata[0])
+        self.assertIn('crop_max', builder.metadata[0])
+        self.assertEqual(builder.metadata[0]['crop_min'], [0, 0, 0])
+        self.assertEqual(builder.metadata[0]['crop_max'], [499, 499, 99])
+
+    def test_build_paths_split_scales_crop_bounds_by_level(self):
+        """Test that crop bounds are scaled by 2^level"""
+        image_loader_df = pd.DataFrame({
+            'view_setup': ['0'],
+            'timepoint': ['0'],
+            'file_path': ['Tile_X_0000_Y_0000_Z_0000_ch_405.zarr'],
+            'crop_min': ['300 0 0'],
+            'crop_max': ['799 499 99'],
+            'zarr_base_path': ['s3://bucket/SPIM.ome.zarr/']
+        })
+
+        overlapping_area = {
+            'timepoint: 0, setup: 0': [{'lower_bound': np.array([0, 0, 0]), 'upper_bound': np.array([100, 100, 50])}]
+        }
+
+        dataframes = {'image_loader': image_loader_df}
+        # level=2 means scale by 2^2 = 4
+        builder = MetadataBuilder(
+            dataframes=dataframes,
+            overlapping_area=overlapping_area,
+            image_file_prefix='s3://bucket/SPIM.ome.zarr/',
+            file_type='zarr',
+            dsxy=1.0,
+            dsz=1.0,
+            chunks_per_bound=0,
+            sigma=1.0,
+            run_type='ray',
+            level=2
+        )
+        builder.build_paths()
+
+        # 300 // 4 = 75, 799 // 4 = 199, etc.
+        self.assertEqual(builder.metadata[0]['crop_min'], [75, 0, 0])
+        self.assertEqual(builder.metadata[0]['crop_max'], [199, 124, 24])
+
+    def test_build_paths_regular_zarr_no_crop(self):
+        """Test backward compatibility: regular zarr has no crop fields"""
+        image_loader_df = pd.DataFrame({
+            'view_setup': ['0'],
+            'timepoint': ['0'],
+            'file_path': ['test.zarr']
+        })
+
+        overlapping_area = {
+            'timepoint: 0, setup: 0': [{'lower_bound': np.array([0, 0, 0]), 'upper_bound': np.array([100, 100, 50])}]
+        }
+
+        dataframes = {'image_loader': image_loader_df}
+        builder = MetadataBuilder(
+            dataframes=dataframes,
+            overlapping_area=overlapping_area,
+            image_file_prefix='s3://bucket/',
+            file_type='zarr',
+            dsxy=1.0,
+            dsz=1.0,
+            chunks_per_bound=0,
+            sigma=1.0,
+            run_type='ray',
+            level=0
+        )
+        builder.build_paths()
+
+        # Regular zarr should have None for crop fields
+        self.assertIsNone(builder.metadata[0]['crop_min'])
+        self.assertIsNone(builder.metadata[0]['crop_max'])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_detection/test_overlap_detection.py b/tests/test_detection/test_overlap_detection.py
index e4f9e88..f6b7ad1 100644
--- a/tests/test_detection/test_overlap_detection.py
+++ b/tests/test_detection/test_overlap_detection.py
@@ -1,5 +1,4 @@
 import unittest
-<<<<<<< HEAD
 import numpy as np
 
 import pandas as pd
@@ -55,14 +54,6 @@ def test_find_overlapping_area_empty_dataframe(self):
         with self.assertRaises(ValueError) as context:
             self.od.find_overlapping_area()
         self.assertEqual(str(context.exception), "Image Loader dataframe is empty.")
-=======
-
-
-class TestOverlapDetecttion(unittest.TestCase):
-
-    def setUp(self):
-        pass
->>>>>>> main
 
 
 if __name__ == "__main__":

From 97948dba42bf9f8aa1daf302475908d98659a816 Mon Sep 17 00:00:00 2001
From: Sean McCulloch <86432671+seanmcculloch@users.noreply.github.com>
Date: Sat, 14 Feb 2026 16:30:16 -0800
Subject: [PATCH 2/6] chore: typo

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 Rhapso/data_prep/xml_to_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Rhapso/data_prep/xml_to_dataframe.py b/Rhapso/data_prep/xml_to_dataframe.py
index f165cb8..5271829 100644
--- a/Rhapso/data_prep/xml_to_dataframe.py
+++ b/Rhapso/data_prep/xml_to_dataframe.py
@@ -2,7 +2,7 @@
 import xml.etree.ElementTree as ET
 import re
 
-# This component recieves an XML file containing Tiff or Zarr image metadata and converts
+# This component receives an XML file containing Tiff or Zarr image metadata and converts
 # it into several Dataframes
 
 class XMLToDataFrame:

From 080293391336be7c5d23cd94087a1b7264ed3d25 Mon Sep 17 00:00:00 2001
From: Sean McCulloch <86432671+seanmcculloch@users.noreply.github.com>
Date: Sat, 14 Feb 2026 16:31:10 -0800
Subject: [PATCH 3/6] docs: inline comment on channel parsing

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 Rhapso/data_prep/xml_to_dataframe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Rhapso/data_prep/xml_to_dataframe.py b/Rhapso/data_prep/xml_to_dataframe.py
index 5271829..2b2f951 100644
--- a/Rhapso/data_prep/xml_to_dataframe.py
+++ b/Rhapso/data_prep/xml_to_dataframe.py
@@ -144,11 +144,14 @@ def parse_image_loader_split_zarr(self, root):
                 )
             tp, zgroup_path = zgroup_lookup[old_id]
 
-            # Extract channel using regex to handle both .zarr and .ome.zarr
+            # Attempt to extract the channel from the path, assuming filenames include '_ch_<number>'
+            # (e.g. both '.zarr' and '.ome.zarr' variants). If this pattern is not present or is
+            # formatted differently, we deliberately fall back to channel 0 as a default.
             channel_match = re.search(r'_ch_(\d+)', zgroup_path)
             if channel_match:
                 channel = channel_match.group(1)
             else:
+                # Default to channel 0 when channel information cannot be parsed from the path.
                 channel = 0
 
             image_loader_data.append({

From f867da6652a7fa724aa63ba01baac92cde10fc33 Mon Sep 17 00:00:00 2001
From: Sean McCulloch <86432671+seanmcculloch@users.noreply.github.com>
Date: Sat, 14 Feb 2026 16:31:50 -0800
Subject: [PATCH 4/6] chore: .format to f string

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 Rhapso/data_prep/xml_to_dataframe.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Rhapso/data_prep/xml_to_dataframe.py b/Rhapso/data_prep/xml_to_dataframe.py
index 2b2f951..1a13b4b 100644
--- a/Rhapso/data_prep/xml_to_dataframe.py
+++ b/Rhapso/data_prep/xml_to_dataframe.py
@@ -137,10 +137,8 @@ def parse_image_loader_split_zarr(self, root):
 
             if old_id not in zgroup_lookup:
                 raise ValueError(
-                    "SetupIdDefinition refers to OldId {!r} that is not present in the "
-                    "inner loader's zgroups. Available setup ids: {}".format(
-                        old_id, sorted(zgroup_lookup.keys())
-                    )
+                    f"SetupIdDefinition refers to OldId {old_id!r} that is not present in the "
+                    f"inner loader's zgroups. Available setup ids: {sorted(zgroup_lookup.keys())}"
                 )
             tp, zgroup_path = zgroup_lookup[old_id]
 

From 6dc5000a4f3da9451cfa750dc4351d5de8e29e37 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Feb 2026 18:27:36 -0800
Subject: [PATCH 5/6] chore: Use os.path.join for path construction in
 metadata_builder (#168)

* Initial plan

* Use os.path.join for path construction in metadata_builder

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

* Use os.path.join consistently for all path construction

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>
---
 Rhapso/detection/metadata_builder.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Rhapso/detection/metadata_builder.py b/Rhapso/detection/metadata_builder.py
index b03f126..b4ef913 100644
--- a/Rhapso/detection/metadata_builder.py
+++ b/Rhapso/detection/metadata_builder.py
@@ -1,3 +1,4 @@
+import os
 import numpy as np
 
 """
@@ -123,11 +124,11 @@ def build_paths(self):
 
             if self.file_type == 'zarr':
                 if is_split:
-                    file_path = row['zarr_base_path'] + row['file_path'] + f'/{self.level}'
+                    file_path = os.path.join(row['zarr_base_path'], row['file_path'], str(self.level))
                 else:
-                    file_path = self.image_file_prefix + row['file_path'] + f'/{self.level}'
+                    file_path = os.path.join(self.image_file_prefix, row['file_path'], str(self.level))
             elif self.file_type == 'tiff':
-                file_path = self.image_file_prefix + row['file_path']
+                file_path = os.path.join(self.image_file_prefix, row['file_path'])
             else:
                 raise ValueError(f"Unsupported file_type: {self.file_type!r}")
 

From a82474c62a3aa6f745b3d8f3e71de5f4907eb8f0 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Feb 2026 20:12:08 -0800
Subject: [PATCH 6/6] test: Add crop bounds validation for split XML IPD
 implementation (#169)

* Initial plan

* Add crop bounds validation with comprehensive tests

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

* Enhance test to verify complete error message with shape

Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: seanmcculloch <86432671+seanmcculloch@users.noreply.github.com>
---
 Rhapso/detection/image_reader.py          | 20 ++++++
 tests/test_detection/test_image_reader.py | 83 +++++++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/Rhapso/detection/image_reader.py b/Rhapso/detection/image_reader.py
index b4d0ab5..c01640e 100644
--- a/Rhapso/detection/image_reader.py
+++ b/Rhapso/detection/image_reader.py
@@ -100,6 +100,26 @@ def fetch_image_data(self, record, dsxy, dsz):
                     f"crop_min and crop_max must both be length 3 for 3D cropping; "
                     f"got crop_min={crop_min}, crop_max={crop_max}"
                 )
+            
+            # Validate crop bounds are within array dimensions
+            array_shape = dask_array.shape
+            for i in range(3):
+                if crop_min[i] < 0:
+                    raise ValueError(
+                        f"crop_min[{i}]={crop_min[i]} is negative; "
+                        f"crop bounds must be non-negative"
+                    )
+                if crop_max[i] >= array_shape[i]:
+                    raise ValueError(
+                        f"crop_max[{i}]={crop_max[i]} exceeds array dimension {i} "
+                        f"(shape={array_shape[i]}); crop_max must be < array shape"
+                    )
+                if crop_min[i] > crop_max[i]:
+                    raise ValueError(
+                        f"crop_min[{i}]={crop_min[i]} > crop_max[{i}]={crop_max[i]}; "
+                        f"crop_min must be <= crop_max"
+                    )
+            
             dask_array = dask_array[
                 crop_min[0]:crop_max[0] + 1,
                 crop_min[1]:crop_max[1] + 1,
diff --git a/tests/test_detection/test_image_reader.py b/tests/test_detection/test_image_reader.py
index 85f193a..e0c10cc 100644
--- a/tests/test_detection/test_image_reader.py
+++ b/tests/test_detection/test_image_reader.py
@@ -97,6 +97,89 @@ def test_fetch_image_data_tiff_no_crop_error(self):
 
             self.assertEqual(view_id, 'timepoint: 0, setup: 0')
 
+    def test_fetch_image_data_crop_bounds_validation(self):
+        """Test that crop bounds exceeding array dimensions raise clear error"""
+        reader = ImageReader(file_type='zarr')
+
+        # Record with crop_max exceeding array dimensions
+        record = {
+            'view_id': 'timepoint: 0, setup: 0',
+            'file_path': 's3://bucket/test.zarr/0',
+            'interval_key': ((0, 0, 0), (50, 50, 25), (51, 51, 26)),
+            'offset': 0,
+            'lb': (0, 0, 0),
+            'crop_min': [0, 0, 0],
+            'crop_max': [15, 5, 5]  # Exceeds dimension 0 (10x10x10 array)
+        }
+
+        # Mock the zarr opening to return a known dask array (10x10x10)
+        mock_array = da.ones((1, 1, 10, 10, 10), dtype=np.float32)
+
+        with patch('zarr.open') as mock_zarr, \
+             patch('s3fs.S3FileSystem'), \
+             patch('s3fs.S3Map'), \
+             patch('dask.array.from_zarr', return_value=mock_array):
+
+            # Should raise ValueError with clear message
+            with self.assertRaises(ValueError) as context:
+                reader.fetch_image_data(record, dsxy=1, dsz=1)
+            
+            error_msg = str(context.exception)
+            self.assertIn('crop_max[0]=15 exceeds array dimension 0', error_msg)
+            self.assertIn('(shape=10)', error_msg)
+
+    def test_fetch_image_data_negative_crop_min(self):
+        """Test that negative crop_min values raise clear error"""
+        reader = ImageReader(file_type='zarr')
+
+        record = {
+            'view_id': 'timepoint: 0, setup: 0',
+            'file_path': 's3://bucket/test.zarr/0',
+            'interval_key': ((0, 0, 0), (50, 50, 25), (51, 51, 26)),
+            'offset': 0,
+            'lb': (0, 0, 0),
+            'crop_min': [-1, 0, 0],
+            'crop_max': [5, 5, 5]
+        }
+
+        mock_array = da.ones((1, 1, 10, 10, 10), dtype=np.float32)
+
+        with patch('zarr.open') as mock_zarr, \
+             patch('s3fs.S3FileSystem'), \
+             patch('s3fs.S3Map'), \
+             patch('dask.array.from_zarr', return_value=mock_array):
+
+            with self.assertRaises(ValueError) as context:
+                reader.fetch_image_data(record, dsxy=1, dsz=1)
+            
+            self.assertIn('crop_min[0]=-1 is negative', str(context.exception))
+
+    def test_fetch_image_data_crop_min_greater_than_crop_max(self):
+        """Test that crop_min > crop_max raises clear error"""
+        reader = ImageReader(file_type='zarr')
+
+        record = {
+            'view_id': 'timepoint: 0, setup: 0',
+            'file_path': 's3://bucket/test.zarr/0',
+            'interval_key': ((0, 0, 0), (50, 50, 25), (51, 51, 26)),
+            'offset': 0,
+            'lb': (0, 0, 0),
+            'crop_min': [5, 0, 0],
+            'crop_max': [3, 5, 5]  # crop_min[0] > crop_max[0]
+        }
+
+        mock_array = da.ones((1, 1, 10, 10, 10), dtype=np.float32)
+
+        with patch('zarr.open') as mock_zarr, \
+             patch('s3fs.S3FileSystem'), \
+             patch('s3fs.S3Map'), \
+             patch('dask.array.from_zarr', return_value=mock_array):
+
+            with self.assertRaises(ValueError) as context:
+                reader.fetch_image_data(record, dsxy=1, dsz=1)
+            
+            self.assertIn('crop_min[0]=5 > crop_max[0]=3', str(context.exception))
+
 
 if __name__ == "__main__":
     unittest.main()