Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ ipython_config.py
# install all needed dependencies.
#Pipfile.lock

# uv
uv.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

Expand Down
95 changes: 89 additions & 6 deletions Rhapso/data_prep/xml_to_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd
import xml.etree.ElementTree as ET
import re

# This component recieves an XML file containing Tiff or Zarr image metadata and converts
# This component receives an XML file containing Tiff or Zarr image metadata and converts
# it into several Dataframes

class XMLToDataFrame:
Expand Down Expand Up @@ -83,17 +84,99 @@ def parse_image_loader_tiff(self, root):
# Convert the list to a DataFrame and return
return pd.DataFrame(image_loader_data)

def parse_image_loader_split_zarr(self):
pass
def parse_image_loader_split_zarr(self, root):
"""
Parses a split.viewerimgloader XML structure where a single source image is virtually
subdivided into overlapping tiles via SetupIdDefinitions.

Parameters
----------
root : xml.etree.ElementTree.Element
Root element of the parsed XML.

Returns
-------
pd.DataFrame
One row per split tile with columns: view_setup, timepoint, series, channel,
file_path, crop_min, crop_max, zarr_base_path.
"""
outer_loader = root.find(".//ImageLoader[@format='split.viewerimgloader']")
if outer_loader is None:
raise ValueError(
"split.viewerimgloader ImageLoader node not found in XML; "
"ensure the XML contains an ImageLoader with format='split.viewerimgloader'."
)

inner_loader = outer_loader.find("ImageLoader")
if inner_loader is None:
raise ValueError(
"Nested ImageLoader node not found inside split.viewerimgloader configuration."
)

zarr_elem = inner_loader.find("zarr")
if zarr_elem is None or zarr_elem.text is None:
raise ValueError(
"<zarr> node with base path is missing from split.viewerimgloader configuration."
)

zarr_base_path = zarr_elem.text.strip()
# Build lookup from source setup id to (timepoint, zgroup_path)
zgroup_lookup = {}
for zg in inner_loader.findall(".//zgroups/zgroup"):
setup = zg.get("setup")
tp = zg.get("tp") or zg.get("timepoint")
path = zg.get("path")
zgroup_lookup[setup] = (tp, path)

image_loader_data = []
for sid in outer_loader.findall(".//SetupIds/SetupIdDefinition"):
new_id = sid.find("NewId").text.strip()
old_id = sid.find("OldId").text.strip()
crop_min = sid.find("min").text.strip()
crop_max = sid.find("max").text.strip()

if old_id not in zgroup_lookup:
raise ValueError(
f"SetupIdDefinition refers to OldId {old_id!r} that is not present in the "
f"inner loader's zgroups. Available setup ids: {sorted(zgroup_lookup.keys())}"
)
tp, zgroup_path = zgroup_lookup[old_id]

# Attempt to extract the channel from the path, assuming filenames include '_ch_<number>'
# (e.g. both '.zarr' and '.ome.zarr' variants). If this pattern is not present or is
# formatted differently, we deliberately fall back to channel 0 as a default.
channel_match = re.search(r'_ch_(\d+)', zgroup_path)
if channel_match:
channel = channel_match.group(1)
else:
# Default to channel 0 when channel information cannot be parsed from the path.
channel = 0

image_loader_data.append({
"view_setup": new_id,
"timepoint": tp,
"series": 1,
"channel": channel,
"file_path": zgroup_path,
"crop_min": crop_min,
"crop_max": crop_max,
"zarr_base_path": zarr_base_path,
})

return pd.DataFrame(image_loader_data)

def route_image_loader(self, root):
"""
Directs the XML parsing process based on the image loader format specified in the XML.
"""
format_node = root.find(".//ImageLoader")
format_type = format_node.get("format")
if format_node is None:
raise ValueError("No <ImageLoader> element found in XML; cannot determine image loader format.")

if "filemap" in format_type:
format_type = (format_node.get("format") or "").lower()
if "split" in format_type:
return self.parse_image_loader_split_zarr(root)
elif "filemap" in format_type:
return self.parse_image_loader_tiff(root)
else:
return self.parse_image_loader_zarr(root)
Expand All @@ -104,7 +187,7 @@ def parse_view_setups(self, root):
"""
viewsetups_data = []

for vs in root.findall(".//ViewSetup"):
for vs in root.findall("./SequenceDescription/ViewSetups/ViewSetup"):
id_ = vs.find("id").text
# name = vs.find("name").text
name = vs.findtext("name")
Expand Down
35 changes: 35 additions & 0 deletions Rhapso/detection/image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,41 @@ def fetch_image_data(self, record, dsxy, dsz):
dask_array = dask_array.astype(np.float32)
dask_array = dask_array.transpose()

# Apply split tile crop if present
crop_min = record.get('crop_min')
crop_max = record.get('crop_max')
if crop_min is not None and crop_max is not None:
if len(crop_min) != 3 or len(crop_max) != 3:
raise ValueError(
f"crop_min and crop_max must both be length 3 for 3D cropping; "
f"got crop_min={crop_min}, crop_max={crop_max}"
)

# Validate crop bounds are within array dimensions
array_shape = dask_array.shape
for i in range(3):
if crop_min[i] < 0:
raise ValueError(
f"crop_min[{i}]={crop_min[i]} is negative; "
f"crop bounds must be non-negative"
)
if crop_max[i] >= array_shape[i]:
raise ValueError(
f"crop_max[{i}]={crop_max[i]} exceeds array dimension {i} "
f"(shape={array_shape[i]}); crop_max must be < array shape"
)
if crop_min[i] > crop_max[i]:
raise ValueError(
f"crop_min[{i}]={crop_min[i]} > crop_max[{i}]={crop_max[i]}; "
f"crop_min must be <= crop_max"
)

dask_array = dask_array[
crop_min[0]:crop_max[0] + 1,
crop_min[1]:crop_max[1] + 1,
crop_min[2]:crop_max[2] + 1
]
Comment on lines +97 to +127
Copy link

Copilot AI Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No validation that crop bounds are within the array dimensions. If crop_min or crop_max values exceed the array shape after transpose, Dask will raise an IndexError. Consider adding validation to provide a clearer error message, for example: check that crop_max[i] < dask_array.shape[i] for each dimension before applying the crop.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot open a new pull request to apply changes based on this feedback


# Downsample Dask array
downsampled_stack = self.interface_downsampling(dask_array, dsxy, dsz)

Expand Down
47 changes: 35 additions & 12 deletions Rhapso/detection/metadata_builder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import numpy as np

"""
Expand All @@ -21,7 +22,7 @@ def __init__(self, dataframes, overlapping_area, image_file_prefix, file_type, d
self.sub_region_chunking = not chunks_per_bound == 0
self.metadata = []

def build_image_metadata(self, process_intervals, file_path, view_id):
def build_image_metadata(self, process_intervals, file_path, view_id, crop_min=None, crop_max=None):
"""
Builds list of metadata with optional sub-chunking
"""
Expand All @@ -41,7 +42,9 @@ def build_image_metadata(self, process_intervals, file_path, view_id):
'file_path': file_path,
'interval_key': interval_key,
'offset': 0,
'lb': lb_fixed
'lb': lb_fixed,
'crop_min': crop_min,
'crop_max': crop_max
})

# Apply sub-region chunking
Expand Down Expand Up @@ -73,8 +76,10 @@ def build_image_metadata(self, process_intervals, file_path, view_id):
'file_path': file_path,
'interval_key': interval_key,
'offset': z,
'lb' : lb
})
'lb' : lb,
'crop_min': crop_min,
'crop_max': crop_max
})

elif self.file_type == "zarr":

Expand Down Expand Up @@ -102,26 +107,44 @@ def build_image_metadata(self, process_intervals, file_path, view_id):
'file_path': file_path,
'interval_key': interval_key,
'offset': z,
'lb' : lb
})

'lb' : lb,
'crop_min': crop_min,
'crop_max': crop_max
})

def build_paths(self):
"""
Iterates through views to interface metadata building
"""
is_split = 'crop_min' in self.image_loader_df.columns

for _, row in self.image_loader_df.iterrows():
view_id = f"timepoint: {row['timepoint']}, setup: {row['view_setup']}"
process_intervals = self.overlapping_area[view_id]

if self.file_type == 'zarr':
file_path = self.image_file_prefix + row['file_path'] + f'/{self.level}'
if is_split:
file_path = os.path.join(row['zarr_base_path'], row['file_path'], str(self.level))
else:
file_path = os.path.join(self.image_file_prefix, row['file_path'], str(self.level))
elif self.file_type == 'tiff':
file_path = self.image_file_prefix + row['file_path']
file_path = os.path.join(self.image_file_prefix, row['file_path'])
else:
raise ValueError(f"Unsupported file_type: {self.file_type!r}")


# Extract and scale crop bounds for split tiles
crop_min = None
crop_max = None
if is_split:
scale = 2 ** self.level if self.level is not None else 1
cmin = [int(v) // scale for v in row['crop_min'].split()]
# For inclusive bounds, use a ceil-style mapping for crop_max to avoid shrinking coverage
cmax = [int(np.ceil((int(v) + 1) / scale) - 1) for v in row['crop_max'].split()]
crop_min = cmin
crop_max = cmax

if self.run_type == 'ray':
self.build_image_metadata(process_intervals, file_path, view_id)
self.build_image_metadata(process_intervals, file_path, view_id, crop_min, crop_max)
else:
raise ValueError(f"Unsupported run type: {self.run_type!r}")

Expand Down
Loading