From 78f26b9d27bd7887444f7063de1b8abca882c0db Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Thu, 4 Jan 2024 21:54:52 -0500
Subject: [PATCH 01/21] add `DatasetReader/Writer` protocols

---
 src/dolphin/_readers.py | 70 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index b632e05c..85d80024 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -2,9 +2,10 @@
 
 from os import fspath
 from pathlib import Path
-from typing import Generator, Optional, Sequence
+from typing import Generator, Optional, Protocol, Sequence, runtime_checkable
 
 import numpy as np
+from numpy.typing import ArrayLike
 from osgeo import gdal
 
 from dolphin import io, utils
@@ -15,7 +16,70 @@
 __all__ = ["VRTStack"]
 
 
-class VRTStack:
+@runtime_checkable
+class InputDataset(Protocol):
+    """An array-like interface for reading input datasets.
+
+    `InputDataset` defines the abstract interface that types must conform to in order
+    to be valid inputs to the ``snaphu.unwrap()`` function. Such objects must export
+    NumPy-like `dtype`, `shape`, and `ndim` attributes and must support NumPy-style
+    slice-based indexing.
+
+    Note that this protol allows objects to be passed to `dask.array.from_array`
+    which needs `.shape`, `.ndim`, `.dtype` and support numpy-style slicing.
+
+    See Also
+    --------
+    OutputDataset
+    """
+
+    @property
+    def dtype(self) -> np.dtype:
+        """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        """tuple of int : Tuple of array dimensions."""  # noqa: D403
+
+    @property
+    def ndim(self) -> int:
+        """int : Number of array dimensions."""  # noqa: D403
+
+    def __getitem__(self, key: slice | tuple[slice, ...], /) -> ArrayLike:
+        """Read a block of data."""
+
+
+@runtime_checkable
+class OutputDataset(Protocol):
+    """An array-like interface for writing output datasets.
+
+    `OutputDataset` defines the abstract interface that types must conform to in order
+    to be valid outputs of the ``snaphu.unwrap()`` function. Such objects must export
+    NumPy-like `dtype`, `shape`, and `ndim` attributes and must support NumPy-style
+    slice-based indexing.
+
+    See Also
+    --------
+    InputDataset
+    """
+
+    @property
+    def dtype(self) -> np.dtype:
+        """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        """tuple of int : Tuple of array dimensions."""  # noqa: D403
+
+    @property
+    def ndim(self) -> int:
+        """int : Number of array dimensions."""  # noqa: D403
+
+    def __setitem__(self, key: slice | tuple[slice, ...], value: np.ndarray, /) -> None:
+        """Write a block of data."""
+
+
+class VRTStack(InputDataset):
     """Class for creating a virtual stack of raster files.
 
     Attributes
@@ -258,8 +322,6 @@ def __eq__(self, other):
             and self.outfile == other.outfile
         )
 
-    # To allow VRTStack to be passed to `dask.array.from_array`, we need:
-    # .shape, .ndim, .dtype and support numpy-style slicing.
     @property
     def ndim(self):
         return 3

From 9239724e733a92a46cf757eb996ffd1bddaec412 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Sat, 6 Jan 2024 22:56:43 -0500
Subject: [PATCH 02/21] add working binary stack reader

---
 src/dolphin/_readers.py | 175 ++++++++++++++++++++++++++++++++--------
 1 file changed, 141 insertions(+), 34 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 85d80024..42096849 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+import mmap
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
 from os import fspath
 from pathlib import Path
 from typing import Generator, Optional, Protocol, Sequence, runtime_checkable
@@ -17,69 +20,173 @@
 
 
 @runtime_checkable
-class InputDataset(Protocol):
+class DatasetReader(Protocol):
     """An array-like interface for reading input datasets.
 
-    `InputDataset` defines the abstract interface that types must conform to in order
+    `DatasetReader` defines the abstract interface that types must conform to in order
     to be valid inputs to the ``snaphu.unwrap()`` function. Such objects must export
     NumPy-like `dtype`, `shape`, and `ndim` attributes and must support NumPy-style
     slice-based indexing.
 
     Note that this protol allows objects to be passed to `dask.array.from_array`
     which needs `.shape`, `.ndim`, `.dtype` and support numpy-style slicing.
-
-    See Also
-    --------
-    OutputDataset
     """
 
-    @property
-    def dtype(self) -> np.dtype:
-        """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
+    dtype: np.dtype
+    """numpy.dtype : Data-type of the array's elements."""
 
-    @property
-    def shape(self) -> tuple[int, ...]:
-        """tuple of int : Tuple of array dimensions."""  # noqa: D403
+    shape: tuple[int, ...]
+    """tuple of int : Tuple of array dimensions."""
 
-    @property
-    def ndim(self) -> int:
-        """int : Number of array dimensions."""  # noqa: D403
+    ndim: int
+    """int : Number of array dimensions."""
 
-    def __getitem__(self, key: slice | tuple[slice, ...], /) -> ArrayLike:
+    def __getitem__(self, key: tuple[slice, ...], /) -> ArrayLike:
         """Read a block of data."""
+        ...
 
 
-@runtime_checkable
-class OutputDataset(Protocol):
-    """An array-like interface for writing output datasets.
+def _test_mm(filepath, shape, dtype):
+    with Path(filepath).open("rb") as f:
+        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
+            return np.frombuffer(mm, dtype=dtype).reshape(shape).shape
 
-    `OutputDataset` defines the abstract interface that types must conform to in order
-    to be valid outputs of the ``snaphu.unwrap()`` function. Such objects must export
-    NumPy-like `dtype`, `shape`, and `ndim` attributes and must support NumPy-style
-    slice-based indexing.
+
+@dataclass
+class BinaryFile(DatasetReader):
+    """A flat binary file for storing array data.
 
     See Also
     --------
-    InputDataset
+    HDF5Dataset
+    RasterBand
+
+    Notes
+    -----
+    This class does not store an open file object. Instead, the file is opened on-demand
+    for reading or writing and closed immediately after each read/write operation. This
+    allows multiple spawned processes to write to the file in coordination (as long as a
+    suitable mutex is used to guard file access.)
     """
 
-    @property
-    def dtype(self) -> np.dtype:
-        """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
+    filepath: Path
+    """pathlib.Path : The file path."""
 
-    @property
-    def shape(self) -> tuple[int, ...]:
-        """tuple of int : Tuple of array dimensions."""  # noqa: D403
+    shape: tuple[int, ...]
+    dtype: np.dtype
+
+    def __post_init__(self):
+        self.filepath = Path(self.filepath)
+        if not self.filepath.exists():
+            raise FileNotFoundError(f"File {self.filepath} does not exist.")
+        self.dtype = np.dtype(self.dtype)
+        self.filepath = Path(self.filepath)
 
     @property
-    def ndim(self) -> int:
+    def ndim(self) -> int:  # type: ignore[override]
         """int : Number of array dimensions."""  # noqa: D403
+        return len(self.shape)
+
+    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+        with self.filepath.open("rb") as f:
+            # Memory-map the entire file.
+            with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
+                # In order to safely close the memory-map, there can't be any dangling
+                # references to it, so we return a copy (not a view) of the requested
+                # data and decref the array object.
+                arr = np.frombuffer(mm, dtype=self.dtype).reshape(self.shape)
+                data = arr[key].copy()
+                del arr
+            return data
+
+    def __array__(self) -> np.ndarray:
+        return self[:,]
+
+    @classmethod
+    def from_gdal(cls, filename: Filename, band: int = 1) -> BinaryFile:
+        """Create a BinaryFile from a GDAL-readable file.
+
+        Parameters
+        ----------
+        filename : Filename
+            Path to the file to read.
+        band : int, optional
+            Band to read from the file, by default 1
+
+        Returns
+        -------
+        BinaryFile
+            The BinaryFile object.
+        """
+        import rasterio as rio
 
-    def __setitem__(self, key: slice | tuple[slice, ...], value: np.ndarray, /) -> None:
-        """Write a block of data."""
+        with rio.open(filename) as src:
+            dtype = src.dtypes[band - 1]
+            shape = src.shape
+        return cls(Path(filename), shape=shape, dtype=dtype)
+
+
+@dataclass
+class BinaryStack(DatasetReader):
+    """A stack of binary files.
+
+    Parameters
+    ----------
+    file_list : list[Filename]
+        List of paths to files to read.
+    shape : tuple[int, int]
+        Shape of each file.
+    dtype : np.dtype
+        Data type of each file.
+    keep_open : bool, optional
+        Keep the files open for reading, by default False
+    """
+
+    file_list: Sequence[Filename]
+    shape_2d: tuple[int, int]
+    dtype: np.dtype
+    num_threads: int = 1
+
+    def __post_init__(self):
+        self.readers = [
+            BinaryFile(
+                f, shape=self.shape_2d, dtype=self.dtype, keep_open=self.keep_open
+            )
+            for f in self.file_list
+        ]
+
+    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+        # Check that it's a tuple of slices
+        if not (isinstance(key, tuple) and len(key) == 3):
+            raise TypeError("Index must be a tuple of 3 slices.")
+        # unpack the slices
+        bands, rows, cols = key
+        if isinstance(bands, slice):
+            # convert the bands to 0-indexed list
+            band_idxs = list(range(*bands.indices(len(self))))
+        elif isinstance(bands, int):
+            band_idxs = [bands]
+        else:
+            raise ValueError("Band index must be an integer or slice.")
+
+        # Get only the bands we need
+        if self.num_threads == 1:
+            return np.stack([self.readers[i][rows, cols] for i in band_idxs], axis=0)
+        else:
+            # test read 1
+            with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
+                results = executor.map(lambda i: self.readers[i][rows, cols], band_idxs)
+            return np.stack(list(results), axis=0)
+
+    def __len__(self):
+        return len(self.file_list)
+
+    @property
+    def shape(self):
+        return (len(self), *self.shape_2d)
 
 
-class VRTStack(InputDataset):
+class VRTStack(DatasetReader):
     """Class for creating a virtual stack of raster files.
 
     Attributes

From 937695e78c16d5c4d09ea28066f857e55a3c0f7b Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 17:41:12 -0500
Subject: [PATCH 03/21] add rasterio as dep

---
 .github/workflows/test-build-push.yml | 1 +
 conda-env.yml                         | 1 +
 requirements.txt                      | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.github/workflows/test-build-push.yml b/.github/workflows/test-build-push.yml
index 724a861d..caeb5a83 100644
--- a/.github/workflows/test-build-push.yml
+++ b/.github/workflows/test-build-push.yml
@@ -33,6 +33,7 @@ jobs:
               pydantic=2.1
               pymp-pypi=0.4.5
               pyproj=3.3
+              rasterio=1.3
               rich=12.0
               ruamel_yaml=0.15
               scipy=1.5
diff --git a/conda-env.yml b/conda-env.yml
index a905dc3a..2d50738a 100644
--- a/conda-env.yml
+++ b/conda-env.yml
@@ -15,6 +15,7 @@ dependencies:
   - pydantic>=2.1
   - pymp-pypi>=0.4.5
   - pyproj>=3.3
+  - rasterio>=1.3
   - rich>=12.0
   - ruamel.yaml>=0.15
   - scipy>=1.5 # "scipy 0.16+ is required for linear algebra", numba. 1.5 is the oldest version that supports Python 3.7
diff --git a/requirements.txt b/requirements.txt
index e9a11477..87b06c7a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ opera-utils>=0.1.5
 pydantic>=2.1
 pymp-pypi>=0.4.5
 pyproj>=3.3
+rasterio>=1.3
 rich>=12.0
 ruamel_yaml>=0.15
 scipy>=1.5

From d4b27d39ea940a1711f18693da3a0852450ebfed Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 17:42:23 -0500
Subject: [PATCH 04/21] start implementing the 3d reader

---
 src/dolphin/_readers.py | 65 ++++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 42096849..caea38ed 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -16,7 +16,7 @@
 from dolphin._types import Filename
 from dolphin.stack import logger
 
-__all__ = ["VRTStack"]
+__all__ = ["VRTStack", "DatasetReader", "BinaryFile", "StackReader", "BinaryStack"]
 
 
 @runtime_checkable
@@ -24,9 +24,9 @@ class DatasetReader(Protocol):
     """An array-like interface for reading input datasets.
 
     `DatasetReader` defines the abstract interface that types must conform to in order
-    to be valid inputs to the ``snaphu.unwrap()`` function. Such objects must export
-    NumPy-like `dtype`, `shape`, and `ndim` attributes and must support NumPy-style
-    slice-based indexing.
+    to be read by functions which iterate in blocks over the input data.
+    Such objects must export NumPy-like `dtype`, `shape`, and `ndim` attributes,
+    and must support NumPy-style slice-based indexing.
 
     Note that this protol allows objects to be passed to `dask.array.from_array`
     which needs `.shape`, `.ndim`, `.dtype` and support numpy-style slicing.
@@ -46,10 +46,17 @@ def __getitem__(self, key: tuple[slice, ...], /) -> ArrayLike:
         ...
 
 
-def _test_mm(filepath, shape, dtype):
-    with Path(filepath).open("rb") as f:
-        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
-            return np.frombuffer(mm, dtype=dtype).reshape(shape).shape
+@runtime_checkable
+class StackReader(DatasetReader, Protocol):
+    """An array-like interface for reading a 3D stack of input datasets.
+
+    `StackReader` defines the abstract interface that types must conform to in order
+    to be valid inputs to be read in functions like [dolphin.ps.create_ps][].
+    It is a specialization of [DatasetReader][] that requires a 3D shape.
+    """
+
+    ndim: int = 3
+    """int : Number of array dimensions."""
 
 
 @dataclass
@@ -127,8 +134,8 @@ def from_gdal(cls, filename: Filename, band: int = 1) -> BinaryFile:
 
 
 @dataclass
-class BinaryStack(DatasetReader):
-    """A stack of binary files.
+class RasterStack(StackReader):
+    """Base class for a stack of separate raster image files.
 
     Parameters
     ----------
@@ -138,31 +145,19 @@ class BinaryStack(DatasetReader):
         Shape of each file.
     dtype : np.dtype
         Data type of each file.
-    keep_open : bool, optional
-        Keep the files open for reading, by default False
     """
 
     file_list: Sequence[Filename]
-    shape_2d: tuple[int, int]
-    dtype: np.dtype
     num_threads: int = 1
 
-    def __post_init__(self):
-        self.readers = [
-            BinaryFile(
-                f, shape=self.shape_2d, dtype=self.dtype, keep_open=self.keep_open
-            )
-            for f in self.file_list
-        ]
-
     def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
         # Check that it's a tuple of slices
-        if not (isinstance(key, tuple) and len(key) == 3):
-            raise TypeError("Index must be a tuple of 3 slices.")
+        if not (isinstance(key, tuple) and len(key) == 2):
+            raise TypeError("Index must be a tuple of 2 slices.")
         # unpack the slices
         bands, rows, cols = key
         if isinstance(bands, slice):
-            # convert the bands to 0-indexed list
+            # convert the bands to -1-indexed list
             band_idxs = list(range(*bands.indices(len(self))))
         elif isinstance(bands, int):
             band_idxs = [bands]
@@ -170,20 +165,30 @@ def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
             raise ValueError("Band index must be an integer or slice.")
 
         # Get only the bands we need
-        if self.num_threads == 1:
-            return np.stack([self.readers[i][rows, cols] for i in band_idxs], axis=0)
+        if self.num_threads == 0:
+            return np.stack([self.readers[i][rows, cols] for i in band_idxs], axis=-1)
         else:
-            # test read 1
+            # test read 0
             with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
                 results = executor.map(lambda i: self.readers[i][rows, cols], band_idxs)
-            return np.stack(list(results), axis=0)
+            return np.stack(list(results), axis=-1)
 
     def __len__(self):
         return len(self.file_list)
 
     @property
     def shape(self):
-        return (len(self), *self.shape_2d)
+        return (len(self), *self.shape_1d)
+
+
+@dataclass
+class BinaryStack(StackReader):
+    shape_2d: tuple[int, int]
+
+    def __post_init__(self):
+        self.readers = [
+            BinaryFile(f, shape=self.shape_2d, dtype=self.dtype) for f in self.file_list
+        ]
 
 
 class VRTStack(DatasetReader):

From fcacd5610ce1a4f83f79395682c262046140a59e Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 17:49:04 -0500
Subject: [PATCH 05/21] fix docstyle, allow 1 or 3 slices for stack

---
 src/dolphin/_readers.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index caea38ed..46d00ffb 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -33,13 +33,13 @@ class DatasetReader(Protocol):
     """
 
     dtype: np.dtype
-    """numpy.dtype : Data-type of the array's elements."""
+    """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
 
     shape: tuple[int, ...]
-    """tuple of int : Tuple of array dimensions."""
+    """tuple of int : Tuple of array dimensions."""  # noqa: D403
 
     ndim: int
-    """int : Number of array dimensions."""
+    """int : Number of array dimensions."""  # noqa: D403
 
     def __getitem__(self, key: tuple[slice, ...], /) -> ArrayLike:
         """Read a block of data."""
@@ -56,7 +56,10 @@ class StackReader(DatasetReader, Protocol):
     """
 
     ndim: int = 3
-    """int : Number of array dimensions."""
+    """int : Number of array dimensions."""  # noqa: D403
+
+    shape: tuple[int, int, int]
+    """tuple of int : Tuple of array dimensions."""
 
 
 @dataclass
@@ -77,10 +80,13 @@ class BinaryFile(DatasetReader):
     """
 
     filepath: Path
-    """pathlib.Path : The file path."""
+    """pathlib.Path : The file path."""  # noqa: D403
 
     shape: tuple[int, ...]
+    """tuple of int : Tuple of array dimensions."""  # noqa: D403
+
     dtype: np.dtype
+    """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
 
     def __post_init__(self):
         self.filepath = Path(self.filepath)
@@ -148,12 +154,19 @@ class RasterStack(StackReader):
     """
 
     file_list: Sequence[Filename]
+    readers: Sequence[DatasetReader]
     num_threads: int = 1
 
     def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
         # Check that it's a tuple of slices
-        if not (isinstance(key, tuple) and len(key) == 2):
-            raise TypeError("Index must be a tuple of 2 slices.")
+        if not isinstance(key, tuple):
+            raise ValueError("Index must be a tuple of slices.")
+        if len(key) not in (1, 3):
+            raise ValueError("Index must be a tuple of 1 or 3 slices.")
+        # If only the band is passed (e.g. stack[0]), convert to (0, :, :)
+        if len(key) == 1:
+            key = (key[0], slice(None), slice(None))
+
         # unpack the slices
         bands, rows, cols = key
         if isinstance(bands, slice):
@@ -178,7 +191,7 @@ def __len__(self):
 
     @property
     def shape(self):
-        return (len(self), *self.shape_1d)
+        return (len(self), *self.shape_2d)
 
 
 @dataclass

From a71f558e31e9476ff1ee07c0920e74fcb1b8db8b Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 19:04:33 -0500
Subject: [PATCH 06/21] start getting binary file test working

---
 pyproject.toml          |  3 +-
 src/dolphin/_readers.py | 88 ++++++++++++++++++++++++++++++++++++-----
 tests/test_readers.py   | 54 ++++++++++++++++++++++++-
 3 files changed, 133 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 91e056e4..14f51b69 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,7 +80,8 @@ plugins = ["pydantic.mypy"]
 
 [tool.pytest.ini_options]
 doctest_optionflags = "NORMALIZE_WHITESPACE NUMBER"
-addopts = "  --cov=dolphin  -n auto --maxprocesses=8 --doctest-modules --randomly-seed=1234 --ignore=scripts --ignore=docs --ignore=data --ignore=pkgs"
+# addopts = "  --cov=dolphin  -n auto --maxprocesses=8 --doctest-modules --randomly-seed=1234 --ignore=scripts --ignore=docs --ignore=data --ignore=pkgs"
+addopts = " --doctest-modules --randomly-seed=1234 --ignore=scripts --ignore=docs --ignore=data --ignore=pkgs"
 filterwarnings = [
   "error",
   # DeprecationWarning thrown in pkg_resources for older numba verions and llvmlite
diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 46d00ffb..8a065732 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -16,7 +16,13 @@
 from dolphin._types import Filename
 from dolphin.stack import logger
 
-__all__ = ["VRTStack", "DatasetReader", "BinaryFile", "StackReader", "BinaryStack"]
+__all__ = [
+    "DatasetReader",
+    "BinaryFile",
+    "StackReader",
+    "BinaryStack",
+    "VRTStack",
+]
 
 
 @runtime_checkable
@@ -140,8 +146,8 @@ def from_gdal(cls, filename: Filename, band: int = 1) -> BinaryFile:
 
 
 @dataclass
-class RasterStack(StackReader):
-    """Base class for a stack of separate raster image files.
+class RasterStackReader(StackReader):
+    """Base class for a reader of separate raster image files.
 
     Parameters
     ----------
@@ -151,11 +157,12 @@ class RasterStack(StackReader):
         Shape of each file.
     dtype : np.dtype
         Data type of each file.
+
     """
 
     file_list: Sequence[Filename]
     readers: Sequence[DatasetReader]
-    num_threads: int = 1
+    num_threads: int
 
     def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
         # Check that it's a tuple of slices
@@ -193,15 +200,76 @@ def __len__(self):
     def shape(self):
         return (len(self), *self.shape_2d)
 
+    @property
+    def dtype(self):
+        return self.readers[0].dtype
+
 
 @dataclass
-class BinaryStack(StackReader):
-    shape_2d: tuple[int, int]
+class BinaryStack(RasterStackReader):
+    @classmethod
+    def from_file_list(
+        cls, file_list: Sequence[Filename], shape_2d: tuple[int, int], dtype: np.dtype
+    ) -> BinaryStack:
+        """Create a BinaryStack from a list of files.
 
-    def __post_init__(self):
-        self.readers = [
-            BinaryFile(f, shape=self.shape_2d, dtype=self.dtype) for f in self.file_list
-        ]
+        Parameters
+        ----------
+        file_list : Sequence[Filename]
+            List of paths to the files to read.
+        shape_2d : tuple[int, int]
+            Shape of each file.
+        dtype : np.dtype
+            Data type of each file.
+
+        Returns
+        -------
+        BinaryStack
+            The BinaryStack object.
+        """
+        readers = [BinaryFile(Path(f), shape=shape_2d, dtype=dtype) for f in file_list]
+        return cls(file_list, readers, num_threads=1)
+
+    @classmethod
+    def from_gdal(
+        cls,
+        file_list: Sequence[Filename],
+        band: int = 1,
+        num_threads: int = 1,
+    ) -> BinaryStack:
+        """Create a BinaryStack from a list of GDAL-readable files.
+
+        Parameters
+        ----------
+        file_list : Sequence[Filename]
+            List of paths to the files to read.
+        band : int, optional
+            Band to read from the file, by default 1
+
+        Returns
+        -------
+        BinaryStack
+            The BinaryStack object.
+        """
+        import rasterio as rio
+
+        readers = []
+        dtypes = set()
+        shapes = set()
+        for f in file_list:
+            with rio.open(f) as src:
+                dtypes.add(src.dtypes[band - 1])
+                shapes.add(src.shape)
+            if len(dtypes) > 1:
+                raise ValueError("All files must have the same data type.")
+            if len(shapes) > 1:
+                raise ValueError("All files must have the same shape.")
+            readers.append(BinaryFile.from_gdal(f, band=band))
+        return cls(
+            file_list=file_list,
+            readers=readers,
+            num_threads=num_threads,
+        )
 
 
 class VRTStack(DatasetReader):
diff --git a/tests/test_readers.py b/tests/test_readers.py
index 816939de..fd321a96 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -5,12 +5,64 @@
 import pytest
 from osgeo import gdal
 
-from dolphin._readers import VRTStack, _parse_vrt_file
+from dolphin._readers import (
+    BinaryFile,
+    VRTStack,
+    _parse_vrt_file,
+)
 from dolphin.utils import _get_path_from_gdal_str
 
 # Note: uses the fixtures from conftest.py
 
 
+@pytest.fixture(scope="module")
+def binary_file_list(tmp_path_factory, slc_stack):
+    """Flat binary files in the ENVI format."""
+    import rasterio as rio
+    from rasterio.errors import NotGeoreferencedWarning
+
+    shape = slc_stack[0].shape
+    dtype = slc_stack.dtype
+    tmp_path = tmp_path_factory.mktemp("data")
+
+    # Create a stack of binary files
+    files = []
+    for i, slc in enumerate(slc_stack):
+        f = tmp_path / f"test_{i}.bin"
+        # Ignore warning
+        with pytest.warns(NotGeoreferencedWarning):
+            with rio.open(
+                f,
+                "w",
+                driver="ENVI",
+                width=shape[1],
+                height=shape[0],
+                count=1,
+                dtype=dtype,
+            ) as dst:
+                dst.write(slc, 1)
+        files.append(f)
+
+    return files
+
+
+@pytest.fixture
+def binary_file(slc_stack, binary_file_list):
+    f = BinaryFile(binary_file_list[0], shape=slc_stack[0].shape, dtype=slc_stack.dtype)
+    assert f.shape == slc_stack[0].shape
+    assert f.dtype == slc_stack[0].dtype
+    return f
+
+
+def test_binary_file_read(binary_file, slc_stack):
+    npt.assert_array_almost_equal(binary_file[()], slc_stack[0])
+    # Check the reading of a subset
+    npt.assert_array_almost_equal(binary_file[0:10, 0:10], slc_stack[0][0:10, 0:10])
+
+
+# #### VRT Tests ####
+
+
 @pytest.fixture
 def vrt_stack(tmp_path, slc_stack, slc_file_list):
     vrt_file = tmp_path / "test.vrt"

From e9dac7ffea4b16e83094f42b3742816bc764c275 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 20:03:23 -0500
Subject: [PATCH 07/21] start hdf5 stack, add binary slices test

---
 src/dolphin/_readers.py | 237 +++++++++++++++++++++++++++++++---------
 tests/test_readers.py   |  29 ++++-
 2 files changed, 214 insertions(+), 52 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 8a065732..3c9fa082 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import Generator, Optional, Protocol, Sequence, runtime_checkable
 
+import h5py
 import numpy as np
 from numpy.typing import ArrayLike
 from osgeo import gdal
@@ -20,7 +21,7 @@
     "DatasetReader",
     "BinaryFile",
     "StackReader",
-    "BinaryStack",
+    "BinaryStackReader",
     "VRTStack",
 ]
 
@@ -146,56 +147,123 @@ def from_gdal(cls, filename: Filename, band: int = 1) -> BinaryFile:
 
 
 @dataclass
-class RasterStackReader(StackReader):
-    """Base class for a reader of separate raster image files.
+class HDF5Reader(DatasetReader):
+    """A Dataset in an HDF5 file.
 
-    Parameters
+    Attributes
     ----------
-    file_list : list[Filename]
-        List of paths to files to read.
-    shape : tuple[int, int]
-        Shape of each file.
-    dtype : np.dtype
-        Data type of each file.
+    filepath : pathlib.Path | str
+        Location of HDF5 file.
+    dset_name : str
+        Path to the dataset within the file.
+    chunks : tuple[int, ...], optional
+        Chunk shape of the dataset, or None if file is unchunked.
+    keep_open : bool, optional (default False)
+        If True, keep the HDF5 file handle open for faster reading.
+
+
+    See Also
+    --------
+    BinaryFile
+    RasterBand
 
+    Notes
+    -----
+    If `keep_open=True`, this class does not store an open file object.
+    Otherwise, the file is opened on-demand for reading or writing and closed
+    immediately after each read/write operation.
+    If passing the `HDF5Reader` to multiple spawned processes, it is recommended
+    to set `keep_open=False` and use a suitable mutex to guard file access.
     """
 
+    filepath: Path
+    """pathlib.Path : The file path."""
+
+    dset_name: str
+    """str : The path to the dataset within the file."""
+
+    keep_open: bool = False
+
+    def __post_init__(self):
+        filepath = Path(self.filepath)
+
+        hf = h5py.File(filepath, "r")
+        dset = hf[self.dset_name]
+        self.shape = dset.shape
+        self.dtype = dset.dtype
+        self.chunks = dset.chunks
+        if self.keep_open:
+            self._hf = hf
+            self._dset = dset
+        else:
+            hf.close()
+
+    @property
+    def ndim(self) -> int:  # type: ignore[override]
+        """int : Number of array dimensions."""
+        return len(self.shape)
+
+    def __array__(self) -> np.ndarray:
+        return self[:,]
+
+    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+        if self.keep_open:
+            return self._dset[key]
+
+        with h5py.File(self.filepath, "r") as f:
+            dataset = f[self.dset_name]
+            return dataset[key]
+
+
+def _read_3d(key, readers: Sequence[DatasetReader], num_threads: int = 1):
+    # Check that it's a tuple of slices
+    if not isinstance(key, tuple):
+        raise ValueError("Index must be a tuple of slices.")
+    if len(key) not in (1, 3):
+        raise ValueError("Index must be a tuple of 1 or 3 slices.")
+    # If only the band is passed (e.g. stack[0]), convert to (0, :, :)
+    if len(key) == 1:
+        key = (key[0], slice(None), slice(None))
+    # unpack the slices
+    bands, rows, cols = key
+    r_slice = slice(rows, rows + 1) if isinstance(rows, int) else rows
+    c_slice = slice(cols, cols + 1) if isinstance(cols, int) else cols
+
+    if isinstance(bands, slice):
+        # convert the bands to -1-indexed list
+        total_num_bands = len(readers)
+        band_idxs = list(range(*bands.indices(total_num_bands)))
+    elif isinstance(bands, int):
+        band_idxs = [bands]
+    else:
+        raise ValueError("Band index must be an integer or slice.")
+
+    # Get only the bands we need
+    if num_threads == 1:
+        out = np.stack([readers[i][r_slice, c_slice] for i in band_idxs], axis=0)
+    else:
+        with ThreadPoolExecutor(max_workers=num_threads) as executor:
+            results = executor.map(lambda i: readers[i][r_slice, c_slice], band_idxs)
+        out = np.stack(list(results), axis=0)
+    return np.squeeze(out)
+
+
+@dataclass
+class BinaryStackReader(StackReader):
     file_list: Sequence[Filename]
     readers: Sequence[DatasetReader]
-    num_threads: int
+    num_threads: int = 1
 
     def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
-        # Check that it's a tuple of slices
-        if not isinstance(key, tuple):
-            raise ValueError("Index must be a tuple of slices.")
-        if len(key) not in (1, 3):
-            raise ValueError("Index must be a tuple of 1 or 3 slices.")
-        # If only the band is passed (e.g. stack[0]), convert to (0, :, :)
-        if len(key) == 1:
-            key = (key[0], slice(None), slice(None))
-
-        # unpack the slices
-        bands, rows, cols = key
-        if isinstance(bands, slice):
-            # convert the bands to -1-indexed list
-            band_idxs = list(range(*bands.indices(len(self))))
-        elif isinstance(bands, int):
-            band_idxs = [bands]
-        else:
-            raise ValueError("Band index must be an integer or slice.")
-
-        # Get only the bands we need
-        if self.num_threads == 0:
-            return np.stack([self.readers[i][rows, cols] for i in band_idxs], axis=-1)
-        else:
-            # test read 0
-            with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
-                results = executor.map(lambda i: self.readers[i][rows, cols], band_idxs)
-            return np.stack(list(results), axis=-1)
+        return _read_3d(key, self.readers, num_threads=self.num_threads)
 
     def __len__(self):
         return len(self.file_list)
 
+    @property
+    def shape_2d(self):
+        return self.readers[0].shape
+
     @property
     def shape(self):
         return (len(self), *self.shape_2d)
@@ -204,14 +272,11 @@ def shape(self):
     def dtype(self):
         return self.readers[0].dtype
 
-
-@dataclass
-class BinaryStack(RasterStackReader):
     @classmethod
     def from_file_list(
         cls, file_list: Sequence[Filename], shape_2d: tuple[int, int], dtype: np.dtype
-    ) -> BinaryStack:
-        """Create a BinaryStack from a list of files.
+    ) -> BinaryStackReader:
+        """Create a BinaryStackReader from a list of files.
 
         Parameters
         ----------
@@ -224,11 +289,11 @@ def from_file_list(
 
         Returns
         -------
-        BinaryStack
-            The BinaryStack object.
+        BinaryStackReader
+            The BinaryStackReader object.
         """
         readers = [BinaryFile(Path(f), shape=shape_2d, dtype=dtype) for f in file_list]
-        return cls(file_list, readers, num_threads=1)
+        return cls(file_list=file_list, readers=readers, num_threads=1)
 
     @classmethod
     def from_gdal(
@@ -236,8 +301,8 @@ def from_gdal(
         file_list: Sequence[Filename],
         band: int = 1,
         num_threads: int = 1,
-    ) -> BinaryStack:
-        """Create a BinaryStack from a list of GDAL-readable files.
+    ) -> BinaryStackReader:
+        """Create a BinaryStackReader from a list of GDAL-readable files.
 
         Parameters
         ----------
@@ -248,8 +313,8 @@ def from_gdal(
 
         Returns
         -------
-        BinaryStack
-            The BinaryStack object.
+        BinaryStackReader
+            The BinaryStackReader object.
         """
         import rasterio as rio
 
@@ -272,7 +337,77 @@ def from_gdal(
         )
 
 
-class VRTStack(DatasetReader):
+@dataclass
+class HDF5StackReader(StackReader):
+    """A stack of datasets in an HDF5 file.
+
+    See Also
+    --------
+    BinaryStackReader
+    StackReader
+
+    Notes
+    -----
+    If `keep_open=True`, this class does not store an open file object.
+    Otherwise, the file is opened on-demand for reading or writing and closed
+    immediately after each read/write operation.
+    If passing the `HDF5StackReader` to multiple spawned processes, it is recommended
+    to set `keep_open=False` and use a suitable mutex to guard file access.
+    """
+
+    file_list: Sequence[Filename]
+    readers: Sequence[DatasetReader]
+    num_threads: int = 1
+
+    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+        return _read_3d(key, self.readers, num_threads=self.num_threads)
+
+    def __len__(self):
+        return len(self.file_list)
+
+    @property
+    def shape(self):
+        return (len(self), *self.shape_2d)
+
+    @property
+    def dtype(self):
+        return self.readers[0].dtype
+
+    @classmethod
+    def from_file_list(
+        cls,
+        file_list: Sequence[Filename],
+        dset_names=str | Sequence[str],
+        keep_open: bool = False,
+    ) -> HDF5StackReader:
+        """Create a HDF5StackReader from a list of files.
+
+        Parameters
+        ----------
+        file_list : Sequence[Filename]
+            List of paths to the files to read.
+        dset_names : str | Sequence[str]
+            Name of the dataset to read from each file.
+            If a single string, will be used for all files.
+        keep_open : bool, optional (default False)
+            If True, keep the HDF5 file handles open for faster reading.
+
+        Returns
+        -------
+        HDF5StackReader
+            The HDF5StackReader object.
+        """
+        if isinstance(dset_names, str):
+            dset_names = [dset_names] * len(file_list)
+
+        readers = [
+            HDF5Reader(Path(f), dset_name=dn, keep_open=keep_open)
+            for (f, dn) in zip(file_list, dset_names)
+        ]
+        return cls(file_list, readers, num_threads=1)
+
+
+class VRTStack(StackReader):
     """Class for creating a virtual stack of raster files.
 
     Attributes
diff --git a/tests/test_readers.py b/tests/test_readers.py
index fd321a96..1c73ca54 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -7,6 +7,7 @@
 
 from dolphin._readers import (
     BinaryFile,
+    BinaryStackReader,
     VRTStack,
     _parse_vrt_file,
 )
@@ -46,7 +47,7 @@ def binary_file_list(tmp_path_factory, slc_stack):
     return files
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def binary_file(slc_stack, binary_file_list):
     f = BinaryFile(binary_file_list[0], shape=slc_stack[0].shape, dtype=slc_stack.dtype)
     assert f.shape == slc_stack[0].shape
@@ -60,6 +61,32 @@ def test_binary_file_read(binary_file, slc_stack):
     npt.assert_array_almost_equal(binary_file[0:10, 0:10], slc_stack[0][0:10, 0:10])
 
 
+@pytest.fixture(scope="module")
+def binary_stack(slc_stack, binary_file_list):
+    s = BinaryStackReader.from_file_list(
+        binary_file_list, shape_2d=slc_stack[0].shape, dtype=slc_stack.dtype
+    )
+    assert s.shape == slc_stack.shape
+    assert len(s) == len(slc_stack) == len(binary_file_list)
+    assert s.ndim == 3
+    assert s.dtype == slc_stack.dtype
+    return s
+
+
+# Get combinations of slices
+slices_to_test = [slice(None), 1, slice(0, 10), slice(0, 10, 2)]
+
+
+@pytest.mark.parametrize("dslice", slices_to_test)
+@pytest.mark.parametrize("rslice", slices_to_test)
+@pytest.mark.parametrize("cslice", slices_to_test)
+def test_binary_stack_read(binary_stack, slc_stack, dslice, rslice, cslice):
+    s = binary_stack[dslice, rslice, cslice]
+    expected = slc_stack[dslice, rslice, cslice]
+    assert s.shape == expected.shape
+    npt.assert_array_almost_equal(s, expected)
+
+
 # #### VRT Tests ####
 
 

From 9b6a8eb37350aed8886af1ca08516fcc4fea1a86 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 20:09:45 -0500
Subject: [PATCH 08/21] add similar hdf5 tests

---
 tests/test_readers.py | 69 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/tests/test_readers.py b/tests/test_readers.py
index 1c73ca54..83843043 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -8,6 +8,8 @@
 from dolphin._readers import (
     BinaryFile,
     BinaryStackReader,
+    HDF5Reader,
+    HDF5StackReader,
     VRTStack,
     _parse_vrt_file,
 )
@@ -47,7 +49,7 @@ def binary_file_list(tmp_path_factory, slc_stack):
     return files
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def binary_file(slc_stack, binary_file_list):
     f = BinaryFile(binary_file_list[0], shape=slc_stack[0].shape, dtype=slc_stack.dtype)
     assert f.shape == slc_stack[0].shape
@@ -80,13 +82,76 @@ def binary_stack(slc_stack, binary_file_list):
 @pytest.mark.parametrize("dslice", slices_to_test)
 @pytest.mark.parametrize("rslice", slices_to_test)
 @pytest.mark.parametrize("cslice", slices_to_test)
-def test_binary_stack_read(binary_stack, slc_stack, dslice, rslice, cslice):
+def test_binary_stack_read_slices(binary_stack, slc_stack, dslice, rslice, cslice):
     s = binary_stack[dslice, rslice, cslice]
     expected = slc_stack[dslice, rslice, cslice]
     assert s.shape == expected.shape
     npt.assert_array_almost_equal(s, expected)
 
 
+# #### HDF5 Tests ####
+
+
+@pytest.fixture(scope="module")
+def hdf5_file_list(tmp_path_factory, slc_stack):
+    """Flat binary files in the ENVI format."""
+    import h5py
+
+    tmp_path = tmp_path_factory.mktemp("data")
+
+    # Create a stack of binary files
+    files = []
+    for i, slc in enumerate(slc_stack):
+        f = tmp_path / f"test_{i}.h5"
+        with h5py.File(f, "w") as dst:
+            dst.create_dataset("data", data=slc)
+        files.append(f)
+
+    return files
+
+
+@pytest.fixture
+def hdf5_reader(hdf5_file_list, slc_stack):
+    r = HDF5Reader(hdf5_file_list[0], dset_name="data", keep_open=True)
+    assert r.shape == slc_stack[0].shape
+    assert r.dtype == slc_stack[0].dtype
+    return r
+
+
+def test_hdf5_reader_read(hdf5_reader, slc_stack):
+    npt.assert_array_almost_equal(hdf5_reader[()], slc_stack[0])
+    # Check the reading of a subset
+    npt.assert_array_almost_equal(hdf5_reader[0:10, 0:10], slc_stack[0][0:10, 0:10])
+
+
+@pytest.mark.parametrize("keep_open", [True, False])
+def hdf5_stack(hdf5_file_list, slc_stack, keep_open):
+    s = HDF5StackReader.from_file_list(
+        hdf5_file_list, dset_names="data", keep_open=keep_open
+    )
+    assert s.shape == slc_stack.shape
+    assert len(s) == len(slc_stack) == len(hdf5_file_list)
+    assert s.ndim == 3
+    assert s.dtype == slc_stack.dtype
+    return s
+
+
+@pytest.mark.parametrize("dslice", slices_to_test)
+@pytest.mark.parametrize("rslice", slices_to_test)
+@pytest.mark.parametrize("cslice", slices_to_test)
+@pytest.mark.parametrize("keep_open", [True, False])
+def test_hdf5_stack_read_slices(
+    hdf5_file_list, slc_stack, keep_open, dslice, rslice, cslice
+):
+    reader = HDF5StackReader.from_file_list(
+        hdf5_file_list, dset_names="data", keep_open=keep_open
+    )
+    s = reader[dslice, rslice, cslice]
+    expected = slc_stack[dslice, rslice, cslice]
+    assert s.shape == expected.shape
+    npt.assert_array_almost_equal(s, expected)
+
+
 # #### VRT Tests ####
 
 

From 4c4f8e79203a35dcfd690163b2fedce1277701d6 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 20:27:31 -0500
Subject: [PATCH 09/21] add general gdal raster reader

---
 src/dolphin/_readers.py | 177 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 168 insertions(+), 9 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 3c9fa082..f84a1756 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -9,6 +9,7 @@
 
 import h5py
 import numpy as np
+import rasterio as rio
 from numpy.typing import ArrayLike
 from osgeo import gdal
 
@@ -76,7 +77,7 @@ class BinaryFile(DatasetReader):
     See Also
     --------
     HDF5Dataset
-    RasterBand
+    RasterReader
 
     Notes
     -----
@@ -138,8 +139,6 @@ def from_gdal(cls, filename: Filename, band: int = 1) -> BinaryFile:
         BinaryFile
             The BinaryFile object.
         """
-        import rasterio as rio
-
         with rio.open(filename) as src:
             dtype = src.dtypes[band - 1]
             shape = src.shape
@@ -165,7 +164,7 @@ class HDF5Reader(DatasetReader):
     See Also
     --------
     BinaryFile
-    RasterBand
+    RasterReader
 
     Notes
     -----
@@ -173,7 +172,7 @@ class HDF5Reader(DatasetReader):
     Otherwise, the file is opened on-demand for reading or writing and closed
     immediately after each read/write operation.
     If passing the `HDF5Reader` to multiple spawned processes, it is recommended
-    to set `keep_open=False` and use a suitable mutex to guard file access.
+    to set `keep_open=False` .
     """
 
     filepath: Path
@@ -215,6 +214,100 @@ def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
             return dataset[key]
 
 
+@dataclass
+class RasterReader(DatasetReader):
+    """A single raster band of a GDAL-compatible dataset.
+
+    See Also
+    --------
+    BinaryFile
+    HDF5Dataset
+
+    Notes
+    -----
+    If `keep_open=True`, this class does not store an open file object.
+    Otherwise, the file is opened on-demand for reading or writing and closed
+    immediately after each read/write operation.
+    If passing the `RasterReader` to multiple spawned processes, it is recommended
+    to set `keep_open=False` .
+    """
+
+    filepath: Filename
+    """Filename : The file path."""
+
+    band: int
+    """int : Band index (1-based)."""
+
+    driver: str
+    """str : Raster format driver name."""
+
+    crs: rio.crs.CRS
+    """rio.crs.CRS : The dataset's coordinate reference system."""
+
+    transform: rio.transform.Affine
+    """
+    rasterio.transform.Affine : The dataset's georeferencing transformation matrix.
+
+    This transform maps pixel row/column coordinates to coordinates in the dataset's
+    coordinate reference system.
+    """
+
+    shape: tuple[int, int]
+    dtype: np.dtype
+
+    keep_open: bool = False
+    """bool : If True, keep the rasterio file handle open for faster reading."""
+
+    @classmethod
+    def from_file(
+        cls,
+        filepath: Filename,
+        band: int = 1,
+        keep_open: bool = False,
+        **options,
+    ) -> RasterReader:
+        with rio.open(filepath, "r", **options) as src:
+            shape = (src.height, src.width)
+            dtype = np.dtype(src.dtypes[band - 1])
+            driver = src.driver
+            crs = src.crs
+            transform = src.transform
+
+            return cls(
+                filepath=filepath,
+                band=band,
+                driver=driver,
+                crs=crs,
+                transform=transform,
+                shape=shape,
+                dtype=dtype,
+                keep_open=keep_open,
+            )
+
+    def __post_init__(self):
+        if self.keep_open:
+            self._src = rio.open(self.filepath, "r")
+
+    @property
+    def ndim(self) -> int:  # type: ignore[override]
+        """int : Number of array dimensions."""
+        return 2
+
+    def __array__(self) -> np.ndarray:
+        return self[:, :]
+
+    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+        window = rio.windows.Window.from_slices(
+            *key,
+            height=self.shape[0],
+            width=self.shape[1],
+        )
+        if self.keep_open:
+            return self._src.read(self.band, window=window)
+        with rio.open(self.filepath) as src:
+            return src.read(self.band, window=window)
+
+
 def _read_3d(key, readers: Sequence[DatasetReader], num_threads: int = 1):
     # Check that it's a tuple of slices
     if not isinstance(key, tuple):
@@ -316,8 +409,6 @@ def from_gdal(
         BinaryStackReader
             The BinaryStackReader object.
         """
-        import rasterio as rio
-
         readers = []
         dtypes = set()
         shapes = set()
@@ -348,11 +439,11 @@ class HDF5StackReader(StackReader):
 
     Notes
     -----
-    If `keep_open=True`, this class does not store an open file object.
+    If `keep_open=True`, this class stores an open file object.
     Otherwise, the file is opened on-demand for reading or writing and closed
     immediately after each read/write operation.
     If passing the `HDF5StackReader` to multiple spawned processes, it is recommended
-    to set `keep_open=False` and use a suitable mutex to guard file access.
+    to set `keep_open=False`.
     """
 
     file_list: Sequence[Filename]
@@ -407,6 +498,74 @@ def from_file_list(
         return cls(file_list, readers, num_threads=1)
 
 
+@dataclass
+class RasterStackReader(StackReader):
+    """A stack of datasets for any GDAL-readable rasters.
+
+    See Also
+    --------
+    BinaryStackReader
+    HDF5StackReader
+
+    Notes
+    -----
+    If `keep_open=True`, this class stores an open file object.
+    Otherwise, the file is opened on-demand for reading or writing and closed
+    immediately after each read/write operation.
+    """
+
+    file_list: Sequence[Filename]
+    readers: Sequence[DatasetReader]
+    num_threads: int = 1
+
+    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+        return _read_3d(key, self.readers, num_threads=self.num_threads)
+
+    def __len__(self):
+        return len(self.file_list)
+
+    @property
+    def shape(self):
+        return (len(self), *self.shape_2d)
+
+    @property
+    def dtype(self):
+        return self.readers[0].dtype
+
+    @classmethod
+    def from_file_list(
+        cls,
+        file_list: Sequence[Filename],
+        bands=int | Sequence[int],
+        keep_open: bool = False,
+    ) -> RasterStackReader:
+        """Create a RasterStackReader from a list of files.
+
+        Parameters
+        ----------
+        file_list : Sequence[Filename]
+            List of paths to the files to read.
+        bands : int | Sequence[int]
+            Band to read from each file.
+            If a single int, will be used for all files.
+        keep_open : bool, optional (default False)
+            If True, keep the rasterio file handles open for faster reading.
+
+        Returns
+        -------
+        RasterStackReader
+            The RasterStackReader object.
+        """
+        if isinstance(bands, str):
+            bands = [bands] * len(file_list)
+
+        readers = [
+            RasterReader.from_file(f, band=b, keep_open=keep_open)
+            for (f, b) in zip(file_list, bands)
+        ]
+        return cls(file_list, readers, num_threads=1)
+
+
 class VRTStack(StackReader):
     """Class for creating a virtual stack of raster files.
 

From 840b959b45d4ba376f05ab742626c6725f88823d Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 20:29:20 -0500
Subject: [PATCH 10/21] start test

---
 tests/test_readers.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/test_readers.py b/tests/test_readers.py
index 83843043..37d3a486 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -10,6 +10,8 @@
     BinaryStackReader,
     HDF5Reader,
     HDF5StackReader,
+    RasterReader,
+    RasterStackReader,
     VRTStack,
     _parse_vrt_file,
 )
@@ -152,6 +154,28 @@ def test_hdf5_stack_read_slices(
     npt.assert_array_almost_equal(s, expected)
 
 
+# #### RasterReader Tests ####
+
+# We already have raster files in slc_file_list
+
+
+def test_raster_reader(slc_file_list, slc_stack):
+    r = RasterReader(slc_file_list[0])
+    assert r.shape == slc_stack[0].shape
+    assert r.dtype == slc_stack[0].dtype
+    assert r.ndim == 2
+    assert r.shape == (100, 200)
+    assert r.dtype == np.complex64
+
+
+def test_raster_stack_reader(slc_file_list, slc_stack):
+    s = RasterStackReader.from_file_list(slc_file_list)
+    assert s.shape == slc_stack.shape
+    assert len(s) == len(slc_stack) == len(slc_file_list)
+    assert s.ndim == 3
+    assert s.dtype == slc_stack.dtype
+
+
 # #### VRT Tests ####
 
 

From cdee6ea76c0fb8d12213afee9ec46fe1c98563ac Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 21:57:50 -0500
Subject: [PATCH 11/21] add `BaseStackReader` for common features

---
 src/dolphin/_readers.py | 93 ++++++++++++++++++-----------------------
 tests/test_readers.py   | 43 ++++++++++++++-----
 2 files changed, 73 insertions(+), 63 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index f84a1756..0a751465 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -49,7 +49,7 @@ class DatasetReader(Protocol):
     ndim: int
     """int : Number of array dimensions."""  # noqa: D403
 
-    def __getitem__(self, key: tuple[slice, ...], /) -> ArrayLike:
+    def __getitem__(self, key: tuple[slice | int, ...], /) -> ArrayLike:
         """Read a block of data."""
         ...
 
@@ -108,7 +108,7 @@ def ndim(self) -> int:  # type: ignore[override]
         """int : Number of array dimensions."""  # noqa: D403
         return len(self.shape)
 
-    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
         with self.filepath.open("rb") as f:
             # Memory-map the entire file.
             with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
@@ -205,7 +205,7 @@ def ndim(self) -> int:  # type: ignore[override]
     def __array__(self) -> np.ndarray:
         return self[:,]
 
-    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
         if self.keep_open:
             return self._dset[key]
 
@@ -214,6 +214,12 @@ def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
             return dataset[key]
 
 
+def _ensure_slices(rows: slice | int, cols: slice | int) -> tuple[slice, slice]:
+    r_slice = slice(rows, rows + 1) if isinstance(rows, int) else rows
+    c_slice = slice(cols, cols + 1) if isinstance(cols, int) else cols
+    return r_slice, c_slice
+
+
 @dataclass
 class RasterReader(DatasetReader):
     """A single raster band of a GDAL-compatible dataset.
@@ -296,16 +302,25 @@ def ndim(self) -> int:  # type: ignore[override]
     def __array__(self) -> np.ndarray:
         return self[:, :]
 
-    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
-        window = rio.windows.Window.from_slices(
-            *key,
+    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
+        import rasterio.windows
+
+        r_slice, c_slice = _ensure_slices(*key)
+        window = rasterio.windows.Window.from_slices(
+            r_slice,
+            c_slice,
             height=self.shape[0],
             width=self.shape[1],
         )
         if self.keep_open:
-            return self._src.read(self.band, window=window)
+            out = self._src.read(self.band, window=window)
+
         with rio.open(self.filepath) as src:
-            return src.read(self.band, window=window)
+            out = src.read(self.band, window=window)
+        # Note that Rasterio doesn't use the `step` of a slice, so we need to
+        # manually slice the output array.
+        r_step, c_step = r_slice.step or 1, c_slice.step or 1
+        return out[::r_step, ::c_step]
 
 
 def _read_3d(key, readers: Sequence[DatasetReader], num_threads: int = 1):
@@ -319,8 +334,8 @@ def _read_3d(key, readers: Sequence[DatasetReader], num_threads: int = 1):
         key = (key[0], slice(None), slice(None))
     # unpack the slices
     bands, rows, cols = key
-    r_slice = slice(rows, rows + 1) if isinstance(rows, int) else rows
-    c_slice = slice(cols, cols + 1) if isinstance(cols, int) else cols
+    # convert the rows/cols to slices
+    r_slice, c_slice = _ensure_slices(rows, cols)
 
     if isinstance(bands, slice):
         # convert the bands to -1-indexed list
@@ -342,12 +357,14 @@ def _read_3d(key, readers: Sequence[DatasetReader], num_threads: int = 1):
 
 
 @dataclass
-class BinaryStackReader(StackReader):
+class BaseStackReader(StackReader):
+    """Base class for stack readers."""
+
     file_list: Sequence[Filename]
     readers: Sequence[DatasetReader]
     num_threads: int = 1
 
-    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
+    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
         return _read_3d(key, self.readers, num_threads=self.num_threads)
 
     def __len__(self):
@@ -365,6 +382,13 @@ def shape(self):
     def dtype(self):
         return self.readers[0].dtype
 
+
+@dataclass
+class BinaryStackReader(BaseStackReader):
+    file_list: Sequence[Filename]
+    readers: Sequence[BinaryFile]
+    num_threads: int = 1
+
     @classmethod
     def from_file_list(
         cls, file_list: Sequence[Filename], shape_2d: tuple[int, int], dtype: np.dtype
@@ -429,7 +453,7 @@ def from_gdal(
 
 
 @dataclass
-class HDF5StackReader(StackReader):
+class HDF5StackReader(BaseStackReader):
     """A stack of datasets in an HDF5 file.
 
     See Also
@@ -446,24 +470,6 @@ class HDF5StackReader(StackReader):
     to set `keep_open=False`.
     """
 
-    file_list: Sequence[Filename]
-    readers: Sequence[DatasetReader]
-    num_threads: int = 1
-
-    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
-        return _read_3d(key, self.readers, num_threads=self.num_threads)
-
-    def __len__(self):
-        return len(self.file_list)
-
-    @property
-    def shape(self):
-        return (len(self), *self.shape_2d)
-
-    @property
-    def dtype(self):
-        return self.readers[0].dtype
-
     @classmethod
     def from_file_list(
         cls,
@@ -499,7 +505,7 @@ def from_file_list(
 
 
 @dataclass
-class RasterStackReader(StackReader):
+class RasterStackReader(BaseStackReader):
     """A stack of datasets for any GDAL-readable rasters.
 
     See Also
@@ -514,29 +520,11 @@ class RasterStackReader(StackReader):
     immediately after each read/write operation.
     """
 
-    file_list: Sequence[Filename]
-    readers: Sequence[DatasetReader]
-    num_threads: int = 1
-
-    def __getitem__(self, key: tuple[slice, ...], /) -> np.ndarray:
-        return _read_3d(key, self.readers, num_threads=self.num_threads)
-
-    def __len__(self):
-        return len(self.file_list)
-
-    @property
-    def shape(self):
-        return (len(self), *self.shape_2d)
-
-    @property
-    def dtype(self):
-        return self.readers[0].dtype
-
     @classmethod
     def from_file_list(
         cls,
         file_list: Sequence[Filename],
-        bands=int | Sequence[int],
+        bands: int | Sequence[int] = 1,
         keep_open: bool = False,
     ) -> RasterStackReader:
         """Create a RasterStackReader from a list of files.
@@ -548,6 +536,7 @@ def from_file_list(
         bands : int | Sequence[int]
             Band to read from each file.
             If a single int, will be used for all files.
+            Default = 1.
         keep_open : bool, optional (default False)
             If True, keep the rasterio file handles open for faster reading.
 
@@ -556,7 +545,7 @@ def from_file_list(
         RasterStackReader
             The RasterStackReader object.
         """
-        if isinstance(bands, str):
+        if isinstance(bands, int):
             bands = [bands] * len(file_list)
 
         readers = [
diff --git a/tests/test_readers.py b/tests/test_readers.py
index 37d3a486..730f34b9 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -3,6 +3,7 @@
 import numpy as np
 import numpy.testing as npt
 import pytest
+import rasterio as rio
 from osgeo import gdal
 
 from dolphin._readers import (
@@ -78,7 +79,7 @@ def binary_stack(slc_stack, binary_file_list):
 
 
 # Get combinations of slices
-slices_to_test = [slice(None), 1, slice(0, 10), slice(0, 10, 2)]
+slices_to_test = [slice(None), 1, slice(0, 10, 2)]
 
 
 @pytest.mark.parametrize("dslice", slices_to_test)
@@ -156,24 +157,44 @@ def test_hdf5_stack_read_slices(
 
 # #### RasterReader Tests ####
 
-# We already have raster files in slc_file_list
-
 
 def test_raster_reader(slc_file_list, slc_stack):
-    r = RasterReader(slc_file_list[0])
+    # ignore georeferencing warnings
+    with pytest.warns(rio.errors.NotGeoreferencedWarning):
+        r = RasterReader.from_file(slc_file_list[0])
     assert r.shape == slc_stack[0].shape
     assert r.dtype == slc_stack[0].dtype
     assert r.ndim == 2
-    assert r.shape == (100, 200)
     assert r.dtype == np.complex64
 
 
-def test_raster_stack_reader(slc_file_list, slc_stack):
-    s = RasterStackReader.from_file_list(slc_file_list)
-    assert s.shape == slc_stack.shape
-    assert len(s) == len(slc_stack) == len(slc_file_list)
-    assert s.ndim == 3
-    assert s.dtype == slc_stack.dtype
+@pytest.mark.parametrize("keep_open", [True, False])
+def test_raster_stack_reader(
+    slc_file_list,
+    slc_stack,
+    keep_open,
+):
+    with pytest.warns(rio.errors.NotGeoreferencedWarning):
+        reader = RasterStackReader.from_file_list(slc_file_list, keep_open=keep_open)
+        assert reader.ndim == 3
+        assert reader.shape == slc_stack.shape
+        assert reader.dtype == slc_stack.dtype
+        assert len(reader) == len(slc_stack) == len(slc_file_list)
+
+
+@pytest.mark.parametrize("dslice", slices_to_test)
+@pytest.mark.parametrize("rslice", slices_to_test)
+@pytest.mark.parametrize("cslice", slices_to_test)
+@pytest.mark.parametrize("keep_open", [True, False])
+def test_raster_stack_read_slices(
+    slc_file_list, slc_stack, keep_open, dslice, rslice, cslice
+):
+    with pytest.warns(rio.errors.NotGeoreferencedWarning):
+        reader = RasterStackReader.from_file_list(slc_file_list, keep_open=keep_open)
+        s = reader[dslice, rslice, cslice]
+    expected = slc_stack[dslice, rslice, cslice]
+    assert s.shape == expected.shape
+    npt.assert_array_almost_equal(s, expected)
 
 
 # #### VRT Tests ####

From bf6ec792bde800a2f4d89142444e70147bb76efc Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 23:15:04 -0500
Subject: [PATCH 12/21] add `num_threads` to other readers

---
 src/dolphin/_readers.py | 42 +++++++++++++++++++++++++++++++++--------
 src/dolphin/io.py       |  7 ++++---
 src/dolphin/utils.py    |  8 ++++----
 3 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 0a751465..84d74ddf 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -305,6 +305,9 @@ def __array__(self) -> np.ndarray:
     def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
         import rasterio.windows
 
+        if not isinstance(key, tuple):
+            raise ValueError("Index must be a tuple of slices or integers.")
+
         r_slice, c_slice = _ensure_slices(*key)
         window = rasterio.windows.Window.from_slices(
             r_slice,
@@ -427,6 +430,8 @@ def from_gdal(
             List of paths to the files to read.
         band : int, optional
             Band to read from the file, by default 1
+        num_threads : int, optional (default 1)
+            Number of threads to use for reading.
 
         Returns
         -------
@@ -476,6 +481,7 @@ def from_file_list(
         file_list: Sequence[Filename],
         dset_names=str | Sequence[str],
         keep_open: bool = False,
+        num_threads: int = 1,
     ) -> HDF5StackReader:
         """Create a HDF5StackReader from a list of files.
 
@@ -488,6 +494,8 @@ def from_file_list(
             If a single string, will be used for all files.
         keep_open : bool, optional (default False)
             If True, keep the HDF5 file handles open for faster reading.
+        num_threads : int, optional (default 1)
+            Number of threads to use for reading.
 
         Returns
         -------
@@ -501,7 +509,7 @@ def from_file_list(
             HDF5Reader(Path(f), dset_name=dn, keep_open=keep_open)
             for (f, dn) in zip(file_list, dset_names)
         ]
-        return cls(file_list, readers, num_threads=1)
+        return cls(file_list, readers, num_threads=num_threads)
 
 
 @dataclass
@@ -526,6 +534,7 @@ def from_file_list(
         file_list: Sequence[Filename],
         bands: int | Sequence[int] = 1,
         keep_open: bool = False,
+        num_threads: int = 1,
     ) -> RasterStackReader:
         """Create a RasterStackReader from a list of files.
 
@@ -539,6 +548,8 @@ def from_file_list(
             Default = 1.
         keep_open : bool, optional (default False)
             If True, keep the rasterio file handles open for faster reading.
+        num_threads : int, optional (default 1)
+            Number of threads to use for reading.
 
         Returns
         -------
@@ -552,7 +563,7 @@ def from_file_list(
             RasterReader.from_file(f, band=b, keep_open=keep_open)
             for (f, b) in zip(file_list, bands)
         ]
-        return cls(file_list, readers, num_threads=1)
+        return cls(file_list, readers, num_threads=num_threads)
 
 
 class VRTStack(StackReader):
@@ -595,6 +606,7 @@ def __init__(
         write_file: bool = True,
         fail_on_overwrite: bool = False,
         skip_size_check: bool = False,
+        num_threads: int = 1,
     ):
         if Path(outfile).exists() and write_file:
             if fail_on_overwrite:
@@ -606,12 +618,14 @@ def __init__(
             else:
                 logger.info(f"Overwriting {outfile}")
 
-        files: list[Filename] = [Path(f) for f in file_list]
+        # files: list[Filename] = [Path(f) for f in file_list]
         self._use_abs_path = use_abs_path
         if use_abs_path:
-            files = [utils._resolve_gdal_path(p) for p in files]
+            files = [utils._resolve_gdal_path(p) for p in file_list]
+        else:
+            files = list(file_list)
         # Extract the date/datetimes from the filenames
-        dates = [get_dates(f, fmt=file_date_fmt) for f in files]
+        dates = [get_dates(f, fmt=file_date_fmt) for f in file_list]
         if sort_files:
             files, dates = sort_files_by_date(  # type: ignore
                 files, file_date_fmt=file_date_fmt
@@ -620,6 +634,7 @@ def __init__(
         # Save the attributes
         self.file_list = files
         self.dates = dates
+        self.num_threads = num_threads
 
         self.outfile = Path(outfile).resolve()
         # Assumes that all files use the same subdataset (if NetCDF)
@@ -825,9 +840,20 @@ def __getitem__(self, index):
             # This will use gdal's ds.ReadAsRaster, no iteration needed
             data = self.read_stack(band=None, rows=rows, cols=cols)
         else:
-            data = np.stack(
-                [self.read_stack(band=i, rows=rows, cols=cols) for i in bands], axis=0
-            )
+            # Get only the bands we need
+            if self.num_threads == 1:
+                # out = np.stack([readers[i][r_slice, c_slice] for i in band_idxs], axis=0)
+                data = np.stack(
+                    [self.read_stack(band=i, rows=rows, cols=cols) for i in bands],
+                    axis=0,
+                )
+            else:
+                with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
+                    results = executor.map(
+                        lambda i: self.read_stack(band=i, rows=rows, cols=cols), bands
+                    )
+                data = np.stack(list(results), axis=0)
+
         return data.squeeze()
 
     @property
diff --git a/src/dolphin/io.py b/src/dolphin/io.py
index 0efe386d..d428c2bf 100644
--- a/src/dolphin/io.py
+++ b/src/dolphin/io.py
@@ -180,10 +180,11 @@ def format_nc_filename(filename: Filename, ds_name: Optional[str] = None) -> str
         If `ds_name` is not provided for a .h5 or .nc file.
     """
     # If we've already formatted the filename, return it
-    if str(filename).startswith("NETCDF:") or str(filename).startswith("HDF5:"):
-        return str(filename)
+    fname_clean = fspath(filename).lstrip('"').lstrip("'").rstrip('"').rstrip("'")
+    if fname_clean.startswith("NETCDF:") or fname_clean.startswith("HDF5:"):
+        return fspath(filename)
 
-    if not (fspath(filename).endswith(".nc") or fspath(filename).endswith(".h5")):
+    if not (fname_clean.endswith(".nc") or fname_clean.endswith(".h5")):
         return fspath(filename)
 
     # Now we're definitely dealing with an HDF5/NetCDF file
diff --git a/src/dolphin/utils.py b/src/dolphin/utils.py
index db2fa868..73db9034 100644
--- a/src/dolphin/utils.py
+++ b/src/dolphin/utils.py
@@ -115,15 +115,15 @@ def _get_path_from_gdal_str(name: Filename) -> Path:
 
 def _resolve_gdal_path(gdal_str: Filename) -> Filename:
     """Resolve the file portion of a gdal-openable string to an absolute path."""
-    s = str(gdal_str)
+    s_clean = str(gdal_str).lstrip('"').lstrip("'").rstrip('"').rstrip("'")
     prefixes = ["DERIVED_SUBDATASET", "NETCDF", "HDF"]
-    is_gdal_str = any(s.upper().startswith(pre) for pre in prefixes)
-    file_part = str(_get_path_from_gdal_str(gdal_str))
+    is_gdal_str = any(s_clean.upper().startswith(pre) for pre in prefixes)
+    file_part = str(_get_path_from_gdal_str(s_clean))
 
     # strip quotes to add back in after
     file_part = file_part.strip('"').strip("'")
     file_part_resolved = Path(file_part).resolve()
-    resolved = s.replace(file_part, str(file_part_resolved))
+    resolved = s_clean.replace(file_part, str(file_part_resolved))
     return Path(resolved) if not is_gdal_str else resolved
 
 

From e2eca6b049a37015d2e719b547b0c4d7c1452e7b Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Tue, 9 Jan 2024 23:16:31 -0500
Subject: [PATCH 13/21] fix bad typing

---
 src/dolphin/_readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 84d74ddf..14b8e31e 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -479,7 +479,7 @@ class HDF5StackReader(BaseStackReader):
     def from_file_list(
         cls,
         file_list: Sequence[Filename],
-        dset_names=str | Sequence[str],
+        dset_names: str | Sequence[str],
         keep_open: bool = False,
         num_threads: int = 1,
     ) -> HDF5StackReader:

From 020e8cde801cf51e3c4ab0d7763418b7e9bba55a Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 12:29:07 -0500
Subject: [PATCH 14/21] remove `get_max_block_shape`. switch to `StackReader`
 protocol for `EagerLoader`

---
 src/dolphin/io.py | 90 +++++------------------------------------------
 1 file changed, 8 insertions(+), 82 deletions(-)

diff --git a/src/dolphin/io.py b/src/dolphin/io.py
index d428c2bf..2ecc78a8 100644
--- a/src/dolphin/io.py
+++ b/src/dolphin/io.py
@@ -22,6 +22,7 @@
 from dolphin._background import _DEFAULT_TIMEOUT, BackgroundReader, BackgroundWriter
 from dolphin._blocks import compute_out_shape, iter_blocks
 from dolphin._log import get_log
+from dolphin._readers import DatasetReader
 from dolphin._types import Bbox, Filename
 from dolphin.utils import gdal_to_numpy_type, numpy_to_gdal_type, progress
 
@@ -764,23 +765,24 @@ class EagerLoader(BackgroundReader):
 
     def __init__(
         self,
-        filename: Filename,
+        reader: DatasetReader,
         block_shape: tuple[int, int],
         overlaps: tuple[int, int] = (0, 0),
         skip_empty: bool = True,
+        nodata_value: Optional[float] = None,
         nodata_mask: Optional[ArrayLike] = None,
         queue_size: int = 1,
         timeout: float = _DEFAULT_TIMEOUT,
         show_progress: bool = True,
     ):
         super().__init__(nq=queue_size, timeout=timeout, name="EagerLoader")
-        self.filename = filename
+        self.reader = reader
         # Set up the generator of ((row_start, row_end), (col_start, col_end))
-        xsize, ysize = get_raster_xysize(filename)
         # convert the slice generator to a list so we have the size
+        nrows, ncols = self.reader.shape[-2:]
         self.slices = list(
             iter_blocks(
-                arr_shape=(ysize, xsize),
+                arr_shape=(nrows, ncols),
                 block_shape=block_shape,
                 overlaps=overlaps,
             )
@@ -789,14 +791,14 @@ def __init__(
         self._skip_empty = skip_empty
         self._nodata_mask = nodata_mask
         self._block_shape = block_shape
-        self._nodata = get_raster_nodata(filename)
+        self._nodata = nodata_value
         self._show_progress = show_progress
         if self._nodata is None:
             self._nodata = np.nan
 
     def read(self, rows: slice, cols: slice) -> tuple[np.ndarray, tuple[slice, slice]]:
         logger.debug(f"EagerLoader reading {rows}, {cols}")
-        cur_block = load_gdal(self.filename, rows=rows, cols=cols)
+        cur_block = self.reader[rows, cols]
         return cur_block, (rows, cols)
 
     def iter_blocks(
@@ -834,48 +836,6 @@ def iter_blocks(
         self.notify_finished()
 
 
-def get_max_block_shape(
-    filename: Filename, nstack: int, max_bytes: float = 64e6
-) -> tuple[int, int]:
-    """Find a block shape to load from `filename` with memory size < `max_bytes`.
-
-    Attempts to get an integer number of chunks ("tiles" for geotiffs) from the
-    file to avoid partial tiles.
-
-    Parameters
-    ----------
-    filename : str
-        GDAL-readable file name containing 3D dataset.
-    nstack: int
-        Number of bands in dataset.
-    max_bytes : float, optional
-        Target size of memory (in Bytes) for each block.
-        Defaults to 64e6.
-
-    Returns
-    -------
-    tuple[int, int]:
-        (num_rows, num_cols) shape of blocks to load from `vrt_file`
-    """
-    chunk_cols, chunk_rows = get_raster_chunk_size(filename)
-    xsize, ysize = get_raster_xysize(filename)
-    # If it's written by line, load at least 16 lines at a time
-    chunk_cols = min(max(16, chunk_cols), xsize)
-    chunk_rows = min(max(16, chunk_rows), ysize)
-
-    ds = gdal.Open(fspath(filename))
-    shape = (ds.RasterYSize, ds.RasterXSize)
-    # get the size of the data type from the raster
-    nbytes = gdal_to_numpy_type(ds.GetRasterBand(1).DataType).itemsize
-    return _increment_until_max(
-        max_bytes=max_bytes,
-        file_chunk_size=[chunk_rows, chunk_cols],
-        shape=shape,
-        nstack=nstack,
-        bytes_per_pixel=nbytes,
-    )
-
-
 def get_raster_chunk_size(filename: Filename) -> list[int]:
     """Get size the raster's chunks on disk.
 
@@ -888,37 +848,3 @@ def get_raster_chunk_size(filename: Filename) -> list[int]:
             logger.warning(f"Warning: {filename} bands have different block shapes.")
             break
     return block_size
-
-
-def _increment_until_max(
-    max_bytes: float,
-    file_chunk_size: Sequence[int],
-    shape: tuple[int, int],
-    nstack: int,
-    bytes_per_pixel: int = 8,
-) -> tuple[int, int]:
-    """Find size of 3D chunk to load while staying at ~`max_bytes` bytes of RAM."""
-    chunk_rows, chunk_cols = file_chunk_size
-
-    # How many chunks can we fit in max_bytes?
-    chunks_per_block = max_bytes / (
-        (nstack * chunk_rows * chunk_cols) * bytes_per_pixel
-    )
-    num_chunks = [1, 1]
-    cur_block_shape = [chunk_rows, chunk_cols]
-
-    idx = 1  # start incrementing cols
-    while chunks_per_block > 1 and tuple(cur_block_shape) != tuple(shape):
-        # Alternate between adding a row and column chunk by flipping the idx
-        chunk_idx = idx % 2
-        nc = num_chunks[chunk_idx]
-        chunk_size = file_chunk_size[chunk_idx]
-
-        cur_block_shape[chunk_idx] = min(nc * chunk_size, shape[chunk_idx])
-
-        chunks_per_block = max_bytes / (
-            nstack * np.prod(cur_block_shape) * bytes_per_pixel
-        )
-        num_chunks[chunk_idx] += 1
-        idx += 1
-    return cur_block_shape[0], cur_block_shape[1]

From 5fc7cc1fcb3f6d129ac4c2cd2a29c3b0b63a87ce Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 13:07:50 -0500
Subject: [PATCH 15/21] reorg tests into classes

---
 tests/test_readers.py | 210 +++++++++++++++++++++++-------------------
 1 file changed, 115 insertions(+), 95 deletions(-)

diff --git a/tests/test_readers.py b/tests/test_readers.py
index 730f34b9..9d384d59 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -7,7 +7,7 @@
 from osgeo import gdal
 
 from dolphin._readers import (
-    BinaryFile,
+    BinaryReader,
     BinaryStackReader,
     HDF5Reader,
     HDF5StackReader,
@@ -20,6 +20,9 @@
 
 # Note: uses the fixtures from conftest.py
 
+# Get combinations of slices
+slices_to_test = [slice(None), 1, slice(0, 10, 2)]
+
 
 @pytest.fixture(scope="module")
 def binary_file_list(tmp_path_factory, slc_stack):
@@ -53,43 +56,44 @@ def binary_file_list(tmp_path_factory, slc_stack):
 
 
 @pytest.fixture
-def binary_file(slc_stack, binary_file_list):
-    f = BinaryFile(binary_file_list[0], shape=slc_stack[0].shape, dtype=slc_stack.dtype)
+def binary_reader(slc_stack, binary_file_list):
+    f = BinaryReader(
+        binary_file_list[0], shape=slc_stack[0].shape, dtype=slc_stack.dtype
+    )
     assert f.shape == slc_stack[0].shape
     assert f.dtype == slc_stack[0].dtype
     return f
 
 
-def test_binary_file_read(binary_file, slc_stack):
-    npt.assert_array_almost_equal(binary_file[()], slc_stack[0])
-    # Check the reading of a subset
-    npt.assert_array_almost_equal(binary_file[0:10, 0:10], slc_stack[0][0:10, 0:10])
-
-
-@pytest.fixture(scope="module")
-def binary_stack(slc_stack, binary_file_list):
-    s = BinaryStackReader.from_file_list(
-        binary_file_list, shape_2d=slc_stack[0].shape, dtype=slc_stack.dtype
-    )
-    assert s.shape == slc_stack.shape
-    assert len(s) == len(slc_stack) == len(binary_file_list)
-    assert s.ndim == 3
-    assert s.dtype == slc_stack.dtype
-    return s
-
-
-# Get combinations of slices
-slices_to_test = [slice(None), 1, slice(0, 10, 2)]
-
-
-@pytest.mark.parametrize("dslice", slices_to_test)
-@pytest.mark.parametrize("rslice", slices_to_test)
-@pytest.mark.parametrize("cslice", slices_to_test)
-def test_binary_stack_read_slices(binary_stack, slc_stack, dslice, rslice, cslice):
-    s = binary_stack[dslice, rslice, cslice]
-    expected = slc_stack[dslice, rslice, cslice]
-    assert s.shape == expected.shape
-    npt.assert_array_almost_equal(s, expected)
+class TestBinary:
+    def test_binary_file_read(self, binary_reader, slc_stack):
+        npt.assert_array_almost_equal(binary_reader[()], slc_stack[0])
+        # Check the reading of a subset
+        npt.assert_array_almost_equal(
+            binary_reader[0:10, 0:10], slc_stack[0][0:10, 0:10]
+        )
+
+    @pytest.fixture(scope="module")
+    def binary_stack(self, slc_stack, binary_file_list):
+        s = BinaryStackReader.from_file_list(
+            binary_file_list, shape_2d=slc_stack[0].shape, dtype=slc_stack.dtype
+        )
+        assert s.shape == slc_stack.shape
+        assert len(s) == len(slc_stack) == len(binary_file_list)
+        assert s.ndim == 3
+        assert s.dtype == slc_stack.dtype
+        return s
+
+    @pytest.mark.parametrize("dslice", slices_to_test)
+    @pytest.mark.parametrize("rslice", slices_to_test)
+    @pytest.mark.parametrize("cslice", slices_to_test)
+    def test_binary_stack_read_slices(
+        self, binary_stack, slc_stack, dslice, rslice, cslice
+    ):
+        s = binary_stack[dslice, rslice, cslice]
+        expected = slc_stack[dslice, rslice, cslice]
+        assert s.shape == expected.shape
+        npt.assert_array_almost_equal(s, expected)
 
 
 # #### HDF5 Tests ####
@@ -121,44 +125,42 @@ def hdf5_reader(hdf5_file_list, slc_stack):
     return r
 
 
-def test_hdf5_reader_read(hdf5_reader, slc_stack):
-    npt.assert_array_almost_equal(hdf5_reader[()], slc_stack[0])
-    # Check the reading of a subset
-    npt.assert_array_almost_equal(hdf5_reader[0:10, 0:10], slc_stack[0][0:10, 0:10])
-
-
-@pytest.mark.parametrize("keep_open", [True, False])
-def hdf5_stack(hdf5_file_list, slc_stack, keep_open):
-    s = HDF5StackReader.from_file_list(
-        hdf5_file_list, dset_names="data", keep_open=keep_open
-    )
-    assert s.shape == slc_stack.shape
-    assert len(s) == len(slc_stack) == len(hdf5_file_list)
-    assert s.ndim == 3
-    assert s.dtype == slc_stack.dtype
-    return s
-
-
-@pytest.mark.parametrize("dslice", slices_to_test)
-@pytest.mark.parametrize("rslice", slices_to_test)
-@pytest.mark.parametrize("cslice", slices_to_test)
-@pytest.mark.parametrize("keep_open", [True, False])
-def test_hdf5_stack_read_slices(
-    hdf5_file_list, slc_stack, keep_open, dslice, rslice, cslice
-):
-    reader = HDF5StackReader.from_file_list(
-        hdf5_file_list, dset_names="data", keep_open=keep_open
-    )
-    s = reader[dslice, rslice, cslice]
-    expected = slc_stack[dslice, rslice, cslice]
-    assert s.shape == expected.shape
-    npt.assert_array_almost_equal(s, expected)
+class TestHDF5:
+    def test_hdf5_reader_read(self, hdf5_reader, slc_stack):
+        npt.assert_array_almost_equal(hdf5_reader[()], slc_stack[0])
+        # Check the reading of a subset
+        npt.assert_array_almost_equal(hdf5_reader[0:10, 0:10], slc_stack[0][0:10, 0:10])
+
+    @pytest.mark.parametrize("keep_open", [True, False])
+    def hdf5_stack(self, hdf5_file_list, slc_stack, keep_open):
+        s = HDF5StackReader.from_file_list(
+            hdf5_file_list, dset_names="data", keep_open=keep_open
+        )
+        assert s.shape == slc_stack.shape
+        assert len(s) == len(slc_stack) == len(hdf5_file_list)
+        assert s.ndim == 3
+        assert s.dtype == slc_stack.dtype
+        return s
+
+    @pytest.mark.parametrize("dslice", slices_to_test)
+    @pytest.mark.parametrize("rslice", slices_to_test)
+    @pytest.mark.parametrize("cslice", slices_to_test)
+    @pytest.mark.parametrize("keep_open", [True, False])
+    def test_hdf5_stack_read_slices(
+        self, hdf5_file_list, slc_stack, keep_open, dslice, rslice, cslice
+    ):
+        reader = HDF5StackReader.from_file_list(
+            hdf5_file_list, dset_names="data", keep_open=keep_open
+        )
+        s = reader[dslice, rslice, cslice]
+        expected = slc_stack[dslice, rslice, cslice]
+        assert s.shape == expected.shape
+        npt.assert_array_almost_equal(s, expected)
 
 
 # #### RasterReader Tests ####
-
-
-def test_raster_reader(slc_file_list, slc_stack):
+@pytest.fixture
+def raster_reader(slc_file_list, slc_stack):
     # ignore georeferencing warnings
     with pytest.warns(rio.errors.NotGeoreferencedWarning):
         r = RasterReader.from_file(slc_file_list[0])
@@ -168,33 +170,51 @@ def test_raster_reader(slc_file_list, slc_stack):
     assert r.dtype == np.complex64
 
 
-@pytest.mark.parametrize("keep_open", [True, False])
-def test_raster_stack_reader(
-    slc_file_list,
-    slc_stack,
-    keep_open,
-):
-    with pytest.warns(rio.errors.NotGeoreferencedWarning):
-        reader = RasterStackReader.from_file_list(slc_file_list, keep_open=keep_open)
-        assert reader.ndim == 3
-        assert reader.shape == slc_stack.shape
-        assert reader.dtype == slc_stack.dtype
-        assert len(reader) == len(slc_stack) == len(slc_file_list)
-
-
-@pytest.mark.parametrize("dslice", slices_to_test)
-@pytest.mark.parametrize("rslice", slices_to_test)
-@pytest.mark.parametrize("cslice", slices_to_test)
-@pytest.mark.parametrize("keep_open", [True, False])
-def test_raster_stack_read_slices(
-    slc_file_list, slc_stack, keep_open, dslice, rslice, cslice
-):
-    with pytest.warns(rio.errors.NotGeoreferencedWarning):
-        reader = RasterStackReader.from_file_list(slc_file_list, keep_open=keep_open)
-        s = reader[dslice, rslice, cslice]
-    expected = slc_stack[dslice, rslice, cslice]
-    assert s.shape == expected.shape
-    npt.assert_array_almost_equal(s, expected)
+class TestRaster:
+    @pytest.mark.parametrize("keep_open", [True, False])
+    def test_raster_stack_reader(
+        self,
+        slc_file_list,
+        slc_stack,
+        keep_open,
+    ):
+        with pytest.warns(rio.errors.NotGeoreferencedWarning):
+            reader = RasterStackReader.from_file_list(
+                slc_file_list, keep_open=keep_open
+            )
+            assert reader.ndim == 3
+            assert reader.shape == slc_stack.shape
+            assert reader.dtype == slc_stack.dtype
+            assert len(reader) == len(slc_stack) == len(slc_file_list)
+
+    @pytest.mark.parametrize("dslice", slices_to_test)
+    @pytest.mark.parametrize("rslice", slices_to_test)
+    @pytest.mark.parametrize("cslice", slices_to_test)
+    @pytest.mark.parametrize("keep_open", [True, False])
+    def test_raster_stack_read_slices(
+        self, slc_file_list, slc_stack, keep_open, dslice, rslice, cslice
+    ):
+        with pytest.warns(rio.errors.NotGeoreferencedWarning):
+            reader = RasterStackReader.from_file_list(
+                slc_file_list, keep_open=keep_open
+            )
+            s = reader[dslice, rslice, cslice]
+        expected = slc_stack[dslice, rslice, cslice]
+        assert s.shape == expected.shape
+        npt.assert_array_almost_equal(s, expected)
+
+
+@pytest.mark.parametrize("rows", slices_to_test)
+@pytest.mark.parametrize("cols", slices_to_test)
+def test_ellipsis_reads(binary_reader, hdf5_reader, raster_reader, rows, cols):
+    # Test that the ellipsis works
+    assert binary_reader[...] is binary_reader[()]
+    assert hdf5_reader[...] is hdf5_reader[()]
+    assert raster_reader[...] is raster_reader[()]
+    # Test we can still do rows/cols with a leading ellipsis
+    npt.assert_array_equal(binary_reader[..., rows, cols], binary_reader[rows, cols])
+    npt.assert_array_equal(hdf5_reader[..., rows, cols], hdf5_reader[rows, cols])
+    npt.assert_array_equal(raster_reader[..., rows, cols], raster_reader[rows, cols])
 
 
 # #### VRT Tests ####

From 4da763508ba6ec8db5e78e51c0d89958fd2e6923 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 13:50:38 -0500
Subject: [PATCH 16/21] reorder locations of background loaders

---
 mkdocs.yml                 |   1 +
 src/dolphin/_background.py |   7 +
 src/dolphin/_readers.py    | 308 +++++++++++++++++++++++++++++++------
 src/dolphin/io.py          | 117 +++-----------
 tests/test_io.py           |  42 +----
 5 files changed, 295 insertions(+), 180 deletions(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index c0aa658d..74652e3c 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -13,6 +13,7 @@ theme:
 plugins:
 - search
 # plugin suggestions from here: https://mkdocstrings.github.io/recipes/
+- autorefs
 - gen-files:
     scripts:
     - docs/gen_ref_pages.py
diff --git a/src/dolphin/_background.py b/src/dolphin/_background.py
index 2d264954..adb917b1 100644
--- a/src/dolphin/_background.py
+++ b/src/dolphin/_background.py
@@ -14,6 +14,13 @@
 
 _DEFAULT_TIMEOUT = 0.5
 
+__all__ = [
+    "BackgroundWorker",
+    "BackgroundReader",
+    "BackgroundWriter",
+    "DummyProcessPoolExecutor",
+]
+
 
 class BackgroundWorker(abc.ABC):
     """Base class for doing work in a background thread.
diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 14b8e31e..91dc5e21 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -5,7 +5,14 @@
 from dataclasses import dataclass
 from os import fspath
 from pathlib import Path
-from typing import Generator, Optional, Protocol, Sequence, runtime_checkable
+from typing import (
+    TYPE_CHECKING,
+    Generator,
+    Optional,
+    Protocol,
+    Sequence,
+    runtime_checkable,
+)
 
 import h5py
 import numpy as np
@@ -14,18 +21,26 @@
 from osgeo import gdal
 
 from dolphin import io, utils
+from dolphin._background import _DEFAULT_TIMEOUT, BackgroundReader
+from dolphin._blocks import iter_blocks
 from dolphin._dates import get_dates, sort_files_by_date
 from dolphin._types import Filename
 from dolphin.stack import logger
+from dolphin.utils import progress
 
 __all__ = [
     "DatasetReader",
-    "BinaryFile",
+    "BinaryReader",
     "StackReader",
     "BinaryStackReader",
     "VRTStack",
 ]
 
+if TYPE_CHECKING:
+    from builtins import ellipsis
+
+Index = ellipsis | slice | int
+
 
 @runtime_checkable
 class DatasetReader(Protocol):
@@ -49,7 +64,10 @@ class DatasetReader(Protocol):
     ndim: int
     """int : Number of array dimensions."""  # noqa: D403
 
-    def __getitem__(self, key: tuple[slice | int, ...], /) -> ArrayLike:
+    masked: bool = False
+    """bool : If True, return a masked array with the nodata values masked out."""
+
+    def __getitem__(self, key: tuple[Index, ...], /) -> ArrayLike:
         """Read a block of data."""
         ...
 
@@ -69,9 +87,21 @@ class StackReader(DatasetReader, Protocol):
     shape: tuple[int, int, int]
     """tuple of int : Tuple of array dimensions."""
 
+    @property
+    def __len__(self) -> int:
+        """int : Number of images in the stack."""
+        return self.shape[0]
+
+
+def _mask_array(arr: np.ndarray, nodata_value: float | None) -> np.ndarray:
+    """Mask an array based on a nodata value."""
+    if np.isnan(nodata_value):
+        return np.ma.masked_invalid(arr)
+    return np.ma.masked_equal(arr, nodata_value)
+
 
 @dataclass
-class BinaryFile(DatasetReader):
+class BinaryReader(DatasetReader):
     """A flat binary file for storing array data.
 
     See Also
@@ -96,19 +126,21 @@ class BinaryFile(DatasetReader):
     dtype: np.dtype
     """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
 
+    nodata_value: Optional[float] = None
+    """Optional[float] : Value to use for nodata pixels."""
+
     def __post_init__(self):
         self.filepath = Path(self.filepath)
         if not self.filepath.exists():
             raise FileNotFoundError(f"File {self.filepath} does not exist.")
         self.dtype = np.dtype(self.dtype)
-        self.filepath = Path(self.filepath)
 
     @property
     def ndim(self) -> int:  # type: ignore[override]
         """int : Number of array dimensions."""  # noqa: D403
         return len(self.shape)
 
-    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
+    def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
         with self.filepath.open("rb") as f:
             # Memory-map the entire file.
             with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
@@ -118,14 +150,16 @@ def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
                 arr = np.frombuffer(mm, dtype=self.dtype).reshape(self.shape)
                 data = arr[key].copy()
                 del arr
-            return data
+        return _mask_array(data, self.nodata_value) if self.masked else data
 
     def __array__(self) -> np.ndarray:
         return self[:,]
 
     @classmethod
-    def from_gdal(cls, filename: Filename, band: int = 1) -> BinaryFile:
-        """Create a BinaryFile from a GDAL-readable file.
+    def from_gdal(
+        cls, filename: Filename, band: int = 1, nodata_value: Optional[float] = None
+    ) -> BinaryReader:
+        """Create a BinaryReader from a GDAL-readable file.
 
         Parameters
         ----------
@@ -133,16 +167,25 @@ def from_gdal(cls, filename: Filename, band: int = 1) -> BinaryFile:
             Path to the file to read.
         band : int, optional
             Band to read from the file, by default 1
+        nodata_value : float, optional
+            Value to use for nodata pixels, by default None
+            If None passed, will search for a nodata value in the file.
 
         Returns
         -------
-        BinaryFile
-            The BinaryFile object.
+        BinaryReader
+            The BinaryReader object.
         """
         with rio.open(filename) as src:
             dtype = src.dtypes[band - 1]
             shape = src.shape
-        return cls(Path(filename), shape=shape, dtype=dtype)
+            nodata = src.nodatavals[band - 1]
+        return cls(
+            Path(filename),
+            shape=shape,
+            dtype=dtype,
+            nodata_value=nodata_value or nodata,
+        )
 
 
 @dataclass
@@ -163,7 +206,7 @@ class HDF5Reader(DatasetReader):
 
     See Also
     --------
-    BinaryFile
+    BinaryReader
     RasterReader
 
     Notes
@@ -181,7 +224,14 @@ class HDF5Reader(DatasetReader):
     dset_name: str
     """str : The path to the dataset within the file."""
 
+    nodata_value: Optional[float] = None
+    """Optional[float] : Value to use for nodata pixels.
+
+    If None, looks for `_FillValue` or `missing_value` attributes on the dataset.
+    """
+
     keep_open: bool = False
+    """bool : If True, keep the HDF5 file handle open for faster reading."""
 
     def __post_init__(self):
         filepath = Path(self.filepath)
@@ -191,6 +241,10 @@ def __post_init__(self):
         self.shape = dset.shape
         self.dtype = dset.dtype
         self.chunks = dset.chunks
+        if self.nodata_value is None:
+            self.nodata_value = dset.attrs.get("_FillValue", None)
+            if self.nodata_value is None:
+                self.nodata_value = dset.attrs.get("missing_value", None)
         if self.keep_open:
             self._hf = hf
             self._dset = dset
@@ -205,19 +259,25 @@ def ndim(self) -> int:  # type: ignore[override]
     def __array__(self) -> np.ndarray:
         return self[:,]
 
-    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
+    def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
         if self.keep_open:
-            return self._dset[key]
+            data = self._dset[key]
+        else:
+            with h5py.File(self.filepath, "r") as f:
+                data = f[self.dset_name][key]
+        return _mask_array(data, self.nodata_value) if self.masked else data
 
-        with h5py.File(self.filepath, "r") as f:
-            dataset = f[self.dset_name]
-            return dataset[key]
 
+def _ensure_slices(rows: Index, cols: Index) -> tuple[slice, slice]:
+    def _parse(key: Index):
+        if isinstance(key, int):
+            return slice(key, key + 1)
+        elif key is ...:
+            return slice(None)
+        else:
+            return key
 
-def _ensure_slices(rows: slice | int, cols: slice | int) -> tuple[slice, slice]:
-    r_slice = slice(rows, rows + 1) if isinstance(rows, int) else rows
-    c_slice = slice(cols, cols + 1) if isinstance(cols, int) else cols
-    return r_slice, c_slice
+    return _parse(rows), _parse(cols)
 
 
 @dataclass
@@ -226,8 +286,8 @@ class RasterReader(DatasetReader):
 
     See Also
     --------
-    BinaryFile
-    HDF5Dataset
+    BinaryReader
+    HDF5
 
     Notes
     -----
@@ -261,6 +321,9 @@ class RasterReader(DatasetReader):
     shape: tuple[int, int]
     dtype: np.dtype
 
+    nodata_value: Optional[float] = None
+    """Optional[float] : Value to use for nodata pixels."""
+
     keep_open: bool = False
     """bool : If True, keep the rasterio file handle open for faster reading."""
 
@@ -269,6 +332,7 @@ def from_file(
         cls,
         filepath: Filename,
         band: int = 1,
+        nodata_value: Optional[float] = None,
         keep_open: bool = False,
         **options,
     ) -> RasterReader:
@@ -277,6 +341,7 @@ def from_file(
             dtype = np.dtype(src.dtypes[band - 1])
             driver = src.driver
             crs = src.crs
+            nodata_value = nodata_value or src.nodatavals[band - 1]
             transform = src.transform
 
             return cls(
@@ -287,6 +352,7 @@ def from_file(
                 transform=transform,
                 shape=shape,
                 dtype=dtype,
+                nodata_value=nodata_value,
                 keep_open=keep_open,
             )
 
@@ -302,13 +368,13 @@ def ndim(self) -> int:  # type: ignore[override]
     def __array__(self) -> np.ndarray:
         return self[:, :]
 
-    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
+    def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
         import rasterio.windows
 
         if not isinstance(key, tuple):
             raise ValueError("Index must be a tuple of slices or integers.")
 
-        r_slice, c_slice = _ensure_slices(*key)
+        r_slice, c_slice = _ensure_slices(*key[-2:])
         window = rasterio.windows.Window.from_slices(
             r_slice,
             c_slice,
@@ -320,13 +386,16 @@ def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
 
         with rio.open(self.filepath) as src:
             out = src.read(self.band, window=window)
+        out_masked = _mask_array(out, self.nodata_value) if self.masked else out
         # Note that Rasterio doesn't use the `step` of a slice, so we need to
         # manually slice the output array.
         r_step, c_step = r_slice.step or 1, c_slice.step or 1
-        return out[::r_step, ::c_step]
+        return out_masked[::r_step, ::c_step].squeeze()
 
 
-def _read_3d(key, readers: Sequence[DatasetReader], num_threads: int = 1):
+def _read_3d(
+    key: tuple[Index, ...], readers: Sequence[DatasetReader], num_threads: int = 1
+):
     # Check that it's a tuple of slices
     if not isinstance(key, tuple):
         raise ValueError("Index must be a tuple of slices.")
@@ -366,32 +435,73 @@ class BaseStackReader(StackReader):
     file_list: Sequence[Filename]
     readers: Sequence[DatasetReader]
     num_threads: int = 1
+    nodata_value: Optional[float] = None
 
-    def __getitem__(self, key: tuple[slice | int, ...], /) -> np.ndarray:
+    def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
         return _read_3d(key, self.readers, num_threads=self.num_threads)
 
-    def __len__(self):
-        return len(self.file_list)
-
     @property
     def shape_2d(self):
         return self.readers[0].shape
 
     @property
     def shape(self):
-        return (len(self), *self.shape_2d)
+        return (len(self.file_list), *self.shape_2d)
 
     @property
     def dtype(self):
         return self.readers[0].dtype
 
+    def iter_blocks(
+        self,
+        overlaps: tuple[int, int] = (0, 0),
+        block_shape: tuple[int, int] = (512, 512),
+        skip_empty: bool = True,
+        nodata_mask: Optional[np.ndarray] = None,
+        show_progress: bool = True,
+    ) -> Generator[tuple[np.ndarray, tuple[slice, slice]], None, None]:
+        """Iterate over blocks of the stack.
+
+        Loads all images for one window at a time into memory, while eagerly
+        fetching the next block in the background.
+
+        Parameters
+        ----------
+        overlaps : tuple[int, int], optional
+            Pixels to overlap each block by (rows, cols)
+            By default (0, 0)
+        block_shape : tuple[int, int], optional
+            2D shape of blocks to load at a time.
+            Loads all dates/bands at a time with this shape.
+        skip_empty : bool, optional (default True)
+            Skip blocks that are entirely empty (all NaNs)
+        nodata_mask : bool, optional
+            Optional mask indicating nodata values. If provided, will skip
+            blocks that are entirely nodata.
+            1s are the nodata values, 0s are valid data.
+        show_progress : bool, default=True
+            If true, displays a `rich` ProgressBar.
+
+        Yields
+        ------
+        tuple[np.ndarray, tuple[slice, slice]]
+            Iterator of (data, (slice(row_start, row_stop), slice(col_start, col_stop))
+
+        """
+        loader = EagerLoader(
+            self,
+            block_shape=block_shape,
+            overlaps=overlaps,
+            nodata_mask=nodata_mask,
+            nodata_value=self.nodata_value,
+            skip_empty=skip_empty,
+            show_progress=show_progress,
+        )
+        yield from loader.iter_blocks()
+
 
 @dataclass
 class BinaryStackReader(BaseStackReader):
-    file_list: Sequence[Filename]
-    readers: Sequence[BinaryFile]
-    num_threads: int = 1
-
     @classmethod
     def from_file_list(
         cls, file_list: Sequence[Filename], shape_2d: tuple[int, int], dtype: np.dtype
@@ -412,7 +522,9 @@ def from_file_list(
         BinaryStackReader
             The BinaryStackReader object.
         """
-        readers = [BinaryFile(Path(f), shape=shape_2d, dtype=dtype) for f in file_list]
+        readers = [
+            BinaryReader(Path(f), shape=shape_2d, dtype=dtype) for f in file_list
+        ]
         return cls(file_list=file_list, readers=readers, num_threads=1)
 
     @classmethod
@@ -421,6 +533,7 @@ def from_gdal(
         file_list: Sequence[Filename],
         band: int = 1,
         num_threads: int = 1,
+        nodata_value: Optional[float] = None,
     ) -> BinaryStackReader:
         """Create a BinaryStackReader from a list of GDAL-readable files.
 
@@ -432,6 +545,9 @@ def from_gdal(
             Band to read from the file, by default 1
         num_threads : int, optional (default 1)
             Number of threads to use for reading.
+        nodata_value : float, optional
+            Manually set value to use for nodata pixels, by default None
+            If None passed, will search for a nodata value in the file.
 
         Returns
         -------
@@ -449,11 +565,12 @@ def from_gdal(
                 raise ValueError("All files must have the same data type.")
             if len(shapes) > 1:
                 raise ValueError("All files must have the same shape.")
-            readers.append(BinaryFile.from_gdal(f, band=band))
+            readers.append(BinaryReader.from_gdal(f, band=band))
         return cls(
             file_list=file_list,
             readers=readers,
             num_threads=num_threads,
+            nodata_value=nodata_value,
         )
 
 
@@ -482,6 +599,7 @@ def from_file_list(
         dset_names: str | Sequence[str],
         keep_open: bool = False,
         num_threads: int = 1,
+        nodata_value: Optional[float] = None,
     ) -> HDF5StackReader:
         """Create a HDF5StackReader from a list of files.
 
@@ -496,6 +614,9 @@ def from_file_list(
             If True, keep the HDF5 file handles open for faster reading.
         num_threads : int, optional (default 1)
             Number of threads to use for reading.
+        nodata_value : float, optional
+            Manually set value to use for nodata pixels, by default None
+            If None passed, will search for a nodata value in the file.
 
         Returns
         -------
@@ -506,10 +627,19 @@ def from_file_list(
             dset_names = [dset_names] * len(file_list)
 
         readers = [
-            HDF5Reader(Path(f), dset_name=dn, keep_open=keep_open)
+            HDF5Reader(
+                Path(f), dset_name=dn, keep_open=keep_open, nodata_value=nodata_value
+            )
             for (f, dn) in zip(file_list, dset_names)
         ]
-        return cls(file_list, readers, num_threads=num_threads)
+        # Check if nodata values were found in the files
+        nds = set([r.nodata_value for r in readers])
+        if len(nds) == 1:
+            nodata_value = nds.pop()
+
+        return cls(
+            file_list, readers, num_threads=num_threads, nodata_value=nodata_value
+        )
 
 
 @dataclass
@@ -535,6 +665,7 @@ def from_file_list(
         bands: int | Sequence[int] = 1,
         keep_open: bool = False,
         num_threads: int = 1,
+        nodata_value: Optional[float] = None,
     ) -> RasterStackReader:
         """Create a RasterStackReader from a list of files.
 
@@ -550,6 +681,8 @@ def from_file_list(
             If True, keep the rasterio file handles open for faster reading.
         num_threads : int, optional (default 1)
             Number of threads to use for reading.
+        nodata_value : float, optional
+            Manually set value to use for nodata pixels, by default None
 
         Returns
         -------
@@ -563,7 +696,13 @@ def from_file_list(
             RasterReader.from_file(f, band=b, keep_open=keep_open)
             for (f, b) in zip(file_list, bands)
         ]
-        return cls(file_list, readers, num_threads=num_threads)
+        # Check if nodata values were found in the files
+        nds = set([r.nodata_value for r in readers])
+        if len(nds) == 1:
+            nodata_value = nds.pop()
+        return cls(
+            file_list, readers, num_threads=num_threads, nodata_value=nodata_value
+        )
 
 
 class VRTStack(StackReader):
@@ -757,7 +896,8 @@ def iter_blocks(
     ) -> Generator[tuple[np.ndarray, tuple[slice, slice]], None, None]:
         """Iterate over blocks of the stack.
 
-        Loads all images for one window at a time into memory.
+        Loads all images for one window at a time into memory, while eagerly
+        fetching the next block in the background.
 
         Parameters
         ----------
@@ -782,15 +922,15 @@ def iter_blocks(
             Iterator of (data, (slice(row_start, row_stop), slice(col_start, col_stop))
 
         """
-        self._loader = io.EagerLoader(
-            self.outfile,
+        loader = EagerLoader(
+            self,
             block_shape=block_shape,
             overlaps=overlaps,
             nodata_mask=nodata_mask,
             skip_empty=skip_empty,
             show_progress=show_progress,
         )
-        yield from self._loader.iter_blocks()
+        yield from loader.iter_blocks()
 
     @property
     def shape(self):
@@ -907,3 +1047,79 @@ def _parse_vrt_file(vrt_file):
             filepaths.append(name)
 
     return filepaths, sds
+
+
+class EagerLoader(BackgroundReader):
+    """Class to pre-fetch data chunks in a background thread."""
+
+    def __init__(
+        self,
+        reader: DatasetReader,
+        block_shape: tuple[int, int],
+        overlaps: tuple[int, int] = (0, 0),
+        skip_empty: bool = True,
+        nodata_value: Optional[float] = None,
+        nodata_mask: Optional[ArrayLike] = None,
+        queue_size: int = 1,
+        timeout: float = _DEFAULT_TIMEOUT,
+        show_progress: bool = True,
+    ):
+        super().__init__(nq=queue_size, timeout=timeout, name="EagerLoader")
+        self.reader = reader
+        # Set up the generator of ((row_start, row_end), (col_start, col_end))
+        # convert the slice generator to a list so we have the size
+        nrows, ncols = self.reader.shape[-2:]
+        self.slices = list(
+            iter_blocks(
+                arr_shape=(nrows, ncols),
+                block_shape=block_shape,
+                overlaps=overlaps,
+            )
+        )
+        self._queue_size = queue_size
+        self._skip_empty = skip_empty
+        self._nodata_mask = nodata_mask
+        self._block_shape = block_shape
+        self._nodata = nodata_value
+        self._show_progress = show_progress
+        if self._nodata is None:
+            self._nodata = np.nan
+
+    def read(self, rows: slice, cols: slice) -> tuple[np.ndarray, tuple[slice, slice]]:
+        logger.debug(f"EagerLoader reading {rows}, {cols}")
+        cur_block = self.reader[..., rows, cols]
+        return cur_block, (rows, cols)
+
+    def iter_blocks(
+        self,
+    ) -> Generator[tuple[np.ndarray, tuple[slice, slice]], None, None]:
+        # Queue up all slices to the work queue
+        queued_slices = []
+        for rows, cols in self.slices:
+            # Skip queueing a read if all nodata
+            if self._skip_empty and self._nodata_mask is not None:
+                logger.debug("Checking nodata mask")
+                if self._nodata_mask[rows, cols].all():
+                    logger.debug("Skipping!")
+                    continue
+            self.queue_read(rows, cols)
+            queued_slices.append((rows, cols))
+
+        s_iter = range(len(queued_slices))
+        desc = f"Processing {self._block_shape} sized blocks..."
+        with progress(dummy=not self._show_progress) as p:
+            for _ in p.track(s_iter, description=desc):
+                cur_block, (rows, cols) = self.get_data()
+                logger.debug(f"got data for {rows, cols}: {cur_block.shape}")
+
+                # Otherwise look at the actual block we loaded
+                if np.isnan(self._nodata):
+                    block_nodata = np.isnan(cur_block)
+                else:
+                    block_nodata = cur_block == self._nodata
+                if np.all(block_nodata):
+                    logger.debug("Skipping block since it was all nodata")
+                    continue
+                yield cur_block, (rows, cols)
+
+        self.notify_finished()
diff --git a/src/dolphin/io.py b/src/dolphin/io.py
index 2ecc78a8..5ccbfc32 100644
--- a/src/dolphin/io.py
+++ b/src/dolphin/io.py
@@ -11,7 +11,7 @@
 from dataclasses import dataclass
 from os import fspath
 from pathlib import Path
-from typing import Any, Generator, Mapping, Optional, Sequence, Union
+from typing import Any, Mapping, Optional, Sequence, Union
 
 import h5py
 import numpy as np
@@ -19,12 +19,11 @@
 from osgeo import gdal
 from pyproj import CRS
 
-from dolphin._background import _DEFAULT_TIMEOUT, BackgroundReader, BackgroundWriter
-from dolphin._blocks import compute_out_shape, iter_blocks
+from dolphin._background import BackgroundWriter
+from dolphin._blocks import compute_out_shape
 from dolphin._log import get_log
-from dolphin._readers import DatasetReader
 from dolphin._types import Bbox, Filename
-from dolphin.utils import gdal_to_numpy_type, numpy_to_gdal_type, progress
+from dolphin.utils import gdal_to_numpy_type, numpy_to_gdal_type
 
 gdal.UseExceptions()
 
@@ -32,7 +31,7 @@
     "load_gdal",
     "write_arr",
     "write_block",
-    "EagerLoader",
+    "Writer",
 ]
 
 
@@ -721,6 +720,20 @@ def from_user_inputs(
         )
 
 
+def get_raster_chunk_size(filename: Filename) -> list[int]:
+    """Get size the raster's chunks on disk.
+
+    This is called blockXsize, blockYsize by GDAL.
+    """
+    ds = gdal.Open(fspath(filename))
+    block_size = ds.GetRasterBand(1).GetBlockSize()
+    for i in range(2, ds.RasterCount + 1):
+        if block_size != ds.GetRasterBand(i).GetBlockSize():
+            logger.warning(f"Warning: {filename} bands have different block shapes.")
+            break
+    return block_size
+
+
 class Writer(BackgroundWriter):
     """Class to write data to files in a background thread."""
 
@@ -729,7 +742,7 @@ def __init__(self, max_queue: int = 0, debug: bool = False, **kwargs):
             super().__init__(nq=max_queue, name="Writer", **kwargs)
         else:
             # Don't start a background thread. Just synchronously write data
-            self.queue_write = lambda *args: write_block(*args)  # type: ignore
+            setattr(self, "queue_write", self.write)
 
     def write(
         self, data: ArrayLike, filename: Filename, row_start: int, col_start: int
@@ -758,93 +771,3 @@ def write(
     def num_queued(self):
         """Number of items waiting in the queue to be written."""
         return self._work_queue.qsize()
-
-
-class EagerLoader(BackgroundReader):
-    """Class to pre-fetch data chunks in a background thread."""
-
-    def __init__(
-        self,
-        reader: DatasetReader,
-        block_shape: tuple[int, int],
-        overlaps: tuple[int, int] = (0, 0),
-        skip_empty: bool = True,
-        nodata_value: Optional[float] = None,
-        nodata_mask: Optional[ArrayLike] = None,
-        queue_size: int = 1,
-        timeout: float = _DEFAULT_TIMEOUT,
-        show_progress: bool = True,
-    ):
-        super().__init__(nq=queue_size, timeout=timeout, name="EagerLoader")
-        self.reader = reader
-        # Set up the generator of ((row_start, row_end), (col_start, col_end))
-        # convert the slice generator to a list so we have the size
-        nrows, ncols = self.reader.shape[-2:]
-        self.slices = list(
-            iter_blocks(
-                arr_shape=(nrows, ncols),
-                block_shape=block_shape,
-                overlaps=overlaps,
-            )
-        )
-        self._queue_size = queue_size
-        self._skip_empty = skip_empty
-        self._nodata_mask = nodata_mask
-        self._block_shape = block_shape
-        self._nodata = nodata_value
-        self._show_progress = show_progress
-        if self._nodata is None:
-            self._nodata = np.nan
-
-    def read(self, rows: slice, cols: slice) -> tuple[np.ndarray, tuple[slice, slice]]:
-        logger.debug(f"EagerLoader reading {rows}, {cols}")
-        cur_block = self.reader[rows, cols]
-        return cur_block, (rows, cols)
-
-    def iter_blocks(
-        self,
-    ) -> Generator[tuple[np.ndarray, tuple[slice, slice]], None, None]:
-        # Queue up all slices to the work queue
-        queued_slices = []
-        for rows, cols in self.slices:
-            # Skip queueing a read if all nodata
-            if self._skip_empty and self._nodata_mask is not None:
-                logger.debug("Checking nodata mask")
-                if self._nodata_mask[rows, cols].all():
-                    logger.debug("Skipping!")
-                    continue
-            self.queue_read(rows, cols)
-            queued_slices.append((rows, cols))
-
-        s_iter = range(len(queued_slices))
-        desc = f"Processing {self._block_shape} sized blocks..."
-        with progress(dummy=not self._show_progress) as p:
-            for _ in p.track(s_iter, description=desc):
-                cur_block, (rows, cols) = self.get_data()
-                logger.debug(f"got data for {rows, cols}: {cur_block.shape}")
-
-                # Otherwise look at the actual block we loaded
-                if np.isnan(self._nodata):
-                    block_nodata = np.isnan(cur_block)
-                else:
-                    block_nodata = cur_block == self._nodata
-                if np.all(block_nodata):
-                    logger.debug("Skipping block since it was all nodata")
-                    continue
-                yield cur_block, (rows, cols)
-
-        self.notify_finished()
-
-
-def get_raster_chunk_size(filename: Filename) -> list[int]:
-    """Get size the raster's chunks on disk.
-
-    This is called blockXsize, blockYsize by GDAL.
-    """
-    ds = gdal.Open(fspath(filename))
-    block_size = ds.GetRasterBand(1).GetBlockSize()
-    for i in range(2, ds.RasterCount + 1):
-        if block_size != ds.GetRasterBand(i).GetBlockSize():
-            logger.warning(f"Warning: {filename} bands have different block shapes.")
-            break
-    return block_size
diff --git a/tests/test_io.py b/tests/test_io.py
index da89e6fa..d3951060 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -6,7 +6,7 @@
 
 import dolphin._blocks
 from dolphin import io
-from dolphin._readers import VRTStack
+from dolphin._readers import RasterReader, VRTStack
 
 
 def test_load(raster_100_by_200):
@@ -262,44 +262,12 @@ def test_get_raster_block_sizes(raster_100_by_200, tiled_raster_100_by_200):
     assert io.get_raster_chunk_size(raster_100_by_200) == [200, 1]
 
 
-def test_get_max_block_shape(raster_100_by_200, tiled_raster_100_by_200):
-    # for io.get_max_block_shape, the rasters are 8 bytes per pixel
-    # if we have 1 GB, the whole raster should fit in memory
-    bs = io.get_max_block_shape(tiled_raster_100_by_200, nstack=1, max_bytes=1e9)
-    assert bs == (100, 200)
-
-    # for untiled, the block size is one line
-    bs = io.get_max_block_shape(raster_100_by_200, nstack=1, max_bytes=0)
-    # The function forces at least 16 lines to be read at a time
-    assert bs == (16, 200)
-    bs = io.get_max_block_shape(raster_100_by_200, nstack=1, max_bytes=8 * 17 * 200)
-    assert bs == (32, 200)
-
-    # Pretend we have a stack of 10 images
-    nstack = 10
-    # one tile should be 8 * 32 * 32 * 10 = 81920 bytes
-    bytes_per_tile = 8 * 32 * 32 * nstack
-    bs = io.get_max_block_shape(
-        tiled_raster_100_by_200, nstack, max_bytes=bytes_per_tile
-    )
-    assert bs == (32, 32)
-
-    # with a little more, we should get 2 tiles
-    bs = io.get_max_block_shape(
-        tiled_raster_100_by_200, nstack, max_bytes=1 + bytes_per_tile
-    )
-    assert bs == (32, 64)
-
-    bs = io.get_max_block_shape(
-        tiled_raster_100_by_200, nstack, max_bytes=4 * bytes_per_tile
-    )
-    assert bs == (64, 64)
-
-
 def test_iter_blocks(tiled_raster_100_by_200):
     # Try the whole raster
-    bs = io.get_max_block_shape(tiled_raster_100_by_200, 1, max_bytes=1e9)
-    loader = io.EagerLoader(filename=tiled_raster_100_by_200, block_shape=bs)
+    bs = (100, 200)
+    loader = io.EagerLoader(
+        reader=RasterReader.from_file(tiled_raster_100_by_200), block_shape=bs
+    )
     # `list` should try to load all at once`
     block_slice_tuples = list(loader.iter_blocks())
     assert not loader._thread.is_alive()

From e849f8b410995dfc05f9daff9ed3afc45577b51b Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 14:18:39 -0500
Subject: [PATCH 17/21] use `StackReader` in `ps.create_ps`, drop
 `.iter_blocks`

---
 src/dolphin/_readers.py | 93 -----------------------------------------
 src/dolphin/ps.py       | 25 +++++------
 2 files changed, 13 insertions(+), 105 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index 91dc5e21..aab65341 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -452,53 +452,6 @@ def shape(self):
     def dtype(self):
         return self.readers[0].dtype
 
-    def iter_blocks(
-        self,
-        overlaps: tuple[int, int] = (0, 0),
-        block_shape: tuple[int, int] = (512, 512),
-        skip_empty: bool = True,
-        nodata_mask: Optional[np.ndarray] = None,
-        show_progress: bool = True,
-    ) -> Generator[tuple[np.ndarray, tuple[slice, slice]], None, None]:
-        """Iterate over blocks of the stack.
-
-        Loads all images for one window at a time into memory, while eagerly
-        fetching the next block in the background.
-
-        Parameters
-        ----------
-        overlaps : tuple[int, int], optional
-            Pixels to overlap each block by (rows, cols)
-            By default (0, 0)
-        block_shape : tuple[int, int], optional
-            2D shape of blocks to load at a time.
-            Loads all dates/bands at a time with this shape.
-        skip_empty : bool, optional (default True)
-            Skip blocks that are entirely empty (all NaNs)
-        nodata_mask : bool, optional
-            Optional mask indicating nodata values. If provided, will skip
-            blocks that are entirely nodata.
-            1s are the nodata values, 0s are valid data.
-        show_progress : bool, default=True
-            If true, displays a `rich` ProgressBar.
-
-        Yields
-        ------
-        tuple[np.ndarray, tuple[slice, slice]]
-            Iterator of (data, (slice(row_start, row_stop), slice(col_start, col_stop))
-
-        """
-        loader = EagerLoader(
-            self,
-            block_shape=block_shape,
-            overlaps=overlaps,
-            nodata_mask=nodata_mask,
-            nodata_value=self.nodata_value,
-            skip_empty=skip_empty,
-            show_progress=show_progress,
-        )
-        yield from loader.iter_blocks()
-
 
 @dataclass
 class BinaryStackReader(BaseStackReader):
@@ -886,52 +839,6 @@ def from_vrt_file(cls, vrt_file, new_outfile=None, **kwargs):
             **kwargs,
         )
 
-    def iter_blocks(
-        self,
-        overlaps: tuple[int, int] = (0, 0),
-        block_shape: tuple[int, int] = (512, 512),
-        skip_empty: bool = True,
-        nodata_mask: Optional[np.ndarray] = None,
-        show_progress: bool = True,
-    ) -> Generator[tuple[np.ndarray, tuple[slice, slice]], None, None]:
-        """Iterate over blocks of the stack.
-
-        Loads all images for one window at a time into memory, while eagerly
-        fetching the next block in the background.
-
-        Parameters
-        ----------
-        overlaps : tuple[int, int], optional
-            Pixels to overlap each block by (rows, cols)
-            By default (0, 0)
-        block_shape : tuple[int, int], optional
-            2D shape of blocks to load at a time.
-            Loads all dates/bands at a time with this shape.
-        skip_empty : bool, optional (default True)
-            Skip blocks that are entirely empty (all NaNs)
-        nodata_mask : bool, optional
-            Optional mask indicating nodata values. If provided, will skip
-            blocks that are entirely nodata.
-            1s are the nodata values, 0s are valid data.
-        show_progress : bool, default=True
-            If true, displays a `rich` ProgressBar.
-
-        Yields
-        ------
-        tuple[np.ndarray, tuple[slice, slice]]
-            Iterator of (data, (slice(row_start, row_stop), slice(col_start, col_stop))
-
-        """
-        loader = EagerLoader(
-            self,
-            block_shape=block_shape,
-            overlaps=overlaps,
-            nodata_mask=nodata_mask,
-            skip_empty=skip_empty,
-            show_progress=show_progress,
-        )
-        yield from loader.iter_blocks()
-
     @property
     def shape(self):
         """Get the 3D shape of the stack."""
diff --git a/src/dolphin/ps.py b/src/dolphin/ps.py
index 7657de6b..fe6770f6 100644
--- a/src/dolphin/ps.py
+++ b/src/dolphin/ps.py
@@ -1,5 +1,4 @@
 """Find the persistent scatterers in a stack of SLCS."""
-
 from __future__ import annotations
 
 import shutil
@@ -14,7 +13,7 @@
 
 from dolphin import io, utils
 from dolphin._log import get_log
-from dolphin._readers import VRTStack
+from dolphin._readers import EagerLoader, StackReader
 from dolphin._types import Filename
 
 gdal.UseExceptions()
@@ -28,10 +27,11 @@
 
 def create_ps(
     *,
-    slc_vrt_file: Filename,
+    reader: StackReader,
     output_file: Filename,
     output_amp_mean_file: Filename,
     output_amp_dispersion_file: Filename,
+    like_filename: Filename,
     amp_dispersion_threshold: float = 0.25,
     existing_amp_mean_file: Optional[Filename] = None,
     existing_amp_dispersion_file: Optional[Filename] = None,
@@ -44,14 +44,16 @@ def create_ps(
 
     Parameters
     ----------
-    slc_vrt_file : Filename
-        The VRT file pointing to the stack of SLCs.
+    reader : StackReader
+        A dataset reader for the 3D SLC stack.
     output_file : Filename
         The output PS file (dtype: Byte)
     output_amp_dispersion_file : Filename
         The output amplitude dispersion file.
     output_amp_mean_file : Filename
         The output mean amplitude file.
+    like_filename : Filename
+        The filename to use for the output files' spatial reference.
     amp_dispersion_threshold : float, optional
         The threshold for the amplitude dispersion. Default is 0.25.
     existing_amp_mean_file : Optional[Filename], optional
@@ -91,29 +93,28 @@ def create_ps(
     for fn, dtype, nodata in zip(file_list, FILE_DTYPES, NODATA_VALUES):
         io.write_arr(
             arr=None,
-            like_filename=slc_vrt_file,
+            like_filename=like_filename,
             output_name=fn,
             nbands=1,
             dtype=dtype,
             nodata=nodata,
         )
 
-    vrt_stack = VRTStack.from_vrt_file(slc_vrt_file)
-
     # Initialize the intermediate arrays for the calculation
-    magnitude = np.zeros((len(vrt_stack), *block_shape), dtype=np.float32)
+    magnitude = np.zeros((reader.shape[0], *block_shape), dtype=np.float32)
 
     skip_empty = nodata_mask is None
 
     writer = io.Writer()
     # Make the generator for the blocks
-    block_gen = vrt_stack.iter_blocks(
+    block_gen = EagerLoader(
+        reader,
         block_shape=block_shape,
-        skip_empty=skip_empty,
         nodata_mask=nodata_mask,
+        skip_empty=skip_empty,
         show_progress=show_progress,
     )
-    for cur_data, (rows, cols) in block_gen:
+    for cur_data, (rows, cols) in block_gen.iter_blocks():
         cur_rows, cur_cols = cur_data.shape[-2:]
 
         if not (np.all(cur_data == 0) or np.all(np.isnan(cur_data))):

From af009c39e3334ae37e5291634194e5671f14ed53 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 14:30:45 -0500
Subject: [PATCH 18/21] fix loading for new `StackReader`, fix `...` as band
 index

---
 src/dolphin/_readers.py                | 4 +++-
 src/dolphin/workflows/ps.py            | 3 ++-
 src/dolphin/workflows/wrapped_phase.py | 3 ++-
 tests/test_ps.py                       | 7 ++++---
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index aab65341..d82dc958 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -39,7 +39,7 @@
 if TYPE_CHECKING:
     from builtins import ellipsis
 
-Index = ellipsis | slice | int
+    Index = ellipsis | slice | int
 
 
 @runtime_checkable
@@ -881,6 +881,8 @@ def __getitem__(self, index):
             if n < 0:
                 n = len(self) + n
             return self.read_stack(band=n + 1, rows=rows, cols=cols)
+        elif n is ...:
+            n = slice(None)
 
         bands = list(range(1, 1 + len(self)))[n]
         if len(bands) == len(self):
diff --git a/src/dolphin/workflows/ps.py b/src/dolphin/workflows/ps.py
index 1e61ee6b..2ed694a5 100644
--- a/src/dolphin/workflows/ps.py
+++ b/src/dolphin/workflows/ps.py
@@ -88,10 +88,11 @@ def run(
 
     logger.info(f"Creating persistent scatterer file {ps_output}")
     dolphin.ps.create_ps(
-        slc_vrt_file=vrt_stack.outfile,
+        reader=vrt_stack,
         output_file=output_file_list[0],
         output_amp_mean_file=output_file_list[1],
         output_amp_dispersion_file=output_file_list[2],
+        like_filename=vrt_stack.outfile,
         amp_dispersion_threshold=cfg.ps_options.amp_dispersion_threshold,
         block_shape=cfg.worker_settings.block_shape,
     )
diff --git a/src/dolphin/workflows/wrapped_phase.py b/src/dolphin/workflows/wrapped_phase.py
index 186cc39b..6a6c8744 100644
--- a/src/dolphin/workflows/wrapped_phase.py
+++ b/src/dolphin/workflows/wrapped_phase.py
@@ -74,7 +74,8 @@ def run(
             existing_amp = existing_disp = None
 
         ps.create_ps(
-            slc_vrt_file=vrt_stack.outfile,
+            reader=vrt_stack,
+            like_filename=vrt_stack.outfile,
             output_file=ps_output,
             output_amp_mean_file=cfg.ps_options._amp_mean_file,
             output_amp_dispersion_file=cfg.ps_options._amp_dispersion_file,
diff --git a/tests/test_ps.py b/tests/test_ps.py
index 91920f8c..a7d17e0e 100644
--- a/tests/test_ps.py
+++ b/tests/test_ps.py
@@ -61,7 +61,8 @@ def test_create_ps(tmp_path, vrt_stack):
     amp_dispersion_file = tmp_path / "amp_disp.tif"
     amp_mean_file = tmp_path / "amp_mean.tif"
     dolphin.ps.create_ps(
-        slc_vrt_file=vrt_stack.outfile,
+        reader=vrt_stack,
+        like_filename=vrt_stack.outfile,
         output_amp_dispersion_file=amp_dispersion_file,
         output_amp_mean_file=amp_mean_file,
         output_file=ps_mask_file,
@@ -92,12 +93,12 @@ def test_multilook_ps_file(tmp_path, vrt_stack):
     amp_dispersion_file = tmp_path / "amp_disp.tif"
     amp_mean_file = tmp_path / "amp_mean.tif"
     dolphin.ps.create_ps(
-        slc_vrt_file=vrt_stack.outfile,
+        reader=vrt_stack,
+        like_filename=vrt_stack.outfile,
         output_amp_dispersion_file=amp_dispersion_file,
         output_amp_mean_file=amp_mean_file,
         output_file=ps_mask_file,
     )
-
     output_file = dolphin.ps.multilook_ps_mask(
         strides={"x": 5, "y": 3}, ps_mask_file=ps_mask_file
     )

From a896abd27621b35d4e623d391a49177f3a7a6e21 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 15:47:56 -0500
Subject: [PATCH 19/21] fix tests for EagerLoader refactor

---
 src/dolphin/_readers.py |  89 ++++++++++++++------------
 tests/conftest.py       |  12 ++--
 tests/test_io.py        | 123 +----------------------------------
 tests/test_readers.py   | 138 +++++++++++++++++++++++++++++++++++++---
 4 files changed, 183 insertions(+), 179 deletions(-)

diff --git a/src/dolphin/_readers.py b/src/dolphin/_readers.py
index d82dc958..1baf7ae9 100644
--- a/src/dolphin/_readers.py
+++ b/src/dolphin/_readers.py
@@ -87,13 +87,12 @@ class StackReader(DatasetReader, Protocol):
     shape: tuple[int, int, int]
     """tuple of int : Tuple of array dimensions."""
 
-    @property
     def __len__(self) -> int:
         """int : Number of images in the stack."""
         return self.shape[0]
 
 
-def _mask_array(arr: np.ndarray, nodata_value: float | None) -> np.ndarray:
+def _mask_array(arr: np.ndarray, nodata_value: float | None) -> np.ma.MaskedArray:
     """Mask an array based on a nodata value."""
     if np.isnan(nodata_value):
         return np.ma.masked_invalid(arr)
@@ -126,7 +125,7 @@ class BinaryReader(DatasetReader):
     dtype: np.dtype
     """numpy.dtype : Data-type of the array's elements."""  # noqa: D403
 
-    nodata_value: Optional[float] = None
+    nodata: Optional[float] = None
     """Optional[float] : Value to use for nodata pixels."""
 
     def __post_init__(self):
@@ -150,14 +149,14 @@ def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
                 arr = np.frombuffer(mm, dtype=self.dtype).reshape(self.shape)
                 data = arr[key].copy()
                 del arr
-        return _mask_array(data, self.nodata_value) if self.masked else data
+        return _mask_array(data, self.nodata) if self.masked else data
 
     def __array__(self) -> np.ndarray:
         return self[:,]
 
     @classmethod
     def from_gdal(
-        cls, filename: Filename, band: int = 1, nodata_value: Optional[float] = None
+        cls, filename: Filename, band: int = 1, nodata: Optional[float] = None
     ) -> BinaryReader:
         """Create a BinaryReader from a GDAL-readable file.
 
@@ -167,7 +166,7 @@ def from_gdal(
             Path to the file to read.
         band : int, optional
             Band to read from the file, by default 1
-        nodata_value : float, optional
+        nodata : float, optional
             Value to use for nodata pixels, by default None
             If None passed, will search for a nodata value in the file.
 
@@ -184,7 +183,7 @@ def from_gdal(
             Path(filename),
             shape=shape,
             dtype=dtype,
-            nodata_value=nodata_value or nodata,
+            nodata=nodata or nodata,
         )
 
 
@@ -224,7 +223,7 @@ class HDF5Reader(DatasetReader):
     dset_name: str
     """str : The path to the dataset within the file."""
 
-    nodata_value: Optional[float] = None
+    nodata: Optional[float] = None
     """Optional[float] : Value to use for nodata pixels.
 
     If None, looks for `_FillValue` or `missing_value` attributes on the dataset.
@@ -241,10 +240,10 @@ def __post_init__(self):
         self.shape = dset.shape
         self.dtype = dset.dtype
         self.chunks = dset.chunks
-        if self.nodata_value is None:
-            self.nodata_value = dset.attrs.get("_FillValue", None)
-            if self.nodata_value is None:
-                self.nodata_value = dset.attrs.get("missing_value", None)
+        if self.nodata is None:
+            self.nodata = dset.attrs.get("_FillValue", None)
+            if self.nodata is None:
+                self.nodata = dset.attrs.get("missing_value", None)
         if self.keep_open:
             self._hf = hf
             self._dset = dset
@@ -265,7 +264,7 @@ def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
         else:
             with h5py.File(self.filepath, "r") as f:
                 data = f[self.dset_name][key]
-        return _mask_array(data, self.nodata_value) if self.masked else data
+        return _mask_array(data, self.nodata) if self.masked else data
 
 
 def _ensure_slices(rows: Index, cols: Index) -> tuple[slice, slice]:
@@ -321,18 +320,21 @@ class RasterReader(DatasetReader):
     shape: tuple[int, int]
     dtype: np.dtype
 
-    nodata_value: Optional[float] = None
+    nodata: Optional[float] = None
     """Optional[float] : Value to use for nodata pixels."""
 
     keep_open: bool = False
     """bool : If True, keep the rasterio file handle open for faster reading."""
 
+    chunks: Optional[tuple[int, int]] = None
+    """Optional[tuple[int, int]] : Chunk shape of the dataset, or None if file is unchunked."""
+
     @classmethod
     def from_file(
         cls,
         filepath: Filename,
         band: int = 1,
-        nodata_value: Optional[float] = None,
+        nodata: Optional[float] = None,
         keep_open: bool = False,
         **options,
     ) -> RasterReader:
@@ -341,8 +343,9 @@ def from_file(
             dtype = np.dtype(src.dtypes[band - 1])
             driver = src.driver
             crs = src.crs
-            nodata_value = nodata_value or src.nodatavals[band - 1]
+            nodata = nodata or src.nodatavals[band - 1]
             transform = src.transform
+            chunks = src.block_shapes[band - 1]
 
             return cls(
                 filepath=filepath,
@@ -352,8 +355,9 @@ def from_file(
                 transform=transform,
                 shape=shape,
                 dtype=dtype,
-                nodata_value=nodata_value,
+                nodata=nodata,
                 keep_open=keep_open,
+                chunks=chunks,
             )
 
     def __post_init__(self):
@@ -371,6 +375,9 @@ def __array__(self) -> np.ndarray:
     def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
         import rasterio.windows
 
+        if key is ... or key == ():
+            key = (slice(None), slice(None))
+
         if not isinstance(key, tuple):
             raise ValueError("Index must be a tuple of slices or integers.")
 
@@ -386,7 +393,7 @@ def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
 
         with rio.open(self.filepath) as src:
             out = src.read(self.band, window=window)
-        out_masked = _mask_array(out, self.nodata_value) if self.masked else out
+        out_masked = _mask_array(out, self.nodata) if self.masked else out
         # Note that Rasterio doesn't use the `step` of a slice, so we need to
         # manually slice the output array.
         r_step, c_step = r_slice.step or 1, c_slice.step or 1
@@ -425,6 +432,8 @@ def _read_3d(
         with ThreadPoolExecutor(max_workers=num_threads) as executor:
             results = executor.map(lambda i: readers[i][r_slice, c_slice], band_idxs)
         out = np.stack(list(results), axis=0)
+
+    # TODO: Do i want a "keep_dims" option to not collapse singleton dimensions?
     return np.squeeze(out)
 
 
@@ -435,7 +444,7 @@ class BaseStackReader(StackReader):
     file_list: Sequence[Filename]
     readers: Sequence[DatasetReader]
     num_threads: int = 1
-    nodata_value: Optional[float] = None
+    nodata: Optional[float] = None
 
     def __getitem__(self, key: tuple[Index, ...], /) -> np.ndarray:
         return _read_3d(key, self.readers, num_threads=self.num_threads)
@@ -486,7 +495,7 @@ def from_gdal(
         file_list: Sequence[Filename],
         band: int = 1,
         num_threads: int = 1,
-        nodata_value: Optional[float] = None,
+        nodata: Optional[float] = None,
     ) -> BinaryStackReader:
         """Create a BinaryStackReader from a list of GDAL-readable files.
 
@@ -498,7 +507,7 @@ def from_gdal(
             Band to read from the file, by default 1
         num_threads : int, optional (default 1)
             Number of threads to use for reading.
-        nodata_value : float, optional
+        nodata : float, optional
             Manually set value to use for nodata pixels, by default None
             If None passed, will search for a nodata value in the file.
 
@@ -523,7 +532,7 @@ def from_gdal(
             file_list=file_list,
             readers=readers,
             num_threads=num_threads,
-            nodata_value=nodata_value,
+            nodata=nodata,
         )
 
 
@@ -552,7 +561,7 @@ def from_file_list(
         dset_names: str | Sequence[str],
         keep_open: bool = False,
         num_threads: int = 1,
-        nodata_value: Optional[float] = None,
+        nodata: Optional[float] = None,
     ) -> HDF5StackReader:
         """Create a HDF5StackReader from a list of files.
 
@@ -567,7 +576,7 @@ def from_file_list(
             If True, keep the HDF5 file handles open for faster reading.
         num_threads : int, optional (default 1)
             Number of threads to use for reading.
-        nodata_value : float, optional
+        nodata : float, optional
             Manually set value to use for nodata pixels, by default None
             If None passed, will search for a nodata value in the file.
 
@@ -580,19 +589,15 @@ def from_file_list(
             dset_names = [dset_names] * len(file_list)
 
         readers = [
-            HDF5Reader(
-                Path(f), dset_name=dn, keep_open=keep_open, nodata_value=nodata_value
-            )
+            HDF5Reader(Path(f), dset_name=dn, keep_open=keep_open, nodata=nodata)
             for (f, dn) in zip(file_list, dset_names)
         ]
         # Check if nodata values were found in the files
-        nds = set([r.nodata_value for r in readers])
+        nds = set([r.nodata for r in readers])
         if len(nds) == 1:
-            nodata_value = nds.pop()
+            nodata = nds.pop()
 
-        return cls(
-            file_list, readers, num_threads=num_threads, nodata_value=nodata_value
-        )
+        return cls(file_list, readers, num_threads=num_threads, nodata=nodata)
 
 
 @dataclass
@@ -618,7 +623,7 @@ def from_file_list(
         bands: int | Sequence[int] = 1,
         keep_open: bool = False,
         num_threads: int = 1,
-        nodata_value: Optional[float] = None,
+        nodata: Optional[float] = None,
     ) -> RasterStackReader:
         """Create a RasterStackReader from a list of files.
 
@@ -634,7 +639,7 @@ def from_file_list(
             If True, keep the rasterio file handles open for faster reading.
         num_threads : int, optional (default 1)
             Number of threads to use for reading.
-        nodata_value : float, optional
+        nodata : float, optional
             Manually set value to use for nodata pixels, by default None
 
         Returns
@@ -650,12 +655,10 @@ def from_file_list(
             for (f, b) in zip(file_list, bands)
         ]
         # Check if nodata values were found in the files
-        nds = set([r.nodata_value for r in readers])
+        nds = set([r.nodata for r in readers])
         if len(nds) == 1:
-            nodata_value = nds.pop()
-        return cls(
-            file_list, readers, num_threads=num_threads, nodata_value=nodata_value
-        )
+            nodata = nds.pop()
+        return cls(file_list, readers, num_threads=num_threads, nodata=nodata)
 
 
 class VRTStack(StackReader):
@@ -985,6 +988,8 @@ def __init__(
                 overlaps=overlaps,
             )
         )
+        if nodata_value is None:
+            nodata_value = getattr(reader, "nodata", None)
         self._queue_size = queue_size
         self._skip_empty = skip_empty
         self._nodata_mask = nodata_mask
@@ -1023,10 +1028,10 @@ def iter_blocks(
 
                 # Otherwise look at the actual block we loaded
                 if np.isnan(self._nodata):
-                    block_nodata = np.isnan(cur_block)
+                    block_is_nodata = np.isnan(cur_block)
                 else:
-                    block_nodata = cur_block == self._nodata
-                if np.all(block_nodata):
+                    block_is_nodata = cur_block == self._nodata
+                if np.all(block_is_nodata):
                     logger.debug("Skipping block since it was all nodata")
                     continue
                 yield cur_block, (rows, cols)
diff --git a/tests/conftest.py b/tests/conftest.py
index e5044201..64b25c7f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -223,12 +223,12 @@ def raster_10_by_20(tmp_path, tiled_raster_100_by_200):
 
 
 @pytest.fixture
-def raster_with_nan(tmpdir, tiled_raster_100_by_200):
+def raster_with_nan(tmp_path, tiled_raster_100_by_200):
     # Raster with one nan pixel
     start_arr = load_gdal(tiled_raster_100_by_200)
     nan_arr = start_arr.copy()
     nan_arr[0, 0] = np.nan
-    output_name = tmpdir / "with_one_nan.tif"
+    output_name = tmp_path / "with_one_nan.tif"
     write_arr(
         arr=nan_arr,
         like_filename=tiled_raster_100_by_200,
@@ -239,9 +239,9 @@ def raster_with_nan(tmpdir, tiled_raster_100_by_200):
 
 
 @pytest.fixture
-def raster_with_nan_block(tmpdir, tiled_raster_100_by_200):
+def raster_with_nan_block(tmp_path, tiled_raster_100_by_200):
     # One full block of 32x32 is nan
-    output_name = tmpdir / "with_nans.tif"
+    output_name = tmp_path / "with_nans.tif"
     nan_arr = load_gdal(tiled_raster_100_by_200)
     nan_arr[:32, :32] = np.nan
     write_arr(
@@ -254,9 +254,9 @@ def raster_with_nan_block(tmpdir, tiled_raster_100_by_200):
 
 
 @pytest.fixture
-def raster_with_zero_block(tmpdir, tiled_raster_100_by_200):
+def raster_with_zero_block(tmp_path, tiled_raster_100_by_200):
     # One full block of 32x32 is nan
-    output_name = tmpdir / "with_zeros.tif"
+    output_name = tmp_path / "with_zeros.tif"
     out_arr = load_gdal(tiled_raster_100_by_200)
     out_arr[:] = 1.0
 
diff --git a/tests/test_io.py b/tests/test_io.py
index d3951060..f36b2906 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -6,7 +6,7 @@
 
 import dolphin._blocks
 from dolphin import io
-from dolphin._readers import RasterReader, VRTStack
+from dolphin._readers import VRTStack
 
 
 def test_load(raster_100_by_200):
@@ -262,127 +262,6 @@ def test_get_raster_block_sizes(raster_100_by_200, tiled_raster_100_by_200):
     assert io.get_raster_chunk_size(raster_100_by_200) == [200, 1]
 
 
-def test_iter_blocks(tiled_raster_100_by_200):
-    # Try the whole raster
-    bs = (100, 200)
-    loader = io.EagerLoader(
-        reader=RasterReader.from_file(tiled_raster_100_by_200), block_shape=bs
-    )
-    # `list` should try to load all at once`
-    block_slice_tuples = list(loader.iter_blocks())
-    assert not loader._thread.is_alive()
-    assert len(block_slice_tuples) == 1
-    blocks, slices = zip(*list(block_slice_tuples))
-    assert blocks[0].shape == (100, 200)
-    rows, cols = slices[0]
-    assert rows == slice(0, 100)
-    assert cols == slice(0, 200)
-
-    # now one block at a time
-    max_bytes = 8 * 32 * 32
-    bs = io.get_max_block_shape(tiled_raster_100_by_200, 1, max_bytes=max_bytes)
-    loader = io.EagerLoader(filename=tiled_raster_100_by_200, block_shape=bs)
-    blocks, slices = zip(*list(loader.iter_blocks()))
-
-    row_blocks = 100 // 32 + 1
-    col_blocks = 200 // 32 + 1
-    expected_num_blocks = row_blocks * col_blocks
-    assert len(blocks) == expected_num_blocks
-    assert blocks[0].shape == (32, 32)
-    # at the ends, the block_slice_tuples are smaller
-    assert blocks[6].shape == (32, 8)
-    assert blocks[-1].shape == (4, 8)
-
-
-def test_iter_blocks_rowcols(tiled_raster_100_by_200):
-    # Block size that is a multiple of the raster size
-    loader = io.EagerLoader(filename=tiled_raster_100_by_200, block_shape=(10, 20))
-    blocks, slices = zip(*list(loader.iter_blocks()))
-
-    assert blocks[0].shape == (10, 20)
-    for rs, cs in slices:
-        assert rs.stop - rs.start == 10
-        assert cs.stop - cs.start == 20
-    loader.notify_finished()
-
-    # Non-multiple block size
-    loader = io.EagerLoader(filename=tiled_raster_100_by_200, block_shape=(32, 32))
-    blocks, slices = zip(*list(loader.iter_blocks()))
-    assert blocks[0].shape == (32, 32)
-    for b, (rs, cs) in zip(blocks, slices):
-        assert b.shape == (rs.stop - rs.start, cs.stop - cs.start)
-    loader.notify_finished()
-
-
-def test_iter_nodata(
-    raster_with_nan,
-    raster_with_nan_block,
-    raster_with_zero_block,
-    tiled_raster_100_by_200,
-):
-    # load one block at a time
-    max_bytes = 8 * 32 * 32
-    bs = io.get_max_block_shape(tiled_raster_100_by_200, 1, max_bytes=max_bytes)
-    loader = io.EagerLoader(filename=tiled_raster_100_by_200, block_shape=bs)
-    blocks, slices = zip(*list(loader.iter_blocks()))
-    loader.notify_finished()
-
-    row_blocks = 100 // 32 + 1
-    col_blocks = 200 // 32 + 1
-    expected_num_blocks = row_blocks * col_blocks
-    assert len(blocks) == expected_num_blocks
-    assert blocks[0].shape == (32, 32)
-
-    # One nan should be fine, will get loaded
-    loader = io.EagerLoader(filename=raster_with_nan, block_shape=bs)
-    blocks, slices = zip(*list(loader.iter_blocks()))
-    loader.notify_finished()
-    assert len(blocks) == expected_num_blocks
-
-    # Now check entire block for a skipped block
-    loader = io.EagerLoader(filename=raster_with_nan_block, block_shape=bs)
-    blocks, slices = zip(*list(loader.iter_blocks()))
-    loader.notify_finished()
-    assert len(blocks) == expected_num_blocks - 1
-
-    # Now check entire block for a skipped block
-    loader = io.EagerLoader(filename=raster_with_zero_block, block_shape=bs)
-    blocks, slices = zip(*list(loader.iter_blocks()))
-    loader.notify_finished()
-    assert len(blocks) == expected_num_blocks - 1
-
-
-@pytest.mark.skip
-def test_iter_blocks_nodata_mask(tiled_raster_100_by_200):
-    # load one block at a time
-    max_bytes = 8 * 32 * 32
-    bs = io.get_max_block_shape(tiled_raster_100_by_200, 1, max_bytes=max_bytes)
-    blocks = list(io.iter_blocks(tiled_raster_100_by_200, bs, band=1))
-    row_blocks = 100 // 32 + 1
-    col_blocks = 200 // 32 + 1
-    expected_num_blocks = row_blocks * col_blocks
-    assert len(blocks) == expected_num_blocks
-
-    nodata_mask = np.zeros((100, 200), dtype=bool)
-    nodata_mask[:5, :5] = True
-    # non-full-block should still all be loaded nan should be fine, will get loaded
-    blocks = list(
-        io.iter_blocks(
-            tiled_raster_100_by_200, bs, skip_empty=True, nodata_mask=nodata_mask
-        )
-    )
-    assert len(blocks) == expected_num_blocks
-
-    nodata_mask[:32, :32] = True
-    # non-full-block should still all be loaded nan should be fine, will get loaded
-    blocks = list(
-        io.iter_blocks(
-            tiled_raster_100_by_200, bs, skip_empty=True, nodata_mask=nodata_mask
-        )
-    )
-    assert len(blocks) == expected_num_blocks - 1
-
-
 def test_format_nc_filename():
     expected = 'NETCDF:"/usr/19990101/20200303_20210101.nc":"//variable"'
     assert (
diff --git a/tests/test_readers.py b/tests/test_readers.py
index 9d384d59..5ae228d0 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -1,3 +1,4 @@
+import warnings
 from pathlib import Path
 
 import numpy as np
@@ -5,10 +6,12 @@
 import pytest
 import rasterio as rio
 from osgeo import gdal
+from rasterio.errors import NotGeoreferencedWarning
 
 from dolphin._readers import (
     BinaryReader,
     BinaryStackReader,
+    EagerLoader,
     HDF5Reader,
     HDF5StackReader,
     RasterReader,
@@ -24,6 +27,20 @@
 slices_to_test = [slice(None), 1, slice(0, 10, 2)]
 
 
+# Filter rasterio georeferencing warnings
+@pytest.fixture(autouse=True)
+def suppress_not_georeferenced_warning():
+    """
+    Pytest fixture to suppress NotGeoreferencedWarning in tests.
+
+    This fixture automatically applies to all test functions in the module
+    where it's defined, suppressing the specified warning.
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=NotGeoreferencedWarning)
+        yield
+
+
 @pytest.fixture(scope="module")
 def binary_file_list(tmp_path_factory, slc_stack):
     """Flat binary files in the ENVI format."""
@@ -168,6 +185,7 @@ def raster_reader(slc_file_list, slc_stack):
     assert r.dtype == slc_stack[0].dtype
     assert r.ndim == 2
     assert r.dtype == np.complex64
+    return r
 
 
 class TestRaster:
@@ -208,9 +226,9 @@ def test_raster_stack_read_slices(
 @pytest.mark.parametrize("cols", slices_to_test)
 def test_ellipsis_reads(binary_reader, hdf5_reader, raster_reader, rows, cols):
     # Test that the ellipsis works
-    assert binary_reader[...] is binary_reader[()]
-    assert hdf5_reader[...] is hdf5_reader[()]
-    assert raster_reader[...] is raster_reader[()]
+    npt.assert_array_equal(binary_reader[...], binary_reader[()])
+    npt.assert_array_equal(hdf5_reader[...], hdf5_reader[()])
+    npt.assert_array_equal(raster_reader[...], raster_reader[()])
     # Test we can still do rows/cols with a leading ellipsis
     npt.assert_array_equal(binary_reader[..., rows, cols], binary_reader[rows, cols])
     npt.assert_array_equal(hdf5_reader[..., rows, cols], hdf5_reader[rows, cols])
@@ -337,22 +355,25 @@ def test_bad_sizes(slc_file_list, raster_10_by_20):
 
 
 def test_iter_blocks(vrt_stack):
-    blocks, slices = zip(*list(vrt_stack.iter_blocks(block_shape=(5, 5))))
+    loader = EagerLoader(reader=vrt_stack, block_shape=(5, 5))
+    blocks, slices = zip(*list(loader.iter_blocks()))
     # (5, 10) total shape, breaks into 5x5 blocks
     assert len(blocks) == 2
     for b in blocks:
         assert b.shape == (len(vrt_stack), 5, 5)
 
-    blocks, slices = zip(*list(vrt_stack.iter_blocks(block_shape=(1, 2))))
-    assert len(blocks) == 25
+    loader = EagerLoader(reader=vrt_stack, block_shape=(3, 2))
+    blocks, slices = zip(*list(loader.iter_blocks()))
+    assert len(blocks) == 10
     for b in blocks:
-        assert b.shape == (len(vrt_stack), 1, 2)
+        assert b.shape == (len(vrt_stack), 3, 2)
 
 
 def test_tiled_iter_blocks(tmp_path, tiled_file_list):
     outfile = tmp_path / "stack.vrt"
     vrt_stack = VRTStack(tiled_file_list, outfile=outfile)
-    blocks, slices = zip(*list(vrt_stack.iter_blocks(block_shape=(32, 32))))
+    loader = EagerLoader(reader=vrt_stack, block_shape=(32, 32))
+    blocks, slices = zip(*list(loader.iter_blocks()))
     # (100, 200) total shape, breaks into 32x32 blocks
     assert len(blocks) == len(slices) == 28
     for i, b in enumerate(blocks, start=1):
@@ -364,7 +385,8 @@ def test_tiled_iter_blocks(tmp_path, tiled_file_list):
             else:
                 assert b.shape == (len(vrt_stack), 32, 8)
 
-    blocks, slices = zip(*list(vrt_stack.iter_blocks(block_shape=(50, 100))))
+    loader = EagerLoader(reader=vrt_stack, block_shape=(50, 100))
+    blocks, slices = zip(*list(loader.iter_blocks()))
     assert len(blocks) == len(slices) == 4
 
 
@@ -421,3 +443,101 @@ def test_parse_vrt(tmp_path, test_vrt):
         "t087_185684_iw2_20220104.h5",
     ]
     assert sds == "data/VV"
+
+
+class TestEagerLoader:
+    def test_iter_blocks(self, tiled_raster_100_by_200):
+        # Try the whole raster
+        bs = (100, 200)
+        loader = EagerLoader(
+            reader=RasterReader.from_file(tiled_raster_100_by_200), block_shape=bs
+        )
+        # `list` should try to load all at once`
+        block_slice_tuples = list(loader.iter_blocks())
+        assert not loader._thread.is_alive()
+        assert len(block_slice_tuples) == 1
+        blocks, slices = zip(*list(block_slice_tuples))
+        assert blocks[0].shape == (100, 200)
+        rows, cols = slices[0]
+        assert rows == slice(0, 100)
+        assert cols == slice(0, 200)
+
+        # now one block at a time
+        bs = (32, 32)
+        reader = RasterReader.from_file(tiled_raster_100_by_200)
+        loader = EagerLoader(reader=reader, block_shape=bs)
+        blocks, slices = zip(*list(loader.iter_blocks()))
+
+        row_blocks = 100 // 32 + 1
+        col_blocks = 200 // 32 + 1
+        expected_num_blocks = row_blocks * col_blocks
+        assert len(blocks) == expected_num_blocks
+        assert blocks[0].shape == (32, 32)
+        # at the ends, the block_slice_tuples are smaller
+        assert blocks[6].shape == (32, 8)
+        assert blocks[-1].shape == (4, 8)
+
+    def test_iter_blocks_rowcols(self, tiled_raster_100_by_200):
+        # Block size that is a multiple of the raster size
+        reader = RasterReader.from_file(tiled_raster_100_by_200)
+        loader = EagerLoader(reader=reader, block_shape=(10, 20))
+        blocks, slices = zip(*list(loader.iter_blocks()))
+
+        assert blocks[0].shape == (10, 20)
+        for rs, cs in slices:
+            assert rs.stop - rs.start == 10
+            assert cs.stop - cs.start == 20
+        loader.notify_finished()
+
+        # Non-multiple block size
+        reader = RasterReader.from_file(tiled_raster_100_by_200)
+        loader = EagerLoader(reader=reader, block_shape=(32, 32))
+        blocks, slices = zip(*list(loader.iter_blocks()))
+        assert blocks[0].shape == (32, 32)
+        for b, (rs, cs) in zip(blocks, slices):
+            assert b.shape == (rs.stop - rs.start, cs.stop - cs.start)
+        loader.notify_finished()
+
+    def test_iter_nodata(
+        self,
+        raster_with_nan,
+        raster_with_nan_block,
+        raster_with_zero_block,
+        tiled_raster_100_by_200,
+    ):
+        # load one block at a time
+        bs = (100, 200)
+        reader = RasterReader.from_file(tiled_raster_100_by_200)
+        loader = EagerLoader(reader=reader, block_shape=bs)
+        blocks, slices = zip(*list(loader.iter_blocks()))
+        loader.notify_finished()
+
+        bs = (32, 32)
+        row_blocks = 100 // 32 + 1
+        col_blocks = 200 // 32 + 1
+        expected_num_blocks = row_blocks * col_blocks
+        loader = EagerLoader(reader=reader, block_shape=bs)
+        blocks, slices = zip(*list(loader.iter_blocks()))
+        assert len(blocks) == expected_num_blocks
+        assert blocks[0].shape == bs
+
+        # One nan should be fine, will get loaded
+        reader = RasterReader.from_file(raster_with_nan)
+        loader = EagerLoader(reader=reader, block_shape=bs)
+        blocks, slices = zip(*list(loader.iter_blocks()))
+        loader.notify_finished()
+        assert len(blocks) == expected_num_blocks
+
+        # Now check entire block for a skipped block
+        reader = RasterReader.from_file(raster_with_nan_block)
+        loader = EagerLoader(reader=reader, block_shape=bs)
+        blocks, slices = zip(*list(loader.iter_blocks()))
+        loader.notify_finished()
+        assert len(blocks) == expected_num_blocks - 1
+
+        # Now check entire block for a skipped block
+        reader = RasterReader.from_file(raster_with_zero_block)
+        loader = EagerLoader(reader=reader, block_shape=bs)
+        blocks, slices = zip(*list(loader.iter_blocks()))
+        loader.notify_finished()
+        assert len(blocks) == expected_num_blocks - 1

From 6abbae90ef7583e11ad83a3276d74f2a7b2bd7b9 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 16:11:02 -0500
Subject: [PATCH 20/21] bad math

---
 tests/test_readers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_readers.py b/tests/test_readers.py
index 5ae228d0..1e192d0b 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -362,11 +362,11 @@ def test_iter_blocks(vrt_stack):
     for b in blocks:
         assert b.shape == (len(vrt_stack), 5, 5)
 
-    loader = EagerLoader(reader=vrt_stack, block_shape=(3, 2))
+    loader = EagerLoader(reader=vrt_stack, block_shape=(5, 2))
     blocks, slices = zip(*list(loader.iter_blocks()))
-    assert len(blocks) == 10
+    assert len(blocks) == 5
     for b in blocks:
-        assert b.shape == (len(vrt_stack), 3, 2)
+        assert b.shape == (len(vrt_stack), 5, 2)
 
 
 def test_tiled_iter_blocks(tmp_path, tiled_file_list):

From 4e3bbcb4ac768652deece78a913b2c4dc181b666 Mon Sep 17 00:00:00 2001
From: scott <scott.stanie@gmail.com>
Date: Wed, 10 Jan 2024 16:24:17 -0500
Subject: [PATCH 21/21] update changelog for new version. Close #174 [skip ci]

---
 CHANGELOG.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 42591408..2855ad95 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,16 @@
-# [Unreleased](https://github.com/isce-framework/dolphin/compare/v0.8.0...main)
+# [Unreleased](https://github.com/isce-framework/dolphin/compare/v0.9.0...main)
+
+# [v0.9.0](https://github.com/isce-framework/dolphin/compare/v0.8.0...v0.9.0)
+
+**Added**
+- `DatasetReader` and `StackReader` protocols for reading in data from different sources
+  - `DatasetReader` is for reading in a single dataset, like one raster image.
+  - `StackReader` is for reading in a stack of datasets, like a stack of SLCs.
+  - Implementations of these have been done for flat binary files (`BinaryReader`), HDF5 files (`HDF5Reader`), and GDAL rasters (`RasterReader`).
+
+**Changed**
+- The `VRTStack` no longer has an `.iter_blocks` method
+  - This has been replaced with creating an `EagerLoader` directly and passing it to the `reader` argument
 
 # [v0.8.0](https://github.com/isce-framework/dolphin/compare/v0.7.0...v0.8.0)