zarr-developers · TomNicholas · Jan 30, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 29, 2025
diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -13,7 +13,7 @@
 from virtualizarr.readers import (
     DMRPPVirtualBackend,
     FITSVirtualBackend,
-    HDF5VirtualBackend,
+    HDFVirtualBackend,
     KerchunkVirtualBackend,
     NetCDF3VirtualBackend,
     TIFFVirtualBackend,
@@ -27,9 +27,9 @@
     "kerchunk": KerchunkVirtualBackend,
     "zarr_v3": ZarrV3VirtualBackend,
     "dmrpp": DMRPPVirtualBackend,
+    "hdf5": HDFVirtualBackend,
+    "netcdf4": HDFVirtualBackend,  # note this is the same as for hdf5
     # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
-    "hdf5": HDF5VirtualBackend,
-    "netcdf4": HDF5VirtualBackend,  # note this is the same as for hdf5
     "netcdf3": NetCDF3VirtualBackend,
     "tiff": TIFFVirtualBackend,
     "fits": FITSVirtualBackend,

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -7,6 +7,8 @@
 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.manifests.manifest import join
+from virtualizarr.readers import HDF5VirtualBackend
+from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.zarr import ZArray, ceildiv
 
 requires_network = pytest.mark.network
@@ -42,6 +44,11 @@ def _importorskip(
 has_zarr_python, requires_zarr_python = _importorskip("zarr")
 has_zarr_python_v3, requires_zarr_python_v3 = _importorskip("zarr", "3.0.0b")
 
+parametrize_over_hdf_backends = pytest.mark.parametrize(
+    "hdf_backend",
+    [HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend],
+)
+
 
 def create_manifestarray(
     shape: tuple[int, ...], chunks: tuple[int, ...]

diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
@@ -15,7 +15,7 @@
 from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import (
     has_astropy,
-    requires_kerchunk,
+    parametrize_over_hdf_backends,
     requires_network,
     requires_s3fs,
     requires_scipy,
@@ -82,8 +82,7 @@ def test_FileType():
         FileType(None)
 
 
-@requires_kerchunk
-@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 class TestOpenVirtualDatasetIndexes:
     def test_no_indexes(self, netcdf4_file, hdf_backend):
         vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
@@ -122,8 +121,7 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
     return True
 
 
-@requires_kerchunk
-@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 def test_cftime_index(tmpdir, hdf_backend):
     """Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
     # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
@@ -152,8 +150,7 @@ def test_cftime_index(tmpdir, hdf_backend):
     assert vds.attrs == ds.attrs
 
 
-@requires_kerchunk
-@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 class TestOpenVirtualDatasetAttrs:
     def test_drop_array_dimensions(self, netcdf4_file, hdf_backend):
         # regression test for GH issue #150
@@ -171,14 +168,16 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend):
         }
 
 
-@requires_kerchunk
+@parametrize_over_hdf_backends
 class TestDetermineCoords:
-    def test_infer_one_dimensional_coords(self, netcdf4_file):
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+    def test_infer_one_dimensional_coords(self, netcdf4_file, hdf_backend):
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         assert set(vds.coords) == {"time", "lat", "lon"}
 
-    def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
-        vds = open_virtual_dataset(netcdf4_file_with_2d_coords, indexes={})
+    def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend):
+        vds = open_virtual_dataset(
+            netcdf4_file_with_2d_coords, indexes={}, backend=hdf_backend
+        )
 
         expected_dimension_coords = ["ocean_time", "s_rho"]
         expected_2d_coords = ["lon_rho", "lat_rho", "h"]
@@ -199,7 +198,7 @@ class TestReadFromS3:
     @pytest.mark.parametrize(
         "indexes", [None, {}], ids=["None index", "empty dict index"]
     )
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     def test_anon_read_s3(self, indexes, hdf_backend):
         """Parameterized tests for empty vs supplied indexes and filetypes."""
         # TODO: Switch away from this s3 url after minIO is implemented.
@@ -217,7 +216,7 @@ def test_anon_read_s3(self, indexes, hdf_backend):
 
 
 @requires_network
-@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 class TestReadFromURL:
     @pytest.mark.parametrize(
         "filetype, url",
@@ -318,46 +317,55 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend):
         xrt.assert_equal(dsXR, dsV)
 
 
-@requires_kerchunk
-def test_open_empty_group(empty_netcdf4_file):
-    vds = open_virtual_dataset(empty_netcdf4_file, indexes={})
-    assert isinstance(vds, xr.Dataset)
-    expected = Dataset()
-    xrt.assert_identical(vds, expected)
-
-
-@requires_kerchunk
+@parametrize_over_hdf_backends
 class TestOpenVirtualDatasetHDFGroup:
-    def test_open_subgroup(self, netcdf4_file_with_data_in_multiple_groups):
+    def test_open_empty_group(self, empty_netcdf4_file, hdf_backend):
+        vds = open_virtual_dataset(empty_netcdf4_file, indexes={}, backend=hdf_backend)
+        assert isinstance(vds, xr.Dataset)
+        expected = Dataset()
+        xrt.assert_identical(vds, expected)
+
+    def test_open_subgroup(
+        self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
+    ):
         vds = open_virtual_dataset(
-            netcdf4_file_with_data_in_multiple_groups, group="subgroup", indexes={}
+            netcdf4_file_with_data_in_multiple_groups,
+            group="subgroup",
+            indexes={},
+            backend=hdf_backend,
         )
         assert list(vds.variables) == ["bar"]
         assert isinstance(vds["bar"].data, ManifestArray)
         assert vds["bar"].shape == (2,)
 
-    def test_open_root_group_manually(self, netcdf4_file_with_data_in_multiple_groups):
+    def test_open_root_group_manually(
+        self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
+    ):
         vds = open_virtual_dataset(
-            netcdf4_file_with_data_in_multiple_groups, group="", indexes={}
+            netcdf4_file_with_data_in_multiple_groups,
+            group="",
+            indexes={},
+            backend=hdf_backend,
         )
         assert list(vds.variables) == ["foo"]
         assert isinstance(vds["foo"].data, ManifestArray)
         assert vds["foo"].shape == (3,)
 
     def test_open_root_group_by_default(
-        self, netcdf4_file_with_data_in_multiple_groups
+        self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
     ):
         vds = open_virtual_dataset(
-            netcdf4_file_with_data_in_multiple_groups, indexes={}
+            netcdf4_file_with_data_in_multiple_groups,
+            indexes={},
+            backend=hdf_backend,
         )
         assert list(vds.variables) == ["foo"]
         assert isinstance(vds["foo"].data, ManifestArray)
         assert vds["foo"].shape == (3,)
 
 
-@requires_kerchunk
 class TestLoadVirtualDataset:
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     def test_loadable_variables(self, netcdf4_file, hdf_backend):
         vars_to_load = ["air", "time"]
         vds = open_virtual_dataset(
@@ -397,7 +405,7 @@ def test_explicit_filetype_and_backend(self, netcdf4_file):
                 netcdf4_file, filetype="hdf", backend=HDFVirtualBackend
             )
 
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
         if hdf_backend == HDFVirtualBackend:
             with pytest.raises(NotImplementedError, match="Nested groups"):
@@ -408,7 +416,9 @@ def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
                 )
         if hdf_backend == HDF5VirtualBackend:
             with pytest.raises(ValueError, match="not found in"):
-                open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
+                open_virtual_dataset(
+                    hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
+                )
 
         vars_to_load = ["air", "time"]
         vds = open_virtual_dataset(
@@ -441,13 +451,13 @@ def test_open_virtual_dataset_passes_expected_args(
         }
         mock_read_kerchunk.assert_called_once_with(**args)
 
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend):
         vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
         assert vds.empty.dims == ()
         assert vds.empty.attrs == {"empty": "true"}
 
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend):
         vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
         assert vds.scalar.dims == ()

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -8,9 +8,7 @@
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.readers import HDF5VirtualBackend
-from virtualizarr.readers.hdf import HDFVirtualBackend
-from virtualizarr.tests import requires_kerchunk
+from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
 from virtualizarr.translators.kerchunk import (
     dataset_from_kerchunk_refs,
 )
@@ -61,7 +59,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
         ),
     ],
 )
-@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 def test_numpy_arrays_to_inlined_kerchunk_refs(
     netcdf4_file, inline_threshold, vars_to_inline, hdf_backend
 ):
@@ -89,7 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
 @requires_kerchunk
 @pytest.mark.parametrize("format", ["dict", "json", "parquet"])
 class TestKerchunkRoundtrip:
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
@@ -122,7 +120,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
         for coord in ds.coords:
             assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
 
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
     def test_kerchunk_roundtrip_concat(
         self, tmpdir, format, hdf_backend, decode_times, time_vars
@@ -192,7 +190,7 @@ def test_kerchunk_roundtrip_concat(
             assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
             assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]
 
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+    @parametrize_over_hdf_backends
     def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
         # regression test for GH issue #105
 
@@ -278,8 +276,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
         assert roundtrip.a.attrs == ds.a.attrs
 
 
-@requires_kerchunk
-@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 def test_open_scalar_variable(tmpdir, hdf_backend):
     # regression test for GH issue #100
 
@@ -290,9 +287,8 @@ def test_open_scalar_variable(tmpdir, hdf_backend):
     assert vds["a"].shape == ()
 
 
+@parametrize_over_hdf_backends
 class TestPathsToURIs:
-    @requires_kerchunk
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
     def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
         vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
 
@@ -302,8 +298,6 @@ def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
         path = manifest["0.0.0"]["path"]
         assert path == expected_path
 
-    @requires_kerchunk
-    @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
     def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend):
         relative_path = relpath(netcdf4_file)
         vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend)

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -7,8 +7,7 @@
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.readers import HDF5VirtualBackend, HDFVirtualBackend
-from virtualizarr.tests import requires_kerchunk
+from virtualizarr.tests import parametrize_over_hdf_backends
 from virtualizarr.zarr import ZArray
 
 
@@ -227,8 +226,7 @@ def test_concat_dim_coords_along_existing_dim(self):
         assert result.data.zarray.zarr_format == zarray.zarr_format
 
 
-@requires_kerchunk
-@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 class TestCombineUsingIndexes:
     def test_combine_by_coords(self, netcdf4_files_factory: Callable, hdf_backend):
         filepath1, filepath2 = netcdf4_files_factory()
@@ -262,8 +260,7 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files, hdf_backe
         assert isinstance(combined_vds["lon"].data, ManifestArray)
 
 
-@requires_kerchunk
-@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+@parametrize_over_hdf_backends
 class TestRenamePaths:
     def test_rename_to_str(self, netcdf4_file, hdf_backend):
         vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
@@ -313,14 +310,13 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(
         assert isinstance(renamed_vds["lat"].data, np.ndarray)
 
 
-@requires_kerchunk
 def test_nbytes(simple_netcdf4):
     vds = open_virtual_dataset(simple_netcdf4)
-    assert vds.virtualize.nbytes == 32
-    assert vds.nbytes == 48
+    assert vds.virtualize.nbytes == 88
+    assert vds.nbytes == 104
 
     vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"])
-    assert vds.virtualize.nbytes == 48
+    assert vds.virtualize.nbytes == 104
 
     ds = open_dataset(simple_netcdf4)
     assert ds.virtualize.nbytes == ds.nbytes