diff --git a/docs/releases.rst b/docs/releases.rst index abc3fb05..a1c095a7 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -35,6 +35,9 @@ Breaking changes rather than positional or keyword. This change is breaking _only_ where arguments for these parameters are currently given positionally. (:issue:`341`) By `Chuck Daniels `_. +- The default backend for netCDF4 and HDF5 is now the custom ``HDFVirtualBackend`` replacing + the previous default which was a wrapper around the kerchunk backend. + (:issue:`374`, :pull:`395`) By `Julia Signell `_. Deprecations ~~~~~~~~~~~~ diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index c57dc432..96c13322 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -13,7 +13,7 @@ from virtualizarr.readers import ( DMRPPVirtualBackend, FITSVirtualBackend, - HDF5VirtualBackend, + HDFVirtualBackend, KerchunkVirtualBackend, NetCDF3VirtualBackend, TIFFVirtualBackend, @@ -27,9 +27,9 @@ "kerchunk": KerchunkVirtualBackend, "zarr_v3": ZarrV3VirtualBackend, "dmrpp": DMRPPVirtualBackend, + "hdf5": HDFVirtualBackend, + "netcdf4": HDFVirtualBackend, # note this is the same as for hdf5 # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) - "hdf5": HDF5VirtualBackend, - "netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5 "netcdf3": NetCDF3VirtualBackend, "tiff": TIFFVirtualBackend, "fits": FITSVirtualBackend, diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index 98da515e..9a0b69e2 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -361,14 +361,14 @@ def _virtual_vars_from_hdf( ).open_file() f = h5py.File(open_file, mode="r") - if group is not None: + if group is not None and group != "": g = f[group] group_name = group if not isinstance(g, h5py.Group): raise ValueError("The provided group is not an HDF group") else: - g = f - group_name = "" + g = f["/"] + group_name = "/" variables = {} for key in g.keys(): @@ -381,9 +381,6 @@ def _virtual_vars_from_hdf( ) if variable is not None: variables[key] = variable - else: - raise NotImplementedError("Nested groups are not yet supported") - return variables @staticmethod diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index b1fbdf11..f38d5c2c 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -7,6 +7,8 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join +from virtualizarr.readers import HDF5VirtualBackend +from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.zarr import ZArray, ceildiv requires_network = pytest.mark.network @@ -42,6 +44,11 @@ def _importorskip( has_zarr_python, requires_zarr_python = _importorskip("zarr") has_zarr_python_v3, requires_zarr_python_v3 = _importorskip("zarr", "3.0.0b") +parametrize_over_hdf_backends = pytest.mark.parametrize( + "hdf_backend", + [HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend], +) + def create_manifestarray( shape: tuple[int, ...], chunks: tuple[int, ...] diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 4d41bb47..e4157b73 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -15,7 +15,9 @@ from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( has_astropy, - requires_kerchunk, + parametrize_over_hdf_backends, + requires_hdf5plugin, + requires_imagecodecs, requires_network, requires_s3fs, requires_scipy, @@ -82,13 +84,14 @@ def test_FileType(): FileType(None) -@requires_kerchunk -@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) +@parametrize_over_hdf_backends class TestOpenVirtualDatasetIndexes: def test_no_indexes(self, netcdf4_file, hdf_backend): vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert vds.indexes == {} + @requires_hdf5plugin + @requires_imagecodecs def test_create_default_indexes_for_loadable_variables( self, netcdf4_file, hdf_backend ): @@ -122,8 +125,9 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I return True -@requires_kerchunk -@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) +@requires_hdf5plugin +@requires_imagecodecs +@parametrize_over_hdf_backends def test_cftime_index(tmpdir, hdf_backend): """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 @@ -152,8 +156,7 @@ def test_cftime_index(tmpdir, hdf_backend): assert vds.attrs == ds.attrs -@requires_kerchunk -@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) +@parametrize_over_hdf_backends class TestOpenVirtualDatasetAttrs: def test_drop_array_dimensions(self, netcdf4_file, hdf_backend): # regression test for GH issue #150 @@ -171,14 +174,16 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend): } -@requires_kerchunk +@parametrize_over_hdf_backends class TestDetermineCoords: - def test_infer_one_dimensional_coords(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_infer_one_dimensional_coords(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert set(vds.coords) == {"time", "lat", "lon"} - def test_var_attr_coords(self, netcdf4_file_with_2d_coords): - vds = open_virtual_dataset(netcdf4_file_with_2d_coords, indexes={}) + def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend): + vds = open_virtual_dataset( + netcdf4_file_with_2d_coords, indexes={}, backend=hdf_backend + ) expected_dimension_coords = ["ocean_time", "s_rho"] expected_2d_coords = ["lon_rho", "lat_rho", "h"] @@ -189,6 +194,8 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords): + expected_2d_coords + expected_1d_non_dimension_coords + expected_scalar_coords + # These should not be included in coords see #401 for more information + + (["xi_rho", "eta_rho"] if hdf_backend == HDFVirtualBackend else []) ) assert set(vds.coords) == set(expected_coords) @@ -199,7 +206,7 @@ class TestReadFromS3: @pytest.mark.parametrize( "indexes", [None, {}], ids=["None index", "empty dict index"] ) - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends def test_anon_read_s3(self, indexes, hdf_backend): """Parameterized tests for empty vs supplied indexes and filetypes.""" # TODO: Switch away from this s3 url after minIO is implemented. @@ -217,7 +224,7 @@ def test_anon_read_s3(self, indexes, hdf_backend): @requires_network -@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) +@parametrize_over_hdf_backends class TestReadFromURL: @pytest.mark.parametrize( "filetype, url", @@ -320,46 +327,55 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend): xrt.assert_equal(dsXR, dsV) -@requires_kerchunk -def test_open_empty_group(empty_netcdf4_file): - vds = open_virtual_dataset(empty_netcdf4_file, indexes={}) - assert isinstance(vds, xr.Dataset) - expected = Dataset() - xrt.assert_identical(vds, expected) - - -@requires_kerchunk +@parametrize_over_hdf_backends class TestOpenVirtualDatasetHDFGroup: - def test_open_subgroup(self, netcdf4_file_with_data_in_multiple_groups): + def test_open_empty_group(self, empty_netcdf4_file, hdf_backend): + vds = open_virtual_dataset(empty_netcdf4_file, indexes={}, backend=hdf_backend) + assert isinstance(vds, xr.Dataset) + expected = Dataset() + xrt.assert_identical(vds, expected) + + def test_open_subgroup( + self, netcdf4_file_with_data_in_multiple_groups, hdf_backend + ): vds = open_virtual_dataset( - netcdf4_file_with_data_in_multiple_groups, group="subgroup", indexes={} + netcdf4_file_with_data_in_multiple_groups, + group="subgroup", + indexes={}, + backend=hdf_backend, + ) + # This should just be ["bar"] see #401 for more information + assert list(vds.variables) == ( + ["bar", "dim_0"] if hdf_backend == HDFVirtualBackend else ["bar"] ) - assert list(vds.variables) == ["bar"] assert isinstance(vds["bar"].data, ManifestArray) assert vds["bar"].shape == (2,) - def test_open_root_group_manually(self, netcdf4_file_with_data_in_multiple_groups): - vds = open_virtual_dataset( - netcdf4_file_with_data_in_multiple_groups, group="", indexes={} - ) - assert list(vds.variables) == ["foo"] - assert isinstance(vds["foo"].data, ManifestArray) - assert vds["foo"].shape == (3,) - - def test_open_root_group_by_default( - self, netcdf4_file_with_data_in_multiple_groups + @pytest.mark.parametrize("group", ["", None]) + def test_open_root_group( + self, + netcdf4_file_with_data_in_multiple_groups, + hdf_backend, + group, ): vds = open_virtual_dataset( - netcdf4_file_with_data_in_multiple_groups, indexes={} + netcdf4_file_with_data_in_multiple_groups, + group=group, + indexes={}, + backend=hdf_backend, + ) + # This should just be ["foo"] see #401 for more information + assert list(vds.variables) == ( + ["foo", "dim_0"] if hdf_backend == HDFVirtualBackend else ["foo"] ) - assert list(vds.variables) == ["foo"] assert isinstance(vds["foo"].data, ManifestArray) assert vds["foo"].shape == (3,) -@requires_kerchunk +@requires_hdf5plugin +@requires_imagecodecs class TestLoadVirtualDataset: - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends def test_loadable_variables(self, netcdf4_file, hdf_backend): vars_to_load = ["air", "time"] vds = open_virtual_dataset( @@ -399,18 +415,18 @@ def test_explicit_filetype_and_backend(self, netcdf4_file): netcdf4_file, filetype="hdf", backend=HDFVirtualBackend ) - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends def test_group_kwarg(self, hdf5_groups_file, hdf_backend): if hdf_backend == HDFVirtualBackend: - with pytest.raises(NotImplementedError, match="Nested groups"): - open_virtual_dataset(hdf5_groups_file, backend=hdf_backend) with pytest.raises(KeyError, match="doesn't exist"): open_virtual_dataset( hdf5_groups_file, group="doesnt_exist", backend=hdf_backend ) if hdf_backend == HDF5VirtualBackend: with pytest.raises(ValueError, match="not found in"): - open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") + open_virtual_dataset( + hdf5_groups_file, group="doesnt_exist", backend=hdf_backend + ) vars_to_load = ["air", "time"] vds = open_virtual_dataset( @@ -443,13 +459,13 @@ def test_open_virtual_dataset_passes_expected_args( } mock_read_kerchunk.assert_called_once_with(**args) - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend): vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend) assert vds.empty.dims == () assert vds.empty.attrs == {"empty": "true"} - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend): vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend) assert vds.scalar.dims == () diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 71de89fa..95be3de8 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -8,9 +8,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.readers import HDF5VirtualBackend -from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, ) @@ -61,7 +59,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): ), ], ) -@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) +@parametrize_over_hdf_backends def test_numpy_arrays_to_inlined_kerchunk_refs( netcdf4_file, inline_threshold, vars_to_inline, hdf_backend ): @@ -89,7 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( @requires_kerchunk @pytest.mark.parametrize("format", ["dict", "json", "parquet"]) class TestKerchunkRoundtrip: - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -122,7 +120,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) def test_kerchunk_roundtrip_concat( self, tmpdir, format, hdf_backend, decode_times, time_vars @@ -192,7 +190,7 @@ def test_kerchunk_roundtrip_concat( assert roundtrip.time.encoding["units"] == ds.time.encoding["units"] assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"] - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + @parametrize_over_hdf_backends def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): # regression test for GH issue #105 @@ -278,8 +276,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): assert roundtrip.a.attrs == ds.a.attrs -@requires_kerchunk -@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) +@parametrize_over_hdf_backends def test_open_scalar_variable(tmpdir, hdf_backend): # regression test for GH issue #100 @@ -290,9 +287,8 @@ def test_open_scalar_variable(tmpdir, hdf_backend): assert vds["a"].shape == () +@parametrize_over_hdf_backends class TestPathsToURIs: - @requires_kerchunk - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend): vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) @@ -302,8 +298,6 @@ def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend): path = manifest["0.0.0"]["path"] assert path == expected_path - @requires_kerchunk - @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend): relative_path = relpath(netcdf4_file) vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend) diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py index 2e6c7c33..598848a5 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py @@ -142,11 +142,11 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): ) assert len(variables) == 3 - def test_nested_groups_not_implemented(self, nested_group_hdf5_file): - with pytest.raises(NotImplementedError): - HDFVirtualBackend._virtual_vars_from_hdf( - path=nested_group_hdf5_file, group="group" - ) + def test_nested_groups_are_ignored(self, nested_group_hdf5_file): + variables = HDFVirtualBackend._virtual_vars_from_hdf( + path=nested_group_hdf5_file, group="group" + ) + assert len(variables) == 1 def test_drop_variables(self, multiple_datasets_hdf5_file): variables = HDFVirtualBackend._virtual_vars_from_hdf( @@ -185,7 +185,7 @@ def test_coord_names( @requires_hdf5plugin @requires_imagecodecs -@pytest.mark.parametrize("group", ["subgroup", "subgroup/"]) +@pytest.mark.parametrize("group", [None, "subgroup", "subgroup/"]) def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group): # regression test for GH issue #364 vds = open_virtual_dataset( @@ -194,17 +194,3 @@ def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, grou backend=HDFVirtualBackend, ) assert list(vds.dims) == ["dim_0"] - - -@requires_hdf5plugin -@requires_imagecodecs -def test_nested_groups(hdf5_groups_file): - # try to open an empty group - with pytest.raises( - NotImplementedError, match="Nested groups are not yet supported" - ): - open_virtual_dataset( - hdf5_groups_file, - group="/", - backend=HDFVirtualBackend, - ) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 856ff395..03cc86f9 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -7,8 +7,11 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.readers import HDF5VirtualBackend, HDFVirtualBackend -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import ( + parametrize_over_hdf_backends, + requires_hdf5plugin, + requires_imagecodecs, +) from virtualizarr.zarr import ZArray @@ -227,8 +230,9 @@ def test_concat_dim_coords_along_existing_dim(self): assert result.data.zarray.zarr_format == zarray.zarr_format -@requires_kerchunk -@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) +@requires_hdf5plugin +@requires_imagecodecs +@parametrize_over_hdf_backends class TestCombineUsingIndexes: def test_combine_by_coords(self, netcdf4_files_factory: Callable, hdf_backend): filepath1, filepath2 = netcdf4_files_factory() @@ -262,8 +266,7 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files, hdf_backe assert isinstance(combined_vds["lon"].data, ManifestArray) -@requires_kerchunk -@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) +@parametrize_over_hdf_backends class TestRenamePaths: def test_rename_to_str(self, netcdf4_file, hdf_backend): vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) @@ -296,6 +299,8 @@ def test_invalid_type(self, netcdf4_file, hdf_backend): with pytest.raises(TypeError): vds.virtualize.rename_paths(["file1.nc", "file2.nc"]) + @requires_hdf5plugin + @requires_imagecodecs def test_mixture_of_manifestarrays_and_numpy_arrays( self, netcdf4_file, hdf_backend ): @@ -313,14 +318,15 @@ def test_mixture_of_manifestarrays_and_numpy_arrays( assert isinstance(renamed_vds["lat"].data, np.ndarray) -@requires_kerchunk +@requires_hdf5plugin +@requires_imagecodecs def test_nbytes(simple_netcdf4): vds = open_virtual_dataset(simple_netcdf4) - assert vds.virtualize.nbytes == 32 - assert vds.nbytes == 48 + assert vds.virtualize.nbytes == 88 + assert vds.nbytes == 104 vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"]) - assert vds.virtualize.nbytes == 48 + assert vds.virtualize.nbytes == 104 ds = open_dataset(simple_netcdf4) assert ds.virtualize.nbytes == ds.nbytes