From 90dbb684f0568f313c0f3cfa50df1a467e0a55a4 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 19 Dec 2022 09:28:26 -0500 Subject: [PATCH 1/3] add parametrized tests for netcdf encoding options --- kerchunk/tests/test_hdf.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 50f1dba7..2a0639ad 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -286,3 +286,40 @@ def test_compact(): m = fsspec.get_mapper("reference://", fo=out) g = zarr.open(m) assert np.allclose(g.ancillary_data.atlas_sdp_gps_epoch[:], 1.19880002e09) + + +@pytest.mark.parametrize("zlib", [True, False], ids=["zlib", "no_zlib"]) +@pytest.mark.parametrize("shuffle", [True, False], ids=["shuffle", "no_shuffle"]) +@pytest.mark.parametrize("fletcher32", [True, False], ids=["fletcher32", "no_fletcher32"]) +def test_encoding_options(zlib, shuffle, fletcher32, tmp_path): + fname = tmp_path / "test.nc" + + shape = (2, 10) + chunksizes = (1, 10) + + encoding = { + 'zlib': zlib, + 'shuffle': shuffle, + 'complevel': 2, + 'fletcher32': fletcher32, + 'contiguous': False, + 'chunksizes': chunksizes + } + + da = xr.DataArray( + data=np.random.rand(*shape), + dims=['y', 'x'], + name="foo", + attrs={"bar": "baz"} + ) + da.encoding = encoding + ds = da.to_dataset() + ds.to_netcdf(fname, engine="netcdf4", mode="w") + + with fsspec.open(fname) as fp: + h5chunks = kerchunk.hdf.SingleHdf5ToZarr(fp, fname, inline_threshold=0, spec=0) + refs = h5chunks.translate() + + store = fsspec.get_mapper("reference://", fo=refs) + ds2 = xr.open_dataset(store, engine="zarr", chunks={}) + xr.testing.assert_identical(ds, ds2) From 5d8ae215f029c39fe8fe4f029871c7465c45a7b7 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 19 Dec 2022 09:32:17 -0500 Subject: [PATCH 2/3] pre-commit --- kerchunk/tests/test_hdf.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 2a0639ad..1ed0d20c 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -290,27 +290,26 @@ def test_compact(): @pytest.mark.parametrize("zlib", [True, False], ids=["zlib", "no_zlib"]) @pytest.mark.parametrize("shuffle", [True, False], ids=["shuffle", "no_shuffle"]) -@pytest.mark.parametrize("fletcher32", [True, False], ids=["fletcher32", "no_fletcher32"]) +@pytest.mark.parametrize( + "fletcher32", [True, False], ids=["fletcher32", "no_fletcher32"] +) def test_encoding_options(zlib, shuffle, fletcher32, tmp_path): fname = tmp_path / "test.nc" - + shape = (2, 10) chunksizes = (1, 10) encoding = { - 'zlib': zlib, - 'shuffle': shuffle, - 'complevel': 2, - 'fletcher32': fletcher32, - 'contiguous': False, - 'chunksizes': chunksizes + "zlib": zlib, + "shuffle": shuffle, + "complevel": 2, + "fletcher32": fletcher32, + "contiguous": False, + "chunksizes": chunksizes, } da = xr.DataArray( - data=np.random.rand(*shape), - dims=['y', 'x'], - name="foo", - attrs={"bar": "baz"} + data=np.random.rand(*shape), dims=["y", "x"], name="foo", attrs={"bar": "baz"} ) da.encoding = encoding ds = da.to_dataset() From 61150102fc22c2566d0573becd600428f6c2a97d Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 19 Dec 2022 10:10:54 -0500 Subject: [PATCH 3/3] fix fletcher --- kerchunk/codecs.py | 13 +++++++++++++ kerchunk/hdf.py | 7 +++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/kerchunk/codecs.py b/kerchunk/codecs.py index 28f0fc15..3755b3f5 100644 --- a/kerchunk/codecs.py +++ b/kerchunk/codecs.py @@ -66,6 +66,19 @@ def decode(self, buf, out=None): numcodecs.register_codec(FillStringsCodec, "fill_hdf_strings") +class FletcherDummyFilter(numcodecs.abc.Codec): + codec_id = "fletcher_null" + + def decode(self, buff, out=None): + return buff[:-4] + + def encode(self, buf): + pass + + +numcodecs.register_codec(FletcherDummyFilter, "fletcher_null") + + class GRIBCodec(numcodecs.abc.Codec): """ Read GRIB stream of bytes as a message using eccodes diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index b54bd525..ebe461f1 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -7,7 +7,7 @@ import zarr from zarr.meta import encode_fill_value import numcodecs -from .codecs import FillStringsCodec +from .codecs import FillStringsCodec, FletcherDummyFilter from .utils import _encode_for_JSON try: @@ -376,6 +376,8 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]): ) # Create a Zarr array equivalent to this HDF5 dataset... + if h5obj.fletcher32: + filters.append(FletcherDummyFilter()) za = self._zroot.create_dataset( h5obj.name, shape=h5obj.shape, @@ -399,9 +401,6 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]): # Store chunk location metadata... if cinfo: for k, v in cinfo.items(): - if h5obj.fletcher32: - logging.info("Discarding fletcher32 checksum") - v["size"] -= 4 self.store[za._chunk_key(k)] = [ self._uri, v["offset"],