diff --git a/.github/workflows/ci-osx.yaml b/.github/workflows/ci-osx.yaml new file mode 100644 index 00000000..9b30d587 --- /dev/null +++ b/.github/workflows/ci-osx.yaml @@ -0,0 +1,75 @@ +name: OSX CI + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout source + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Conda + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + channels: conda-forge + miniforge-version: latest + python-version: ${{ matrix.python-version }} + + - name: Show info about `base` environment + shell: "bash -l {0}" + run: | + conda info + conda config --show-sources + conda list --show-channel-urls + + - name: Set up `env` + shell: "bash -l {0}" + run: > + conda create -n env + c-compiler cxx-compiler 'clang>=12.0.1' + python=${{matrix.python-version}} wheel pip + + - name: Show info about `env` environment + shell: "bash -l {0}" + run: | + conda list --show-channel-urls -n env + + - name: Install numcodecs + shell: "bash -l {0}" + run: | + conda activate env + export DISABLE_NUMCODECS_AVX2="" + python -m pip install -v -e .[test,test_extras,msgpack,zfpy,pcodec] + + - name: List installed packages + shell: "bash -l {0}" + run: | + conda activate env + python -m pip list + + - name: Run tests + shell: "bash -l {0}" + run: | + conda activate env + pytest -v + + - uses: codecov/codecov-action@v3 + with: + #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) + verbose: true # optional (default = false) diff --git a/.github/workflows/ci-windows.yaml b/.github/workflows/ci-windows.yaml new file mode 100644 index 00000000..397be043 --- /dev/null +++ b/.github/workflows/ci-windows.yaml @@ -0,0 +1,67 @@ +name: Windows CI + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout source + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Conda + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + channels: conda-forge + miniforge-version: latest + python-version: ${{ matrix.python-version }} + + - name: Set up `env` + shell: "bash -l {0}" + run: > + conda create -n env + c-compiler cxx-compiler + python=${{matrix.python-version}} wheel pip + + - name: Show info about `env` environment + shell: "bash -l {0}" + run: | + conda list --show-channel-urls -n env + + - name: Install numcodecs + shell: "bash -l {0}" + run: | + conda activate env + python -m pip install -v -e .[test,test_extras,msgpack,zfpy,pcodec] + + - name: List installed packages + shell: "bash -l {0}" + run: | + conda activate env + python -m pip list + + - name: Run tests + shell: "bash -l {0}" + run: | + conda activate env + pytest -v + + - uses: codecov/codecov-action@v3 + with: + #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) + verbose: true # optional (default = false) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c4aca34..1b6adcb8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -85,5 +85,14 @@ jobs: - uses: codecov/codecov-action@v4 with: +<<<<<<< HEAD:.github/workflows/ci.yaml token: ${{ secrets.CODECOV_TOKEN }} verbose: true +======= + #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) + verbose: true # optional (default = false) +>>>>>>> ba38507 (Remove usage example from doctest):.github/workflows/ci-linux.yaml diff --git a/docs/bitinfo.rst b/docs/bitinfo.rst new file mode 100644 index 00000000..ed925056 --- /dev/null +++ b/docs/bitinfo.rst @@ -0,0 +1,10 @@ +PCodec +====== + +.. automodule:: numcodecs.bitinfo + +.. autoclass:: BitInfo + + .. autoattribute:: codec_id + .. automethod:: encode + .. automethod:: decode diff --git a/docs/release.rst b/docs/release.rst index a3b47185..cf00e0e0 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -13,7 +13,8 @@ Unreleased Enhancements ~~~~~~~~~~~~ - +* Add bitinfo codec + By :user:`Timothy Hodson <thodson-usgs>` :pr:`503`. Fix ~~~ diff --git a/numcodecs/__init__.py b/numcodecs/__init__.py index 47c2c616..2c50a30a 100644 --- a/numcodecs/__init__.py +++ b/numcodecs/__init__.py @@ -109,6 +109,9 @@ register_codec(Shuffle) +from numcodecs.bitinfo import BitInfo +register_codec(BitInfo) + from numcodecs.bitround import BitRound register_codec(BitRound) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py new file mode 100644 index 00000000..84dce586 --- /dev/null +++ b/numcodecs/bitinfo.py @@ -0,0 +1,294 @@ +import numpy as np + +from .compat import ensure_ndarray_like +from .bitround import BitRound + +# The size in bits of the mantissa/significand for the various floating types +# You cannot keep more bits of data than you have available +# https://en.wikipedia.org/wiki/IEEE_754 + +NMBITS = {64: 12, 32: 9, 16: 6} # number of non mantissa bits for given dtype + + +class BitInfo(BitRound): + """Floating-point bit information codec + + Drops bits from the floating point mantissa, leaving an array more amenable + to compression. The number of bits to keep is determined using the approach + from Klöwer et al. 2021 (https://www.nature.com/articles/s43588-021-00156-2). + See https://github.com/zarr-developers/numcodecs/issues/298 for discussion + and the original implementation in Julia referred to at + https://github.com/milankl/BitInformation.jl + + Parameters + ---------- + + info_level: float + The level of information to preserve in the data. The value should be + between 0. and 1.0. Higher values preserve more information. + + axes: int or list of int, optional + Axes along which to calculate the bit information. If None, all axes + are used. + + Examples + -------- + This example applies the BitInfo codec to an xarray dataset and + writes the data out using xarray's `to_zarr` method. Using this pattern, the + information content is computed chunkwise, which is recommended for + datasets with variable information content. Note that these data have + been quantized creating erroneous results, which is apparent in + the output. Do not use with quantized data in practice. + + import xarray as xr + ds = xr.tutorial.open_dataset("air_temperature") + from numcodecs import Blosc, BitInfo + compressor = Blosc(cname="zstd", clevel=3) + filters = [BitInfo(info_level=0.99)] + encoding = {"air": {"compressor": compressor, "filters": filters}} + ds.to_zarr('xbit.zarr', mode="w", encoding=encoding) + """ + + codec_id = 'bitinfo' + + def __init__(self, info_level: float, axes=None): + if (info_level < 0) or (info_level > 1.0): + raise ValueError("Please provide `info_level` from interval [0.,1.]") + + elif axes is not None and not isinstance(axes, list): + if int(axes) != axes: + raise ValueError("axis must be an integer or a list of integers.") + axes = [axes] + + elif isinstance(axes, list) and not all(int(ax) == ax for ax in axes): + raise ValueError("axis must be an integer or a list of integers.") + + self.info_level = info_level + self.axes = axes + + def encode(self, buf): + """Create int array by rounding floating-point data + + The itemsize will be preserved, but the output should be much more + compressible. + """ + a = ensure_ndarray_like(buf) + dtype = a.dtype + + if not a.dtype.kind == "f" or a.dtype.itemsize > 8: + raise TypeError("Only float arrays (16-64bit) can be bit-rounded") + + if self.axes is None: + self.axes = range(a.ndim) + + itemsize = a.dtype.itemsize + astype = f"u{itemsize}" + if a.dtype in (np.float16, np.float32, np.float64): + a = signed_exponent(a) + + a = a.astype(astype) + keepbits = [] + + for ax in self.axes: + info_per_bit = bitinformation(a, axis=ax) + keepbits.append(get_keepbits(info_per_bit, self.info_level)) + + keepbits = max(keepbits) + + return BitRound.bitround(buf, keepbits, dtype) + + +def exponent_bias(dtype): + """ + Returns the exponent bias for a given floating-point dtype. + + Example + ------- + >>> exponent_bias("f4") + 127 + >>> exponent_bias("f8") + 1023 + """ + info = np.finfo(dtype) + exponent_bits = info.bits - info.nmant - 1 + return 2 ** (exponent_bits - 1) - 1 + + +def exponent_mask(dtype): + """ + Returns exponent mask for a given floating-point dtype. + + Example + ------- + >>> np.binary_repr(exponent_mask(np.float32), width=32) + '01111111100000000000000000000000' + >>> np.binary_repr(exponent_mask(np.float16), width=16) + '0111110000000000' + """ + if dtype == np.float16: + mask = 0x7C00 + elif dtype == np.float32: + mask = 0x7F80_0000 + elif dtype == np.float64: + mask = 0x7FF0_0000_0000_0000 + else: + raise ValueError(f"Unsupported dtype {dtype}") + return mask + + +def signed_exponent(A): + """ + Transform biased exponent notation to signed exponent notation. + + Parameters + ---------- + a : array + Array to transform + + Returns + ------- + array + + Example + ------- + >>> A = np.array(0.03125, dtype="float32") + >>> np.binary_repr(A.view("uint32"), width=32) + '00111101000000000000000000000000' + >>> np.binary_repr(signed_exponent(A), width=32) + '01000010100000000000000000000000' + >>> A = np.array(0.03125, dtype="float64") + >>> np.binary_repr(A.view("uint64"), width=64) + '0011111110100000000000000000000000000000000000000000000000000000' + >>> np.binary_repr(signed_exponent(A), width=64) + '0100000001010000000000000000000000000000000000000000000000000000' + """ + itemsize = A.dtype.itemsize + uinttype = f"u{itemsize}" + inttype = f"i{itemsize}" + + sign_mask = 1 << np.finfo(A.dtype).bits - 1 + sfmask = sign_mask | (1 << np.finfo(A.dtype).nmant) - 1 + emask = exponent_mask(A.dtype) + esignmask = sign_mask >> 1 + + sbits = np.finfo(A.dtype).nmant + if itemsize == 8: + sbits = np.uint64(sbits) + emask = np.uint64(emask) + bias = exponent_bias(A.dtype) + + ui = A.view(uinttype) + sf = ui & sfmask + e = ((ui & emask) >> sbits).astype(inttype) - bias + max_eabs = np.iinfo(A.view(uinttype).dtype).max >> sbits + eabs = abs(e) % (max_eabs + 1) + esign = np.where(e < 0, esignmask, 0) + if itemsize == 8: + eabs = np.uint64(eabs) + esign = np.uint64(esign) + esigned = esign | (eabs << sbits) + return (sf | esigned).view(np.int64) + + +def bitpaircount_u1(a, b): + assert a.dtype == "u1" + assert b.dtype == "u1" + + unpack_a = np.unpackbits(a.flatten()).astype("u1") + unpack_b = np.unpackbits(b.flatten()).astype("u1") + + index = ((unpack_a << 1) | unpack_b).reshape(-1, 8) + + selection = np.array([0, 1, 2, 3], dtype="u1") + sel = np.where((index[..., np.newaxis]) == selection, True, False) + return sel.sum(axis=0).reshape([8, 2, 2]) + + +def bitpaircount(a, b): + assert a.dtype.kind == "u" + assert b.dtype.kind == "u" + + nbytes = max(a.dtype.itemsize, b.dtype.itemsize) + + a, b = np.broadcast_arrays(a, b) + + bytewise_counts = [] + for i in range(nbytes): + s = (nbytes - 1 - i) * 8 + bitc = bitpaircount_u1((a >> s).astype("u1"), (b >> s).astype("u1")) + bytewise_counts.append(bitc) + return np.concatenate(bytewise_counts, axis=0) + + +def mutual_information(a, b, base=2): + """Calculate the mutual information between two arrays.""" + assert a.dtype == b.dtype + assert a.dtype.kind == "u" + + size = np.prod(np.broadcast_shapes(a.shape, b.shape)) + counts = bitpaircount(a, b) + + p = counts.astype("float") / size + p = np.ma.masked_equal(p, 0) + pr = p.sum(axis=-1)[..., np.newaxis] + ps = p.sum(axis=-2)[..., np.newaxis, :] + mutual_info = (p * np.ma.log(p / (pr * ps))).sum(axis=(-1, -2)) / np.log(base) + return mutual_info + + +def bitinformation(a, axis=0): + """Get the information content of each bit in the array. + + Parameters + ---------- + a : array + Array to calculate the bit information. + axis : int + Axis along which to calculate the bit information. + + Returns + ------- + info_per_bit : array + """ + assert a.dtype.kind == "u" + + sa = tuple(slice(0, -1) if i == axis else slice(None) for i in range(len(a.shape))) + sb = tuple( + slice(1, None) if i == axis else slice(None) for i in range(len(a.shape)) + ) + return mutual_information(a[sa], a[sb]) + + +def get_keepbits(info_per_bit, inflevel=0.99): + """Get the number of mantissa bits to keep. + + Parameters + ---------- + info_per_bit : array + Information content of each bit from `get_bitinformation`. + + inflevel : float + Level of information that shall be preserved. + + Returns + ------- + keepbits : int + Number of mantissa bits to keep + + """ + if (inflevel < 0) or (inflevel > 1.0): + raise ValueError("Please provide `inflevel` from interval [0.,1.]") + + cdf = _cdf_from_info_per_bit(info_per_bit) + bitdim_non_mantissa_bits = NMBITS[len(info_per_bit)] + keepmantissabits = (cdf > inflevel).argmax() + 1 - bitdim_non_mantissa_bits + + return keepmantissabits + + +def _cdf_from_info_per_bit(info_per_bit): + """Convert info_per_bit to cumulative distribution function""" + tol = info_per_bit[-4:].max() * 1.5 + info_per_bit[info_per_bit < tol] = 0 + cdf = info_per_bit.cumsum() + return cdf / cdf[-1] diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 767e5e43..117d7467 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -54,19 +54,12 @@ def encode(self, buf): raise TypeError("Only float arrays (16-64bit) can be bit-rounded") bits = max_bits[str(a.dtype)] # cast float to int type of same width (preserve endianness) - a_int_dtype = np.dtype(a.dtype.str.replace("f", "i")) - all_set = np.array(-1, dtype=a_int_dtype) if self.keepbits == bits: return a if self.keepbits > bits: raise ValueError("Keepbits too large for given dtype") - b = a.view(a_int_dtype) - maskbits = bits - self.keepbits - mask = (all_set >> maskbits) << maskbits - half_quantum1 = (1 << (maskbits - 1)) - 1 - b += ((b >> maskbits) & 1) + half_quantum1 - b &= mask - return b + + return self.bitround(a, self.keepbits, a.dtype) def decode(self, buf, out=None): """Remake floats from ints @@ -78,3 +71,32 @@ def decode(self, buf, out=None): dt = np.dtype(buf.dtype.str.replace("i", "f")) data = buf.view(dt) return ndarray_copy(data, out) + + @staticmethod + def bitround(buf, keepbits: int, dtype): + """Drop bits from the mantissa of a floating point array + + Parameters + ---------- + buf: ndarray + The input array + keepbits: int + The number of bits to keep + dtype: dtype + The dtype of the input array + + Returns + ------- + ndarray + The bitrounded array transformed to an integer type + """ + bits = max_bits[str(dtype)] + a_int_dtype = np.dtype(buf.dtype.str.replace("f", "i")) + all_set = np.array(-1, dtype=a_int_dtype) + b = buf.view(a_int_dtype) + maskbits = bits - keepbits + mask = (all_set >> maskbits) << maskbits + half_quantum1 = (1 << (maskbits - 1)) - 1 + b += ((b >> maskbits) & 1) + half_quantum1 + b &= mask + return b diff --git a/numcodecs/tests/test_bitinfo.py b/numcodecs/tests/test_bitinfo.py new file mode 100644 index 00000000..aba5b34e --- /dev/null +++ b/numcodecs/tests/test_bitinfo.py @@ -0,0 +1,70 @@ +import numpy as np + +import pytest + +from numcodecs.bitinfo import BitInfo, exponent_bias, mutual_information + + +def test_bitinfo_initialization(): + bitinfo = BitInfo(0.5) + assert bitinfo.info_level == 0.5 + assert bitinfo.axes is None + + bitinfo = BitInfo(0.5, axes=1) + assert bitinfo.axes == [1] + + bitinfo = BitInfo(0.5, axes=[1, 2]) + assert bitinfo.axes == [1, 2] + + with pytest.raises(ValueError): + BitInfo(-0.1) + + with pytest.raises(ValueError): + BitInfo(1.1) + + with pytest.raises(ValueError): + BitInfo(0.5, axes=1.5) + + with pytest.raises(ValueError): + BitInfo(0.5, axes=[1, 1.5]) + + +def test_bitinfo_encode(): + bitinfo = BitInfo(info_level=0.5) + a = np.array([1.0, 2.0, 3.0], dtype="float32") + encoded = bitinfo.encode(a) + decoded = bitinfo.decode(encoded) + assert decoded.dtype == a.dtype + + +def test_bitinfo_encode_errors(): + bitinfo = BitInfo(0.5) + a = np.array([1, 2, 3], dtype="int32") + with pytest.raises(TypeError): + bitinfo.encode(a) + + +def test_exponent_bias(): + assert exponent_bias("f2") == 15 + assert exponent_bias("f4") == 127 + assert exponent_bias("f8") == 1023 + + with pytest.raises(ValueError): + exponent_bias("int32") + + +def test_mutual_information(): + """Test mutual information calculation + + Tests for changes to the mutual_information + but not the correcteness of the original. + """ + a = np.arange(10.0, dtype='float32') + b = a + 1000 + c = a[::-1].copy() + dt = np.dtype('uint32') + a, b, c = map(lambda x: x.view(dt), [a, b, c]) + + assert mutual_information(a, a).sum() == 7.020411549771797 + assert mutual_information(a, b).sum() == 0.0 + assert mutual_information(a, c).sum() == 0.6545015579460758