diff --git a/wums/ioutils.py b/wums/ioutils.py index a47184e..3b2cce2 100644 --- a/wums/ioutils.py +++ b/wums/ioutils.py @@ -2,8 +2,9 @@ import pickle import sys -import wums -sys.modules['narf.ioutils'] = sys.modules['wums.ioutils'] # backwards compatibility to use old files +sys.modules["narf.ioutils"] = sys.modules[ + "wums.ioutils" +] # backwards compatibility to use old files import boost_histogram as bh import h5py @@ -156,9 +157,11 @@ def reduce_Hist(obj): view = get_histogram_view(obj) h5buf = H5Buffer(view) + metadata = obj.metadata if hasattr(obj, "metadata") else None + return ( make_Hist, - (axes, obj.storage_type(), obj.metadata, obj.label, obj.name, h5buf), + (axes, obj.storage_type(), metadata, obj.label, obj.name, h5buf), ) diff --git a/wums/sparse_hist.py b/wums/sparse_hist.py new file mode 100644 index 0000000..f651552 --- /dev/null +++ b/wums/sparse_hist.py @@ -0,0 +1,258 @@ +"""Sparse histogram wrapper combining a scipy sparse array with hist axes. + +The :class:`SparseHist` class pairs a scipy sparse array with a sequence of +hist axes describing its dense N-D shape. The dense layout is always the +with-flow layout (each axis contributes ``axis.extent`` bins). Consumers can +extract either the with-flow or no-flow representation via the ``flow`` +parameter on :meth:`SparseHist.toarray` and :meth:`SparseHist.to_flat_csr`. +""" + +import numpy as np + + +class _AxesTuple(tuple): + """Tuple of hist axes that supports lookup by name as well as by index.""" + + def __getitem__(self, key): + if isinstance(key, str): + for ax in self: + if ax.name == key: + return ax + raise KeyError(f"axis '{key}' not found") + return tuple.__getitem__(self, key) + + +class SparseHist: + """Wrapper combining a scipy sparse array with hist axes describing its dense shape. + + The dense N-D layout is **always the with-flow layout**: each axis contributes + ``axis.extent`` bins, where (for axes with underflow) position 0 is the + underflow bin, regular bins follow at positions 1..size, and (for axes with + overflow) the overflow bin is at the last position. For axes that have + neither underflow nor overflow, ``extent == size`` and the layout matches the + no-flow layout exactly. + + The user provides scipy sparse data whose row-major flattening matches the + row-major flattening of this with-flow dense shape. Consumers (such as the + rabbit ``TensorWriter``) can extract either the with-flow or no-flow layout + via the ``flow`` parameter on :meth:`toarray` and :meth:`to_flat_csr`. + + Parameters + ---------- + data : scipy.sparse array or matrix + Sparse storage. Total element count must equal the product of axis extents. + axes : sequence of hist axes + Axes describing the dense N-D shape. Each axis must have ``.name``. + """ + + @staticmethod + def _underflow_offset(ax): + """Return 1 if the axis has an underflow bin, 0 otherwise.""" + traits = getattr(ax, "traits", None) + if traits is not None and getattr(traits, "underflow", False): + return 1 + return 0 + + def __init__(self, data, axes): + self._axes = _AxesTuple(axes) + self._dense_shape = tuple(int(a.extent) for a in self._axes) + self._size = int(np.prod(self._dense_shape)) + + if not (hasattr(data, "toarray") and hasattr(data, "tocoo")): + raise TypeError( + f"data must be a scipy sparse array/matrix, got {type(data).__name__}" + ) + + if int(np.prod(data.shape)) != self._size: + raise ValueError( + f"Total elements in sparse data ({int(np.prod(data.shape))}) does " + f"not match product of axis extents {self._dense_shape} = {self._size}" + ) + + # Internally store as flat (indices, values) corresponding to row-major + # flatten of the with-flow dense shape. + coo = data.tocoo() + if coo.ndim == 2: + flat_idx = np.ravel_multi_index((coo.row, coo.col), data.shape) + elif coo.ndim == 1: + flat_idx = coo.coords[0] + else: + raise ValueError(f"Unsupported sparse ndim {coo.ndim}") + + self._flat_indices = np.asarray(flat_idx, dtype=np.int64) + self._values = np.asarray(coo.data) + + @classmethod + def _from_flat(cls, flat_indices, values, axes, size): + """Construct directly from flat indices and values, bypassing __init__ checks.""" + obj = cls.__new__(cls) + obj._axes = _AxesTuple(axes) + obj._dense_shape = tuple(int(a.extent) for a in obj._axes) + obj._size = int(size) + obj._flat_indices = np.asarray(flat_indices, dtype=np.int64) + obj._values = np.asarray(values) + return obj + + @property + def axes(self): + return self._axes + + @property + def shape(self): + return self._dense_shape + + @property + def dtype(self): + return self._values.dtype + + @property + def nnz(self): + return len(self._flat_indices) + + def toarray(self, flow=True): + """Return the dense N-D numpy array. + + If ``flow=True`` (default), the result has the with-flow shape (extents). + If ``flow=False``, flow bins are dropped and the result has the no-flow + shape (sizes). + """ + out = np.zeros(self._size, dtype=self._values.dtype) + out[self._flat_indices] = self._values + full = out.reshape(self._dense_shape) + if flow: + return full + slices = tuple( + slice(self._underflow_offset(ax), self._underflow_offset(ax) + len(ax)) + for ax in self._axes + ) + return full[slices] + + def tocoo(self): + """Return a 2D scipy COO array of shape (1, size) in the with-flow layout.""" + import scipy.sparse + + return scipy.sparse.coo_array( + ( + self._values, + (np.zeros(len(self._flat_indices), dtype=np.int64), self._flat_indices), + ), + shape=(1, self._size), + ) + + def to_flat_csr(self, dtype, flow=True): + """Return a flat CSR array of shape (1, size) with sorted indices. + + If ``flow=True`` (default), returns the with-flow CSR (size = product of + extents). If ``flow=False``, drops entries that fall in flow bins and + returns a CSR in the no-flow layout (size = product of sizes), with + indices shifted to that layout. + """ + import scipy.sparse + + if flow: + target_size = self._size + else: + no_flow_shape = tuple(int(len(ax)) for ax in self._axes) + target_size = int(np.prod(no_flow_shape)) + + # Use int64 indices when the flat size exceeds the int32 range, since + # scipy.sparse CSR indices default to int32 and would silently overflow. + idx_dtype = np.int64 if target_size > np.iinfo(np.int32).max else np.int32 + + if flow: + sort_order = np.argsort(self._flat_indices) + sorted_idx = self._flat_indices[sort_order].astype(idx_dtype) + sorted_vals = self._values[sort_order].astype(dtype) + indptr = np.array([0, len(sorted_vals)], dtype=idx_dtype) + return scipy.sparse.csr_array( + (sorted_vals, sorted_idx, indptr), shape=(1, self._size) + ) + + # No-flow extraction: filter entries in flow bins, shift remaining to + # the no-flow layout. + if len(self._flat_indices) == 0: + indptr = np.array([0, 0], dtype=idx_dtype) + return scipy.sparse.csr_array( + ( + np.zeros(0, dtype=dtype), + np.zeros(0, dtype=idx_dtype), + indptr, + ), + shape=(1, target_size), + ) + + multi = np.unravel_index(self._flat_indices, self._dense_shape) + mask = np.ones(len(self._flat_indices), dtype=bool) + for i, ax in enumerate(self._axes): + u = self._underflow_offset(ax) + s = int(len(ax)) + mask &= (multi[i] >= u) & (multi[i] < u + s) + + shifted = tuple( + multi[i][mask] - self._underflow_offset(ax) + for i, ax in enumerate(self._axes) + ) + + if len(no_flow_shape) == 1: + new_flat = shifted[0] + else: + new_flat = np.ravel_multi_index(shifted, no_flow_shape) + + new_values = self._values[mask] + sort_order = np.argsort(new_flat) + sorted_idx = new_flat[sort_order].astype(idx_dtype) + sorted_vals = new_values[sort_order].astype(dtype) + indptr = np.array([0, len(sorted_vals)], dtype=idx_dtype) + return scipy.sparse.csr_array( + (sorted_vals, sorted_idx, indptr), shape=(1, target_size) + ) + + def __getitem__(self, slice_dict): + """Slice along one or more axes by integer index, returning a new SparseHist. + + Slice indices are interpreted as regular-bin indices (0..axis.size-1), + matching hist's ``h[{"name": i}]`` convention. The underflow offset is + added internally so the slice maps to the correct position in the + with-flow dense layout. + """ + if not isinstance(slice_dict, dict): + raise TypeError( + f"SparseHist supports only dict-style index slicing, got {type(slice_dict).__name__}" + ) + + slice_per_axis = {} + for ax_name, ax_idx in slice_dict.items(): + try: + ax_pos = next(i for i, a in enumerate(self._axes) if a.name == ax_name) + except StopIteration as ex: + raise KeyError( + f"Axis '{ax_name}' not found in SparseHist axes " + f"{[a.name for a in self._axes]}" + ) from ex + ax = self._axes[ax_pos] + slice_per_axis[ax_pos] = int(ax_idx) + self._underflow_offset(ax) + + keep_positions = [i for i in range(len(self._axes)) if i not in slice_per_axis] + if not keep_positions: + raise ValueError("Cannot slice all axes of a SparseHist") + axes_keep = [self._axes[i] for i in keep_positions] + + # Convert flat indices back to multi-dim + multi = np.unravel_index(self._flat_indices, self._dense_shape) + + # Filter entries that match the requested slice + mask = np.ones(len(self._flat_indices), dtype=bool) + for ax_pos, sl in slice_per_axis.items(): + mask &= multi[ax_pos] == sl + + new_dense_shape = tuple(int(a.extent) for a in axes_keep) + new_size = int(np.prod(new_dense_shape)) + + if len(keep_positions) == 1: + new_flat = multi[keep_positions[0]][mask] + else: + new_multi = tuple(multi[i][mask] for i in keep_positions) + new_flat = np.ravel_multi_index(new_multi, new_dense_shape) + + new_values = self._values[mask] + return SparseHist._from_flat(new_flat, new_values, axes_keep, new_size)