Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions wums/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import pickle
import sys

import wums
sys.modules['narf.ioutils'] = sys.modules['wums.ioutils'] # backwards compatibility to use old files
sys.modules["narf.ioutils"] = sys.modules[
"wums.ioutils"
] # backwards compatibility to use old files

import boost_histogram as bh
import h5py
Expand Down Expand Up @@ -156,9 +157,11 @@ def reduce_Hist(obj):
view = get_histogram_view(obj)
h5buf = H5Buffer(view)

metadata = obj.metadata if hasattr(obj, "metadata") else None

return (
make_Hist,
(axes, obj.storage_type(), obj.metadata, obj.label, obj.name, h5buf),
(axes, obj.storage_type(), metadata, obj.label, obj.name, h5buf),
)


Expand Down
258 changes: 258 additions & 0 deletions wums/sparse_hist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
"""Sparse histogram wrapper combining a scipy sparse array with hist axes.

The :class:`SparseHist` class pairs a scipy sparse array with a sequence of
hist axes describing its dense N-D shape. The dense layout is always the
with-flow layout (each axis contributes ``axis.extent`` bins). Consumers can
extract either the with-flow or no-flow representation via the ``flow``
parameter on :meth:`SparseHist.toarray` and :meth:`SparseHist.to_flat_csr`.
"""

import numpy as np


class _AxesTuple(tuple):
"""Tuple of hist axes that supports lookup by name as well as by index."""

def __getitem__(self, key):
if isinstance(key, str):
for ax in self:
if ax.name == key:
return ax
raise KeyError(f"axis '{key}' not found")
return tuple.__getitem__(self, key)


class SparseHist:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is the idea but if we want to use SparseHist as drop in replacement for a regular Hist object we should give it the same attributes.

Right now "name" and "label" are the obvious ones missing.

There are also small differences e.g. the .shape for SparseHist includes under/overflow while it it not included in the regular Hist.

On the Hist object I can also do things like "h_dense.axes.name" which doesn't work for the SparseHist.

Functions like "fill" or "project" could be set as "NotImplemented" or "NotSupported"

Just for reference this is the full list:

>>> h_sparse.__dir__()
['_axes', '_dense_shape', '_size', '_flat_indices', '_values', '__module__', '__firstlineno__', '__doc__', '_underflow_offset', '__init__', '_from_flat', 'axes', 'shape', 'dtype', 'nnz', 'toarray', 'tocoo', 'to_flat_csr', '__getitem__', '__static_attributes__', '__dict__', '__weakref__', '__new__', '__repr__', '__hash__', '__str__', '__getattribute__', '__setattr__', '__delattr__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__reduce_ex__', '__reduce__', '__getstate__', '__subclasshook__', '__init_subclass__', '__format__', '__sizeof__', '__dir__', '__class__']

>>> h_dense.__dir__()
['_variance_known', 'name', 'label', '__module__', '__firstlineno__', '__static_attributes__', '__orig_bases__', '__weakref__', '__doc__', '__parameters__', '_family', '__slots__', '__init__', '_generate_axes_', '_repr_html_', '_name_to_index', '_to_uhi_', 'from_columns', 'project', 'T', 'fill', 'fill_flattened', 'sort', '_convert_index_wildcards', '_loc_shortcut', '_step_shortcut', '_index_transform', '__getitem__', '__setitem__', 'profile', 'density', 'show', 'plot', 'plot1d', 'plot2d', 'plot2d_full', 'plot_ratio', 'plot_pull', 'plot_pie', 'stack', 'integrate', '__annotations__', '__init_subclass__', '_clone', '_new_hist', '_from_histogram_cpp', '_from_histogram_object', '_import_bh_', '_export_bh_', '__getattr__', '_from_uhi_', 'ndim', 'view', '__array__', '__hash__', '__eq__', '__ne__', '__add__', '__iadd__', '__radd__', '__sub__', '__isub__', '__mul__', '__rmul__', '__truediv__', '__div__', '__idiv__', '__itruediv__', '__imul__', '_compute_inplace_op', '__str__', '_axis', 'storage_type', '_storage_type', '_reduce', '__copy__', '__deepcopy__', '__getstate__', '__setstate__', '__repr__', '_compute_uhi_index', '_compute_commonindex', 'to_numpy', 'copy', 'reset', 'empty', 'sum', 'size', 'shape', '_handle_slice', '_rebin_with_groups', 'kind', 'values', 'variances', 'counts', '_hist', 'axes', '__dict__', '_types', '__class_getitem__', '__new__', '__getattribute__', '__setattr__', '__delattr__', '__lt__', '__le__', '__gt__', '__ge__', '__reduce_ex__', '__reduce__', '__subclasshook__', '__format__', '__sizeof__', '__dir__', '__class__']

>>> h_dense.__dict__
{'_variance_known': True, 'name': None, 'label': None}

>>> h_sparse.__dict__
{'_axes': (Regular(20, -5, 5, name='x'),), '_dense_shape': (22,), '_size': 22, '_flat_indices': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20]), '_values': array([265., 235., 247., 249., 249., 263., 260., 265., 226., 248., 247.,
       254., 230., 227., 261., 246., 254., 283., 242., 249.])}

"""Wrapper combining a scipy sparse array with hist axes describing its dense shape.

The dense N-D layout is **always the with-flow layout**: each axis contributes
``axis.extent`` bins, where (for axes with underflow) position 0 is the
underflow bin, regular bins follow at positions 1..size, and (for axes with
overflow) the overflow bin is at the last position. For axes that have
neither underflow nor overflow, ``extent == size`` and the layout matches the
no-flow layout exactly.

The user provides scipy sparse data whose row-major flattening matches the
row-major flattening of this with-flow dense shape. Consumers (such as the
rabbit ``TensorWriter``) can extract either the with-flow or no-flow layout
via the ``flow`` parameter on :meth:`toarray` and :meth:`to_flat_csr`.

Parameters
----------
data : scipy.sparse array or matrix
Sparse storage. Total element count must equal the product of axis extents.
axes : sequence of hist axes
Axes describing the dense N-D shape. Each axis must have ``.name``.
"""

@staticmethod
def _underflow_offset(ax):
"""Return 1 if the axis has an underflow bin, 0 otherwise."""
traits = getattr(ax, "traits", None)
if traits is not None and getattr(traits, "underflow", False):
return 1
return 0

def __init__(self, data, axes):
self._axes = _AxesTuple(axes)
self._dense_shape = tuple(int(a.extent) for a in self._axes)
self._size = int(np.prod(self._dense_shape))

if not (hasattr(data, "toarray") and hasattr(data, "tocoo")):
raise TypeError(
f"data must be a scipy sparse array/matrix, got {type(data).__name__}"
)

if int(np.prod(data.shape)) != self._size:
raise ValueError(
f"Total elements in sparse data ({int(np.prod(data.shape))}) does "
f"not match product of axis extents {self._dense_shape} = {self._size}"
)

# Internally store as flat (indices, values) corresponding to row-major
# flatten of the with-flow dense shape.
coo = data.tocoo()
if coo.ndim == 2:
flat_idx = np.ravel_multi_index((coo.row, coo.col), data.shape)
elif coo.ndim == 1:
flat_idx = coo.coords[0]
else:
raise ValueError(f"Unsupported sparse ndim {coo.ndim}")

self._flat_indices = np.asarray(flat_idx, dtype=np.int64)
self._values = np.asarray(coo.data)

@classmethod
def _from_flat(cls, flat_indices, values, axes, size):
"""Construct directly from flat indices and values, bypassing __init__ checks."""
obj = cls.__new__(cls)
obj._axes = _AxesTuple(axes)
obj._dense_shape = tuple(int(a.extent) for a in obj._axes)
obj._size = int(size)
obj._flat_indices = np.asarray(flat_indices, dtype=np.int64)
obj._values = np.asarray(values)
return obj

@property
def axes(self):
return self._axes

@property
def shape(self):
return self._dense_shape

@property
def dtype(self):
return self._values.dtype

@property
def nnz(self):
return len(self._flat_indices)

def toarray(self, flow=True):
"""Return the dense N-D numpy array.

If ``flow=True`` (default), the result has the with-flow shape (extents).
If ``flow=False``, flow bins are dropped and the result has the no-flow
shape (sizes).
"""
out = np.zeros(self._size, dtype=self._values.dtype)
out[self._flat_indices] = self._values
full = out.reshape(self._dense_shape)
if flow:
return full
slices = tuple(
slice(self._underflow_offset(ax), self._underflow_offset(ax) + len(ax))
for ax in self._axes
)
return full[slices]

def tocoo(self):
"""Return a 2D scipy COO array of shape (1, size) in the with-flow layout."""
import scipy.sparse

return scipy.sparse.coo_array(
(
self._values,
(np.zeros(len(self._flat_indices), dtype=np.int64), self._flat_indices),
),
shape=(1, self._size),
)

def to_flat_csr(self, dtype, flow=True):
"""Return a flat CSR array of shape (1, size) with sorted indices.

If ``flow=True`` (default), returns the with-flow CSR (size = product of
extents). If ``flow=False``, drops entries that fall in flow bins and
returns a CSR in the no-flow layout (size = product of sizes), with
indices shifted to that layout.
"""
import scipy.sparse

if flow:
target_size = self._size
else:
no_flow_shape = tuple(int(len(ax)) for ax in self._axes)
target_size = int(np.prod(no_flow_shape))

# Use int64 indices when the flat size exceeds the int32 range, since
# scipy.sparse CSR indices default to int32 and would silently overflow.
idx_dtype = np.int64 if target_size > np.iinfo(np.int32).max else np.int32

if flow:
sort_order = np.argsort(self._flat_indices)
sorted_idx = self._flat_indices[sort_order].astype(idx_dtype)
sorted_vals = self._values[sort_order].astype(dtype)
indptr = np.array([0, len(sorted_vals)], dtype=idx_dtype)
return scipy.sparse.csr_array(
(sorted_vals, sorted_idx, indptr), shape=(1, self._size)
)

# No-flow extraction: filter entries in flow bins, shift remaining to
# the no-flow layout.
if len(self._flat_indices) == 0:
indptr = np.array([0, 0], dtype=idx_dtype)
return scipy.sparse.csr_array(
(
np.zeros(0, dtype=dtype),
np.zeros(0, dtype=idx_dtype),
indptr,
),
shape=(1, target_size),
)

multi = np.unravel_index(self._flat_indices, self._dense_shape)
mask = np.ones(len(self._flat_indices), dtype=bool)
for i, ax in enumerate(self._axes):
u = self._underflow_offset(ax)
s = int(len(ax))
mask &= (multi[i] >= u) & (multi[i] < u + s)

shifted = tuple(
multi[i][mask] - self._underflow_offset(ax)
for i, ax in enumerate(self._axes)
)

if len(no_flow_shape) == 1:
new_flat = shifted[0]
else:
new_flat = np.ravel_multi_index(shifted, no_flow_shape)

new_values = self._values[mask]
sort_order = np.argsort(new_flat)
sorted_idx = new_flat[sort_order].astype(idx_dtype)
sorted_vals = new_values[sort_order].astype(dtype)
indptr = np.array([0, len(sorted_vals)], dtype=idx_dtype)
return scipy.sparse.csr_array(
(sorted_vals, sorted_idx, indptr), shape=(1, target_size)
)

def __getitem__(self, slice_dict):
"""Slice along one or more axes by integer index, returning a new SparseHist.

Slice indices are interpreted as regular-bin indices (0..axis.size-1),
matching hist's ``h[{"name": i}]`` convention. The underflow offset is
added internally so the slice maps to the correct position in the
with-flow dense layout.
"""
if not isinstance(slice_dict, dict):
raise TypeError(
f"SparseHist supports only dict-style index slicing, got {type(slice_dict).__name__}"
)

slice_per_axis = {}
for ax_name, ax_idx in slice_dict.items():
try:
ax_pos = next(i for i, a in enumerate(self._axes) if a.name == ax_name)
except StopIteration as ex:
raise KeyError(
f"Axis '{ax_name}' not found in SparseHist axes "
f"{[a.name for a in self._axes]}"
) from ex
ax = self._axes[ax_pos]
slice_per_axis[ax_pos] = int(ax_idx) + self._underflow_offset(ax)

keep_positions = [i for i in range(len(self._axes)) if i not in slice_per_axis]
if not keep_positions:
raise ValueError("Cannot slice all axes of a SparseHist")
axes_keep = [self._axes[i] for i in keep_positions]

# Convert flat indices back to multi-dim
multi = np.unravel_index(self._flat_indices, self._dense_shape)

# Filter entries that match the requested slice
mask = np.ones(len(self._flat_indices), dtype=bool)
for ax_pos, sl in slice_per_axis.items():
mask &= multi[ax_pos] == sl

new_dense_shape = tuple(int(a.extent) for a in axes_keep)
new_size = int(np.prod(new_dense_shape))

if len(keep_positions) == 1:
new_flat = multi[keep_positions[0]][mask]
else:
new_multi = tuple(multi[i][mask] for i in keep_positions)
new_flat = np.ravel_multi_index(new_multi, new_dense_shape)

new_values = self._values[mask]
return SparseHist._from_flat(new_flat, new_values, axes_keep, new_size)