diff --git a/docs/basic/grammar.md b/docs/basic/grammar.md index 243ed2d0..62d3229b 100644 --- a/docs/basic/grammar.md +++ b/docs/basic/grammar.md @@ -59,6 +59,7 @@ that have *not* been implemented by `formualaic` are explicitly noted also. | `center(...)` | Shift column data so mean is zero. | ✓ | ✓ | 🗙 | | `scale(...)` | Shift column so mean is zero and variance is 1. | ✓ | ✓[^7] | ✓ | | `standardize(...)` | Alias of `scale`. | 🗙 | ✓ | 🗙 | +| `poly(...)` | Generates a polynomial basis, allowing non-linear fits. | ✓ | 🗙 | ✓ | | `bs(...)` | Generates a B-Spline basis, allowing non-linear fits. | ✓ | ✓ | ✓ | | `cr(...)` | Generates a natural cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ | | `cc(...)` | Generates a cyclic cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ | diff --git a/formulaic/materializers/base.py b/formulaic/materializers/base.py index 877ffb10..c502ebcb 100644 --- a/formulaic/materializers/base.py +++ b/formulaic/materializers/base.py @@ -5,7 +5,17 @@ import operator from abc import abstractmethod from collections import defaultdict, OrderedDict -from typing import Any, Dict, Generator, List, Iterable, Set, Tuple, TYPE_CHECKING +from typing import ( + Any, + Dict, + Generator, + List, + Iterable, + Set, + Tuple, + Union, + TYPE_CHECKING, +) from interface_meta import InterfaceMeta, inherit_docs @@ -17,6 +27,7 @@ ) from formulaic.materializers.types.factor_values import FactorValuesMetadata from formulaic.model_matrix import ModelMatrix +from formulaic.utils.cast import as_columns from formulaic.utils.layered_mapping import LayeredMapping from formulaic.utils.stateful_transforms import stateful_eval @@ -465,10 +476,17 @@ def wrapped(values, metadata, state, *args, **kwargs): return wrapped + # If we need to unpack values into columns, we do this here. + # Otherwise, we pass through the original values. + factor_values = FactorValues( + self._extract_columns_for_encoding(factor), + metadata=factor.metadata, + ) + encoder_state = spec.encoder_state.get(factor.expr, [None, {}])[1] if factor.metadata.kind is Factor.Kind.CATEGORICAL: encoded = map_dict(self._encode_categorical)( - factor.values, + factor_values, factor.metadata, encoder_state, spec, @@ -477,11 +495,11 @@ def wrapped(values, metadata, state, *args, **kwargs): ) elif factor.metadata.kind is Factor.Kind.NUMERICAL: encoded = map_dict(self._encode_numerical)( - factor.values, factor.metadata, encoder_state, spec, drop_rows + factor_values, factor.metadata, encoder_state, spec, drop_rows ) elif factor.metadata.kind is Factor.Kind.CONSTANT: encoded = map_dict(self._encode_constant)( - factor.values, factor.metadata, encoder_state, spec, drop_rows + factor_values, factor.metadata, encoder_state, spec, drop_rows ) else: raise FactorEncodingError( @@ -519,6 +537,16 @@ def wrapped(values, metadata, state, *args, **kwargs): return self._flatten_encoded_evaled_factor(factor.expr, encoded) + def _extract_columns_for_encoding( + self, factor: EvaluatedFactor + ) -> Union[Any, Dict[str, Any]]: + """ + If incoming factor has values that need to be unpacked into columns + (e.g. a two-dimensions numpy array), do that expansion here. Otherwise, + return the current factor values. + """ + return as_columns(factor.values) + def _flatten_encoded_evaled_factor( self, name: str, values: FactorValues[dict] ) -> Dict[str, Any]: diff --git a/formulaic/materializers/types/factor_values.py b/formulaic/materializers/types/factor_values.py index a1e5a996..50dae75d 100644 --- a/formulaic/materializers/types/factor_values.py +++ b/formulaic/materializers/types/factor_values.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass, replace -from typing import Generic, Optional, TypeVar, Union +from typing import Generic, Optional, Tuple, TypeVar, Union import wrapt @@ -31,6 +31,7 @@ class FactorValuesMetadata: """ kind: Factor.Kind = Factor.Kind.UNKNOWN + column_names: Optional[Tuple[str]] = None spans_intercept: bool = False drop_field: Optional[str] = None format: str = "{name}[{field}]" @@ -59,6 +60,7 @@ def __init__( *, metadata: FactorValuesMetadata = MISSING, kind: Union[str, Factor.Kind] = MISSING, + column_names: Tuple[str] = MISSING, spans_intercept: bool = MISSING, drop_field: Optional[str] = MISSING, format: str = MISSING, @@ -67,6 +69,7 @@ def __init__( metadata_constructor = FactorValuesMetadata metadata_kwargs = dict( kind=Factor.Kind(kind) if kind is not MISSING else kind, + column_names=column_names, spans_intercept=spans_intercept, drop_field=drop_field, format=format, diff --git a/formulaic/transforms/__init__.py b/formulaic/transforms/__init__.py index a54a527f..3f2d5259 100644 --- a/formulaic/transforms/__init__.py +++ b/formulaic/transforms/__init__.py @@ -1,12 +1,14 @@ from .basis_spline import basis_spline from .identity import identity from .encode_categorical import encode_categorical +from .poly import poly from .scale import center, scale TRANSFORMS = { "bs": basis_spline, "center": center, + "poly": poly, "scale": scale, "C": encode_categorical, "I": identity, diff --git a/formulaic/transforms/poly.py b/formulaic/transforms/poly.py new file mode 100644 index 00000000..62a6c13e --- /dev/null +++ b/formulaic/transforms/poly.py @@ -0,0 +1,111 @@ +from formulaic.materializers.types import FactorValues +from formulaic.utils.stateful_transforms import stateful_transform + +import numpy +import numpy.typing + + +@stateful_transform +def poly( + x: numpy.typing.ArrayLike, degree: int = 1, raw: bool = False, _state=None +) -> numpy.ndarray: + """ + Generate a basis for a polynomial vector-space representation of `x`. + + The basis vectors returned by this transform can be used, for example, to + capture non-linear dependence on `x` in a linear regression. + + Args: + x: The vector for which a polynomial vector space should be generated. + degree: The degree of the polynomial vector space. + raw: Whether to return "raw" basis vectors (e.g. `[x, x**2, x**3]`). If + `False`, an orthonormal set of basis vectors is returned instead + (see notes below for more information). + + Returns: + A two-dimensional numpy array with `len(x)` rows, and `degree` columns. + The columns represent the basis vectors of the polynomial vector-space. + + Notes: + This transform is an implementation of the "three-term recurrence + relation" for monic orthogonal polynomials. There are many good + introductions to these recurrence relations, including: + https://dec41.user.srcf.net/h/IB_L/numerical_analysis/2_3 + Another common approach is QR factorisation, where the columns of Q are + the orthogonal basis vectors. However, our implementation outperforms + numpy's QR decomposition, and does not require needless computation of + the R matrix. It should also be noted that orthogonal polynomial bases + are unique up to the choice of inner-product and scaling, and so all + methods will result in the same set of polynomials. + + When used as a stateful transform, we retain the coefficients that + uniquely define the polynomials; and so new data will be evaluated + against the same polynomial bases as the original dataset. However, + the polynomial basis will almost certainly *not* be orthogonal for the + new data. This is because changing the incoming dataset is equivalent to + changing your choice of inner product. + + Using orthogonal basis vectors (as compared to the "raw" vectors) allows + you to increase the degree of the polynomial vector space without + affecting the coefficients of lower-order components in a linear + regression. This stability is often attractive during exploratory data + analysis, but does not otherwise change the results of a linear + regression. + + `nan` values in `x` will be ignored and progagated through to generated + polynomials. + + The signature of this transform is intentionally chosen to be compatible + with R. + """ + + if raw: + return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1) + + x = numpy.array(x) + + # Check if we already have already generated the alpha and beta coefficients. + # If not, we enter "training" mode. + training = False + alpha = _state.get("alpha") + norms2 = _state.get("norms2") + + if alpha is None: + training = True + alpha = {} + norms2 = {} + + # Build polynomials iteratively using the monic three-term recurrence relation + # Note that alpha and beta are fixed if not in "training" mode. + P = numpy.empty((x.shape[0], degree + 1)) + P[:, 0] = 1 + + def get_alpha(k): + if training and k not in alpha: + alpha[k] = numpy.sum(x * P[:, k] ** 2) / numpy.sum(P[:, k] ** 2) + return alpha[k] + + def get_norm(k): + if training and k not in norms2: + norms2[k] = numpy.sum(P[:, k] ** 2) + return norms2[k] + + def get_beta(k): + return get_norm(k) / get_norm(k - 1) + + for i in range(1, degree + 1): + P[:, i] = (x - get_alpha(i - 1)) * P[:, i - 1] + if i >= 2: + P[:, i] -= get_beta(i - 1) * P[:, i - 2] + + # Renormalize so we provide an orthonormal basis. + P /= numpy.array([numpy.sqrt(get_norm(k)) for k in range(0, degree + 1)]) + + if training: + _state["alpha"] = alpha + _state["norms2"] = norms2 + + # Return basis dropping the first (constant) column + return FactorValues( + P[:, 1:], column_names=tuple(str(i) for i in range(1, degree + 1)) + ) diff --git a/formulaic/utils/cast.py b/formulaic/utils/cast.py new file mode 100644 index 00000000..8840e42b --- /dev/null +++ b/formulaic/utils/cast.py @@ -0,0 +1,52 @@ +from functools import singledispatch + +from typing import Any + +import numpy +import pandas +import scipy.sparse + + +@singledispatch +def as_columns(data: Any) -> Any: + """ + Get the columns for `data`. If `data` represents a single column, or is a + dictionary (the format used to store columns), it is returned as is. + """ + return data + + +@as_columns.register +def _(data: pandas.DataFrame): + return {col: series for col, series in data.items()} + + +@as_columns.register +def _(data: numpy.ndarray): + if len(data.shape) == 1: + return data + if len(data.shape) > 2: + raise ValueError( + "Formulaic does not know how to convert numpy arrays with more than " + "two dimensions into columns." + ) + if ( + hasattr(data, "__formulaic_metadata__") + and data.__formulaic_metadata__.column_names + ): + column_names = data.__formulaic_metadata__.column_names + else: + column_names = list(range(data.shape[1])) + return {column_names[i]: data[:, i] for i in range(data.shape[1])} + + +@as_columns.register +def _(data: scipy.sparse.csc_matrix): + if ( + hasattr(data, "__formulaic_metadata__") + and data.__formulaic_metadata__.column_names + ): + column_names = data.__formulaic_metadata__.column_names + else: + column_names = list(range(data.shape[1])) + return {column_names[i]: data[:, i] for i in range(data.shape[1])} diff --git a/tests/transforms/test_poly.py b/tests/transforms/test_poly.py new file mode 100644 index 00000000..d89d17fd --- /dev/null +++ b/tests/transforms/test_poly.py @@ -0,0 +1,155 @@ +import numpy +import pytest + +from formulaic.transforms.poly import poly + + +class TestPoly: + @pytest.fixture(scope="session") + def data(self): + return numpy.linspace(0, 1, 21) + + def test_basic(self, data): + state = {} + V = poly(data, _state=state) + + assert V.shape[1] == 1 + + # Comparison data copied from R output of: + # > data = seq(from=0, to=1, by=0.05) + # > poly(data) + r_reference = [ + -3.603750e-01, + -3.243375e-01, + -2.883000e-01, + -2.522625e-01, + -2.162250e-01, + -1.801875e-01, + -1.441500e-01, + -1.081125e-01, + -7.207500e-02, + -3.603750e-02, + -3.000725e-17, + 3.603750e-02, + 7.207500e-02, + 1.081125e-01, + 1.441500e-01, + 1.801875e-01, + 2.162250e-01, + 2.522625e-01, + 2.883000e-01, + 3.243375e-01, + 3.603750e-01, + ] + + assert numpy.allclose( + V[:, 0], + r_reference, + ) + + assert pytest.approx(state["alpha"], {0: 0.5}) + assert pytest.approx(state["norms2"], {0: 21.0, 2: 1.925}) + + assert numpy.allclose( + poly(data, _state=state)[:, 0], + r_reference, + ) + + def test_degree(self, data): + state = {} + V = poly(data, degree=3, _state=state) + + assert V.shape[1] == 3 + + # Comparison data copied from R output of: + # > data = seq(from=0, to=1, by=0.05) + # > poly(data, 3) + r_reference = numpy.array( + [ + [-3.603750e-01, 0.42285541, -4.332979e-01], + [-3.243375e-01, 0.29599879, -1.733191e-01], + [-2.883000e-01, 0.18249549, 1.824412e-02], + [-2.522625e-01, 0.08234553, 1.489937e-01], + [-2.162250e-01, -0.00445111, 2.265312e-01], + [-1.801875e-01, -0.07789442, 2.584584e-01], + [-1.441500e-01, -0.13798440, 2.523770e-01], + [-1.081125e-01, -0.18472105, 2.158888e-01], + [-7.207500e-02, -0.21810437, 1.565954e-01], + [-3.603750e-02, -0.23813436, 8.209854e-02], + [-3.000725e-17, -0.24481103, -4.395626e-17], + [3.603750e-02, -0.23813436, -8.209854e-02], + [7.207500e-02, -0.21810437, -1.565954e-01], + [1.081125e-01, -0.18472105, -2.158888e-01], + [1.441500e-01, -0.13798440, -2.523770e-01], + [1.801875e-01, -0.07789442, -2.584584e-01], + [2.162250e-01, -0.00445111, -2.265312e-01], + [2.522625e-01, 0.08234553, -1.489937e-01], + [2.883000e-01, 0.18249549, -1.824412e-02], + [3.243375e-01, 0.29599879, 1.733191e-01], + [3.603750e-01, 0.42285541, 4.332979e-01], + ] + ) + + assert numpy.allclose( + V, + r_reference, + ) + + assert pytest.approx(state["alpha"], {0: 0.5, 1: 0.5, 2: 0.5}) + assert pytest.approx( + state["norms2"], {1: 0.09166666666666667, 2: 0.07283333333333333} + ) + + assert numpy.allclose( + poly(data, degree=3, _state=state), + r_reference, + ) + + def test_reuse_state(self, data): + state = {} + V = poly(data, degree=3, _state=state) # as tested above + + # Reuse state but with different data + V = poly(data ** 2, degree=3, _state=state) + + assert V.shape[1] == 3 + + # Comparison data copied from R output of: + # > data = seq(from=0, to=1, by=0.05) + # > coefs = attr(poly(data, 3), 'coefs') + # > poly(data^2, 3, coefs=coefs) + r_reference = numpy.array( + [ + [-0.36037499, 0.422855413, -0.43329786], + [-0.35857311, 0.416195441, -0.41855671], + [-0.35316749, 0.396415822, -0.37546400], + [-0.34415811, 0.364117458, -0.30735499], + [-0.33154499, 0.320301848, -0.21959840], + [-0.31532811, 0.266371091, -0.11931132], + [-0.29550749, 0.204127887, -0.01496018], + [-0.27208311, 0.135775535, 0.08415243], + [-0.24505499, 0.063917934, 0.16851486], + [-0.21442312, -0.008440417, 0.22914758], + [-0.18018749, -0.077894418, 0.25845837], + [-0.14234812, -0.140638372, 0.25121156], + [-0.10090500, -0.192465980, 0.20561124], + [-0.05585812, -0.228770342, 0.12449854], + [-0.00720750, -0.244543962, 0.01666296], + [0.04504687, -0.234378741, -0.10173235], + [0.10090500, -0.192465980, -0.20561124], + [0.16036687, -0.112596382, -0.25933114], + [0.22343249, 0.011839952, -0.21491574], + [0.29010186, 0.187853517, -0.01017347], + [0.36037499, 0.422855413, 0.43329786], + ] + ) + + assert numpy.allclose( + V, + r_reference, + ) + + def test_raw(self, data): + assert numpy.allclose( + poly(data, 3, raw=True), numpy.array([data, data ** 2, data ** 3]).T + ) diff --git a/tests/utils/test_cast.py b/tests/utils/test_cast.py new file mode 100644 index 00000000..52757da2 --- /dev/null +++ b/tests/utils/test_cast.py @@ -0,0 +1,50 @@ +import numpy +import pandas +import pytest +import scipy.sparse + +from formulaic import FactorValues +from formulaic.utils.cast import as_columns + + +def test_as_columns(): + assert as_columns(1) == 1 + assert as_columns([1, 2, 3, 4]) == [1, 2, 3, 4] + + # Check pandas types + series = pandas.Series([1, 2, 3]) + assert as_columns(series) is series + assert list( + as_columns(pandas.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})).keys() + ) == ["a", "b"] + + # Check numpy types + assert numpy.all(as_columns(numpy.array([1, 2, 3])) == numpy.array([1, 2, 3])) + assert numpy.all( + as_columns(numpy.array([1, 2, 3]).reshape((-1, 1)))[0] == numpy.array([1, 2, 3]) + ) + assert numpy.all( + as_columns( + FactorValues(numpy.array([1, 2, 3]).reshape((-1, 1)), column_names=("a")) + )["a"] + == numpy.array([1, 2, 3]) + ) + + with pytest.raises( + ValueError, + match="Formulaic does not know how to convert numpy arrays with more than two dimensions into columns.", + ): + as_columns(numpy.array([1, 2, 3]).reshape((1, 1, 3))) + + # Check sparse types + assert numpy.all( + as_columns(scipy.sparse.csc_matrix([[1, 2, 3]]))[0] == numpy.array([1]) + ) + assert numpy.all( + as_columns( + FactorValues( + scipy.sparse.csc_matrix([[1, 2, 3]]), column_names=("a", "b", "c") + ) + )["a"] + == numpy.array([1]) + )