Merge pull request #44 from matthewwardrop/add_poly_transform

Add support for the `poly` transform.
matthewwardrop · Oct 16, 2021 · 678c916 · 678c916
2 parents 279b97c + 7caa9cd
commit 678c916
Show file tree

Hide file tree

Showing 8 changed files with 407 additions and 5 deletions.
diff --git a/docs/basic/grammar.md b/docs/basic/grammar.md
@@ -59,6 +59,7 @@ that have *not* been implemented by `formualaic` are explicitly noted also.
 | `center(...)` | Shift column data so mean is zero. | ✓ | ✓ | 🗙 |
 | `scale(...)` | Shift column so mean is zero and variance is 1. | ✓ | ✓[^7] | ✓ |
 | `standardize(...)` | Alias of `scale`. | 🗙 | ✓ | 🗙 |
+| `poly(...)` | Generates a polynomial basis, allowing non-linear fits. | ✓ | 🗙 | ✓ |
 | `bs(...)` | Generates a B-Spline basis, allowing non-linear fits. | ✓ | ✓ | ✓ |
 | `cr(...)` | Generates a natural cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ |
 | `cc(...)` | Generates a cyclic cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ |

diff --git a/formulaic/materializers/base.py b/formulaic/materializers/base.py
@@ -5,7 +5,17 @@
 import operator
 from abc import abstractmethod
 from collections import defaultdict, OrderedDict
-from typing import Any, Dict, Generator, List, Iterable, Set, Tuple, TYPE_CHECKING
+from typing import (
+    Any,
+    Dict,
+    Generator,
+    List,
+    Iterable,
+    Set,
+    Tuple,
+    Union,
+    TYPE_CHECKING,
+)
 
 from interface_meta import InterfaceMeta, inherit_docs
 
@@ -17,6 +27,7 @@
 )
 from formulaic.materializers.types.factor_values import FactorValuesMetadata
 from formulaic.model_matrix import ModelMatrix
+from formulaic.utils.cast import as_columns
 from formulaic.utils.layered_mapping import LayeredMapping
 from formulaic.utils.stateful_transforms import stateful_eval
 
@@ -465,10 +476,17 @@ def wrapped(values, metadata, state, *args, **kwargs):
 
                     return wrapped
 
+                # If we need to unpack values into columns, we do this here.
+                # Otherwise, we pass through the original values.
+                factor_values = FactorValues(
+                    self._extract_columns_for_encoding(factor),
+                    metadata=factor.metadata,
+                )
+
                 encoder_state = spec.encoder_state.get(factor.expr, [None, {}])[1]
                 if factor.metadata.kind is Factor.Kind.CATEGORICAL:
                     encoded = map_dict(self._encode_categorical)(
-                        factor.values,
+                        factor_values,
                         factor.metadata,
                         encoder_state,
                         spec,
@@ -477,11 +495,11 @@ def wrapped(values, metadata, state, *args, **kwargs):
                     )
                 elif factor.metadata.kind is Factor.Kind.NUMERICAL:
                     encoded = map_dict(self._encode_numerical)(
-                        factor.values, factor.metadata, encoder_state, spec, drop_rows
+                        factor_values, factor.metadata, encoder_state, spec, drop_rows
                     )
                 elif factor.metadata.kind is Factor.Kind.CONSTANT:
                     encoded = map_dict(self._encode_constant)(
-                        factor.values, factor.metadata, encoder_state, spec, drop_rows
+                        factor_values, factor.metadata, encoder_state, spec, drop_rows
                     )
                 else:
                     raise FactorEncodingError(
@@ -519,6 +537,16 @@ def wrapped(values, metadata, state, *args, **kwargs):
 
         return self._flatten_encoded_evaled_factor(factor.expr, encoded)
 
+    def _extract_columns_for_encoding(
+        self, factor: EvaluatedFactor
+    ) -> Union[Any, Dict[str, Any]]:
+        """
+        If incoming factor has values that need to be unpacked into columns
+        (e.g. a two-dimensions numpy array), do that expansion here. Otherwise,
+        return the current factor values.
+        """
+        return as_columns(factor.values)
+
     def _flatten_encoded_evaled_factor(
         self, name: str, values: FactorValues[dict]
     ) -> Dict[str, Any]:

diff --git a/formulaic/materializers/types/factor_values.py b/formulaic/materializers/types/factor_values.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, replace
-from typing import Generic, Optional, TypeVar, Union
+from typing import Generic, Optional, Tuple, TypeVar, Union
 
 import wrapt
 
@@ -31,6 +31,7 @@ class FactorValuesMetadata:
     """
 
     kind: Factor.Kind = Factor.Kind.UNKNOWN
+    column_names: Optional[Tuple[str]] = None
     spans_intercept: bool = False
     drop_field: Optional[str] = None
     format: str = "{name}[{field}]"
@@ -59,6 +60,7 @@ def __init__(
         *,
         metadata: FactorValuesMetadata = MISSING,
         kind: Union[str, Factor.Kind] = MISSING,
+        column_names: Tuple[str] = MISSING,
         spans_intercept: bool = MISSING,
         drop_field: Optional[str] = MISSING,
         format: str = MISSING,
@@ -67,6 +69,7 @@ def __init__(
         metadata_constructor = FactorValuesMetadata
         metadata_kwargs = dict(
             kind=Factor.Kind(kind) if kind is not MISSING else kind,
+            column_names=column_names,
             spans_intercept=spans_intercept,
             drop_field=drop_field,
             format=format,

diff --git a/formulaic/transforms/__init__.py b/formulaic/transforms/__init__.py
@@ -1,12 +1,14 @@
 from .basis_spline import basis_spline
 from .identity import identity
 from .encode_categorical import encode_categorical
+from .poly import poly
 from .scale import center, scale
 
 
 TRANSFORMS = {
     "bs": basis_spline,
     "center": center,
+    "poly": poly,
     "scale": scale,
     "C": encode_categorical,
     "I": identity,

diff --git a/formulaic/transforms/poly.py b/formulaic/transforms/poly.py
@@ -0,0 +1,111 @@
+from formulaic.materializers.types import FactorValues
+from formulaic.utils.stateful_transforms import stateful_transform
+
+import numpy
+import numpy.typing
+
+
+@stateful_transform
+def poly(
+    x: numpy.typing.ArrayLike, degree: int = 1, raw: bool = False, _state=None
+) -> numpy.ndarray:
+    """
+    Generate a basis for a polynomial vector-space representation of `x`.
+
+    The basis vectors returned by this transform can be used, for example, to
+    capture non-linear dependence on `x` in a linear regression.
+
+    Args:
+        x: The vector for which a polynomial vector space should be generated.
+        degree: The degree of the polynomial vector space.
+        raw: Whether to return "raw" basis vectors (e.g. `[x, x**2, x**3]`). If
+            `False`, an orthonormal set of basis vectors is returned instead
+            (see notes below for more information).
+
+    Returns:
+        A two-dimensional numpy array with `len(x)` rows, and `degree` columns.
+        The columns represent the basis vectors of the polynomial vector-space.
+
+    Notes:
+        This transform is an implementation of the "three-term recurrence
+        relation" for monic orthogonal polynomials. There are many good
+        introductions to these recurrence relations, including:
+            https://dec41.user.srcf.net/h/IB_L/numerical_analysis/2_3
+        Another common approach is QR factorisation, where the columns of Q are
+        the orthogonal basis vectors. However, our implementation outperforms
+        numpy's QR decomposition, and does not require needless computation of
+        the R matrix. It should also be noted that orthogonal polynomial bases
+        are unique up to the choice of inner-product and scaling, and so all
+        methods will result in the same set of polynomials.
+
+        When used as a stateful transform, we retain the coefficients that
+        uniquely define the polynomials; and so new data will be evaluated
+        against the same polynomial bases as the original dataset. However,
+        the polynomial basis will almost certainly *not* be orthogonal for the
+        new data. This is because changing the incoming dataset is equivalent to
+        changing your choice of inner product.
+
+        Using orthogonal basis vectors (as compared to the "raw" vectors) allows
+        you to increase the degree of the polynomial vector space without
+        affecting the coefficients of lower-order components in a linear
+        regression. This stability is often attractive during exploratory data
+        analysis, but does not otherwise change the results of a linear
+        regression.
+
+        `nan` values in `x` will be ignored and progagated through to generated
+        polynomials.
+
+        The signature of this transform is intentionally chosen to be compatible
+        with R.
+    """
+
+    if raw:
+        return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1)
+
+    x = numpy.array(x)
+
+    # Check if we already have already generated the alpha and beta coefficients.
+    # If not, we enter "training" mode.
+    training = False
+    alpha = _state.get("alpha")
+    norms2 = _state.get("norms2")
+
+    if alpha is None:
+        training = True
+        alpha = {}
+        norms2 = {}
+
+    # Build polynomials iteratively using the monic three-term recurrence relation
+    # Note that alpha and beta are fixed if not in "training" mode.
+    P = numpy.empty((x.shape[0], degree + 1))
+    P[:, 0] = 1
+
+    def get_alpha(k):
+        if training and k not in alpha:
+            alpha[k] = numpy.sum(x * P[:, k] ** 2) / numpy.sum(P[:, k] ** 2)
+        return alpha[k]
+
+    def get_norm(k):
+        if training and k not in norms2:
+            norms2[k] = numpy.sum(P[:, k] ** 2)
+        return norms2[k]
+
+    def get_beta(k):
+        return get_norm(k) / get_norm(k - 1)
+
+    for i in range(1, degree + 1):
+        P[:, i] = (x - get_alpha(i - 1)) * P[:, i - 1]
+        if i >= 2:
+            P[:, i] -= get_beta(i - 1) * P[:, i - 2]
+
+    # Renormalize so we provide an orthonormal basis.
+    P /= numpy.array([numpy.sqrt(get_norm(k)) for k in range(0, degree + 1)])
+
+    if training:
+        _state["alpha"] = alpha
+        _state["norms2"] = norms2
+
+    # Return basis dropping the first (constant) column
+    return FactorValues(
+        P[:, 1:], column_names=tuple(str(i) for i in range(1, degree + 1))
+    )
diff --git a/formulaic/utils/cast.py b/formulaic/utils/cast.py
@@ -0,0 +1,52 @@
+from functools import singledispatch
+
+from typing import Any
+
+import numpy
+import pandas
+import scipy.sparse
+
+
+@singledispatch
+def as_columns(data: Any) -> Any:
+    """
+    Get the columns for `data`. If `data` represents a single column, or is a
+    dictionary (the format used to store columns), it is returned as is.
+    """
+    return data
+
+
+@as_columns.register
+def _(data: pandas.DataFrame):
+    return {col: series for col, series in data.items()}
+
+
+@as_columns.register
+def _(data: numpy.ndarray):
+    if len(data.shape) == 1:
+        return data
+    if len(data.shape) > 2:
+        raise ValueError(
+            "Formulaic does not know how to convert numpy arrays with more than "
+            "two dimensions into columns."
+        )
+    if (
+        hasattr(data, "__formulaic_metadata__")
+        and data.__formulaic_metadata__.column_names
+    ):
+        column_names = data.__formulaic_metadata__.column_names
+    else:
+        column_names = list(range(data.shape[1]))
+    return {column_names[i]: data[:, i] for i in range(data.shape[1])}
+
+
+@as_columns.register
+def _(data: scipy.sparse.csc_matrix):
+    if (
+        hasattr(data, "__formulaic_metadata__")
+        and data.__formulaic_metadata__.column_names
+    ):
+        column_names = data.__formulaic_metadata__.column_names
+    else:
+        column_names = list(range(data.shape[1]))
+    return {column_names[i]: data[:, i] for i in range(data.shape[1])}