Skip to content

Commit

Permalink
Merge pull request #44 from matthewwardrop/add_poly_transform
Browse files Browse the repository at this point in the history
Add support for the `poly` transform.
  • Loading branch information
matthewwardrop authored Oct 16, 2021
2 parents 279b97c + 7caa9cd commit 678c916
Show file tree
Hide file tree
Showing 8 changed files with 407 additions and 5 deletions.
1 change: 1 addition & 0 deletions docs/basic/grammar.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ that have *not* been implemented by `formualaic` are explicitly noted also.
| `center(...)` | Shift column data so mean is zero. ||| 🗙 |
| `scale(...)` | Shift column so mean is zero and variance is 1. ||[^7] ||
| `standardize(...)` | Alias of `scale`. | 🗙 || 🗙 |
| `poly(...)` | Generates a polynomial basis, allowing non-linear fits. || 🗙 ||
| `bs(...)` | Generates a B-Spline basis, allowing non-linear fits. ||||
| `cr(...)` | Generates a natural cubic spline basis, allowing non-linear fits. | 🗙 |||
| `cc(...)` | Generates a cyclic cubic spline basis, allowing non-linear fits. | 🗙 |||
Expand Down
36 changes: 32 additions & 4 deletions formulaic/materializers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@
import operator
from abc import abstractmethod
from collections import defaultdict, OrderedDict
from typing import Any, Dict, Generator, List, Iterable, Set, Tuple, TYPE_CHECKING
from typing import (
Any,
Dict,
Generator,
List,
Iterable,
Set,
Tuple,
Union,
TYPE_CHECKING,
)

from interface_meta import InterfaceMeta, inherit_docs

Expand All @@ -17,6 +27,7 @@
)
from formulaic.materializers.types.factor_values import FactorValuesMetadata
from formulaic.model_matrix import ModelMatrix
from formulaic.utils.cast import as_columns
from formulaic.utils.layered_mapping import LayeredMapping
from formulaic.utils.stateful_transforms import stateful_eval

Expand Down Expand Up @@ -465,10 +476,17 @@ def wrapped(values, metadata, state, *args, **kwargs):

return wrapped

# If we need to unpack values into columns, we do this here.
# Otherwise, we pass through the original values.
factor_values = FactorValues(
self._extract_columns_for_encoding(factor),
metadata=factor.metadata,
)

encoder_state = spec.encoder_state.get(factor.expr, [None, {}])[1]
if factor.metadata.kind is Factor.Kind.CATEGORICAL:
encoded = map_dict(self._encode_categorical)(
factor.values,
factor_values,
factor.metadata,
encoder_state,
spec,
Expand All @@ -477,11 +495,11 @@ def wrapped(values, metadata, state, *args, **kwargs):
)
elif factor.metadata.kind is Factor.Kind.NUMERICAL:
encoded = map_dict(self._encode_numerical)(
factor.values, factor.metadata, encoder_state, spec, drop_rows
factor_values, factor.metadata, encoder_state, spec, drop_rows
)
elif factor.metadata.kind is Factor.Kind.CONSTANT:
encoded = map_dict(self._encode_constant)(
factor.values, factor.metadata, encoder_state, spec, drop_rows
factor_values, factor.metadata, encoder_state, spec, drop_rows
)
else:
raise FactorEncodingError(
Expand Down Expand Up @@ -519,6 +537,16 @@ def wrapped(values, metadata, state, *args, **kwargs):

return self._flatten_encoded_evaled_factor(factor.expr, encoded)

def _extract_columns_for_encoding(
self, factor: EvaluatedFactor
) -> Union[Any, Dict[str, Any]]:
"""
If incoming factor has values that need to be unpacked into columns
(e.g. a two-dimensions numpy array), do that expansion here. Otherwise,
return the current factor values.
"""
return as_columns(factor.values)

def _flatten_encoded_evaled_factor(
self, name: str, values: FactorValues[dict]
) -> Dict[str, Any]:
Expand Down
5 changes: 4 additions & 1 deletion formulaic/materializers/types/factor_values.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass, replace
from typing import Generic, Optional, TypeVar, Union
from typing import Generic, Optional, Tuple, TypeVar, Union

import wrapt

Expand Down Expand Up @@ -31,6 +31,7 @@ class FactorValuesMetadata:
"""

kind: Factor.Kind = Factor.Kind.UNKNOWN
column_names: Optional[Tuple[str]] = None
spans_intercept: bool = False
drop_field: Optional[str] = None
format: str = "{name}[{field}]"
Expand Down Expand Up @@ -59,6 +60,7 @@ def __init__(
*,
metadata: FactorValuesMetadata = MISSING,
kind: Union[str, Factor.Kind] = MISSING,
column_names: Tuple[str] = MISSING,
spans_intercept: bool = MISSING,
drop_field: Optional[str] = MISSING,
format: str = MISSING,
Expand All @@ -67,6 +69,7 @@ def __init__(
metadata_constructor = FactorValuesMetadata
metadata_kwargs = dict(
kind=Factor.Kind(kind) if kind is not MISSING else kind,
column_names=column_names,
spans_intercept=spans_intercept,
drop_field=drop_field,
format=format,
Expand Down
2 changes: 2 additions & 0 deletions formulaic/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from .basis_spline import basis_spline
from .identity import identity
from .encode_categorical import encode_categorical
from .poly import poly
from .scale import center, scale


TRANSFORMS = {
"bs": basis_spline,
"center": center,
"poly": poly,
"scale": scale,
"C": encode_categorical,
"I": identity,
Expand Down
111 changes: 111 additions & 0 deletions formulaic/transforms/poly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from formulaic.materializers.types import FactorValues
from formulaic.utils.stateful_transforms import stateful_transform

import numpy
import numpy.typing


@stateful_transform
def poly(
x: numpy.typing.ArrayLike, degree: int = 1, raw: bool = False, _state=None
) -> numpy.ndarray:
"""
Generate a basis for a polynomial vector-space representation of `x`.
The basis vectors returned by this transform can be used, for example, to
capture non-linear dependence on `x` in a linear regression.
Args:
x: The vector for which a polynomial vector space should be generated.
degree: The degree of the polynomial vector space.
raw: Whether to return "raw" basis vectors (e.g. `[x, x**2, x**3]`). If
`False`, an orthonormal set of basis vectors is returned instead
(see notes below for more information).
Returns:
A two-dimensional numpy array with `len(x)` rows, and `degree` columns.
The columns represent the basis vectors of the polynomial vector-space.
Notes:
This transform is an implementation of the "three-term recurrence
relation" for monic orthogonal polynomials. There are many good
introductions to these recurrence relations, including:
https://dec41.user.srcf.net/h/IB_L/numerical_analysis/2_3
Another common approach is QR factorisation, where the columns of Q are
the orthogonal basis vectors. However, our implementation outperforms
numpy's QR decomposition, and does not require needless computation of
the R matrix. It should also be noted that orthogonal polynomial bases
are unique up to the choice of inner-product and scaling, and so all
methods will result in the same set of polynomials.
When used as a stateful transform, we retain the coefficients that
uniquely define the polynomials; and so new data will be evaluated
against the same polynomial bases as the original dataset. However,
the polynomial basis will almost certainly *not* be orthogonal for the
new data. This is because changing the incoming dataset is equivalent to
changing your choice of inner product.
Using orthogonal basis vectors (as compared to the "raw" vectors) allows
you to increase the degree of the polynomial vector space without
affecting the coefficients of lower-order components in a linear
regression. This stability is often attractive during exploratory data
analysis, but does not otherwise change the results of a linear
regression.
`nan` values in `x` will be ignored and progagated through to generated
polynomials.
The signature of this transform is intentionally chosen to be compatible
with R.
"""

if raw:
return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1)

x = numpy.array(x)

# Check if we already have already generated the alpha and beta coefficients.
# If not, we enter "training" mode.
training = False
alpha = _state.get("alpha")
norms2 = _state.get("norms2")

if alpha is None:
training = True
alpha = {}
norms2 = {}

# Build polynomials iteratively using the monic three-term recurrence relation
# Note that alpha and beta are fixed if not in "training" mode.
P = numpy.empty((x.shape[0], degree + 1))
P[:, 0] = 1

def get_alpha(k):
if training and k not in alpha:
alpha[k] = numpy.sum(x * P[:, k] ** 2) / numpy.sum(P[:, k] ** 2)
return alpha[k]

def get_norm(k):
if training and k not in norms2:
norms2[k] = numpy.sum(P[:, k] ** 2)
return norms2[k]

def get_beta(k):
return get_norm(k) / get_norm(k - 1)

for i in range(1, degree + 1):
P[:, i] = (x - get_alpha(i - 1)) * P[:, i - 1]
if i >= 2:
P[:, i] -= get_beta(i - 1) * P[:, i - 2]

# Renormalize so we provide an orthonormal basis.
P /= numpy.array([numpy.sqrt(get_norm(k)) for k in range(0, degree + 1)])

if training:
_state["alpha"] = alpha
_state["norms2"] = norms2

# Return basis dropping the first (constant) column
return FactorValues(
P[:, 1:], column_names=tuple(str(i) for i in range(1, degree + 1))
)
52 changes: 52 additions & 0 deletions formulaic/utils/cast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from functools import singledispatch

from typing import Any

import numpy
import pandas
import scipy.sparse


@singledispatch
def as_columns(data: Any) -> Any:
"""
Get the columns for `data`. If `data` represents a single column, or is a
dictionary (the format used to store columns), it is returned as is.
"""
return data


@as_columns.register
def _(data: pandas.DataFrame):
return {col: series for col, series in data.items()}


@as_columns.register
def _(data: numpy.ndarray):
if len(data.shape) == 1:
return data
if len(data.shape) > 2:
raise ValueError(
"Formulaic does not know how to convert numpy arrays with more than "
"two dimensions into columns."
)
if (
hasattr(data, "__formulaic_metadata__")
and data.__formulaic_metadata__.column_names
):
column_names = data.__formulaic_metadata__.column_names
else:
column_names = list(range(data.shape[1]))
return {column_names[i]: data[:, i] for i in range(data.shape[1])}


@as_columns.register
def _(data: scipy.sparse.csc_matrix):
if (
hasattr(data, "__formulaic_metadata__")
and data.__formulaic_metadata__.column_names
):
column_names = data.__formulaic_metadata__.column_names
else:
column_names = list(range(data.shape[1]))
return {column_names[i]: data[:, i] for i in range(data.shape[1])}
Loading

0 comments on commit 678c916

Please sign in to comment.