-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
279b97c
commit dc6e38d
Showing
4 changed files
with
266 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
from formulaic.utils.stateful_transforms import stateful_transform | ||
|
||
import numpy | ||
import numpy.typing | ||
|
||
|
||
@stateful_transform | ||
def poly( | ||
x: numpy.typing.ArrayLike, degree: int = 1, raw: bool = False, _state=None | ||
) -> numpy.ndarray: | ||
""" | ||
Generate a basis for a polynomial vector-space representation of `x`. | ||
The basis vectors returned by this transform can be used, for example, to | ||
capture non-linear dependence on `x` in a linear regression. | ||
Args: | ||
x: The vector for which a polynomial vector space should be generated. | ||
degree: The degree of the polynomial vector space. | ||
raw: Whether to return "raw" basis vectors (e.g. `[x, x**2, x**3]`). If | ||
`False`, an orthonormal set of basis vectors is returned instead | ||
(see notes below for more information). | ||
Returns: | ||
A two-dimensional numpy array with `len(x)` rows, and `degree` columns. | ||
The columns represent the basis vectors of the polynomial vector-space. | ||
Notes: | ||
This transform is an implementation of the "three-term recurrence | ||
relation" for monic orthogonal polynomials. There are many good | ||
introductions to these recurrence relations, including: | ||
https://dec41.user.srcf.net/h/IB_L/numerical_analysis/2_3 | ||
Another common approach is QR factorisation, where the columns of Q are | ||
the orthogonal basis vectors. However, our implementation outperforms | ||
numpy's QR decomposition, and does not require needless computation of | ||
the R matrix. It should also be noted that orthogonal polynomial bases | ||
are unique up to the choice of inner-product and scaling, and so all | ||
methods will result in the same set of polynomials. | ||
When used as a stateful transform, we retain the coefficients that | ||
uniquely define the polynomials; and so new data will be evaluated | ||
against the same polynomial bases as the original dataset. However, | ||
the polynomial basis will almost certainly *not* be orthogonal for the | ||
new data. This is because changing the incoming dataset is equivalent to | ||
changing your choice of inner product. | ||
Using orthogonal basis vectors (as compared to the "raw" vectors) allows | ||
you to increase the degree of the polynomial vector space without | ||
affecting the coefficients of lower-order components in a linear | ||
regression. This stability is often attractive during exploratory data | ||
analysis, but does not otherwise change the results of a linear | ||
regression. | ||
`nan` values in `x` will be ignored and progagated through to generated | ||
polynomials. | ||
The signature of this transform is intentionally chosen to be compatible | ||
with R. | ||
""" | ||
|
||
if raw: | ||
return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1) | ||
|
||
x = numpy.array(x) | ||
|
||
# Check if we already have already generated the alpha and beta coefficients. | ||
# If not, we enter "training" mode. | ||
training = False | ||
alpha = _state.get("alpha") | ||
norms2 = _state.get("norms2") | ||
|
||
if alpha is None: | ||
training = True | ||
alpha = {} | ||
norms2 = {} | ||
|
||
# Build polynomials iteratively using the monic three-term recurrence relation | ||
# Note that alpha and beta are fixed if not in "training" mode. | ||
P = numpy.empty((x.shape[0], degree + 1)) | ||
P[:, 0] = 1 | ||
|
||
def get_alpha(k): | ||
if training and k not in alpha: | ||
alpha[k] = numpy.sum(x * P[:, k] ** 2) / numpy.sum(P[:, k] ** 2) | ||
return alpha[k] | ||
|
||
def get_norm(k): | ||
if training and k not in norms2: | ||
norms2[k] = numpy.sum(P[:, k] ** 2) | ||
return norms2[k] | ||
|
||
def get_beta(k): | ||
return get_norm(k) / get_norm(k - 1) | ||
|
||
for i in range(1, degree + 1): | ||
P[:, i] = (x - get_alpha(i - 1)) * P[:, i - 1] | ||
if i >= 2: | ||
P[:, i] -= get_beta(i - 1) * P[:, i - 2] | ||
|
||
# Renormalize so we provide an orthonormal basis. | ||
P /= numpy.array([numpy.sqrt(get_norm(k)) for k in range(0, degree + 1)]) | ||
|
||
if training: | ||
_state["alpha"] = alpha | ||
_state["norms2"] = norms2 | ||
|
||
# Return basis dropping the first (constant) column | ||
return P[:, 1:] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import numpy | ||
import pytest | ||
|
||
from formulaic.transforms.poly import poly | ||
|
||
|
||
class TestPoly: | ||
@pytest.fixture(scope="session") | ||
def data(self): | ||
return numpy.linspace(0, 1, 21) | ||
|
||
def test_basic(self, data): | ||
state = {} | ||
V = poly(data, _state=state) | ||
|
||
assert V.shape[1] == 1 | ||
|
||
# Comparison data copied from R output of: | ||
# > data = seq(from=0, to=1, by=0.05) | ||
# > poly(data) | ||
r_reference = [ | ||
-3.603750e-01, | ||
-3.243375e-01, | ||
-2.883000e-01, | ||
-2.522625e-01, | ||
-2.162250e-01, | ||
-1.801875e-01, | ||
-1.441500e-01, | ||
-1.081125e-01, | ||
-7.207500e-02, | ||
-3.603750e-02, | ||
-3.000725e-17, | ||
3.603750e-02, | ||
7.207500e-02, | ||
1.081125e-01, | ||
1.441500e-01, | ||
1.801875e-01, | ||
2.162250e-01, | ||
2.522625e-01, | ||
2.883000e-01, | ||
3.243375e-01, | ||
3.603750e-01, | ||
] | ||
|
||
assert numpy.allclose( | ||
V[:, 0], | ||
r_reference, | ||
) | ||
|
||
assert pytest.approx(state["alpha"], {0: 0.5}) | ||
assert pytest.approx(state["norms2"], {0: 21.0, 2: 1.925}) | ||
|
||
assert numpy.allclose( | ||
poly(data, _state=state)[:, 0], | ||
r_reference, | ||
) | ||
|
||
def test_degree(self, data): | ||
state = {} | ||
V = poly(data, degree=3, _state=state) | ||
|
||
assert V.shape[1] == 3 | ||
|
||
# Comparison data copied from R output of: | ||
# > data = seq(from=0, to=1, by=0.05) | ||
# > poly(data, 3) | ||
r_reference = numpy.array( | ||
[ | ||
[-3.603750e-01, 0.42285541, -4.332979e-01], | ||
[-3.243375e-01, 0.29599879, -1.733191e-01], | ||
[-2.883000e-01, 0.18249549, 1.824412e-02], | ||
[-2.522625e-01, 0.08234553, 1.489937e-01], | ||
[-2.162250e-01, -0.00445111, 2.265312e-01], | ||
[-1.801875e-01, -0.07789442, 2.584584e-01], | ||
[-1.441500e-01, -0.13798440, 2.523770e-01], | ||
[-1.081125e-01, -0.18472105, 2.158888e-01], | ||
[-7.207500e-02, -0.21810437, 1.565954e-01], | ||
[-3.603750e-02, -0.23813436, 8.209854e-02], | ||
[-3.000725e-17, -0.24481103, -4.395626e-17], | ||
[3.603750e-02, -0.23813436, -8.209854e-02], | ||
[7.207500e-02, -0.21810437, -1.565954e-01], | ||
[1.081125e-01, -0.18472105, -2.158888e-01], | ||
[1.441500e-01, -0.13798440, -2.523770e-01], | ||
[1.801875e-01, -0.07789442, -2.584584e-01], | ||
[2.162250e-01, -0.00445111, -2.265312e-01], | ||
[2.522625e-01, 0.08234553, -1.489937e-01], | ||
[2.883000e-01, 0.18249549, -1.824412e-02], | ||
[3.243375e-01, 0.29599879, 1.733191e-01], | ||
[3.603750e-01, 0.42285541, 4.332979e-01], | ||
] | ||
) | ||
|
||
assert numpy.allclose( | ||
V, | ||
r_reference, | ||
) | ||
|
||
assert pytest.approx(state["alpha"], {0: 0.5, 1: 0.5, 2: 0.5}) | ||
assert pytest.approx( | ||
state["norms2"], {1: 0.09166666666666667, 2: 0.07283333333333333} | ||
) | ||
|
||
assert numpy.allclose( | ||
poly(data, degree=3, _state=state), | ||
r_reference, | ||
) | ||
|
||
def test_reuse_state(self, data): | ||
state = {} | ||
V = poly(data, degree=3, _state=state) # as tested above | ||
|
||
# Reuse state but with different data | ||
V = poly(data ** 2, degree=3, _state=state) | ||
|
||
assert V.shape[1] == 3 | ||
|
||
# Comparison data copied from R output of: | ||
# > data = seq(from=0, to=1, by=0.05) | ||
# > coefs = attr(poly(data, 3), 'coefs') | ||
# > poly(data^2, 3, coefs=coefs) | ||
r_reference = numpy.array( | ||
[ | ||
[-0.36037499, 0.422855413, -0.43329786], | ||
[-0.35857311, 0.416195441, -0.41855671], | ||
[-0.35316749, 0.396415822, -0.37546400], | ||
[-0.34415811, 0.364117458, -0.30735499], | ||
[-0.33154499, 0.320301848, -0.21959840], | ||
[-0.31532811, 0.266371091, -0.11931132], | ||
[-0.29550749, 0.204127887, -0.01496018], | ||
[-0.27208311, 0.135775535, 0.08415243], | ||
[-0.24505499, 0.063917934, 0.16851486], | ||
[-0.21442312, -0.008440417, 0.22914758], | ||
[-0.18018749, -0.077894418, 0.25845837], | ||
[-0.14234812, -0.140638372, 0.25121156], | ||
[-0.10090500, -0.192465980, 0.20561124], | ||
[-0.05585812, -0.228770342, 0.12449854], | ||
[-0.00720750, -0.244543962, 0.01666296], | ||
[0.04504687, -0.234378741, -0.10173235], | ||
[0.10090500, -0.192465980, -0.20561124], | ||
[0.16036687, -0.112596382, -0.25933114], | ||
[0.22343249, 0.011839952, -0.21491574], | ||
[0.29010186, 0.187853517, -0.01017347], | ||
[0.36037499, 0.422855413, 0.43329786], | ||
] | ||
) | ||
|
||
assert numpy.allclose( | ||
V, | ||
r_reference, | ||
) | ||
|
||
def test_raw(self, data): | ||
assert numpy.allclose( | ||
poly(data, 3, raw=True), numpy.array([data, data ** 2, data ** 3]).T | ||
) |