From 7dc23bd93ec523b5ed5a8c7b72af1b8d4332547e Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Wed, 14 Sep 2016 20:33:27 -0500 Subject: [PATCH 1/6] ENH: R poly compatibility Travis fixes --- doc/API-reference.rst | 5 + patsy/__init__.py | 3 + patsy/poly.py | 179 ++++++++++++++++++++++++++++++++ patsy/test_poly_data.py | 37 +++++++ tools/get-R-poly-test-vectors.R | 62 +++++++++++ 5 files changed, 286 insertions(+) create mode 100644 patsy/poly.py create mode 100644 patsy/test_poly_data.py create mode 100644 tools/get-R-poly-test-vectors.R diff --git a/doc/API-reference.rst b/doc/API-reference.rst index 7b6b411..07a80b9 100644 --- a/doc/API-reference.rst +++ b/doc/API-reference.rst @@ -198,6 +198,11 @@ Spline regression .. autofunction:: cc .. autofunction:: te +Orthogonal Polynomial +--------------------- + +.. autofunction:: poly + Working with formulas programmatically -------------------------------------- diff --git a/patsy/__init__.py b/patsy/__init__.py index 29722d0..6a37b0a 100644 --- a/patsy/__init__.py +++ b/patsy/__init__.py @@ -113,5 +113,8 @@ def _reexport(mod): import patsy.mgcv_cubic_splines _reexport(patsy.mgcv_cubic_splines) +import patsy.poly +_reexport(patsy.poly) + # XX FIXME: we aren't exporting any of the explicit parsing interface # yet. Need to figure out how to do that. diff --git a/patsy/poly.py b/patsy/poly.py new file mode 100644 index 0000000..a20402b --- /dev/null +++ b/patsy/poly.py @@ -0,0 +1,179 @@ +# This file is part of Patsy +# Copyright (C) 2012-2013 Nathaniel Smith +# See file LICENSE.txt for license information. + +# R-compatible poly function + +# These are made available in the patsy.* namespace +__all__ = ["poly"] + +import numpy as np + +from patsy.util import have_pandas, no_pickling, assert_no_pickling +from patsy.state import stateful_transform + +if have_pandas: + import pandas + +class Poly(object): + """poly(x, degree=1, raw=False) + + Generates an orthogonal polynomial transformation of x of degree. + Generic usage is something along the lines of:: + + y ~ 1 + poly(x, 4) + + to fit ``y`` as a function of ``x``, with a 4th degree polynomial. + + :arg degree: The number of degrees for the polynomial expansion. + :arg raw: When raw is False (the default), will return orthogonal + polynomials. + + .. versionadded:: 0.4.1 + """ + def __init__(self): + self._tmp = {} + self._degree = None + self._raw = None + + def memorize_chunk(self, x, degree=3, raw=False): + args = {"degree": degree, + "raw": raw + } + self._tmp["args"] = args + # XX: check whether we need x values before saving them + x = np.atleast_1d(x) + if x.ndim == 2 and x.shape[1] == 1: + x = x[:, 0] + if x.ndim > 1: + raise ValueError("input to 'poly' must be 1-d, " + "or a 2-d column vector") + # There's no better way to compute exact quantiles than memorizing + # all data. + x = np.array(x, dtype=float) + self._tmp.setdefault("xs", []).append(x) + + def memorize_finish(self): + tmp = self._tmp + args = tmp["args"] + del self._tmp + + if args["degree"] < 1: + raise ValueError("degree must be greater than 0 (not %r)" + % (args["degree"],)) + if int(args["degree"]) != args["degree"]: + raise ValueError("degree must be an integer (not %r)" + % (self._degree,)) + + # These are guaranteed to all be 1d vectors by the code above + scores = np.concatenate(tmp["xs"]) + scores_mean = scores.mean() + # scores -= scores_mean + self.scores_mean = scores_mean + n = args['degree'] + self.degree = n + raw_poly = scores.reshape((-1, 1)) ** np.arange(n + 1).reshape((1, -1)) + raw = args['raw'] + self.raw = raw + if not raw: + q, r = np.linalg.qr(raw_poly) + # Q is now orthognoal of degree n. To match what R is doing, we + # need to use the three-term recurrence technique to calculate + # new alpha, beta, and norm. + + self.alpha = (np.sum(scores.reshape((-1, 1)) * q[:, :n] ** 2, + axis=0) / + np.sum(q[:, :n] ** 2, axis=0)) + + # For reasons I don't understand, the norms R uses are based off + # of the diagonal of the r upper triangular matrix. + + self.norm = np.linalg.norm(q * np.diag(r), axis=0) + self.beta = (self.norm[1:] / self.norm[:n]) ** 2 + + def transform(self, x, degree=3, raw=False): + if have_pandas: + if isinstance(x, (pandas.Series, pandas.DataFrame)): + to_pandas = True + idx = x.index + else: + to_pandas = False + else: + to_pandas = False + x = np.array(x, ndmin=1).flatten() + + if self.raw: + n = self.degree + p = x.reshape((-1, 1)) ** np.arange(n + 1).reshape((1, -1)) + else: + # This is where the three-term recurrance technique is unwound. + + p = np.empty((x.shape[0], self.degree + 1)) + p[:, 0] = 1 + + for i in np.arange(self.degree): + p[:, i + 1] = (x - self.alpha[i]) * p[:, i] + if i > 0: + p[:, i + 1] = (p[:, i + 1] - + (self.beta[i - 1] * p[:, i - 1])) + p /= self.norm + + p = p[:, 1:] + if to_pandas: + p = pandas.DataFrame(p) + p.index = idx + return p + + __getstate__ = no_pickling + +poly = stateful_transform(Poly) + + +def test_poly_compat(): + from patsy.test_state import check_stateful + from patsy.test_poly_data import (R_poly_test_x, + R_poly_test_data, + R_poly_num_tests) + lines = R_poly_test_data.split("\n") + tests_ran = 0 + start_idx = lines.index("--BEGIN TEST CASE--") + while True: + if not lines[start_idx] == "--BEGIN TEST CASE--": + break + start_idx += 1 + stop_idx = lines.index("--END TEST CASE--", start_idx) + block = lines[start_idx:stop_idx] + test_data = {} + for line in block: + key, value = line.split("=", 1) + test_data[key] = value + # Translate the R output into Python calling conventions + kwargs = { + # integer + "degree": int(test_data["degree"]), + # boolen + "raw": (test_data["raw"] == 'TRUE') + } + # Special case: in R, setting intercept=TRUE increases the effective + # dof by 1. Adjust our arguments to match. + # if kwargs["df"] is not None and kwargs["include_intercept"]: + # kwargs["df"] += 1 + output = np.asarray(eval(test_data["output"])) + # Do the actual test + check_stateful(Poly, False, R_poly_test_x, output, **kwargs) + tests_ran += 1 + # Set up for the next one + start_idx = stop_idx + 1 + assert tests_ran == R_poly_num_tests + + +def test_poly_errors(): + from nose.tools import assert_raises + x = np.arange(27) + # Invalid input shape + assert_raises(ValueError, poly, x.reshape((3, 3, 3))) + assert_raises(ValueError, poly, x.reshape((3, 3, 3)), raw=True) + # Invalid degree + assert_raises(ValueError, poly, x, degree=-1) + assert_raises(ValueError, poly, x, degree=0) + assert_raises(ValueError, poly, x, degree=3.5) diff --git a/patsy/test_poly_data.py b/patsy/test_poly_data.py new file mode 100644 index 0000000..cb4481b --- /dev/null +++ b/patsy/test_poly_data.py @@ -0,0 +1,37 @@ +# This file auto-generated by tools/get-R-bs-test-vectors.R +# Using: R version 3.2.4 Revised (2016-03-16 r70336) +import numpy as np +R_poly_test_x = np.array([1, 1.5, 2.25, 3.375, 5.0625, 7.59375, 11.390625, 17.0859375, 25.62890625, 38.443359375, 57.6650390625, 86.49755859375, 129.746337890625, 194.6195068359375, 291.92926025390625, 437.89389038085938, 656.84083557128906, 985.26125335693359, 1477.8918800354004, 2216.8378200531006, ]) +R_poly_test_data = """ +--BEGIN TEST CASE-- +degree=1 +raw=TRUE +output=np.array([1, 1.5, 2.25, 3.375, 5.0625, 7.59375, 11.390625, 17.0859375, 25.62890625, 38.443359375, 57.6650390625, 86.49755859375, 129.746337890625, 194.6195068359375, 291.92926025390625, 437.89389038085938, 656.84083557128906, 985.26125335693359, 1477.8918800354004, 2216.8378200531006, ]).reshape((20, 1, ), order="F") +--END TEST CASE-- +--BEGIN TEST CASE-- +degree=1 +raw=FALSE +output=np.array([-0.12865949508274149, -0.12846539500908838, -0.12817424489860868, -0.12773751973288924, -0.12708243198431005, -0.12609980036144131, -0.12462585292713815, -0.12241493177568342, -0.11909855004850137, -0.11412397745772825, -0.10666211857156857, -0.095469330242329037, -0.07868014774846975, -0.053496374007680828, -0.015720713396497447, 0.040942777520277619, 0.12593801389544024, 0.25343086845818413, 0.4446701503023, 0.73152907306847381, ]).reshape((20, 1, ), order="F") +--END TEST CASE-- +--BEGIN TEST CASE-- +degree=3 +raw=TRUE +output=np.array([1, 1.5, 2.25, 3.375, 5.0625, 7.59375, 11.390625, 17.0859375, 25.62890625, 38.443359375, 57.6650390625, 86.49755859375, 129.746337890625, 194.6195068359375, 291.92926025390625, 437.89389038085938, 656.84083557128906, 985.26125335693359, 1477.8918800354004, 2216.8378200531006, 1, 2.25, 5.0625, 11.390625, 25.62890625, 57.6650390625, 129.746337890625, 291.92926025390625, 656.84083557128906, 1477.8918800354004, 3325.2567300796509, 7481.8276426792145, 16834.112196028233, 37876.752441063523, 85222.692992392927, 191751.05923288409, 431439.8832739892, 970739.73736647563, 2184164.4090745705, 4914369.9204177829, 1, 3.375, 11.390625, 38.443359375, 129.746337890625, 437.89389038085938, 1477.8918800354004, 4987.8850951194763, 16834.112196028233, 56815.128661595285, 191751.05923288409, 647159.82491098379, 2184164.4090745705, 7371554.8806266747, 24878997.722115029, 83966617.312138215, 283387333.4284665, 956432250.32107437, 3227958844.8336263, 10894361101.313488, ]).reshape((20, 3, ), order="F") +--END TEST CASE-- +--BEGIN TEST CASE-- +degree=3 +raw=FALSE +output=np.array([-0.12865949508274149, -0.12846539500908838, -0.12817424489860868, -0.12773751973288924, -0.12708243198431005, -0.12609980036144131, -0.12462585292713815, -0.12241493177568342, -0.11909855004850137, -0.11412397745772825, -0.10666211857156857, -0.095469330242329037, -0.07868014774846975, -0.053496374007680828, -0.015720713396497447, 0.040942777520277619, 0.12593801389544024, 0.25343086845818413, 0.4446701503023, 0.73152907306847381, 0.11682670564764953, 0.11622774987820758, 0.1153299112445243, 0.11398449209008393, 0.11196937564961051, 0.10895347864407183, 0.10444488285989936, 0.097716301062945765, 0.087700630095951776, 0.072850827534442664, 0.05096695744238839, 0.019020528242278005, -0.026920519697452645, -0.091380250921070119, -0.1780532062130448, -0.28552519567824058, -0.39602393206231051, -0.44767622905753701, -0.26843910749340033, 0.57802660073100254, -0.11560888340228653, -0.11436481217184656, -0.11250218782662975, -0.10971608089390825, -0.10555451667328646, -0.099351692934324679, -0.090136150925525155, -0.076511614727544461, -0.05651941299388475, -0.027522538371457093, 0.013772191900731716, 0.070864671671751547, 0.14593497036033168, 0.23591981919395397, 0.32391016867398448, 0.36336942185480259, 0.25890497941187346, -0.11572025100301592, -0.66076386903314166, 0.27159578788942196, ]).reshape((20, 3, ), order="F") +--END TEST CASE-- +--BEGIN TEST CASE-- +degree=5 +raw=TRUE +output=np.array([1, 1.5, 2.25, 3.375, 5.0625, 7.59375, 11.390625, 17.0859375, 25.62890625, 38.443359375, 57.6650390625, 86.49755859375, 129.746337890625, 194.6195068359375, 291.92926025390625, 437.89389038085938, 656.84083557128906, 985.26125335693359, 1477.8918800354004, 2216.8378200531006, 1, 2.25, 5.0625, 11.390625, 25.62890625, 57.6650390625, 129.746337890625, 291.92926025390625, 656.84083557128906, 1477.8918800354004, 3325.2567300796509, 7481.8276426792145, 16834.112196028233, 37876.752441063523, 85222.692992392927, 191751.05923288409, 431439.8832739892, 970739.73736647563, 2184164.4090745705, 4914369.9204177829, 1, 3.375, 11.390625, 38.443359375, 129.746337890625, 437.89389038085938, 1477.8918800354004, 4987.8850951194763, 16834.112196028233, 56815.128661595285, 191751.05923288409, 647159.82491098379, 2184164.4090745705, 7371554.8806266747, 24878997.722115029, 83966617.312138215, 283387333.4284665, 956432250.32107437, 3227958844.8336263, 10894361101.313488, 1, 5.0625, 25.62890625, 129.746337890625, 656.84083557128906, 3325.2567300796509, 16834.112196028233, 85222.692992392927, 431439.8832739892, 2184164.4090745705, 11057332.320940012, 55977744.87475881, 283387333.4284665, 1434648375.4816115, 7262907400.875659, 36768468716.933022, 186140372879.47342, 942335637702.33411, 4770574165868.0674, 24151031714707.086, 1, 7.59375, 57.6650390625, 437.89389038085938, 3325.2567300796509, 25251.168294042349, 191751.05923288409, 1456109.6060497134, 11057332.320940012, 83966617.31213823, 637621500.21404958, 4841938267.2504387, 36768468716.933022, 279210559319.21014, 2120255184830.252, 16100687809804.727, 122264598055704.64, 928446791485507, 7050392822843070, 53538920498464552, ]).reshape((20, 5, ), order="F") +--END TEST CASE-- +--BEGIN TEST CASE-- +degree=5 +raw=FALSE +output=np.array([-0.12865949508274149, -0.12846539500908838, -0.12817424489860868, -0.12773751973288924, -0.12708243198431005, -0.12609980036144131, -0.12462585292713815, -0.12241493177568342, -0.11909855004850137, -0.11412397745772825, -0.10666211857156857, -0.095469330242329037, -0.07868014774846975, -0.053496374007680828, -0.015720713396497447, 0.040942777520277619, 0.12593801389544024, 0.25343086845818413, 0.4446701503023, 0.73152907306847381, 0.11682670564764953, 0.11622774987820758, 0.1153299112445243, 0.11398449209008393, 0.11196937564961051, 0.10895347864407183, 0.10444488285989936, 0.097716301062945765, 0.087700630095951776, 0.072850827534442664, 0.05096695744238839, 0.019020528242278005, -0.026920519697452645, -0.091380250921070119, -0.1780532062130448, -0.28552519567824058, -0.39602393206231051, -0.44767622905753701, -0.26843910749340033, 0.57802660073100254, -0.11560888340228653, -0.11436481217184656, -0.11250218782662975, -0.10971608089390825, -0.10555451667328646, -0.099351692934324679, -0.090136150925525155, -0.076511614727544461, -0.05651941299388475, -0.027522538371457093, 0.013772191900731716, 0.070864671671751547, 0.14593497036033168, 0.23591981919395397, 0.32391016867398448, 0.36336942185480259, 0.25890497941187346, -0.11572025100301592, -0.66076386903314166, 0.27159578788942196, 0.11925766326375063, 0.11701962699862156, 0.11367531238125347, 0.10868744714732725, 0.10126981942884175, 0.090287103769210786, 0.074134201646975206, 0.050620044131431986, 0.016933017097416861, -0.030116712154368355, -0.093138533517390085, -0.17160263551697441, -0.25618209006285081, -0.3183631162695052, -0.29707753517866498, -0.10102478727647804, 0.30185248746535442, 0.55289166632880227, -0.46108564710186972, 0.081962667419115426, -0.12626707822019206, -0.12250155553682644, -0.11689136915447108, -0.10856147160045609, -0.096257598068575617, -0.078227654788373013, -0.052128116579684983, -0.015063001240831148, 0.035988153544508683, 0.10280803884977513, 0.18263307034840112, 0.26144732880503613, 0.30325203347309243, 0.24116709207723347, -0.00082575540196283526, -0.37830141983168153, -0.42887161757203512, 0.55207091753656046, -0.17171017635275559, 0.016240179713238136, ]).reshape((20, 5, ), order="F") +--END TEST CASE-- +""" +R_poly_num_tests = 6 diff --git a/tools/get-R-poly-test-vectors.R b/tools/get-R-poly-test-vectors.R new file mode 100644 index 0000000..d97c84e --- /dev/null +++ b/tools/get-R-poly-test-vectors.R @@ -0,0 +1,62 @@ +cat("# This file auto-generated by tools/get-R-bs-test-vectors.R\n") +cat(sprintf("# Using: %s\n", R.Version()$version.string)) +cat("import numpy as np\n") + +options(digits=20) +library(splines) +x <- (1.5)^(0:19) + +MISSING <- "MISSING" + +is.missing <- function(obj) { + length(obj) == 1 && obj == MISSING +} + +pyprint <- function(arr) { + if (is.missing(arr)) { + cat("None\n") + } else { + cat("np.array([") + for (val in arr) { + cat(val) + cat(", ") + } + cat("])") + if (!is.null(dim(arr))) { + cat(".reshape((") + for (size in dim(arr)) { + cat(sprintf("%s, ", size)) + } + cat("), order=\"F\")") + } + cat("\n") + } +} + +num.tests <- 0 +dump.poly <- function(degree, raw) { + cat("--BEGIN TEST CASE--\n") + cat(sprintf("degree=%s\n", degree)) + cat(sprintf("raw=%s\n", raw)) + + args <- list(x=x, degree=degree, raw=raw) + + result <- do.call(poly, args) + + cat("output=") + pyprint(result) + cat("--END TEST CASE--\n") + assign("num.tests", num.tests + 1, envir=.GlobalEnv) +} + +cat("R_poly_test_x = ") +pyprint(x) +cat("R_poly_test_data = \"\"\"\n") + +for (degree in c(1, 3, 5)) { + for (raw in c(TRUE, FALSE)) { + dump.poly(degree, raw) + } +} +cat("\"\"\"\n") +cat(sprintf("R_poly_num_tests = %s\n", num.tests)) From 35e092312a2e8df13dfce71b75df3f6a6ac24d96 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Tue, 25 Oct 2016 22:23:34 -0500 Subject: [PATCH 2/6] Use poly to calculate poly contrast. Add ability to use standarize in addition to qr decomposition. Added all numpy polynomial types. remove poly --- doc/API-reference.rst | 4 +- patsy/__init__.py | 4 +- patsy/contrasts.py | 9 +- patsy/{poly.py => polynomials.py} | 157 ++++++++++++++++++++++-------- patsy/test_poly_data.py | 2 +- tools/get-R-poly-test-vectors.R | 2 +- 6 files changed, 125 insertions(+), 53 deletions(-) rename patsy/{poly.py => polynomials.py} (50%) diff --git a/doc/API-reference.rst b/doc/API-reference.rst index 07a80b9..1d3ada9 100644 --- a/doc/API-reference.rst +++ b/doc/API-reference.rst @@ -198,8 +198,8 @@ Spline regression .. autofunction:: cc .. autofunction:: te -Orthogonal Polynomial ---------------------- +Polynomial +---------- .. autofunction:: poly diff --git a/patsy/__init__.py b/patsy/__init__.py index 6a37b0a..a4ec751 100644 --- a/patsy/__init__.py +++ b/patsy/__init__.py @@ -113,8 +113,8 @@ def _reexport(mod): import patsy.mgcv_cubic_splines _reexport(patsy.mgcv_cubic_splines) -import patsy.poly -_reexport(patsy.poly) +import patsy.polynomials +_reexport(patsy.polynomials) # XX FIXME: we aren't exporting any of the explicit parsing interface # yet. Need to figure out how to do that. diff --git a/patsy/contrasts.py b/patsy/contrasts.py index 3f1cf54..469aac0 100644 --- a/patsy/contrasts.py +++ b/patsy/contrasts.py @@ -17,6 +17,7 @@ from patsy.util import (repr_pretty_delegate, repr_pretty_impl, safe_issubdtype, no_pickling, assert_no_pickling) +from patsy.polynomials import Poly as Polynomial class ContrastMatrix(object): """A simple container for a matrix used for coding categorical factors. @@ -263,11 +264,9 @@ def _code_either(self, intercept, levels): # quadratic, etc., functions of the raw scores, and then use 'qr' to # orthogonalize each column against those to its left. scores -= scores.mean() - raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1)) - q, r = np.linalg.qr(raw_poly) - q *= np.sign(np.diag(r)) - q /= np.sqrt(np.sum(q ** 2, axis=1)) - # The constant term is always all 1's -- we don't normalize it. + raw_poly = Polynomial.vander(scores, n - 1, 'poly') + alpha, norm, beta = Polynomial.gen_qr(raw_poly, n - 1) + q = Polynomial.apply_qr(raw_poly, n - 1, alpha, norm, beta) q[:, 0] = 1 names = [".Constant", ".Linear", ".Quadratic", ".Cubic"] names += ["^%s" % (i,) for i in range(4, n)] diff --git a/patsy/poly.py b/patsy/polynomials.py similarity index 50% rename from patsy/poly.py rename to patsy/polynomials.py index a20402b..56ca9b5 100644 --- a/patsy/poly.py +++ b/patsy/polynomials.py @@ -16,7 +16,7 @@ import pandas class Poly(object): - """poly(x, degree=1, raw=False) + """poly(x, degree=3, polytype='poly', raw=False, scaler=None) Generates an orthogonal polynomial transformation of x of degree. Generic usage is something along the lines of:: @@ -26,19 +26,29 @@ class Poly(object): to fit ``y`` as a function of ``x``, with a 4th degree polynomial. :arg degree: The number of degrees for the polynomial expansion. + :arg polytype: Either poly (the default), legendre, laguerre, hermite, or + hermanite_e. :arg raw: When raw is False (the default), will return orthogonal polynomials. + :arg scaler: Choice of 'qr' (default when raw=False) for QR- + decomposition or 'standardize'. .. versionadded:: 0.4.1 """ def __init__(self): self._tmp = {} - self._degree = None - self._raw = None - def memorize_chunk(self, x, degree=3, raw=False): + def memorize_chunk(self, x, degree=3, polytype='poly', raw=False, + scaler=None): + if not raw and (scaler is None): + scaler = 'qr' + if scaler not in ('qr', 'standardize', None): + raise ValueError('input to \'scaler\' %s is not a valid ' + 'scaling technique' % scaler) args = {"degree": degree, - "raw": raw + "raw": raw, + "scaler": scaler, + 'polytype': polytype } self._tmp["args"] = args # XX: check whether we need x values before saving them @@ -63,35 +73,27 @@ def memorize_finish(self): % (args["degree"],)) if int(args["degree"]) != args["degree"]: raise ValueError("degree must be an integer (not %r)" - % (self._degree,)) + % (args['degree'],)) # These are guaranteed to all be 1d vectors by the code above scores = np.concatenate(tmp["xs"]) - scores_mean = scores.mean() - # scores -= scores_mean - self.scores_mean = scores_mean + n = args['degree'] self.degree = n - raw_poly = scores.reshape((-1, 1)) ** np.arange(n + 1).reshape((1, -1)) - raw = args['raw'] - self.raw = raw - if not raw: - q, r = np.linalg.qr(raw_poly) - # Q is now orthognoal of degree n. To match what R is doing, we - # need to use the three-term recurrence technique to calculate - # new alpha, beta, and norm. - - self.alpha = (np.sum(scores.reshape((-1, 1)) * q[:, :n] ** 2, - axis=0) / - np.sum(q[:, :n] ** 2, axis=0)) - - # For reasons I don't understand, the norms R uses are based off - # of the diagonal of the r upper triangular matrix. - - self.norm = np.linalg.norm(q * np.diag(r), axis=0) - self.beta = (self.norm[1:] / self.norm[:n]) ** 2 - - def transform(self, x, degree=3, raw=False): + self.scaler = args['scaler'] + self.raw = args['raw'] + self.polytype = args['polytype'] + + if self.scaler is not None: + raw_poly = self.vander(scores, n, self.polytype) + + if self.scaler == 'qr': + self.alpha, self.norm, self.beta = self.gen_qr(raw_poly, n) + + if self.scaler == 'standardize': + self.mean, self.var = self.gen_standardize(raw_poly) + + def transform(self, x, degree=3, polytype='poly', raw=False, scaler=None): if have_pandas: if isinstance(x, (pandas.Series, pandas.DataFrame)): to_pandas = True @@ -102,21 +104,14 @@ def transform(self, x, degree=3, raw=False): to_pandas = False x = np.array(x, ndmin=1).flatten() - if self.raw: - n = self.degree - p = x.reshape((-1, 1)) ** np.arange(n + 1).reshape((1, -1)) - else: - # This is where the three-term recurrance technique is unwound. + n = self.degree + p = self.vander(x, n, self.polytype) - p = np.empty((x.shape[0], self.degree + 1)) - p[:, 0] = 1 + if self.scaler == 'qr': + p = self.apply_qr(p, n, self.alpha, self.norm, self.beta) - for i in np.arange(self.degree): - p[:, i + 1] = (x - self.alpha[i]) * p[:, i] - if i > 0: - p[:, i + 1] = (p[:, i + 1] - - (self.beta[i - 1] * p[:, i - 1])) - p /= self.norm + if self.scaler == 'standardize': + p = self.apply_standardize(p, self.mean, self.var) p = p[:, 1:] if to_pandas: @@ -124,6 +119,60 @@ def transform(self, x, degree=3, raw=False): p.index = idx return p + @staticmethod + def vander(x, n, polytype): + v_func = {'poly': np.polynomial.polynomial.polyvander, + 'cheb': np.polynomial.chebyshev.chebvander, + 'legendre': np.polynomial.legendre.legvander, + 'laguerre': np.polynomial.laguerre.lagvander, + 'hermite': np.polynomial.hermite.hermvander, + 'hermite_e': np.polynomial.hermite_e.hermevander} + raw_poly = v_func[polytype](x, n) + return raw_poly + + @staticmethod + def gen_qr(raw_poly, n): + # Q is now orthognoal of degree n. To match what R is doing, we + # need to use the three-term recurrence technique to calculate + # new alpha, beta, and norm. + x = raw_poly[:, 1] + q, r = np.linalg.qr(raw_poly) + alpha = (np.sum(x.reshape((-1, 1)) * q[:, :n] ** 2, axis=0) / + np.sum(q[:, :n] ** 2, axis=0)) + + # For reasons I don't understand, the norms R uses are based off + # of the diagonal of the r upper triangular matrix. + + norm = np.linalg.norm(q * np.diag(r), axis=0) + beta = (norm[1:] / norm[:n]) ** 2 + return alpha, norm, beta + + @staticmethod + def gen_standardize(raw_poly): + return raw_poly.mean(axis=0), raw_poly.var(axis=0) + + @staticmethod + def apply_qr(x, n, alpha, norm, beta): + # This is where the three-term recurrence is unwound for the QR + # decomposition. + if np.ndim(x) == 2: + x = x[:, 1] + p = np.empty((x.shape[0], n + 1)) + p[:, 0] = 1 + + for i in np.arange(n): + p[:, i + 1] = (x - alpha[i]) * p[:, i] + if i > 0: + p[:, i + 1] = (p[:, i + 1] - (beta[i - 1] * p[:, i - 1])) + p /= norm + return p + + @staticmethod + def apply_standardize(x, mean, var): + x[:, 1:] = ((x[:, 1:] - mean[1:]) / (var[1:] ** 0.5)) + return x + + __getstate__ = no_pickling poly = stateful_transform(Poly) @@ -166,6 +215,24 @@ def test_poly_compat(): start_idx = stop_idx + 1 assert tests_ran == R_poly_num_tests +def test_poly_smoke(): + # Test that standardized values match. + x = np.arange(27) + vanders = ['poly', 'cheb', 'legendre', 'laguerre', 'hermite', 'hermite_e'] + scalers = ['raw', 'qr', 'standardize'] + for v in vanders: + p1 = poly(x, polytype=v, scaler='standardize') + p2 = poly(x, polytype=v, raw=True) + p2 = (p2 - p2.mean(axis=0)) / p2.std(axis=0) + np.testing.assert_allclose(p1, p2) + + # Don't have tests for all this... so just make sure it works. + for v in vanders: + for s in scalers: + if s == 'raw': + poly(x, raw=True, polytype=v) + else: + poly(x, scaler=s, polytype=v) def test_poly_errors(): from nose.tools import assert_raises @@ -177,3 +244,9 @@ def test_poly_errors(): assert_raises(ValueError, poly, x, degree=-1) assert_raises(ValueError, poly, x, degree=0) assert_raises(ValueError, poly, x, degree=3.5) + + #Invalid Poly Type + assert_raises(KeyError, poly, x, polytype='foo') + + #Invalid scaling type + assert_raises(ValueError, poly, x, scaler='bar') diff --git a/patsy/test_poly_data.py b/patsy/test_poly_data.py index cb4481b..668ee28 100644 --- a/patsy/test_poly_data.py +++ b/patsy/test_poly_data.py @@ -1,4 +1,4 @@ -# This file auto-generated by tools/get-R-bs-test-vectors.R +# This file auto-generated by tools/get-R-poly-test-vectors.R # Using: R version 3.2.4 Revised (2016-03-16 r70336) import numpy as np R_poly_test_x = np.array([1, 1.5, 2.25, 3.375, 5.0625, 7.59375, 11.390625, 17.0859375, 25.62890625, 38.443359375, 57.6650390625, 86.49755859375, 129.746337890625, 194.6195068359375, 291.92926025390625, 437.89389038085938, 656.84083557128906, 985.26125335693359, 1477.8918800354004, 2216.8378200531006, ]) diff --git a/tools/get-R-poly-test-vectors.R b/tools/get-R-poly-test-vectors.R index d97c84e..68e9678 100644 --- a/tools/get-R-poly-test-vectors.R +++ b/tools/get-R-poly-test-vectors.R @@ -1,4 +1,4 @@ -cat("# This file auto-generated by tools/get-R-bs-test-vectors.R\n") +cat("# This file auto-generated by tools/get-R-poly-test-vectors.R\n") cat(sprintf("# Using: %s\n", R.Version()$version.string)) cat("import numpy as np\n") From becad9a7a94726c014c43f6dcd4e2f8289a0307b Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Thu, 3 Nov 2016 19:54:06 -0500 Subject: [PATCH 3/6] Removing other polytypes, standardize procedure. --- patsy/contrasts.py | 2 +- patsy/polynomials.py | 84 ++++++++------------------------------------ 2 files changed, 15 insertions(+), 71 deletions(-) diff --git a/patsy/contrasts.py b/patsy/contrasts.py index 469aac0..010afb3 100644 --- a/patsy/contrasts.py +++ b/patsy/contrasts.py @@ -264,7 +264,7 @@ def _code_either(self, intercept, levels): # quadratic, etc., functions of the raw scores, and then use 'qr' to # orthogonalize each column against those to its left. scores -= scores.mean() - raw_poly = Polynomial.vander(scores, n - 1, 'poly') + raw_poly = Polynomial.vander(scores, n - 1) alpha, norm, beta = Polynomial.gen_qr(raw_poly, n - 1) q = Polynomial.apply_qr(raw_poly, n - 1, alpha, norm, beta) q[:, 0] = 1 diff --git a/patsy/polynomials.py b/patsy/polynomials.py index 56ca9b5..9137864 100644 --- a/patsy/polynomials.py +++ b/patsy/polynomials.py @@ -5,18 +5,19 @@ # R-compatible poly function # These are made available in the patsy.* namespace -__all__ = ["poly"] - import numpy as np from patsy.util import have_pandas, no_pickling, assert_no_pickling from patsy.state import stateful_transform +__all__ = ["poly"] + if have_pandas: import pandas + class Poly(object): - """poly(x, degree=3, polytype='poly', raw=False, scaler=None) + """poly(x, degree=3, raw=False) Generates an orthogonal polynomial transformation of x of degree. Generic usage is something along the lines of:: @@ -26,29 +27,17 @@ class Poly(object): to fit ``y`` as a function of ``x``, with a 4th degree polynomial. :arg degree: The number of degrees for the polynomial expansion. - :arg polytype: Either poly (the default), legendre, laguerre, hermite, or - hermanite_e. :arg raw: When raw is False (the default), will return orthogonal polynomials. - :arg scaler: Choice of 'qr' (default when raw=False) for QR- - decomposition or 'standardize'. .. versionadded:: 0.4.1 """ def __init__(self): self._tmp = {} - def memorize_chunk(self, x, degree=3, polytype='poly', raw=False, - scaler=None): - if not raw and (scaler is None): - scaler = 'qr' - if scaler not in ('qr', 'standardize', None): - raise ValueError('input to \'scaler\' %s is not a valid ' - 'scaling technique' % scaler) + def memorize_chunk(self, x, degree=3, raw=False): args = {"degree": degree, - "raw": raw, - "scaler": scaler, - 'polytype': polytype + "raw": raw } self._tmp["args"] = args # XX: check whether we need x values before saving them @@ -80,19 +69,12 @@ def memorize_finish(self): n = args['degree'] self.degree = n - self.scaler = args['scaler'] self.raw = args['raw'] - self.polytype = args['polytype'] - - if self.scaler is not None: - raw_poly = self.vander(scores, n, self.polytype) - if self.scaler == 'qr': + if not self.raw: + raw_poly = self.vander(scores, n) self.alpha, self.norm, self.beta = self.gen_qr(raw_poly, n) - if self.scaler == 'standardize': - self.mean, self.var = self.gen_standardize(raw_poly) - def transform(self, x, degree=3, polytype='poly', raw=False, scaler=None): if have_pandas: if isinstance(x, (pandas.Series, pandas.DataFrame)): @@ -120,23 +102,17 @@ def transform(self, x, degree=3, polytype='poly', raw=False, scaler=None): return p @staticmethod - def vander(x, n, polytype): - v_func = {'poly': np.polynomial.polynomial.polyvander, - 'cheb': np.polynomial.chebyshev.chebvander, - 'legendre': np.polynomial.legendre.legvander, - 'laguerre': np.polynomial.laguerre.lagvander, - 'hermite': np.polynomial.hermite.hermvander, - 'hermite_e': np.polynomial.hermite_e.hermevander} - raw_poly = v_func[polytype](x, n) + def vander(x, n): + raw_poly = np.polynomial.polynomial.polyvander(x, n) return raw_poly @staticmethod def gen_qr(raw_poly, n): + x = raw_poly[:, 1] + q, r = np.linalg.qr(raw_poly) # Q is now orthognoal of degree n. To match what R is doing, we # need to use the three-term recurrence technique to calculate # new alpha, beta, and norm. - x = raw_poly[:, 1] - q, r = np.linalg.qr(raw_poly) alpha = (np.sum(x.reshape((-1, 1)) * q[:, :n] ** 2, axis=0) / np.sum(q[:, :n] ** 2, axis=0)) @@ -147,10 +123,6 @@ def gen_qr(raw_poly, n): beta = (norm[1:] / norm[:n]) ** 2 return alpha, norm, beta - @staticmethod - def gen_standardize(raw_poly): - return raw_poly.mean(axis=0), raw_poly.var(axis=0) - @staticmethod def apply_qr(x, n, alpha, norm, beta): # This is where the three-term recurrence is unwound for the QR @@ -166,13 +138,6 @@ def apply_qr(x, n, alpha, norm, beta): p[:, i + 1] = (p[:, i + 1] - (beta[i - 1] * p[:, i - 1])) p /= norm return p - - @staticmethod - def apply_standardize(x, mean, var): - x[:, 1:] = ((x[:, 1:] - mean[1:]) / (var[1:] ** 0.5)) - return x - - __getstate__ = no_pickling poly = stateful_transform(Poly) @@ -215,24 +180,6 @@ def test_poly_compat(): start_idx = stop_idx + 1 assert tests_ran == R_poly_num_tests -def test_poly_smoke(): - # Test that standardized values match. - x = np.arange(27) - vanders = ['poly', 'cheb', 'legendre', 'laguerre', 'hermite', 'hermite_e'] - scalers = ['raw', 'qr', 'standardize'] - for v in vanders: - p1 = poly(x, polytype=v, scaler='standardize') - p2 = poly(x, polytype=v, raw=True) - p2 = (p2 - p2.mean(axis=0)) / p2.std(axis=0) - np.testing.assert_allclose(p1, p2) - - # Don't have tests for all this... so just make sure it works. - for v in vanders: - for s in scalers: - if s == 'raw': - poly(x, raw=True, polytype=v) - else: - poly(x, scaler=s, polytype=v) def test_poly_errors(): from nose.tools import assert_raises @@ -245,8 +192,5 @@ def test_poly_errors(): assert_raises(ValueError, poly, x, degree=0) assert_raises(ValueError, poly, x, degree=3.5) - #Invalid Poly Type - assert_raises(KeyError, poly, x, polytype='foo') - - #Invalid scaling type - assert_raises(ValueError, poly, x, scaler='bar') + p = poly(np.arange(1, 10), degree=3) + assert_no_pickling(p) From 0cd4299024101885a3c414a2c0489ec6755b264a Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Thu, 3 Nov 2016 20:03:39 -0500 Subject: [PATCH 4/6] Missed a few other places where polytype/scaler were. --- patsy/polynomials.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/patsy/polynomials.py b/patsy/polynomials.py index 9137864..25d18f7 100644 --- a/patsy/polynomials.py +++ b/patsy/polynomials.py @@ -75,7 +75,7 @@ def memorize_finish(self): raw_poly = self.vander(scores, n) self.alpha, self.norm, self.beta = self.gen_qr(raw_poly, n) - def transform(self, x, degree=3, polytype='poly', raw=False, scaler=None): + def transform(self, x, degree=3, raw=False): if have_pandas: if isinstance(x, (pandas.Series, pandas.DataFrame)): to_pandas = True @@ -87,14 +87,11 @@ def transform(self, x, degree=3, polytype='poly', raw=False, scaler=None): x = np.array(x, ndmin=1).flatten() n = self.degree - p = self.vander(x, n, self.polytype) + p = self.vander(x, n) - if self.scaler == 'qr': + if not self.raw: p = self.apply_qr(p, n, self.alpha, self.norm, self.beta) - if self.scaler == 'standardize': - p = self.apply_standardize(p, self.mean, self.var) - p = p[:, 1:] if to_pandas: p = pandas.DataFrame(p) From 0cb862bd0cc57140e8c67b97ca781bd699667bb2 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Thu, 3 Nov 2016 20:18:04 -0500 Subject: [PATCH 5/6] assert_no_pickling again --- patsy/polynomials.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/patsy/polynomials.py b/patsy/polynomials.py index 25d18f7..85a35d1 100644 --- a/patsy/polynomials.py +++ b/patsy/polynomials.py @@ -189,5 +189,4 @@ def test_poly_errors(): assert_raises(ValueError, poly, x, degree=0) assert_raises(ValueError, poly, x, degree=3.5) - p = poly(np.arange(1, 10), degree=3) - assert_no_pickling(p) + assert_no_pickling(Poly()) From 4d107c634ee13d2392c85ab5ddb8c6a390b5ba94 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Sat, 3 Nov 2018 19:55:14 -0500 Subject: [PATCH 6/6] Increase test coverage --- patsy/polynomials.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/patsy/polynomials.py b/patsy/polynomials.py index 85a35d1..f8d54c2 100644 --- a/patsy/polynomials.py +++ b/patsy/polynomials.py @@ -32,6 +32,7 @@ class Poly(object): .. versionadded:: 0.4.1 """ + def __init__(self): self._tmp = {} @@ -110,8 +111,8 @@ def gen_qr(raw_poly, n): # Q is now orthognoal of degree n. To match what R is doing, we # need to use the three-term recurrence technique to calculate # new alpha, beta, and norm. - alpha = (np.sum(x.reshape((-1, 1)) * q[:, :n] ** 2, axis=0) / - np.sum(q[:, :n] ** 2, axis=0)) + alpha = (np.sum(x.reshape((-1, 1)) * q[:, :n] ** 2, axis=0) + / np.sum(q[:, :n] ** 2, axis=0)) # For reasons I don't understand, the norms R uses are based off # of the diagonal of the r upper triangular matrix. @@ -137,6 +138,7 @@ def apply_qr(x, n, alpha, norm, beta): return p __getstate__ = no_pickling + poly = stateful_transform(Poly) @@ -145,6 +147,8 @@ def test_poly_compat(): from patsy.test_poly_data import (R_poly_test_x, R_poly_test_data, R_poly_num_tests) + from numpy.testing import assert_allclose + lines = R_poly_test_data.split("\n") tests_ran = 0 start_idx = lines.index("--BEGIN TEST CASE--") @@ -172,6 +176,14 @@ def test_poly_compat(): output = np.asarray(eval(test_data["output"])) # Do the actual test check_stateful(Poly, False, R_poly_test_x, output, **kwargs) + raw_poly = Poly.vander(R_poly_test_x, kwargs['degree']) + if kwargs['raw']: + actual = raw_poly[:, 1:] + else: + alpha, norm, beta = Poly.gen_qr(raw_poly, kwargs['degree']) + actual = Poly.apply_qr(R_poly_test_x, kwargs['degree'], alpha, + norm, beta)[:, 1:] + assert_allclose(actual, output) tests_ran += 1 # Set up for the next one start_idx = stop_idx + 1