parsing-science · Emaasit · Oct 4, 2018 · Oct 4, 2018 · Oct 4, 2018 · Oct 4, 2018
diff --git a/.gitignore b/.gitignore
@@ -105,3 +105,6 @@ ENV/
 
 # Pickles
 notebooks/pickle_jar/
+
+# IDE
+.idea/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,11 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## [1.1.4] - 2018-09-06
+### Added
+- Gaussian Process Regression, Sparse Gaussian Process Regression and Students T Process Regression models
+- Notebooks
+
 ## [1.1.3] - 2018-05-25
 ### Fixed
 - HLR fit method sets shared vars if no minibatch_size given

diff --git a/docs/api/modules.rst b/docs/api/modules.rst
@@ -1,7 +1,7 @@
-api
+pymc3_models
 ============
 
 .. toctree::
    :maxdepth: 4
 
-   pymc3_models.models
+   pymc3_models
diff --git a/docs/api/pymc3_models.models.rst b/docs/api/pymc3_models.models.rst
@@ -1,22 +1,49 @@
-models
-=============================
+pymc3\_models.models package
+============================
 
-pymc3\_models\.models\.HierarchicalLogisticRegression module
-------------------------------------------------------------
+Submodules
+----------
+
+pymc3\_models.models.GaussianProcessRegression module
+-----------------------------------------------------
+
+.. automodule:: pymc3_models.models.GaussianProcessRegression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pymc3\_models.models.HierarchicalLogisticRegression module
+----------------------------------------------------------
 
 .. automodule:: pymc3_models.models.HierarchicalLogisticRegression
     :members:
     :undoc-members:
     :show-inheritance:
 
-pymc3\_models\.models\.LinearRegression module
-----------------------------------------------
+pymc3\_models.models.LinearRegression module
+--------------------------------------------
 
 .. automodule:: pymc3_models.models.LinearRegression
     :members:
     :undoc-members:
     :show-inheritance:
 
+pymc3\_models.models.SparseGaussianProcessRegression module
+-----------------------------------------------------------
+
+.. automodule:: pymc3_models.models.SparseGaussianProcessRegression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pymc3\_models.models.StudentsTProcessRegression module
+------------------------------------------------------
+
+.. automodule:: pymc3_models.models.StudentsTProcessRegression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 
 Module contents
 ---------------

diff --git a/docs/api/pymc3_models.rst b/docs/api/pymc3_models.rst
@@ -0,0 +1,29 @@
+pymc3\_models package
+=====================
+
+Subpackages
+-----------
+
+.. toctree::
+
+    pymc3_models.models
+
+Submodules
+----------
+
+pymc3\_models.exc module
+------------------------
+
+.. automodule:: pymc3_models.exc
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: pymc3_models
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/notebooks/GaussianProcessRegression.ipynb b/notebooks/GaussianProcessRegression.ipynb
diff --git a/notebooks/SparseGaussianProcessRegression.ipynb b/notebooks/SparseGaussianProcessRegression.ipynb
diff --git a/notebooks/StudentsTProcessRegression.ipynb b/notebooks/StudentsTProcessRegression.ipynb
diff --git a/pymc3_models/__init__.py b/pymc3_models/__init__.py
@@ -1,2 +1,9 @@
+__version__ = "1.1.3"
+
 from pymc3_models.models.HierarchicalLogisticRegression import HierarchicalLogisticRegression
 from pymc3_models.models.LinearRegression import LinearRegression
+from pymc3_models.models.GaussianProcessRegression import GaussianProcessRegression
+from pymc3_models.models.SparseGaussianProcessRegression import SparseGaussianProcessRegression
+from pymc3_models.models.StudentsTProcessRegression import StudentsTProcessRegression
+
+
diff --git a/pymc3_models/models/GaussianProcessRegression.py b/pymc3_models/models/GaussianProcessRegression.py
@@ -0,0 +1,177 @@
+import numpy as np
+import pymc3 as pm
+from sklearn.metrics import r2_score
+import theano
+import theano.tensor as T
+
+from pymc3_models.exc import PyMC3ModelsError
+from pymc3_models.models import BayesianModel
+
+
+class GaussianProcessRegression(BayesianModel):
+    """
+    Gaussian Process Regression built using PyMC3.
+    """
+
+    def __init__(self, prior_mean=0.0):
+        self.ppc = None
+        self.gp = None
+        self.num_training_samples = None
+        self.num_pred = None
+        self.prior_mean = prior_mean
+
+        super(GaussianProcessRegression, self).__init__()
+
+    def create_model(self):
+        """
+        Creates and returns the PyMC3 model.
+
+        Note: The size of the shared variables must match the size of the
+        training data. Otherwise, setting the shared variables later will
+        raise an error. See http://docs.pymc.io/advanced_theano.html
+
+        Returns
+        ----------
+        the PyMC3 model
+        """
+        model_input = theano.shared(np.zeros([self.num_training_samples, self.num_pred]))
+
+        model_output = theano.shared(np.zeros(self.num_training_samples))
+
+        self.shared_vars = {
+            'model_input': model_input,
+            'model_output': model_output,
+        }
+
+        model = pm.Model()
+
+        with model:
+            length_scale = pm.Gamma('length_scale', alpha=2, beta=1, shape=(1, self.num_pred))
+            signal_variance = pm.HalfCauchy('signal_variance', beta=5, shape=(1))
+            noise_variance = pm.HalfCauchy('noise_variance', beta=5, shape=(1))
+
+            # cov = signal_variance**2 * pm.gp.cov.ExpQuad(1, length_scale)
+            cov = signal_variance ** 2 * pm.gp.cov.Matern52(1, length_scale)
+
+            # mean_function = pm.gp.mean.Zero()
+            mean_function = pm.gp.mean.Constant(self.prior_mean)
+
+            self.gp = pm.gp.Latent(mean_func=mean_function, cov_func=cov)
+
+            f = self.gp.prior('f', X=model_input.get_value())
+
+            y = pm.Normal('y', mu=f, sd=noise_variance, observed=model_output)
+
+        return model
+
+    def fit(self, X, y, inference_type='advi', minibatch_size=None, inference_args=None):
+        """
+        Train the Gaussian Process Regression model
+
+        Parameters
+        ----------
+        X : numpy array, shape [n_samples, n_features]
+
+        y : numpy array, shape [n_samples, ]
+
+        inference_type : string, specifies which inference method to call.
+        Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported
+
+        minibatch_size : number of samples to include in each minibatch
+        for ADVI, defaults to None, so minibatch is not run by default
+
+        inference_args : dict, arguments to be passed to the inference methods.
+        Check the PyMC3 docs for permissable values. If no arguments are
+        specified, default values will be set.
+        """
+        self.num_training_samples, self.num_pred = X.shape
+
+        self.inference_type = inference_type
+
+        if y.ndim != 1:
+            y = np.squeeze(y)
+
+        if not inference_args:
+            inference_args = self._set_default_inference_args()
+
+        if self.cached_model is None:
+            self.cached_model = self.create_model()
+
+        if minibatch_size:
+            with self.cached_model:
+                minibatches = {
+                    self.shared_vars['model_input']: pm.Minibatch(X, batch_size=minibatch_size),
+                    self.shared_vars['model_output']: pm.Minibatch(y, batch_size=minibatch_size),
+                }
+
+                inference_args['more_replacements'] = minibatches
+        else:
+            self._set_shared_vars({'model_input': X, 'model_output': y})
+
+        self._inference(inference_type, inference_args)
+
+        return self
+
+    def predict(self, X, return_std=False):
+        """
+        Predicts values of new data with a trained Gaussian Process Regression model
+
+        Parameters
+        ----------
+        X : numpy array, shape [n_samples, n_features]
+
+        return_std : Boolean flag of whether to return standard deviations with mean values. Defaults to False.
+        """
+
+        if self.trace is None:
+            raise PyMC3ModelsError('Run fit on the model before predict.')
+
+        num_samples = X.shape[0]
+
+        if self.cached_model is None:
+            self.cached_model = self.create_model()
+
+        self._set_shared_vars({'model_input': X,
+                               'model_output': np.zeros(num_samples)})
+
+        with self.cached_model:
+            f_pred = self.gp.conditional("f_pred", X)
+            self.ppc = pm.sample_ppc(self.trace,
+                                     vars=[f_pred],
+                                     samples=2000)
+
+        if return_std:
+            return self.ppc['f_pred'].mean(axis=0), self.ppc['f_pred'].std(axis=0)
+        else:
+            return self.ppc['f_pred'].mean(axis=0)
+
+    def score(self, X, y):
+        """
+        Scores new data with a trained model.
+
+        Parameters
+        ----------
+        X : numpy array, shape [n_samples, n_features]
+
+        y : numpy array, shape [n_samples, ]
+        """
+
+        return r2_score(y, self.predict(X))
+
+    def save(self, file_prefix):
+        params = {
+            'inference_type': self.inference_type,
+            'num_pred': self.num_pred,
+            'num_training_samples': self.num_training_samples
+        }
+
+        super(GaussianProcessRegression, self).save(file_prefix, params)
+
+    def load(self, file_prefix):
+        params = super(GaussianProcessRegression, self).load(file_prefix,
+                                                             load_custom_params=True)
+
+        self.inference_type = params['inference_type']
+        self.num_pred = params['num_pred']
+        self.num_training_samples = params['num_training_samples']
+
diff --git a/pymc3_models/models/SparseGaussianProcessRegression.py b/pymc3_models/models/SparseGaussianProcessRegression.py
@@ -0,0 +1,85 @@
+import numpy as np
+import pymc3 as pm
+from sklearn.metrics import r2_score
+import theano
+import theano.tensor as T
+
+from pymc3_models.exc import PyMC3ModelsError
+from pymc3_models.models.GaussianProcessRegression import GaussianProcessRegression
+
+
+class SparseGaussianProcessRegression(GaussianProcessRegression):
+    """
+    Sparse Gaussian Process Regression model built using PyMC3.
+    """
+
+    def __init__(self, prior_mean=0.0):
+        super(SparseGaussianProcessRegression, self).__init__(prior_mean=prior_mean)
+
+    def create_model(self):
+        """
+        Creates and returns the PyMC3 model.
+
+        Note: The size of the shared variables must match the size of the
+        training data. Otherwise, setting the shared variables later will
+        raise an error. See http://docs.pymc.io/advanced_theano.html
+
+        Returns
+        ----------
+        the PyMC3 model
+        """
+        model_input = theano.shared(np.zeros([self.num_training_samples, self.num_pred]))
+
+        model_output = theano.shared(np.zeros(self.num_training_samples))
+
+        self.shared_vars = {
+            'model_input': model_input,
+            'model_output': model_output,
+        }
+
+        self.gp = None
+        model = pm.Model()
+
+        with model:
+            length_scale = pm.Gamma('length_scale', alpha=2, beta=1, shape=(1, self.num_pred))
+            signal_variance = pm.HalfCauchy('signal_variance', beta=5, shape=(1))
+            noise_variance = pm.HalfCauchy('noise_variance', beta=5, shape=(1))
+
+            # cov_function = signal_variance**2 * pm.gp.cov.ExpQuad(1, length_scale)
+            cov_function = signal_variance ** 2 * pm.gp.cov.Matern52(1, length_scale)
+
+            # mean_function = pm.gp.mean.Zero()
+            mean_function = pm.gp.mean.Constant(self.prior_mean)
+
+            self.gp = pm.gp.MarginalSparse(mean_func=mean_function,
+                                           cov_func=cov_function,
+                                           approx="FITC")
+
+            # initialize 20 inducing points with K-means
+            # gp.util
+            Xu = pm.gp.util.kmeans_inducing_points(20, X=model_input.get_value())
+
+            y = self.gp.marginal_likelihood('y',
+                                            X=model_input.get_value(),
+                                            Xu=Xu,
+                                            y=model_output.get_value(),
+                                            noise=noise_variance)
+
+        return model
+
+    def save(self, file_prefix):
+        params = {
+            'inference_type': self.inference_type,
+            'num_pred': self.num_pred,
+            'num_training_samples': self.num_training_samples
+        }
+
+        super(SparseGaussianProcessRegression, self).save(file_prefix, params)
+
+    def load(self, file_prefix):
+        params = super(SparseGaussianProcessRegression, self).load(file_prefix,
+                                                             load_custom_params=True)
+
+        self.inference_type = params['inference_type']
+        self.num_pred = params['num_pred']
+        self.num_training_samples = params['num_training_samples']
-Original file line number
+Diff line change
@@ Expand Up / @@ -105,3 +105,6 @@ ENV/ @@
     # Pickles
     notebooks/pickle_jar/
+    # IDE
+    .idea/