mattjj
diff --git a/‎examples/bernoulli_lds.py
+32-16 b/‎examples/bernoulli_lds.py
+32-16
diff --git a/‎pylds/distributions.py
+165-27 b/‎pylds/distributions.py
+165-27
@@ -6,9 +6,9 @@
 from pybasicbayes.distributions import Regression
 from pybasicbayes.util.text import progprint_xrange
 from pypolyagamma.distributions import BernoulliRegression
-from pylds.models import CountLDS
+from pylds.models import CountLDS, DefaultBernoulliLDS
 
-npr.seed(0)
+npr.seed(1)
 
 # Parameters
 D_obs = 10
@@ -22,7 +22,7 @@
 
 A = 0.99*np.array([[np.cos(np.pi/24), -np.sin(np.pi/24)],
                    [np.sin(np.pi/24),  np.cos(np.pi/24)]])
-B = np.ones((D_latent, D_input))
+B = np.zeros((D_latent, D_input))
 sigma_states = 0.01*np.eye(2)
 
 C = np.random.randn(D_obs, D_latent)
@@ -45,26 +45,43 @@
                               M_0=np.zeros((D_latent, D_latent + D_input)),
                               K_0=(D_latent + D_input) * np.eye(D_latent + D_input)),
     emission_distn=BernoulliRegression(D_out=D_obs, D_in=D_latent + D_input))
-model.add_data(data, inputs=inputs)
+model.add_data(data, inputs=inputs, stateseq=np.zeros((T, D_latent)))
 
-# Run a Gibbs sampler
-N_samples = 500
+# Run a Gibbs sampler with Polya-gamma augmentation
+N_samples = 50
 def gibbs_update(model):
     model.resample_model()
     smoothed_obs = model.states_list[0].smooth()
-    return model.log_likelihood(), \
-           model.states_list[0].gaussian_states, \
-           smoothed_obs
+    ll = model.log_likelihood()
+    return ll, model.states_list[0].gaussian_states, smoothed_obs
 
-lls, z_smpls, smoothed_obss = \
+lls_gibbs, x_smpls_gibbs, y_smooth_gibbs = \
     zip(*[gibbs_update(model) for _ in progprint_xrange(N_samples)])
 
+# Fit with a Bernoulli LDS using Laplace approximation for comparison
+model = DefaultBernoulliLDS(D_obs, D_latent, D_input=D_input,
+                            C=0.01 * np.random.randn(D_obs, D_latent),
+                            D=0.01 * np.random.randn(D_obs, D_input))
+model.add_data(data, inputs=inputs, stateseq=np.zeros((T, D_latent)))
+
+N_iters = 50
+def em_update(model):
+    model.EM_step(verbose=True)
+    smoothed_obs = model.states_list[0].smooth()
+    ll = model.log_likelihood()
+    return ll, model.states_list[0].gaussian_states, smoothed_obs
+
+lls_em, x_smpls_em, y_smooth_em = \
+    zip(*[em_update(model) for _ in progprint_xrange(N_iters)])
+
 # Plot the log likelihood over iterations
 plt.figure(figsize=(10,6))
-plt.plot(lls,'-b')
-plt.plot([0,N_samples], truemodel.log_likelihood() * np.ones(2), '-k')
+plt.plot(lls_gibbs, label="gibbs")
+plt.plot(lls_em, label="em")
+plt.plot([0,N_samples], truemodel.log_likelihood() * np.ones(2), '-k', label="true")
 plt.xlabel('iteration')
 plt.ylabel('log likelihood')
+plt.legend(loc="lower right")
 
 # Plot the smoothed observations
 fig = plt.figure(figsize=(10,10))
@@ -80,9 +97,9 @@ def gibbs_update(model):
     given_ts = np.where(data[:,j]==1)[0]
     ax.plot(given_ts, np.ones_like(given_ts), 'ko', markersize=5)
 
-    # Plot the inferred rate
-    ax.plot([0], [0], 'b', lw=2, label="smoothed obs.")
-    ax.plot(smoothed_obss[-1][:,j], 'r', lw=2, label="smoothed pr.")
+    ax.plot([0], [0], 'ko', lw=2, label="data")
+    ax.plot(y_smooth_gibbs[-1][:, j], lw=2, label="gibbs probs")
+    ax.plot(y_smooth_em[-1][:, j], lw=2, label="em probs")
 
     if i == 0:
         plt.legend(loc="upper center", ncol=4, bbox_to_anchor=(0.5, 2.))
@@ -93,4 +110,3 @@ def gibbs_update(model):
     ax.set_ylabel("$x_%d(t)$" % (j+1))
 
 plt.show()
-
 
@@ -1,21 +1,23 @@
 import autograd.numpy as np
-from autograd import value_and_grad, hessian_vector_product
+from autograd import value_and_grad
 from autograd.scipy.special import gammaln
 
 from scipy.optimize import minimize
 
 from pybasicbayes.distributions import Regression
+from pybasicbayes.util.text import progprint_xrange
+
 
 class PoissonRegression(Regression):
     """
     Poisson regression with Gaussian distributed inputs and exp link:
 
-       y ~ Poisson(exp(Ax + b))
+       y ~ Poisson(exp(Ax))
 
     where x ~ N(mu, sigma)
 
     Currently, we only support maximum likelihood estimation of the
-    parameters, A and b, given the distribution over inputs, x, and
+    parameters A given the distribution over inputs, x, and
     the observed outputs, y.
 
     We compute the expected log likelihood in closed form (since
@@ -35,10 +37,18 @@ def __init__(self, D_out, D_in, A=None, verbose=False):
 
         self.sigma = None
 
+    @property
+    def D_in(self):
+        return self._D_in
+
+    @property
+    def D_out(self):
+        return self._D_out
+
     def log_likelihood(self,xy):
         assert isinstance(xy, tuple)
         x, y = xy
-        loglmbda = x.dot(self.A.T) + self.b.T
+        loglmbda = x.dot(self.A.T)
         lmbda = np.exp(loglmbda)
         return -gammaln(y+1) - lmbda + y * loglmbda
 
@@ -68,14 +78,6 @@ def expected_log_likelihood(self, mus, sigmas, y):
 
         return ll
 
-    @property
-    def D_in(self):
-        return self._D_in
-
-    @property
-    def D_out(self):
-        return self._D_out
-
     def predict(self, x):
         return np.exp(x.dot(self.A.T))
 
@@ -97,27 +99,30 @@ def max_likelihood(self, data, weights=None,stats=None):
     def max_expected_likelihood(self, stats, verbose=False):
         # These aren't really "sufficient" statistics, since we
         # need the mean and covariance for each time bin.
-        EyxT = np.sum([s[0] for s in stats], axis=0)
+        EyxuT = np.sum([s[0] for s in stats], axis=0)
         mus = np.vstack([s[1] for s in stats])
-        sigs = np.vstack([s[2] for s in stats])
-        masks = np.vstack(s[3] for s in stats)
+        sigmas = np.vstack([s[2] for s in stats])
+        inputs = np.vstack([s[3] for s in stats])
+        masks = np.vstack(s[4] for s in stats)
         T = mus.shape[0]
-        D = self.D_in
+
+        D_latent = mus.shape[1]
+        sigmas_vec = sigmas.reshape((T, D_latent**2))
 
         # Optimize each row of A independently
-        for n in range(self.D_out):
+        ns = progprint_xrange(self.D_out) if verbose else range(self.D_out)
+        for n in ns:
 
             # Flatten the covariance to enable vectorized calculations
-            sigs_vec = sigs.reshape((T,D**2))
             def ll_vec(an):
 
                 ll = 0
-                ll += np.dot(an, EyxT[n])
+                ll += np.dot(an, EyxuT[n])
 
                 # Vectorized log likelihood calculation
                 loglmbda = np.dot(mus, an)
-                aa_vec = np.outer(an, an).reshape((D ** 2,))
-                trms = np.exp(loglmbda + 0.5 * np.dot(sigs_vec, aa_vec))
+                aa_vec = np.outer(an[:D_latent], an[:D_latent]).reshape((D_latent ** 2,))
+                trms = np.exp(loglmbda + 0.5 * np.dot(sigmas_vec, aa_vec))
                 ll -= np.sum(trms[masks[:, n]])
 
                 if not np.isfinite(ll):
@@ -134,11 +139,144 @@ def cbk(x):
             res = minimize(value_and_grad(obj), self.A[n],
                            jac=True,
                            callback=cbk if verbose else None)
-            # res = minimize(value_and_grad(obj), self.A[n],
-            #                tol=1e-3,
-            #                method="Newton-CG",
-            #                jac=True,
-            #                hessp=hessian_vector_product(obj),
-            #                callback=cbk if verbose else None)
             assert res.success
             self.A[n] = res.x
+
+
+class BernoulliRegression(Regression):
+    """
+    Bernoulli regression with Gaussian distributed inputs and logistic link:
+
+       y ~ Bernoulli(logistic(Ax))
+
+    where x ~ N(mu, sigma)
+
+    Currently, we only support maximum likelihood estimation of the
+    parameter A given the distribution over inputs, x, and
+    the observed outputs, y.
+
+    We approximate the expected log likelihood with Monte Carlo.
+    """
+
+    def __init__(self, D_out, D_in, A=None, verbose=False):
+        self._D_out, self._D_in = D_out, D_in
+        self.verbose = verbose
+
+        if A is not None:
+            assert A.shape == (D_out, D_in)
+            self.A = A.copy()
+        else:
+            self.A = 0.01 * np.random.randn(D_out, D_in)
+
+        self.sigma = None
+
+    @property
+    def D_in(self):
+        return self._D_in
+
+    @property
+    def D_out(self):
+        return self._D_out
+
+    def log_likelihood(self,xy):
+        assert isinstance(xy, tuple)
+        x, y = xy
+        psi = x.dot(self.A.T)
+
+        # First term is linear
+        ll = y * psi
+
+        # Compute second term with log-sum-exp trick (see above)
+        logm = np.maximum(0, psi)
+        ll -= np.sum(logm)
+        ll -= np.sum(np.log(np.exp(-logm) + np.exp(psi - logm)))
+
+        return ll
+
+    def predict(self, x):
+        return 1 / (1 + np.exp(-x.dot(self.A.T)))
+
+    def rvs(self, x=None, size=1, return_xy=True):
+        x = np.random.normal(size=(size, self.D_in)) if x is None else x
+        y = np.random.rand(x.shape[0], self.D_out) < self.predict(x)
+        return np.hstack((x, y)) if return_xy else y
+
+    def max_likelihood(self, data, weights=None, stats=None):
+        """
+        Maximize the likelihood for given data
+        :param data:
+        :param weights:
+        :param stats:
+        :return:
+        """
+        if isinstance(data, list):
+            x = np.vstack([d[0] for d in data])
+            y = np.vstack([d[1] for d in data])
+        elif isinstance(data, tuple):
+            assert len(data) == 2
+        elif isinstance(data, np.ndarray):
+            x, y = data[:,:self.D_in], data[:, self.D_in:]
+        else:
+            raise Exception("Invalid data type")
+
+        from sklearn.linear_model import LogisticRegression
+        for n in progprint_xrange(self.D_out):
+            lr = LogisticRegression(fit_intercept=False)
+            lr.fit(x, y[:,n])
+            self.A[n] = lr.coef_
+
+
+    def max_expected_likelihood(self, stats, verbose=False, n_smpls=1):
+
+        # These aren't really "sufficient" statistics, since we
+        # need the mean and covariance for each time bin.
+        EyxuT = np.sum([s[0] for s in stats], axis=0)
+        mus = np.vstack([s[1] for s in stats])
+        sigmas = np.vstack([s[2] for s in stats])
+        inputs = np.vstack([s[3] for s in stats])
+        T = mus.shape[0]
+
+        D_latent = mus.shape[1]
+
+        # Draw Monte Carlo samples of x
+        sigmas_chol = np.linalg.cholesky(sigmas)
+        x_smpls = mus[:, :, None] + np.matmul(sigmas_chol, np.random.randn(T, D_latent, n_smpls))
+
+        # Optimize each row of A independently
+        ns = progprint_xrange(self.D_out) if verbose else range(self.D_out)
+        for n in ns:
+
+            def ll_vec(an):
+                ll = 0
+
+                # todo include mask
+                # First term is linear in psi
+                ll += np.dot(an, EyxuT[n])
+
+                # Second term depends only on x and cannot be computed in closed form
+                # Instead, Monte Carlo sample x
+                psi_smpls = np.einsum('tdm, d -> tm', x_smpls, an[:D_latent])
+                psi_smpls = psi_smpls + np.dot(inputs, an[D_latent:])[:, None]
+                logm = np.maximum(0, psi_smpls)
+                trm2_smpls = logm + np.log(np.exp(-logm) + np.exp(psi_smpls - logm))
+                ll -= np.sum(trm2_smpls) / n_smpls
+
+                if not np.isfinite(ll):
+                    return -np.inf
+
+                return ll / T
+
+            obj = lambda x: -ll_vec(x)
+
+            itr = [0]
+            def cbk(x):
+                itr[0] += 1
+                print("M_step iteration ", itr[0])
+
+            res = minimize(value_and_grad(obj), self.A[n],
+                           jac=True,
+                           # callback=cbk if verbose else None)
+                           callback=None)
+            assert res.success
+            self.A[n] = res.x
+