clean up

josephbowles · josephbowles · commit a43c901b03ee · 2024-10-10T16:17:28.000+02:00
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ pyyaml~=6.0
 pennyLane~=0.34
 scipy~=1.11
 pandas~=2.2
+numpyro~=0.14.0
diff --git a/src/qml_benchmarks/data/ising.py b/src/qml_benchmarks/data/ising.py
@@ -8,32 +8,8 @@
 from jax import random
 from collections import namedtuple
 from numpyro.infer.mcmc import MCMCKernel
-from qgml.data import SpinConfigurationGeneratorBase
 from tqdm.auto import tqdm
 
-def create_isotropic_interaction_matrix(grid_size: int):
-    """Create an interaction matrix for a 2D isotropic square lattice."""
-    J = jnp.zeros((grid_size * grid_size, grid_size * grid_size))
-
-    for i in range(grid_size):
-        for j in range(grid_size):
-            # Spin index in the grid
-            idx = i * grid_size + j
-
-            # Calculate the indices of the neighbors
-            right_idx = i * grid_size + (j + 1) % grid_size
-            left_idx = i * grid_size + (j - 1) % grid_size
-            bottom_idx = ((i + 1) % grid_size) * grid_size + j
-            top_idx = ((i - 1) % grid_size) * grid_size + j
-
-            # Set the interactions, ensuring each pair is only added once
-            J = J.at[idx, right_idx].set(1)
-            J = J.at[idx, left_idx].set(1)
-            J = J.at[idx, bottom_idx].set(1)
-            J = J.at[idx, top_idx].set(1)
-    return J
-
-
 @jax.jit
 def energy(s, J, b, J_sparse=None):
     """Calculate the Ising energy. For sparse Hamiltonians, it is recommneded to supply a list of nonzero indices of
@@ -51,7 +27,6 @@ def energy(s, J, b, J_sparse=None):
     else:
         return -jnp.einsum("i,j,ij->", s, s, J) / 2.0 - jnp.dot(s, b)
 
-
 def initialize_spins(rng_key, num_spins, num_chains):
     if num_chains == 1:
         spins = random.bernoulli(rng_key, 0.5, (num_spins,))
@@ -119,11 +94,19 @@ def mh_step(i, val):
         return MHState(spins, rng_key)
 
 
-# Define the Ising model class
-class IsingSpins(SpinConfigurationGeneratorBase):
-    """
-    class object used to generate datasets
-    ArgsL
+class IsingSpins:
+    r"""
+    class object used to generate datasets by sampling an ising distrbution of a specified interaction
+    matrix. The distribution is sampled via markov chain Monte Carlo via the Metrolopis Hastings
+    algorithm.
+
+    In the case of perfect sampling, a spin configuration s is sampled with probabability
+    :math:`p(s)=exp(-H(s)/T)`, where the energy :math:`H(s)=\sum_{i\neq j}s_i s_i J_{ij}+\sum_i b_i s_i`
+    corresponds to an ising Hamiltonian and configurations s are :math:`\pm1` valued.
+
+    The final sampled configurations are converted from a :math:`\pm1` representation to to a binary
+    representation via x = (s+1)//2.
+
     N (int): Number of spins
     J (np.array): interaction matrix
     b (np.array): bias terms
@@ -134,14 +117,15 @@ class object used to generate datasets
     def __init__(
         self, N: int, J: jnp.array, b: jnp.array, T: float, sparse=False, compute_partition_fn=False
     ) -> None:
-        super().__init__(N)
+
+        self.N = N
         self.kernel = MetropolisHastings()
         self.J = J
         self.T = T
         self.b = b
         self.J_sparse = jnp.nonzero(J) if sparse else None
 
-       if compute_partition_fn:
+        if compute_partition_fn:
             Z = 0
             for i in tqdm(range(2**self.N), desc="Computing partition function"):
                 lattice = (-1) ** jnp.array(jnp.unravel_index(i, [2] * self.N))
@@ -181,22 +165,70 @@ def sample(
             J_sparse=self.J_sparse,
         )
         samples = mcmc.get_samples()
-        return samples.reshape((-1, self.N))
+        samples.reshape((-1, self.N))
+        return (samples+1)//2
+
+    def probability(self, x: ndarray) -> float:
+        """
+        compute the probability of a binary configuration x
+        Args:
+            x: binary configuration array
+        Returns:
+            (float): the probability of sampling x according to the ising distribution
+        """
+
+        if not(hasattr(self, 'Z')):
+            raise Exception('probability requires partition fuction to have been computed')
 
-    def probability(self, spin_configuration: ndarray) -> float:
         return (
-            jnp.exp(-energy(spin_configuration, self.J, self.b, self.J_sparse) / self.T)
+            jnp.exp(-energy(x, self.J, self.b, self.J_sparse) / self.T)
             / self.Z
         )
 
-def generate_isometric_ising(
-    num_samples: int = 100, T: float = 2.5, grid_size: int = 4
-) -> (ndarray, None):
-    num_spins = grid_size * grid_size
-    num_chains = 2
-    num_steps = 1000
-    J = create_isotropic_interaction_matrix(grid_size)
-    model = IsingSpins(num_spins, J, b=1.0, T=T)
-    # Plot the magnetization and energy trajectories for a single T
-    samples = model.sample(num_samples*num_steps, num_chains=num_chains, num_warmup=10000, key=0)
-    return samples[-num_samples:], None
+def generate_ising(N: int,
+                   num_samples: int,
+                   J: jnp.array,
+                   b: jnp.array,
+                   T: float,
+                   sparse=False,
+                   num_chains=1,
+                   thinning=1,
+                   num_warmup=1000,
+                   key=42):
+    r"""
+    Generating function for ising datasets.
+
+    The dataset is generated by sampling an ising distrbution of a specified interaction
+    matrix. The distribution is sampled via markov chain Monte Carlo via the Metrolopis Hastings
+    algorithm.
+
+    In the case of perfect sampling, a spin configuration s is sampled with probabability
+    :math:`p(s)=exp(-H(s)/T)`, where the energy :math:`H(s)=\sum_{i\neq j}s_i s_i J_{ij}+\sum_i b_i s_i`
+    corresponds to an ising Hamiltonian and configurations s are :math:`\pm1` valued.
+
+    The final sampled configurations are converted from a :math:`\pm1` representation to to a binary
+    representation via x = (s+1)//2.
+
+    Note that in order to use parallelization, the number of avaliable cores has to be specified explicitly
+    to numpyro. i.e. the line `numpyro.set_host_device_count(num_cores)` should appear before running the
+    generator, where num_cores is the number of avaliable CPU cores you want to use.
+
+    N (int): Number of spins
+    num_samples (int): total number of samples to generate per chain
+    J (np.array): interaction matrix of shape (N,N)
+    b (np.array): bias array of shape (N,)
+    T (float): temperature
+    num_chains (int): number of chains, defaults to 1.
+    thinning (int): how much to thin the sampling. e.g. if thinning = 10 a sample will be drawn after each
+        10 steps of mcmc sampling. Larger numbers result in more unbiased samples.
+    num_warmup (int): number of mcmc 'burn in' steps to perform before collecting any samples.
+    key (int): random seed used to initialize sampling.
+    sparse (bool): If true, J is converted to a sparse representation (faster for sparse Hamiltonians)
+
+    Returns:
+        Array of data samples, and Nonetype object (since there are no labels)
+    """
+
+    sampler = IsingSpins(N, J, b, T, sparse=sparse, compute_partition_fn=False)
+    samples = sampler.sample(num_samples, num_chains=num_chains, thinning=thinning, num_warmup=num_warmup, key=key)
+    return samples, None
diff --git a/src/qml_benchmarks/data/spin_blobs.py b/src/qml_benchmarks/data/spin_blobs.py
@@ -16,27 +16,23 @@
 
 import numpy as np
 
-
 class RandomSpinBlobs:
-    """Generate spin configurations with high probabilites for certain spins.
-
-    The dataset is generated by creating random spin samples close to a few
-    chosen `peak_spin` configurations of dimension `N` with each spin having
-    the possible values 0 or 1. We can vary the `peak_probabilities` parameter
-    to create data with different modes, where some samples will have higher
-    probabilities allowing us to study the effects of imbalance in the data.
+    """
+    Class object used to generate spin blob datasets: a binary analog of the
+    'gaussian blobs' dataset, in which bitstrings are sampled close in Hamming
+    distance to a set of specified configurations.
 
-    Samples are generated by selecting one of the peak spin configurations
-    distributed according `peak_probabilities`, and then by flipping some of the
-    spins. The number of spins that are flipped each time, is drawn from a
-    Binomial distribution bin(`N`, `p`) where `p=1` will flip all the spins
-    and `p=0` will not flip any spins therefore creating very narrow distributions
-    around the peak spins.
+    The dataset is generated by specifying a list of configurations (peak spins)
+    that mark the centre of the 'blobs'. Data points are sampled by chosing one of
+    the peak spins (with probabilities specified by peak probabilities), and then
+    flipping some of the bits. Each bit is flipped with probability specified by
+    p, so that (for small p) datapoints are close in Hamming distance to one of
+    the peak probabilities.
 
     Args:
         N (int): The number of spins.
         num_blobs (int):
-            The number of blobs or peak probabilities.
+            The number of blobs.
         peak_probabilities (list[float], optional):
             The probability of each spin to be selected. If not specified,
             the probabilities are distributed uniformly.
@@ -56,6 +52,7 @@ def __init__(
         peak_spins: list[np.array] = None,
         p: float = 0.01,
     ) -> None:
+
         self.N = N
         self.num_blobs = num_blobs
 
@@ -122,6 +119,54 @@ def sample(self, num_samples: int, return_labels=False) -> np.array:
         else:
             return samples
 
+def generate_spin_blobs(N: int, num_blobs: int, num_samples:int, peak_probabilities: list[float] = None, peak_spins: list[np.array] = None,
+        p: float = 0.01):
+
+    """
+    Generator function for spin blob datasets: a binary analog of the
+    'gaussian blobs' dataset, in which bitstrings are sampled close in Hamming
+    distance to a set of specified configurations.
+
+    The dataset is generated by specifying a list of configurations (peak spins)
+    that mark the centre of the 'blobs'. Data points are sampled by chosing one of
+    the peak spins (with probabilities specified by peak probabilities), and then
+    flipping some of the bits. Each bit is flipped with probability specified by
+    p, so that (for small p) datapoints are close in Hamming distance to one of
+    the peak probabilities.
+
+    Args:
+        N (int): The number of spins.
+        num_blobs (int):
+            The number of blobs.
+        num_samples (int): The number of samples to generate.
+        peak_probabilities (list[float], optional):
+            The probability of each spin to be selected. If not specified,
+            the probabilities are distributed uniformly.
+        peak_spins (list[np.array], optional):
+            The peak spin configurations. Selected randomly by default.
+        p (float, optional):
+            The value of the parameter `p` in a Binomial distribution specifying
+            the number of spins that are flipped each time during sampling.
+            Defaults to 0.01.
+
+    Returns:
+        tuple(np.ndarray): Dataset array and label array specifying the peak spin
+            that was used to sample each datapoint.
+    """
+
+    sampler = RandomSpinBlobs(
+        N=N,
+        num_blobs=num_blobs,
+        peak_probabilities=peak_probabilities,
+        peak_spins=peak_spins,
+        p=p,
+    )
+
+    X, y = sampler.sample(num_samples=num_samples, return_labels=True)
+    X = X.reshape(-1, N)
+
+    return X, y
+
 
 def generate_8blobs(
     num_samples: int,