diff --git a/examples/02_meta-analyses/12_compare_cbmr_and_cbma.py b/examples/02_meta-analyses/12_compare_cbmr_and_cbma.py
new file mode 100644
index 000000000..37e4e9c26
--- /dev/null
+++ b/examples/02_meta-analyses/12_compare_cbmr_and_cbma.py
@@ -0,0 +1,226 @@
+"""
+
+.. _metas_cbmr_vs_cbma:
+
+================================================================
+Compare coordinate-based meta-regression and meta-analysis methods
+================================================================
+
+A comparison between coordinate-based meta-regression (CBMR) and
+coordinate-based meta-analysis (CBMA) in NiMARE
+
+CBMR is a generative framework to approximate smooth activation intensity function and investigate
+the effect of study-level moderators (e.g., year of pubilication, sample size, subtype of stimuli).
+It allows flexible statistical inference for either spatial homogeneity tests or group comparison
+tests. Additionally, it's a computationally efficient approach with good statistical
+interpretability to model the locations of activation foci.
+
+This tutorial is intended to provide an intuitive comparison of CBMA and MKDA results on
+neurosynth dataset.
+
+For more detailed introduction to CBMR implementation in NiMARE, see the `CBMR tutoral
+<https://nimare.readthedocs.io/en/latest/auto_examples/02_meta-analyses/11_plot_cbmr.html>`_ and
+`documatation <https://nimare.readthedocs.io/en/latest/generated/nimare.meta.cbmr.html>`_.
+
+"""
+import os
+
+from nimare.extract import download_abstracts, fetch_neurosynth
+from nimare.io import convert_neurosynth_to_dataset
+from nimare.meta import models
+from nilearn.plotting import plot_stat_map
+
+###############################################################################
+# Download the Neurosynth Dataset
+# -----------------------------------------------------------------------------
+# Neurosynth is a large-scale functional magnetic resonance imaing (fMRI) database.
+# There are currently 507891 activations reported in 14371 studies in the Neurosynth
+# database, with interactive, downloadable meta-analyses of 1334 terms. There is also
+# a `platform <https://neurosynth.org/>`_ designed for automated synthesis of fMRI data.
+
+out_dir = os.path.abspath("../example_data/")
+os.makedirs(out_dir, exist_ok=True)
+
+files = fetch_neurosynth(
+    data_dir=out_dir,
+    version="7",
+    overwrite=False,
+    source="abstract",
+    vocab="terms",
+)
+# Note that the files are saved to a new folder within "out_dir" named "neurosynth".
+neurosynth_db = files[0]
+
+neurosynth_dset = convert_neurosynth_to_dataset(
+    coordinates_file=neurosynth_db["coordinates"],
+    metadata_file=neurosynth_db["metadata"],
+    annotations_files=neurosynth_db["features"],
+)
+neurosynth_dset.save(os.path.join(out_dir, "neurosynth_dataset.pkl.gz"))
+
+neurosynth_dset = download_abstracts(neurosynth_dset, "example@example.edu")
+neurosynth_dset.save(os.path.join(out_dir, "neurosynth_dataset_with_abstracts.pkl.gz"))
+
+###############################################################################
+# For term-based meta-analyses, we split the whole Neurosynth dataset into two subsets,
+# one including all studies in the Neurosynth database whose abstracts include the term
+# at least once, the other including all the remaining studies. Here, we will conduct
+# meta-analyses based on the term "pain", and explore the spatial convergence between
+# pain studies and other fMRI studies.
+
+# extract study_id for pain dataset and non-pain dataset
+all_study_id = neurosynth_dset.annotations["id"]
+pain_study_id = neurosynth_dset.get_studies_by_label(labels=["terms_abstract_tfidf__pain"])
+non_pain_study_id = list(set(list(all_study_id)) - set(pain_study_id))  # 13855 studies
+# add an additional column for group
+neurosynth_dset.annotations.loc[all_study_id.isin(pain_study_id), "group"] = "pain"
+neurosynth_dset.annotations.loc[all_study_id.isin(non_pain_study_id), "group"] = "non_pain"
+
+###############################################################################
+# Estimation of group-specific spatial intensity functions
+# -----------------------------------------------------------------------------
+# Now we are going to run CBMR framework on the Neurosynth Dataset and estimate
+# spatial intensity functions for both pain studies and non-pain fMRI studies.
+
+from nimare.meta.cbmr import CBMREstimator
+cbmr = CBMREstimator(
+    group_categories="group",
+    moderators=None,
+    spline_spacing=10,  # a reasonable choice is 10 or 5, 100 is for speed
+    model=models.PoissonEstimator,
+    penalty=False,
+    lr=1e-1,
+    tol=1e-2,   # a reasonable choice is 1e-2, 1e3 is for speed
+    device="cpu",  # "cuda" if you have GPU
+)
+results = cbmr.fit(dataset=neurosynth_dset)
+
+###############################################################################
+# Now that we have fitted the model, we can plot the spatial intensity maps.
+
+plot_stat_map(
+    results.get_map("spatialIntensity_group-Pain"),
+    cut_coords=[0, 0, -8],
+    draw_cross=False,
+    cmap="RdBu_r",
+    title="Pain studies",
+    threshold=3e-4,
+    vmax=1e-3,
+)
+plot_stat_map(
+    results.get_map("spatialIntensity_group-Non_pain"),
+    cut_coords=[0, 0, -8],
+    draw_cross=False,
+    cmap="RdBu_r",
+    title="Non-pain fMRI studies",
+    threshold=3e-4,
+    vmax=1e-3,
+)
+
+###############################################################################
+# These two figures correspond to group-specific spatial intensity map of pain group
+# and non-pain group. Areas with stronger spatial intensity are highlighted.
+
+###############################################################################
+# Group-wise tests for spatial homogeneity
+# -----------------------------------------------------------------------------
+# For group-wise spatial homogeneity test, we generate contrast matrix *t_con_groups*
+# by specifying the group names in *create_contrast* function, and generate group-wise
+# p-value and z-score maps for spatial homogeneity tests.
+from nimare.meta.cbmr import CBMRInference
+
+inference = CBMRInference(device="cpu")
+inference.fit(result=results)
+t_con_groups = inference.create_contrast(
+    ["Pain", "Non_pain"], source="groups"
+)
+contrast_result = inference.transform(t_con_groups=t_con_groups)
+
+###############################################################################
+
+# generate z-score maps for group-wise spatial homogeneity test.
+plot_stat_map(
+    contrast_result.get_map("z_group-Pain"),
+    cut_coords=[0, 0, -8],
+    draw_cross=False,
+    cmap="RdBu_r",
+    title="Z-score map for spatial homogeneity test on pain studies",
+    threshold=20,
+    vmax=30,
+)
+
+plot_stat_map(
+    contrast_result.get_map("z_group-Non_pain"),
+    cut_coords=[0, 0, -8],
+    draw_cross=False,
+    cmap="RdBu_r",
+    title="Z-score map for spatial homogeneity test on non-pain fMRI studies",
+    threshold=20,
+    vmax=30,
+)
+
+###############################################################################
+# Group comparison test between pain studies and non-pain fMRI studies
+# -----------------------------------------------------------------------------
+# CBMR framework also allows flexible statistical inference for group comparison
+# between any two or more groups. For example, it's straightforward to generate
+# contrast matrix *t_con_groups* by specifying *contrast_name* as "group1-group2".
+
+inference = CBMRInference(device="cpu")
+inference.fit(result=results)
+t_con_groups = inference.create_contrast(
+    ["Pain-Non_pain"], source="groups"
+)
+contrast_result = inference.transform(t_con_groups=t_con_groups)
+
+###############################################################################
+
+# generate z-statistics maps for each group
+plot_stat_map(
+    contrast_result.get_map("z_group-Pain-Non_pain"),
+    cut_coords=[0, 0, 0],
+    draw_cross=False,
+    cmap="RdBu_r",
+    title="Spatial convergence between pain studies and Non-pain fMRI studies",
+    threshold=6,
+    vmax=20,
+)
+
+###############################################################################
+# This figure (displayed as z-statistics map) shows CBMR group comparison test
+# of spatial intensity between pain studies and non-pain studies in Neurosynth.
+# The null hypothesis assumes spatial intensity estimations of two groups are equal
+# at voxel level, $H_0: \mu_{1j}=\mu_{2j}, j=1,\cdots,N$, where $N$ is number of
+# voxels within brain mask, $j$ is the index of voxel. Areas with significant p-vaules
+# (siginificant difference in spatial intensity estimation between two groups) are
+# highlighted. We found that estimated activation level are significantly different
+# in ... between pain group and non-pain group.
+
+###############################################################################
+# Run MKDA on Neurosynth dataset
+# -----------------------------------------------------------------------------
+# For the purpose of justifying the validity of CBMR framework, we compare the estimated
+# spatial covergence of activation regions between pain studies and non-pain fMRI studies
+# with MKDA.
+
+from nimare.meta.cbma.mkda import MKDAChi2
+
+pain_dset = neurosynth_dset.slice(ids=pain_study_id)
+non_pain_dset = neurosynth_dset.slice(ids=pain_study_id)
+
+meta = MKDAChi2()
+results = meta.fit(pain_dset, non_pain_dset)
+
+plot_stat_map(
+    results.get_map("z_desc-consistency"),
+    cut_coords=[0, 0, -8],
+    draw_cross=False,
+    cmap="RdBu_r",
+    title="MKDA Chi-square analysis between pain studies and non-pain studies",
+    threshold=5,
+)
+
+###############################################################################
+# This figure (displayed as z-statistics map) shows MKDA spatial covergence of
+# activation between pain studies and non-pain fMRI studies. We found the results are
+# very consistent with CBMR approach, with higher specificity but lower sensitivity.
diff --git a/nimare/meta/cbmr.py b/nimare/meta/cbmr.py
index 2dc8b5eee..303873394 100644
--- a/nimare/meta/cbmr.py
+++ b/nimare/meta/cbmr.py
@@ -110,10 +110,10 @@ def __init__(
         spline_spacing=10,
         model=models.PoissonEstimator,
         penalty=False,
-        n_iter=1000,
-        lr=1e-2,
+        n_iter=2000,
+        lr=1,
         lr_decay=0.999,
-        tol=1e-2,
+        tol=1e-9,
         device="cpu",
         **kwargs,
     ):
@@ -359,7 +359,7 @@ def _preprocess_input(self, dataset):
                     n_group_study = len(group_study_id)
                     group_foci_per_study = np.array(
                         [(group_coordinates["study_id"] == i).sum() for i in group_study_id]
-                    )
+                    )  # try groupby
                     group_foci_per_study = group_foci_per_study.reshape((n_group_study, 1))
 
                     foci_per_voxel[group] = group_foci_per_voxel
diff --git a/nimare/meta/models.py b/nimare/meta/models.py
index 85c097ed5..f2a8df30d 100644
--- a/nimare/meta/models.py
+++ b/nimare/meta/models.py
@@ -1,9 +1,7 @@
 """CBMR Models."""
 import abc
-import copy
 import logging
 
-import functorch
 import numpy as np
 import pandas as pd
 import torch
@@ -45,10 +43,10 @@ def __init__(
         spatial_coef_dim=None,
         moderators_coef_dim=None,
         penalty=False,
-        lr=0.1,
+        lr=1,
         lr_decay=0.999,
-        n_iter=1000,
-        tol=1e-2,
+        n_iter=2000,
+        tol=1e-9,
         device="cpu",
     ):
         super().__init__()
@@ -187,47 +185,17 @@ def closure():
             loss.backward()
             return loss
 
-        loss = optimizer.step(closure)
+        optimizer.step(closure)
         scheduler.step()
+        # recalculate the loss function
+        loss = self(coef_spline_bases, moderators, foci_per_voxel, foci_per_study)
+
         if torch.isnan(loss):
             raise ValueError(
                 f"""The current learing rate {str(self.lr)} or choice of model gives rise to
                 NaN log-likelihood, please try Poisson model or adjust learning rate to a smaller
                 value."""
             )
-        # reset the L-BFGS params if NaN appears in coefficient of regression
-        if any(
-            [
-                torch.any(torch.isnan(self.spatial_coef_linears[group].weight))
-                for group in self.groups
-            ]
-        ):
-            if self.iter == 1:  # NaN occurs in the first iteration
-                raise ValueError(
-                    """The current learing rate {str(self.lr)} gives rise to NaN values, adjust
-                    to a smaller value."""
-                )
-            spatial_coef_linears, overdispersion = dict(), dict()
-            for group in self.groups:
-                group_spatial_linear = torch.nn.Linear(
-                    self.spatial_coef_dim, 1, bias=False
-                ).double()
-                group_spatial_linear.weight = torch.nn.Parameter(
-                    self.last_state["spatial_coef_linears." + group + ".weight"]
-                )
-                spatial_coef_linears[group] = group_spatial_linear
-
-                if hasattr(self, "overdispersion"):
-                    group_overdispersion = torch.nn.Parameter(
-                        self.last_state["overdispersion." + group]
-                    )
-                    overdispersion[group] = group_overdispersion
-            self.spatial_coef_linears = torch.nn.ModuleDict(spatial_coef_linears)
-            if hasattr(self, "overdispersion"):
-                self.overdispersion = torch.nn.ParameterDict(overdispersion)
-            LGR.debug("Reset L-BFGS optimizer......")
-        else:
-            self.last_state = copy.deepcopy(self.state_dict())
 
         return loss
 
@@ -247,7 +215,13 @@ def _optimizer(self, coef_spline_bases, moderators_by_group, foci_per_voxel, foc
             Dictionary of group-wise number of foci per study.
         """
         torch.manual_seed(100)
-        optimizer = torch.optim.LBFGS(self.parameters(), self.lr)
+        optimizer = torch.optim.LBFGS(
+            params=self.parameters(),
+            lr=self.lr,
+            max_iter=self.n_iter,
+            tolerance_change=self.tol,
+            line_search_fn="strong_wolfe",
+        )
         # load dataset info to torch.tensor
         coef_spline_bases = torch.tensor(
             coef_spline_bases, dtype=torch.float64, device=self.device
@@ -275,20 +249,14 @@ def _optimizer(self, coef_spline_bases, moderators_by_group, foci_per_voxel, foc
         if self.iter == 0:
             prev_loss = torch.tensor(float("inf"))  # initialization loss difference
 
-        for i in range(self.n_iter):
-            loss = self._update(
-                optimizer,
-                coef_spline_bases,
-                moderators_by_group_tensor,
-                foci_per_voxel_tensor,
-                foci_per_study_tensor,
-                prev_loss,
-            )
-            loss_diff = loss - prev_loss
-            LGR.debug(f"Iter {self.iter:04d}: log-likelihood {loss:.4f}")
-            if torch.abs(loss_diff) < self.tol:
-                break
-            prev_loss = loss
+        self._update(
+            optimizer,
+            coef_spline_bases,
+            moderators_by_group_tensor,
+            foci_per_voxel_tensor,
+            foci_per_study_tensor,
+            prev_loss,
+        )
 
         return
 
@@ -389,7 +357,7 @@ def nll_spatial_coef(group_spatial_coef):
                     **ll_single_group_kwargs,
                 )
 
-            f_spatial_coef = functorch.hessian(nll_spatial_coef)(group_spatial_coef)
+            f_spatial_coef = torch.func.hessian(nll_spatial_coef)(group_spatial_coef)
             f_spatial_coef = f_spatial_coef.reshape((self.spatial_coef_dim, self.spatial_coef_dim))
             cov_spatial_coef = np.linalg.inv(f_spatial_coef.detach().numpy())
             var_spatial_coef = np.diag(cov_spatial_coef)
@@ -423,7 +391,7 @@ def nll_moderators_coef(moderators_coef):
                     **ll_single_group_kwargs,
                 )
 
-            f_moderators_coef = functorch.hessian(nll_moderators_coef)(moderators_coef)
+            f_moderators_coef = torch.func.hessian(nll_moderators_coef)(moderators_coef)
             f_moderators_coef = f_moderators_coef.reshape(
                 (self.moderators_coef_dim, self.moderators_coef_dim)
             )
@@ -560,7 +528,7 @@ def nll_spatial_coef(spatial_coef):
                 **ll_mult_group_kwargs,
             )
 
-        h = functorch.hessian(nll_spatial_coef)(spatial_coef)
+        h = torch.func.hessian(nll_spatial_coef)(spatial_coef)
         h = h.view(n_involved_groups * self.spatial_coef_dim, -1)
 
         return h.detach().cpu().numpy()
@@ -632,7 +600,7 @@ def nll_moderator_coef(moderator_coef):
                 **ll_mult_group_kwargs,
             )
 
-        h = functorch.hessian(nll_moderator_coef)(moderator_coef)
+        h = torch.func.hessian(nll_moderator_coef)(moderator_coef)
         h = h.view(self.moderators_coef_dim, self.moderators_coef_dim)
 
         return h.detach().cpu().numpy()
diff --git a/nimare/tests/test_meta_cbmr.py b/nimare/tests/test_meta_cbmr.py
index 6315f4f72..d77c8f69b 100644
--- a/nimare/tests/test_meta_cbmr.py
+++ b/nimare/tests/test_meta_cbmr.py
@@ -42,8 +42,9 @@ def cbmr_result(testdata_cbmr_simulated, model):
         spline_spacing=100,
         model=model,
         penalty=False,
-        lr=1e-2,
-        tol=1e7,
+        n_iter=1000,
+        lr=1,
+        tol=1e4,
         device="cpu",
     )
     res = cbmr.fit(dataset=dset)
@@ -113,10 +114,10 @@ def test_firth_penalty(testdata_cbmr_simulated):
         group_categories=["diagnosis", "drug_status"],
         moderators=["standardized_sample_sizes", "standardized_avg_age", "schizophrenia_subtype"],
         spline_spacing=100,
-        model=models.ClusteredNegativeBinomialEstimator,
+        model=models.PoissonEstimator,
         penalty=True,
-        lr=1e-1,
-        tol=1e7,
+        lr=1,
+        tol=1e4,
         device="cpu",
     )
     res = cbmr.fit(dataset=dset)
@@ -134,8 +135,8 @@ def test_moderators_none(testdata_cbmr_simulated):
         spline_spacing=100,
         model=models.PoissonEstimator,
         penalty=False,
-        lr=1e-2,
-        tol=1e7,
+        lr=1,
+        tol=1e4,
         device="cpu",
     )
     res = cbmr.fit(dataset=dset)
@@ -162,7 +163,7 @@ def test_CBMREstimator_update(testdata_cbmr_simulated):
     cbmr = CBMREstimator(
         moderators=["standardized_sample_sizes", "standardized_avg_age", "schizophrenia_subtype"],
         model=models.PoissonEstimator,
-        lr=1e-4,
+        lr=1,
     )
 
     cbmr._collect_inputs(testdata_cbmr_simulated, drop_invalid=True)
diff --git a/setup.cfg b/setup.cfg
index ffd73a8e8..85fb92e3a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,7 +57,7 @@ install_requires =
     scipy>=1.6.0
     sparse>=0.13.0  # for kernel transformers
     statsmodels!=0.13.2  # this version doesn't install properly
-    torch # for cbmr models
+    torch>=2.0 # for cbmr models
     tqdm  # progress bars throughout package
 packages = find:
 include_package_data = False