docs: update sysvi docs and images (#3225)

@Hrovatin --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
scverse · Mar 2, 2025 · 859cc1e · 859cc1e
1 parent d28a67f
commit 859cc1e
Show file tree

Hide file tree

Showing 10 changed files with 168 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,7 +22,8 @@ to [Semantic Versioning]. Full commit history is available in the
     {class}`~scvi.external.CellAssign` and {class}`~scvi.external.GIMVI`. {pr}`3121`.
 - Add {class}`scvi.external.RESOLVI` for bias correction in single-cell resolved spatial
     transcriptomics {pr}`3144`.
-- Add semisupervised training mixin class {class}`scvi.model.base.SemisupervisedTrainingMixin` {pr}`3164`.
+- Add semisupervised training mixin class
+    {class}`scvi.model.base.SemisupervisedTrainingMixin`. {pr}`3164`.
 - Add scib-metrics support for {class}`scvi.autotune.AutotuneExperiment` and
     {class}`scvi.train._callbacks.ScibCallback` for autotune for scib metrics {pr}`3168`.
 - Add Support of dask arrays in AnnTorchDataset. {pr}`3193`.

diff --git a/LICENSE b/LICENSE
@@ -1,7 +1,8 @@
 BSD 3-Clause License
 
-Copyright (c) 2024, Adam Gayoso, Romain Lopez, Martin Kim, Pierre Boyeau, Nir Yosef
-Copyright (c) 2025, scverse®
+Copyright (c) 2025, The scvi-tools development team
+
+All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

diff --git a/README.md b/README.md
@@ -97,10 +97,12 @@ You can cite the scverse publication as follows:
 >
 > _Nature Biotechnology_ 2023 Apr 10. doi: [10.1038/s41587-023-01733-8](https://doi.org/10.1038/s41587-023-01733-8).
 
-[//]: # (numfocus-fiscal-sponsor-attribution)
+scvi-tools is part of the scverse® project ([website](https://scverse.org),
+[governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
 
-scvi-tools is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
-If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
+If you like scverse® and want to support our mission, please consider making a tax-deductible
+[donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time,
+professional services, travel, workshops, and a variety of other needs.
 
 <div align="center">
 <a href="https://numfocus.org/project/scverse">
@@ -123,12 +125,10 @@ If you like scverse® and want to support our mission, please consider making a
 [docs-badge]: https://readthedocs.org/projects/scvi/badge/?version=latest
 [docs-link]: https://scvi.readthedocs.io/en/stable/?badge=stable
 [documentation]: https://docs.scvi-tools.org/
-[donation]: https://numfocus.org/donate-to-scverse
 [forum]: https://discourse.scvi-tools.org
 [gh-stars-badge]: https://img.shields.io/github/stars/scverse/scvi-tools?style=flat&logo=GitHub&color=blue
 [gh-stars-link]: https://github.com/scverse/scvi-tools/stargazers
 [issues]: https://github.com/scverse/scvi-tools/issues
-[numfocus]: https://numfocus.org/
 [pepy-badge]: https://static.pepy.tech/badge/scvi-tools
 [pepy-link]: https://pepy.tech/project/scvi-tools
 [pypi-badge]: https://img.shields.io/pypi/v/scvi-tools.svg

diff --git a/docs/user_guide/models/figures/sysvi_cycleconsistency.png b/docs/user_guide/models/figures/sysvi_cycleconsistency.png
diff --git a/docs/user_guide/models/figures/sysvi_vampprior.png b/docs/user_guide/models/figures/sysvi_vampprior.png
diff --git a/docs/user_guide/models/sysvi.md b/docs/user_guide/models/sysvi.md
@@ -1,20 +1,153 @@
 # SysVI
 
-:::{note}
-This page is under construction.
-:::
-
-**sysVI** (Python class {class}`~scvi.external.SysVI`) is a ...
+**sysVI** (cross-SYStem Variational Inference,
+Python class {class}`~scvi.external.SysVI`)
+is a representation learning models that can remove substantial batch effects.
 
 The advantages of SysVI are:
 
--   ...
+-   Improved integration: For datasets with **substantial batch effects**
+(e.g., cross-species or organoid-tissue), where other models often fail.
+It provides a good tradeoff between batch correction and preservation of
+cell-type and sub-cell-type biological variation.
+- Tunable integration: The **integration strength is directly tunable**
+via cycle consistency loss.
+- Generally applicable: The model operates on
+**approximately normally distributed data**
+(e.g. normalized and log+1 transformed scRNA-seq data), which makes it
+more generally applicable than just scRNA-seq.
+- Scalable: Can integrate very large datasets if using a GPU.
 
 The limitations of SysVI include:
 
--   ...
+-   Weak batch effects: For datasets with **small batch effects**
+(e.g. multiple subjects from a single laboratory) we recommend using scVI instead,
+as it has slightly higher biological preservation in this setting.
+For determining whether a dataset has substantial batch effects
+please refer to our paper.
+- Model selection: The best performance is achieved if
+**selecting the best model** from multiple
+runs with a few different cycle consistency loss weights and random seed
+initialisations, as explained in the tutorial.
+However, we provide **defaults** that generate decent results in
+many settings.
+
 
 ```{topic} Tutorials:
 
 -   {doc}`/tutorials/notebooks/scrna/sysVI`
 ```
+
+```{topic} References:
+
+-  Paper: Hrovatin and Moinfar, et al.
+Integrating single-cell RNA-seq datasets with substantial batch effects.
+bioRxiv (2023): https://doi.org/10.1101/2023.11.03.565463
+- Talk on caveats of scRNA-seq integration and strategies for removing
+substantial batch effects: https://www.youtube.com/watch?v=i-a4BjAn90E
+```
+
+## Method background
+
+The model is based on a variational autoencoder (VAE), with the integrated
+representation corresponding to the latent space embedding of the cells.
+
+### Stronger batch correction with cycle-consistency loss
+
+Vanilla VAEs struggle to achieve strong batch correction without loosing
+substantial biological variation. This issue arises as the VAE loss
+does not directly penalize the presence of batch covariate information in the
+latent space.
+Instead, conditional VAEs assume that batch covariate information will be
+omitted from the latent space, which has limited-capacity,
+as it is separately injected into the decoder. Namely, its presence in the
+latent space is "unnecessary" for the reconstruction (Hrovatin and Moinfar, 2023).
+
+To achieve stronger integration than vanilla VAEs, SysVI employs
+cycle-consistency loss in the latent space. In particular, the model embeds a cell
+from one system (i.e. the covariate representing substantial batch effect)
+into latent space and then decodes it using another category of the system covariate.
+In this way it generates a biologically identical cell with a
+different batch effect. The generated cell is then likewise embedded into the
+latent space and the distance between the embeddings of the original and
+the switched-batch cell are computed. The model is trained to minimize this distance.
+
+:::{figure} figures/sysvi_cycleconsistency.png
+:align: center
+:alt: Cycle consistency loss used to increase batch correction in SysVI.
+:class: img-fluid
+:::
+
+Benefits of this approach:
+- As only cells with identical biological background are compared, this method
+retains good biological preservation even when removing
+substantial batch effects. This distinguishes it from alternative approaches
+that compare cells with different biological backgrounds
+(e.g. via adversarial loss; see Hrovatin and Moinfar (2023) for details).
+- The integration strength can be directly tuned via the cycle-consistency
+loss weight.
+
+### Improved biological preservation via the VampPrior
+
+Vanilla VAEs employ standard normal prior for regularizing latent space.
+However, this prior is very restrictive and can lead to loss of
+important biological variation in the latent space.
+
+Instead, we use the
+VampPrior ([Tomczak, 2017](https://doi.org/10.48550/arXiv.1705.07120)),
+which permits a more expressive latent space. VampPrior is a multi-modal
+prior for which the mode positions are learned during the training.
+
+:::{figure} figures/sysvi_vampprior.png
+:align: center
+:alt: VampPrior used to increase the preservation of biological variation in SysVI.
+:class: img-fluid
+:::
+
+Benefits of this approach:
+- More expressive latent space leads to increased preservation of
+biological variability.
+- VampPrior was more robust with respect to the number of modes than the
+better-know Gaussian mixture prior.
+
+### Application flexibility due to using normally distributed inputs
+
+Many scRNA-seq integration models are specially designed to work with
+scRNA-seq data, e.g. raw counts that follows negative binomial distribution.
+However, due to this, these models can not be directly used for other
+types of data.
+
+We observed that for representation learning this specialised setup is not
+strictly required. - SysVI is designed for data following normal distribution, while
+performing competitively in comparison to the more specialised models
+on scRNA-seq data.
+To make scRNA-seq data approximately normally distributed we preprocess it via
+size-factor normalization and log+1 transformation.
+
+Thus, SysVI could be also applied to other types of normally distributed data.
+However, we did not specifically test its performance on other data types.
+
+## Other tips & tricks for data integration
+
+Besides the benefits of the SysVI model, our paper
+([Hrovatin and Moinfar, 2023](https://doi.org/10.1101/2023.11.03.565463))
+and
+[talk](https://www.youtube.com/watch?v=i-a4BjAn90E)
+provide additional advice on scRNA-seq integration that apply beyond SysVI.
+The two most important insights are:
+- Try to make the **integration task as easy for the model** as possible.
+This means that data should be pre-processed in a way that already eliminates
+some of the batch differences, when possible:
+  - Use intersection of HVGs across batches with substantial batch effects
+  (e.g. the systems).
+  - Mitigate known technical artefacts, such as ambient gene expression
+  ([Hrovatin and Sikkema, 2024](https://doi.org/10.1038/s41592-024-02532-y)).
+- Ensure that **the metrics used to evaluate integration are of high-quality**:
+  - They should be able to capture the key properties required for downstream tasks.
+  For example, the standard cell-type based biological preservation metrics do
+  not assess whether subtler biological differences, such as within-cell-type
+  disease effects, are preserved.
+  - Be cautious of potential biases within integration metric scores. -
+  The scores may not directly correspond to the desired data property,
+  being influenced by other factors, or
+  certain models may be able to trick the metrics.
diff --git a/src/scvi/model/base/__init__.py b/src/scvi/model/base/__init__.py
@@ -23,6 +23,7 @@
     "RNASeqMixin",
     "VAEMixin",
     "UnsupervisedTrainingMixin",
+    "SemisupervisedTrainingMixin",
     "PyroSviTrainMixin",
     "PyroSampleMixin",
     "PyroJitGuideWarmup",
@@ -32,5 +33,4 @@
     "BaseMinifiedModeModelClass",
     "BaseMudataMinifiedModeModelClass",
     "EmbeddingMixin",
-    "SemisupervisedTrainingMixin",
 ]
diff --git a/src/scvi/model/base/_training_mixin.py b/src/scvi/model/base/_training_mixin.py
@@ -162,6 +162,8 @@ def train(
 
 
 class SemisupervisedTrainingMixin:
+    """General purpose semisupervised train, predict and interoperability methods."""
+
     _training_plan_cls = SemiSupervisedTrainingPlan
 
     def _set_indices_and_labels(self):
@@ -416,7 +418,10 @@ def get_ranked_genes(
 
         Parameters
         ----------
-        attr: numpy.ndarray
+        adata
+            AnnData or MuData object that has been registered via corresponding setup
+            method in model class.
+        attrs: numpy.ndarray
             Attributions matrix.
 
         Returns
@@ -426,7 +431,7 @@ def get_ranked_genes(
 
         Examples
         --------
-        >>> attrs_df = interpreter.get_ranked_genes(attrs)
+        >>> attrs_df = model.get_ranked_genes(attrs)
         """
         if attrs is None:
             Warning("Missing Attributions matrix")

diff --git a/src/scvi/train/__init__.py b/src/scvi/train/__init__.py
@@ -1,4 +1,10 @@
-from ._callbacks import JaxModuleInit, LoudEarlyStopping, SaveBestState, SaveCheckpoint
+from ._callbacks import (
+    JaxModuleInit,
+    LoudEarlyStopping,
+    SaveBestState,
+    SaveCheckpoint,
+    ScibCallback,
+)
 from ._constants import METRIC_KEYS
 from ._trainer import Trainer
 from ._trainingplans import (
@@ -24,6 +30,7 @@
     "LoudEarlyStopping",
     "SaveBestState",
     "SaveCheckpoint",
+    "ScibCallback",
     "JaxModuleInit",
     "JaxTrainingPlan",
     "METRIC_KEYS",

diff --git a/src/scvi/train/_callbacks.py b/src/scvi/train/_callbacks.py
@@ -387,6 +387,8 @@ def on_train_start(self, trainer, pl_module):
 
 
 class ScibCallback(Callback):
+    """A callback to initialize the Scib-Metrics autotune module."""
+
     def __init__(
         self,
     ):
-Original file line number
+Diff line change
@@ Expand Up / @@ -387,6 +387,8 @@ def on_train_start(self, trainer, pl_module): @@
     class ScibCallback(Callback):
+        """A callback to initialize the Scib-Metrics autotune module."""
         def __init__(
             self,
         ):
@@ Expand Down @@