diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fde65420c..7abb10f97f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,8 @@ to [Semantic Versioning]. Full commit history is available in the {class}`~scvi.external.CellAssign` and {class}`~scvi.external.GIMVI`. {pr}`3121`. - Add {class}`scvi.external.RESOLVI` for bias correction in single-cell resolved spatial transcriptomics {pr}`3144`. -- Add semisupervised training mixin class {class}`scvi.model.base.SemisupervisedTrainingMixin` {pr}`3164`. +- Add semisupervised training mixin class + {class}`scvi.model.base.SemisupervisedTrainingMixin`. {pr}`3164`. - Add scib-metrics support for {class}`scvi.autotune.AutotuneExperiment` and {class}`scvi.train._callbacks.ScibCallback` for autotune for scib metrics {pr}`3168`. - Add Support of dask arrays in AnnTorchDataset. {pr}`3193`. diff --git a/LICENSE b/LICENSE index b7af094f79..6042364f48 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,8 @@ BSD 3-Clause License -Copyright (c) 2024, Adam Gayoso, Romain Lopez, Martin Kim, Pierre Boyeau, Nir Yosef -Copyright (c) 2025, scverse® +Copyright (c) 2025, The scvi-tools development team + +All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index f9a413ef83..ff21e88ba6 100644 --- a/README.md +++ b/README.md @@ -97,10 +97,12 @@ You can cite the scverse publication as follows: > > _Nature Biotechnology_ 2023 Apr 10. doi: [10.1038/s41587-023-01733-8](https://doi.org/10.1038/s41587-023-01733-8). -[//]: # (numfocus-fiscal-sponsor-attribution) +scvi-tools is part of the scverse® project ([website](https://scverse.org), +[governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). -scvi-tools is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). -If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs. +If you like scverse® and want to support our mission, please consider making a tax-deductible +[donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, +professional services, travel, workshops, and a variety of other needs.
@@ -123,12 +125,10 @@ If you like scverse® and want to support our mission, please consider making a [docs-badge]: https://readthedocs.org/projects/scvi/badge/?version=latest [docs-link]: https://scvi.readthedocs.io/en/stable/?badge=stable [documentation]: https://docs.scvi-tools.org/ -[donation]: https://numfocus.org/donate-to-scverse [forum]: https://discourse.scvi-tools.org [gh-stars-badge]: https://img.shields.io/github/stars/scverse/scvi-tools?style=flat&logo=GitHub&color=blue [gh-stars-link]: https://github.com/scverse/scvi-tools/stargazers [issues]: https://github.com/scverse/scvi-tools/issues -[numfocus]: https://numfocus.org/ [pepy-badge]: https://static.pepy.tech/badge/scvi-tools [pepy-link]: https://pepy.tech/project/scvi-tools [pypi-badge]: https://img.shields.io/pypi/v/scvi-tools.svg diff --git a/docs/user_guide/models/figures/sysvi_cycleconsistency.png b/docs/user_guide/models/figures/sysvi_cycleconsistency.png new file mode 100644 index 0000000000..4d5a2a7997 Binary files /dev/null and b/docs/user_guide/models/figures/sysvi_cycleconsistency.png differ diff --git a/docs/user_guide/models/figures/sysvi_vampprior.png b/docs/user_guide/models/figures/sysvi_vampprior.png new file mode 100644 index 0000000000..22c195d0dd Binary files /dev/null and b/docs/user_guide/models/figures/sysvi_vampprior.png differ diff --git a/docs/user_guide/models/sysvi.md b/docs/user_guide/models/sysvi.md index 249cd140b1..d346b53bb0 100644 --- a/docs/user_guide/models/sysvi.md +++ b/docs/user_guide/models/sysvi.md @@ -1,20 +1,153 @@ # SysVI -:::{note} -This page is under construction. -::: - -**sysVI** (Python class {class}`~scvi.external.SysVI`) is a ... +**sysVI** (cross-SYStem Variational Inference, +Python class {class}`~scvi.external.SysVI`) +is a representation learning models that can remove substantial batch effects. The advantages of SysVI are: -- ... +- Improved integration: For datasets with **substantial batch effects** +(e.g., cross-species or organoid-tissue), where other models often fail. +It provides a good tradeoff between batch correction and preservation of +cell-type and sub-cell-type biological variation. +- Tunable integration: The **integration strength is directly tunable** +via cycle consistency loss. +- Generally applicable: The model operates on +**approximately normally distributed data** +(e.g. normalized and log+1 transformed scRNA-seq data), which makes it +more generally applicable than just scRNA-seq. +- Scalable: Can integrate very large datasets if using a GPU. The limitations of SysVI include: -- ... +- Weak batch effects: For datasets with **small batch effects** +(e.g. multiple subjects from a single laboratory) we recommend using scVI instead, +as it has slightly higher biological preservation in this setting. +For determining whether a dataset has substantial batch effects +please refer to our paper. +- Model selection: The best performance is achieved if +**selecting the best model** from multiple +runs with a few different cycle consistency loss weights and random seed +initialisations, as explained in the tutorial. +However, we provide **defaults** that generate decent results in +many settings. + ```{topic} Tutorials: - {doc}`/tutorials/notebooks/scrna/sysVI` ``` + +```{topic} References: + +- Paper: Hrovatin and Moinfar, et al. +Integrating single-cell RNA-seq datasets with substantial batch effects. +bioRxiv (2023): https://doi.org/10.1101/2023.11.03.565463 +- Talk on caveats of scRNA-seq integration and strategies for removing +substantial batch effects: https://www.youtube.com/watch?v=i-a4BjAn90E +``` + +## Method background + +The model is based on a variational autoencoder (VAE), with the integrated +representation corresponding to the latent space embedding of the cells. + +### Stronger batch correction with cycle-consistency loss + +Vanilla VAEs struggle to achieve strong batch correction without loosing +substantial biological variation. This issue arises as the VAE loss +does not directly penalize the presence of batch covariate information in the +latent space. +Instead, conditional VAEs assume that batch covariate information will be +omitted from the latent space, which has limited-capacity, +as it is separately injected into the decoder. Namely, its presence in the +latent space is "unnecessary" for the reconstruction (Hrovatin and Moinfar, 2023). + +To achieve stronger integration than vanilla VAEs, SysVI employs +cycle-consistency loss in the latent space. In particular, the model embeds a cell +from one system (i.e. the covariate representing substantial batch effect) +into latent space and then decodes it using another category of the system covariate. +In this way it generates a biologically identical cell with a +different batch effect. The generated cell is then likewise embedded into the +latent space and the distance between the embeddings of the original and +the switched-batch cell are computed. The model is trained to minimize this distance. + +:::{figure} figures/sysvi_cycleconsistency.png +:align: center +:alt: Cycle consistency loss used to increase batch correction in SysVI. +:class: img-fluid +::: + +Benefits of this approach: +- As only cells with identical biological background are compared, this method +retains good biological preservation even when removing +substantial batch effects. This distinguishes it from alternative approaches +that compare cells with different biological backgrounds +(e.g. via adversarial loss; see Hrovatin and Moinfar (2023) for details). +- The integration strength can be directly tuned via the cycle-consistency +loss weight. + +### Improved biological preservation via the VampPrior + +Vanilla VAEs employ standard normal prior for regularizing latent space. +However, this prior is very restrictive and can lead to loss of +important biological variation in the latent space. + +Instead, we use the +VampPrior ([Tomczak, 2017](https://doi.org/10.48550/arXiv.1705.07120)), +which permits a more expressive latent space. VampPrior is a multi-modal +prior for which the mode positions are learned during the training. + +:::{figure} figures/sysvi_vampprior.png +:align: center +:alt: VampPrior used to increase the preservation of biological variation in SysVI. +:class: img-fluid +::: + +Benefits of this approach: +- More expressive latent space leads to increased preservation of +biological variability. +- VampPrior was more robust with respect to the number of modes than the +better-know Gaussian mixture prior. + +### Application flexibility due to using normally distributed inputs + +Many scRNA-seq integration models are specially designed to work with +scRNA-seq data, e.g. raw counts that follows negative binomial distribution. +However, due to this, these models can not be directly used for other +types of data. + +We observed that for representation learning this specialised setup is not +strictly required. - SysVI is designed for data following normal distribution, while +performing competitively in comparison to the more specialised models +on scRNA-seq data. +To make scRNA-seq data approximately normally distributed we preprocess it via +size-factor normalization and log+1 transformation. + +Thus, SysVI could be also applied to other types of normally distributed data. +However, we did not specifically test its performance on other data types. + +## Other tips & tricks for data integration + +Besides the benefits of the SysVI model, our paper +([Hrovatin and Moinfar, 2023](https://doi.org/10.1101/2023.11.03.565463)) +and +[talk](https://www.youtube.com/watch?v=i-a4BjAn90E) +provide additional advice on scRNA-seq integration that apply beyond SysVI. +The two most important insights are: +- Try to make the **integration task as easy for the model** as possible. +This means that data should be pre-processed in a way that already eliminates +some of the batch differences, when possible: + - Use intersection of HVGs across batches with substantial batch effects + (e.g. the systems). + - Mitigate known technical artefacts, such as ambient gene expression + ([Hrovatin and Sikkema, 2024](https://doi.org/10.1038/s41592-024-02532-y)). +- Ensure that **the metrics used to evaluate integration are of high-quality**: + - They should be able to capture the key properties required for downstream tasks. + For example, the standard cell-type based biological preservation metrics do + not assess whether subtler biological differences, such as within-cell-type + disease effects, are preserved. + - Be cautious of potential biases within integration metric scores. - + The scores may not directly correspond to the desired data property, + being influenced by other factors, or + certain models may be able to trick the metrics. diff --git a/src/scvi/model/base/__init__.py b/src/scvi/model/base/__init__.py index 97f10675d6..569d7de8d8 100644 --- a/src/scvi/model/base/__init__.py +++ b/src/scvi/model/base/__init__.py @@ -23,6 +23,7 @@ "RNASeqMixin", "VAEMixin", "UnsupervisedTrainingMixin", + "SemisupervisedTrainingMixin", "PyroSviTrainMixin", "PyroSampleMixin", "PyroJitGuideWarmup", @@ -32,5 +33,4 @@ "BaseMinifiedModeModelClass", "BaseMudataMinifiedModeModelClass", "EmbeddingMixin", - "SemisupervisedTrainingMixin", ] diff --git a/src/scvi/model/base/_training_mixin.py b/src/scvi/model/base/_training_mixin.py index e803473c34..637e276101 100644 --- a/src/scvi/model/base/_training_mixin.py +++ b/src/scvi/model/base/_training_mixin.py @@ -162,6 +162,8 @@ def train( class SemisupervisedTrainingMixin: + """General purpose semisupervised train, predict and interoperability methods.""" + _training_plan_cls = SemiSupervisedTrainingPlan def _set_indices_and_labels(self): @@ -416,7 +418,10 @@ def get_ranked_genes( Parameters ---------- - attr: numpy.ndarray + adata + AnnData or MuData object that has been registered via corresponding setup + method in model class. + attrs: numpy.ndarray Attributions matrix. Returns @@ -426,7 +431,7 @@ def get_ranked_genes( Examples -------- - >>> attrs_df = interpreter.get_ranked_genes(attrs) + >>> attrs_df = model.get_ranked_genes(attrs) """ if attrs is None: Warning("Missing Attributions matrix") diff --git a/src/scvi/train/__init__.py b/src/scvi/train/__init__.py index 96fa9ba5ff..d0e3d8e2ae 100644 --- a/src/scvi/train/__init__.py +++ b/src/scvi/train/__init__.py @@ -1,4 +1,10 @@ -from ._callbacks import JaxModuleInit, LoudEarlyStopping, SaveBestState, SaveCheckpoint +from ._callbacks import ( + JaxModuleInit, + LoudEarlyStopping, + SaveBestState, + SaveCheckpoint, + ScibCallback, +) from ._constants import METRIC_KEYS from ._trainer import Trainer from ._trainingplans import ( @@ -24,6 +30,7 @@ "LoudEarlyStopping", "SaveBestState", "SaveCheckpoint", + "ScibCallback", "JaxModuleInit", "JaxTrainingPlan", "METRIC_KEYS", diff --git a/src/scvi/train/_callbacks.py b/src/scvi/train/_callbacks.py index 81ccb58033..8fc1641c9a 100644 --- a/src/scvi/train/_callbacks.py +++ b/src/scvi/train/_callbacks.py @@ -387,6 +387,8 @@ def on_train_start(self, trainer, pl_module): class ScibCallback(Callback): + """A callback to initialize the Scib-Metrics autotune module.""" + def __init__( self, ):