feat(data): add explicit synthetic protein and region var names (#2739)

scverse · Apr 22, 2024 · 7246fae · 7246fae
1 parent 79bc2d1
commit 7246fae
Show file tree

Hide file tree

Showing 8 changed files with 30 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,7 +32,8 @@ to [Semantic Versioning]. Full commit history is available in the
 - Add `load_best_on_end` argument to {class}`scvi.train.SaveCheckpoint` to load the best model
     state at the end of training {pr}`2672`.
 - Add experimental class {class}`scvi.distributions.BetaBinomial` implementing the Beta-Binomial
-    distribution with mean-dispersion parameterization for modeling scBS-seq methylation data {pr}`2692`.
+    distribution with mean-dispersion parameterization for modeling scBS-seq methylation data
+    {pr}`2692`.
 
 #### Changed
 
@@ -60,6 +61,8 @@ to [Semantic Versioning]. Full commit history is available in the
     {func}`scvi.model.base._de_core._de_core` {pr}`2731`.
 - Move {func}`scvi.model.base._utils._fdr_de_prediction` to
     {func}`scvi.model.base._de_core_._fdr_de_prediction` {pr}`2731`.
+- {func}`scvi.data.synthetic_iid` now generates unique variable names for protein and
+    accessibility data {pr}`2739`.
 
 #### Removed
 

diff --git a/src/scvi/data/_built_in_data/_synthetic.py b/src/scvi/data/_built_in_data/_synthetic.py
@@ -27,9 +27,12 @@ def _generate_synthetic(
     batch_key: str = "batch",
     labels_key: str = "labels",
     rna_key: str = "rna",
+    gene_names_prefix: str = "gene",
     protein_expression_key: str = "protein_expression",
     protein_names_key: str = "protein_names",
+    protein_names_prefix: str = "protein",
     accessibility_key: str = "accessibility",
+    region_names_prefix: str = "region",
     coordinates_key: str = "coordinates",
 ) -> AnnOrMuData:
     n_obs = batch_size * n_batches
@@ -43,17 +46,19 @@ def sparsify_data(data: np.ndarray):
     mask = np.random.binomial(n=1, p=dropout_ratio, size=(n_obs, n_genes))
     rna = rna * mask
     rna = sparsify_data(rna)
+    gene_names = np.array([f"{gene_names_prefix}_{i}" for i in range(n_genes)])
 
     if n_proteins > 0:
         protein = np.random.negative_binomial(5, 0.3, size=(n_obs, n_proteins))
-        protein_names = np.arange(n_proteins).astype(str)
         protein = sparsify_data(protein)
+        protein_names = np.array([f"{protein_names_prefix}_{i}" for i in range(n_proteins)])
 
     if n_regions > 0:
         accessibility = np.random.negative_binomial(5, 0.3, size=(n_obs, n_regions))
         mask = np.random.binomial(n=1, p=dropout_ratio, size=(n_obs, n_regions))
         accessibility = accessibility * mask
         accessibility = sparsify_data(accessibility)
+        region_names = np.array([f"{region_names_prefix}_{i}" for i in range(n_regions)])
 
     batch = []
     for i in range(n_batches):
@@ -67,6 +72,7 @@ def sparsify_data(data: np.ndarray):
         coords = np.random.normal(size=(n_obs, 2))
 
     adata = AnnData(rna)
+    adata.var_names = gene_names
     if return_mudata:
         mod_dict = {rna_key: adata}
 
@@ -75,7 +81,9 @@ def sparsify_data(data: np.ndarray):
             protein_adata.var_names = protein_names
             mod_dict[protein_expression_key] = protein_adata
         if n_regions > 0:
-            mod_dict[accessibility_key] = AnnData(accessibility)
+            accessibility_adata = AnnData(accessibility)
+            accessibility_adata.var_names = region_names
+            mod_dict[accessibility_key] = accessibility_adata
 
         adata = MuData(mod_dict)
     else:

diff --git a/src/scvi/data/_datasets.py b/src/scvi/data/_datasets.py
@@ -517,6 +517,7 @@ def synthetic_iid(
     sparse_format: str | None = None,
     generate_coordinates: bool = False,
     return_mudata: bool = False,
+    **kwargs,
 ) -> AnnOrMuData:
     """Synthetic multimodal dataset.
 
@@ -600,6 +601,7 @@ def synthetic_iid(
         sparse_format=sparse_format,
         generate_coordinates=generate_coordinates,
         return_mudata=return_mudata,
+        **kwargs,
     )
 
 

diff --git a/tests/data/test_mudata.py b/tests/data/test_mudata.py
@@ -81,7 +81,7 @@ def test_setup_mudata_unpaired():
             index=unpaired_adata.obs_names,
         )
     )
-    mdata.mod["protein"] = anndata.concat([protein_adata, pad_adata])
+    mdata.mod["protein"] = anndata.concat([protein_adata, pad_adata], join="outer")
     mdata.update()
     generic_setup_mudata_manager(mdata, layer_mod="rna", protein_expression_mod="protein")
 

diff --git a/tests/external/cellassign/test_model_cellassign.py b/tests/external/cellassign/test_model_cellassign.py
@@ -12,7 +12,7 @@ def get_test_adata_marker_mat():
     adata.obs["size_factor"] = adata.X.sum(1)
 
     marker_df = pd.DataFrame(data=np.random.randint(2, size=(100, 5)))
-    marker_df.index = marker_df.index.map(str)
+    marker_df.index = pd.Index([f"gene_{i}" for i in range(100)])
 
     return adata, marker_df
 

diff --git a/tests/model/test_models_with_minified_data.py b/tests/model/test_models_with_minified_data.py
@@ -375,13 +375,17 @@ def test_scvi_with_minified_adata_posterior_predictive_sample():
     model.adata.obsm["X_latent_qzv"] = qzv
 
     scvi.settings.seed = 1
-    sample_orig = model.posterior_predictive_sample(indices=[1, 2, 3], gene_list=["1", "2"])
+    sample_orig = model.posterior_predictive_sample(
+        indices=[1, 2, 3], gene_list=["gene_1", "gene_2"]
+    )
 
     model.minify_adata()
     assert model.minified_data_type == ADATA_MINIFY_TYPE.LATENT_POSTERIOR
 
     scvi.settings.seed = 1
-    sample_new = model.posterior_predictive_sample(indices=[1, 2, 3], gene_list=["1", "2"])
+    sample_new = model.posterior_predictive_sample(
+        indices=[1, 2, 3], gene_list=["gene_1", "gene_2"]
+    )
     assert sample_new.shape == (3, 2)
 
     np.testing.assert_array_equal(sample_new.todense(), sample_orig.todense())

diff --git a/tests/model/test_scvi.py b/tests/model/test_scvi.py
@@ -238,10 +238,12 @@ def test_scvi(n_latent: int = 5):
     assert denoised.shape == (3, adata2.n_vars)
     sample = model.posterior_predictive_sample(adata2)
     assert sample.shape == adata2.shape
-    sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"])
+    sample = model.posterior_predictive_sample(
+        adata2, indices=[1, 2, 3], gene_list=["gene_1", "gene_2"]
+    )
     assert sample.shape == (3, 2)
     sample = model.posterior_predictive_sample(
-        adata2, indices=[1, 2, 3], gene_list=["1", "2"], n_samples=3
+        adata2, indices=[1, 2, 3], gene_list=["gene_1", "gene_2"], n_samples=3
     )
     assert sample.shape == (3, 2, 3)
 

diff --git a/tests/model/test_totalvi.py b/tests/model/test_totalvi.py
@@ -204,7 +204,7 @@ def test_totalvi(save_path):
     assert latent_lib_size.shape == (3, 1)
 
     pro_foreground_prob = model.get_protein_foreground_probability(
-        adata2, indices=[1, 2, 3], protein_list=["1", "2"]
+        adata2, indices=[1, 2, 3], protein_list=["protein_1", "protein_2"]
     )
     assert pro_foreground_prob.shape == (3, 2)
     model.posterior_predictive_sample(adata2)
@@ -429,7 +429,7 @@ def test_totalvi_mudata():
     assert latent_lib_size.shape == (3, 1)
 
     pro_foreground_prob = model.get_protein_foreground_probability(
-        mdata2, indices=[1, 2, 3], protein_list=["1", "2"]
+        mdata2, indices=[1, 2, 3], protein_list=["gene_1", "gene_2"]
     )
     assert pro_foreground_prob.shape == (3, 2)
     model.posterior_predictive_sample(mdata2)