Skip to content

Commit

Permalink
feat(data): add explicit synthetic protein and region var names (#2739)
Browse files Browse the repository at this point in the history
  • Loading branch information
martinkim0 authored Apr 22, 2024
1 parent 79bc2d1 commit 7246fae
Show file tree
Hide file tree
Showing 8 changed files with 30 additions and 11 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ to [Semantic Versioning]. Full commit history is available in the
- Add `load_best_on_end` argument to {class}`scvi.train.SaveCheckpoint` to load the best model
state at the end of training {pr}`2672`.
- Add experimental class {class}`scvi.distributions.BetaBinomial` implementing the Beta-Binomial
distribution with mean-dispersion parameterization for modeling scBS-seq methylation data {pr}`2692`.
distribution with mean-dispersion parameterization for modeling scBS-seq methylation data
{pr}`2692`.

#### Changed

Expand Down Expand Up @@ -60,6 +61,8 @@ to [Semantic Versioning]. Full commit history is available in the
{func}`scvi.model.base._de_core._de_core` {pr}`2731`.
- Move {func}`scvi.model.base._utils._fdr_de_prediction` to
{func}`scvi.model.base._de_core_._fdr_de_prediction` {pr}`2731`.
- {func}`scvi.data.synthetic_iid` now generates unique variable names for protein and
accessibility data {pr}`2739`.

#### Removed

Expand Down
12 changes: 10 additions & 2 deletions src/scvi/data/_built_in_data/_synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@ def _generate_synthetic(
batch_key: str = "batch",
labels_key: str = "labels",
rna_key: str = "rna",
gene_names_prefix: str = "gene",
protein_expression_key: str = "protein_expression",
protein_names_key: str = "protein_names",
protein_names_prefix: str = "protein",
accessibility_key: str = "accessibility",
region_names_prefix: str = "region",
coordinates_key: str = "coordinates",
) -> AnnOrMuData:
n_obs = batch_size * n_batches
Expand All @@ -43,17 +46,19 @@ def sparsify_data(data: np.ndarray):
mask = np.random.binomial(n=1, p=dropout_ratio, size=(n_obs, n_genes))
rna = rna * mask
rna = sparsify_data(rna)
gene_names = np.array([f"{gene_names_prefix}_{i}" for i in range(n_genes)])

if n_proteins > 0:
protein = np.random.negative_binomial(5, 0.3, size=(n_obs, n_proteins))
protein_names = np.arange(n_proteins).astype(str)
protein = sparsify_data(protein)
protein_names = np.array([f"{protein_names_prefix}_{i}" for i in range(n_proteins)])

if n_regions > 0:
accessibility = np.random.negative_binomial(5, 0.3, size=(n_obs, n_regions))
mask = np.random.binomial(n=1, p=dropout_ratio, size=(n_obs, n_regions))
accessibility = accessibility * mask
accessibility = sparsify_data(accessibility)
region_names = np.array([f"{region_names_prefix}_{i}" for i in range(n_regions)])

batch = []
for i in range(n_batches):
Expand All @@ -67,6 +72,7 @@ def sparsify_data(data: np.ndarray):
coords = np.random.normal(size=(n_obs, 2))

adata = AnnData(rna)
adata.var_names = gene_names
if return_mudata:
mod_dict = {rna_key: adata}

Expand All @@ -75,7 +81,9 @@ def sparsify_data(data: np.ndarray):
protein_adata.var_names = protein_names
mod_dict[protein_expression_key] = protein_adata
if n_regions > 0:
mod_dict[accessibility_key] = AnnData(accessibility)
accessibility_adata = AnnData(accessibility)
accessibility_adata.var_names = region_names
mod_dict[accessibility_key] = accessibility_adata

adata = MuData(mod_dict)
else:
Expand Down
2 changes: 2 additions & 0 deletions src/scvi/data/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ def synthetic_iid(
sparse_format: str | None = None,
generate_coordinates: bool = False,
return_mudata: bool = False,
**kwargs,
) -> AnnOrMuData:
"""Synthetic multimodal dataset.
Expand Down Expand Up @@ -600,6 +601,7 @@ def synthetic_iid(
sparse_format=sparse_format,
generate_coordinates=generate_coordinates,
return_mudata=return_mudata,
**kwargs,
)


Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_mudata.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_setup_mudata_unpaired():
index=unpaired_adata.obs_names,
)
)
mdata.mod["protein"] = anndata.concat([protein_adata, pad_adata])
mdata.mod["protein"] = anndata.concat([protein_adata, pad_adata], join="outer")
mdata.update()
generic_setup_mudata_manager(mdata, layer_mod="rna", protein_expression_mod="protein")

Expand Down
2 changes: 1 addition & 1 deletion tests/external/cellassign/test_model_cellassign.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def get_test_adata_marker_mat():
adata.obs["size_factor"] = adata.X.sum(1)

marker_df = pd.DataFrame(data=np.random.randint(2, size=(100, 5)))
marker_df.index = marker_df.index.map(str)
marker_df.index = pd.Index([f"gene_{i}" for i in range(100)])

return adata, marker_df

Expand Down
8 changes: 6 additions & 2 deletions tests/model/test_models_with_minified_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,13 +375,17 @@ def test_scvi_with_minified_adata_posterior_predictive_sample():
model.adata.obsm["X_latent_qzv"] = qzv

scvi.settings.seed = 1
sample_orig = model.posterior_predictive_sample(indices=[1, 2, 3], gene_list=["1", "2"])
sample_orig = model.posterior_predictive_sample(
indices=[1, 2, 3], gene_list=["gene_1", "gene_2"]
)

model.minify_adata()
assert model.minified_data_type == ADATA_MINIFY_TYPE.LATENT_POSTERIOR

scvi.settings.seed = 1
sample_new = model.posterior_predictive_sample(indices=[1, 2, 3], gene_list=["1", "2"])
sample_new = model.posterior_predictive_sample(
indices=[1, 2, 3], gene_list=["gene_1", "gene_2"]
)
assert sample_new.shape == (3, 2)

np.testing.assert_array_equal(sample_new.todense(), sample_orig.todense())
Expand Down
6 changes: 4 additions & 2 deletions tests/model/test_scvi.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,10 +238,12 @@ def test_scvi(n_latent: int = 5):
assert denoised.shape == (3, adata2.n_vars)
sample = model.posterior_predictive_sample(adata2)
assert sample.shape == adata2.shape
sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"])
sample = model.posterior_predictive_sample(
adata2, indices=[1, 2, 3], gene_list=["gene_1", "gene_2"]
)
assert sample.shape == (3, 2)
sample = model.posterior_predictive_sample(
adata2, indices=[1, 2, 3], gene_list=["1", "2"], n_samples=3
adata2, indices=[1, 2, 3], gene_list=["gene_1", "gene_2"], n_samples=3
)
assert sample.shape == (3, 2, 3)

Expand Down
4 changes: 2 additions & 2 deletions tests/model/test_totalvi.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def test_totalvi(save_path):
assert latent_lib_size.shape == (3, 1)

pro_foreground_prob = model.get_protein_foreground_probability(
adata2, indices=[1, 2, 3], protein_list=["1", "2"]
adata2, indices=[1, 2, 3], protein_list=["protein_1", "protein_2"]
)
assert pro_foreground_prob.shape == (3, 2)
model.posterior_predictive_sample(adata2)
Expand Down Expand Up @@ -429,7 +429,7 @@ def test_totalvi_mudata():
assert latent_lib_size.shape == (3, 1)

pro_foreground_prob = model.get_protein_foreground_probability(
mdata2, indices=[1, 2, 3], protein_list=["1", "2"]
mdata2, indices=[1, 2, 3], protein_list=["gene_1", "gene_2"]
)
assert pro_foreground_prob.shape == (3, 2)
model.posterior_predictive_sample(mdata2)
Expand Down

0 comments on commit 7246fae

Please sign in to comment.