Skip to content

Commit 2d01c59

Browse files
committed
example for genee using sgkit
1 parent b95d459 commit 2d01c59

File tree

1 file changed

+50
-4
lines changed

1 file changed

+50
-4
lines changed

sgkit/stats/genee.py

+50-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import warnings
2+
13
import dask.array as da
24
import numpy as np
35
import pandas as pd
@@ -16,7 +18,9 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
1618
Parameters
1719
----------
1820
ds
19-
Dataset containing beta values (OLS betas or regularized betas).
21+
Dataset containing marginal variant effect values in the variable "beta"
22+
(OLS betas or regularized betas), and window definitions corresponding to
23+
genomic regions of interest, e.g. genes.
2024
ld
2125
Dense 2D array corresponding to the LD matrix.
2226
reg_covar
@@ -30,6 +34,41 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
3034
the first mixture component with the largest variance if it is composed
3135
of more than 50% of the SNPs.
3236
37+
Examples
38+
--------
39+
40+
>>> import numpy as np
41+
>>> import xarray as xr
42+
>>> from sgkit.stats.genee import genee
43+
>>> from sgkit.stats.association import gwas_linear_regression
44+
>>> from sgkit.stats.ld import ld_matrix
45+
>>> from sgkit.testing import simulate_genotype_call_dataset
46+
>>> n_variant, n_sample, n_contig, n_covariate, n_trait, seed = 100, 50, 2, 3, 5, 0
47+
>>> rs = np.random.RandomState(seed)
48+
>>> ds = simulate_genotype_call_dataset(n_variant=n_variant, n_sample=n_sample, n_contig=n_contig, seed=seed)
49+
>>> ds_pheno = xr.DataArray(np.random.rand(n_sample), dims=['samples'], name='phenotype')
50+
>>> ds = ds.assign(phenotype=ds_pheno)
51+
>>> ds["call_dosage"] = ds.call_genotype.sum(dim="ploidy")
52+
>>> ds = gwas_linear_regression(ds, dosage="call_dosage", add_intercept=True, traits=["phenotype"], covariates=[])
53+
>>> ds = ds.rename({"variant_linreg_beta": "beta"})
54+
# genee cannot handle the return value of gwas_linear_regression, we need to
55+
# convert it to a dataset with a single variant dimension
56+
>>> ds_beta = xr.DataArray(np.random.rand(n_variant), dims=['variants'], name='beta')
57+
>>> ds = ds.assign(beta=ds_beta)
58+
>>> gene_start = np.array([0, 30])
59+
>>> gene_stop = np.array([20, 40])
60+
>>> gene_contig = np.array([0, 1])
61+
# genes are windows in this simple example
62+
>>> ds["window_contig"] = (["windows"], gene_contig)
63+
>>> ds["window_start"] = (["windows"], gene_start)
64+
>>> ds["window_stop"] = (["windows"], gene_stop)
65+
# this only works as long as the windows on different chromosomes are non-overlapping
66+
# ld_matrix looses all information about contigs
67+
# is there a way to do this properly in chunks and have genee work with the chunks?
68+
>>> ld_temp = ld_matrix(ds).compute().pivot(index="i", columns="j", values="value").fillna(-1).to_numpy()
69+
>>> ld = (ld_temp + ld_temp.T) / 2
70+
>>> df = genee(ds, ld).compute()
71+
3372
Returns
3473
-------
3574
A dataframe containing the following fields:
@@ -38,6 +77,8 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
3877
- ``q_var``: test variance
3978
- ``pval``: p-value
4079
80+
Each value corresponds to a window in the input dataset.
81+
4182
References
4283
----------
4384
[1] - W. Cheng, S. Ramachandran, and L. Crawford (2020).
@@ -83,7 +124,7 @@ def genee_EM(betas, reg_covar=0.000001):
83124

84125
covars = best_gmm.covariances_.squeeze()
85126
if best_gmm.n_components == 1: # pragma: no cover
86-
epsilon_effect = covars[0]
127+
epsilon_effect = covars[0] if (covars.ndim > 0) else covars
87128
else:
88129
# TODO: handle case where first component composed more than 50% SNPs
89130
# https://github.com/ramachandran-lab/genee/blob/a357a956241df93f16e07664e24f3aeac65f4177/genee/R/genee_EM.R#L28-L29
@@ -132,8 +173,13 @@ def genee_test(gene, ld, betas, epsilon_effect):
132173
test_statistics = betas_g.T @ betas_g
133174
t_var = np.diag((ld_g * epsilon_effect) @ (ld_g * epsilon_effect)).sum()
134175

135-
p_value_g = compute_p_values(e_values, test_statistics)
136-
p_value_g = ensure_positive_real(p_value_g)
176+
if all(e_values == 0.0):
177+
warnings.warn("All eigenvalues of the given gene LD matrix are zero. Cannot compute p-value.",
178+
UserWarning)
179+
p_value_g = np.real_if_close(np.nan)
180+
else:
181+
p_value_g = compute_p_values(e_values, test_statistics)
182+
p_value_g = ensure_positive_real(p_value_g)
137183

138184
return test_statistics.squeeze().item(), t_var, p_value_g.squeeze().item()
139185

0 commit comments

Comments
 (0)