1
+ import warnings
2
+
1
3
import dask .array as da
2
4
import numpy as np
3
5
import pandas as pd
@@ -16,7 +18,9 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
16
18
Parameters
17
19
----------
18
20
ds
19
- Dataset containing beta values (OLS betas or regularized betas).
21
+ Dataset containing marginal variant effect values in the variable "beta"
22
+ (OLS betas or regularized betas), and window definitions corresponding to
23
+ genomic regions of interest, e.g. genes.
20
24
ld
21
25
Dense 2D array corresponding to the LD matrix.
22
26
reg_covar
@@ -30,6 +34,41 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
30
34
the first mixture component with the largest variance if it is composed
31
35
of more than 50% of the SNPs.
32
36
37
+ Examples
38
+ --------
39
+
40
+ >>> import numpy as np
41
+ >>> import xarray as xr
42
+ >>> from sgkit.stats.genee import genee
43
+ >>> from sgkit.stats.association import gwas_linear_regression
44
+ >>> from sgkit.stats.ld import ld_matrix
45
+ >>> from sgkit.testing import simulate_genotype_call_dataset
46
+ >>> n_variant, n_sample, n_contig, n_covariate, n_trait, seed = 100, 50, 2, 3, 5, 0
47
+ >>> rs = np.random.RandomState(seed)
48
+ >>> ds = simulate_genotype_call_dataset(n_variant=n_variant, n_sample=n_sample, n_contig=n_contig, seed=seed)
49
+ >>> ds_pheno = xr.DataArray(np.random.rand(n_sample), dims=['samples'], name='phenotype')
50
+ >>> ds = ds.assign(phenotype=ds_pheno)
51
+ >>> ds["call_dosage"] = ds.call_genotype.sum(dim="ploidy")
52
+ >>> ds = gwas_linear_regression(ds, dosage="call_dosage", add_intercept=True, traits=["phenotype"], covariates=[])
53
+ >>> ds = ds.rename({"variant_linreg_beta": "beta"})
54
+ # genee cannot handle the return value of gwas_linear_regression, we need to
55
+ # convert it to a dataset with a single variant dimension
56
+ >>> ds_beta = xr.DataArray(np.random.rand(n_variant), dims=['variants'], name='beta')
57
+ >>> ds = ds.assign(beta=ds_beta)
58
+ >>> gene_start = np.array([0, 30])
59
+ >>> gene_stop = np.array([20, 40])
60
+ >>> gene_contig = np.array([0, 1])
61
+ # genes are windows in this simple example
62
+ >>> ds["window_contig"] = (["windows"], gene_contig)
63
+ >>> ds["window_start"] = (["windows"], gene_start)
64
+ >>> ds["window_stop"] = (["windows"], gene_stop)
65
+ # this only works as long as the windows on different chromosomes are non-overlapping
66
+ # ld_matrix looses all information about contigs
67
+ # is there a way to do this properly in chunks and have genee work with the chunks?
68
+ >>> ld_temp = ld_matrix(ds).compute().pivot(index="i", columns="j", values="value").fillna(-1).to_numpy()
69
+ >>> ld = (ld_temp + ld_temp.T) / 2
70
+ >>> df = genee(ds, ld).compute()
71
+
33
72
Returns
34
73
-------
35
74
A dataframe containing the following fields:
@@ -38,6 +77,8 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
38
77
- ``q_var``: test variance
39
78
- ``pval``: p-value
40
79
80
+ Each value corresponds to a window in the input dataset.
81
+
41
82
References
42
83
----------
43
84
[1] - W. Cheng, S. Ramachandran, and L. Crawford (2020).
@@ -83,7 +124,7 @@ def genee_EM(betas, reg_covar=0.000001):
83
124
84
125
covars = best_gmm .covariances_ .squeeze ()
85
126
if best_gmm .n_components == 1 : # pragma: no cover
86
- epsilon_effect = covars [0 ]
127
+ epsilon_effect = covars [0 ] if ( covars . ndim > 0 ) else covars
87
128
else :
88
129
# TODO: handle case where first component composed more than 50% SNPs
89
130
# https://github.com/ramachandran-lab/genee/blob/a357a956241df93f16e07664e24f3aeac65f4177/genee/R/genee_EM.R#L28-L29
@@ -132,8 +173,13 @@ def genee_test(gene, ld, betas, epsilon_effect):
132
173
test_statistics = betas_g .T @ betas_g
133
174
t_var = np .diag ((ld_g * epsilon_effect ) @ (ld_g * epsilon_effect )).sum ()
134
175
135
- p_value_g = compute_p_values (e_values , test_statistics )
136
- p_value_g = ensure_positive_real (p_value_g )
176
+ if all (e_values == 0.0 ):
177
+ warnings .warn ("All eigenvalues of the given gene LD matrix are zero. Cannot compute p-value." ,
178
+ UserWarning )
179
+ p_value_g = np .real_if_close (np .nan )
180
+ else :
181
+ p_value_g = compute_p_values (e_values , test_statistics )
182
+ p_value_g = ensure_positive_real (p_value_g )
137
183
138
184
return test_statistics .squeeze ().item (), t_var , p_value_g .squeeze ().item ()
139
185
0 commit comments