-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_stat.py
123 lines (90 loc) · 3.82 KB
/
get_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Get statistics
"""
import numpy as np
from itertools import product
# from pop_hist_opt import HistOpt, PopData, ClusterCov
from pop_hist_opt_wright_fisher import HistOpt, PopData, ClusterCov
def run_freq_bayes(cl_tag, dist_tag, pop_tag):
"""
Run optimisation in particular cluster and species type, under particular hypothesis
:param cl_tag: tag of cluster
:param dist_tag: tag of hypothesis
:param pop_tag: desi/kabuli tag
:return:
"""
seeds = [239, 30, 123] # Seeds for two chains
for iseed in seeds:
np.random.seed(seed=iseed) # Set random seed
file_ndata = path_data + 'n_data_{}.txt'.format(pop_tag) # file with counts of alternative allele
file_qdata = path_data + 'q_data_{}.txt'.format(pop_tag) # file with total allele counts
file_cov = path_cov + '{}_cov_{}.txt'.format(cl_tag, dist_tag) # file with covariance matrix
data = PopData(file_qdata, file_ndata) # Dataset
mx_cov = ClusterCov(file_cov) # Covariance matrix
opt_hist = HistOpt(mx_cov, data, ab_flag=True, ilr_flag=True) # Optimizer
path_res = path_mcmc + '{}_{}_{}_{}/'.format(cl_tag, dist_tag, pop_tag, iseed) # Path for MCMC
result_sum = 0
result_ll = 0
mean_f_a = []
for snp_id in range(opt_hist.n_snp):
# print(snp_id)
k1 = opt_hist.q[:, snp_id]
n1 = opt_hist.n[:, snp_id]
# In the number of alternative alleles in all locations is zero - return
if sum(k1) == 0:
mean_f_a += [0]
continue
if sum(k1) == sum(n1):
mean_f_a += [1]
continue
if (opt_hist.n_loc - sum(k1 == 0)) <= 2:
mean_f_a += [k1.sum() / n1.sum()]
continue
if (opt_hist.n_loc - sum(k1 == n1)) <= 2:
mean_f_a += [k1.sum() / n1.sum()]
continue
x0 = np.loadtxt(path_res + 'mean' + str(snp_id) + '.txt')
if x0[0] == 0:
print('No optimisation', snp_id)
f_a = 1 / (1 + np.exp(x0[0]))
mean_f_a += [f_a]
res = opt_hist.logprob(x0, snp_id)
result_ll += res[0]
x_pop = x0[0] + x0[1:(opt_hist.n_loc + 1)]
res = opt_hist.likelihood_snp(snp_id, x_pop)
result_sum += res
np.savetxt(fname=path_estim + '{}_{}_{}_{}'.format(cl_tag, dist_tag, pop_tag, iseed) + '_freq_a.txt',
X=mean_f_a, fmt='%.10f')
#print(len(mean_f_a))
print(cl_tag, dist_tag, pop_tag, result_sum, result_ll, iseed)
# opt_hist.optimise(path_res, n_thr=30) # Run optimiser
# ------------------------------------------------
# PATHS
# ------------------------------------------------
path_pref = 'data/'
path_data = path_pref + 'samples/'
path_cov = path_pref + 'cov_mx/'
path_mcmc = path_pref + 'mcmc/'
path_estim = path_pref + 'estim/' # Path for estimates
# ------------------------------------------------
# TAGS
# ------------------------------------------------
cluster_tags = ['ETHI', 'IND', 'LEB', 'MOR', 'TUR', 'UZB']
dist_tags = ['routes', 'lin']
pop_tags = ['desi', 'kabuli']
# Combinations to skip: we do not have Kabulis in Ethiopid and India
skip_combinations = [('kabuli', 'ETHI', 'lin'),
('kabuli', 'ETHI', 'routes'),
('kabuli', 'IND', 'lin'),
('kabuli', 'IND', 'routes')]
k = 0
pop_tag = pop_tags[k]
cl_tag = cluster_tags[k]
dist_tag = dist_tags[k]
# ------------------------------------------------
# OPTIMISATION
# ------------------------------------------------
for pop_tag, cl_tag, dist_tag in product(pop_tags, cluster_tags, dist_tags):
if (pop_tag, cl_tag, dist_tag) in skip_combinations:
continue
run_freq_bayes(cl_tag, dist_tag, pop_tag)