-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpop_hist_opt_wright_fisher.py
executable file
·371 lines (277 loc) · 11.9 KB
/
pop_hist_opt_wright_fisher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
"""
Tools to test a species spread hypothesis within a cluster
Input data:
1) covariance matrix between locations -
this covariance matrix is obtained inline with the hypothesis
2.1) counts of alternative allele in each location
2.2) total number number of alleles in each location
"""
import os
import numpy as np
import scipy.stats as st
from functools import partial
from pandas import read_csv
from multiprocess import Pool
from pyhmc import hmc
def ilr(f):
"""
ilr transformation of frequencies
:param f: frequency
:return: ilr-transformation
"""
return np.log((1 - f) / f)
def inv_ilr(x):
"""
Inverce ilr transformation
:param x: a coordinate within ilr space
:return: a corresponding frequency
"""
return 1 / (1 + np.exp(x))
class PopData:
"""
The dataset class: everything about SNPs in all locations
"""
def __init__(self, file_qdata, file_ndata):
"""
:param file_qdata: file with counts of alternative allele in each location
:param file_ndata: file with the total number of alternative allele in each location
"""
self.q = read_csv(file_qdata, sep='\t',index_col=0) # Counts
self.n = read_csv(file_ndata, sep='\t',index_col=0) # Total number
self.loc_name = list(self.q.index) # names of loci
self.n_loc, self.n_snp = self.q.shape # number of locations and SNPs
class ClusterCov:
"""
Covariance matrix between locations based on the hypothesis
"""
def __init__(self, file_cov):
"""
:param file_cov: File with covariance matrix
"""
self.cov = read_csv(file_cov, sep='\t',index_col=0) # Covariance matrix
self.loc_name = list(self.cov.index) # names of locations
self.n_loc = self.cov.shape[0] # number of loci
# We normalised covariance matrix to the mean of diagonal elements.
# The diagonal elements are variances of each locations according to the ancestral state,
# which, in our liner/routes model, can be considered as
# lengths of branches in the binary tree from the root to leaves.
# To normalise these distances, we normalise all variances and covariances
# to the mean distance from the root to leaves.
self.cov /= np.mean(np.diag(self.cov))
class HistOpt:
"""
Estimator of alternative allele frequencies within each location and in the ancestral state
"""
def __init__(self, mx_cov: ClusterCov, data: PopData, ab_flag=False, ilr_flag=True):
"""
:param mx_cov: covariance matrix
:param data: the dataset
:param ab_flag: whether the prior is used for frequencies in ancestral state
:param ilr_flag: whether to use the ilr transformation
"""
self.ab_flag = ab_flag
self.ilr_flag = ilr_flag
# shared names of locations in covariance matrix and the dataset
self.loc_name = [y for y in mx_cov.loc_name
if y in data.loc_name]
# Dataset
self.q = data.q.loc[self.loc_name, :] # alt.allele counts
self.n = data.n.loc[self.loc_name, :] # total allele counts
self.q = self.q.as_matrix()
self.n = self.n.as_matrix()
self.n_loc = len(self.loc_name) # number of locations
self.n_snp = data.n_snp # number of SNPs
# freq_anc, freq_pop = self.set_init_freqs() # sample frequencies of alternative alleles
# self.x_anc = ilr(freq_anc) # ilr-transformed frequencies in locations
# self.x_pop = ilr(freq_pop) # ilr-transformed frequencies in ancestral state
# # Covariance matrix
self.m_c = mx_cov.cov.loc[self.loc_name, self.loc_name]
self.m_c = self.m_c.as_matrix()
self.m_c_inv = np.linalg.inv(self.m_c) # Inverse covariance matrix
# # Priors for proportionality coefficients
# self.sigma = 0.01 * np.ones(self.n_snp)
# self.sigma_mean = 0.001
# self.sigma_var = 0.0001
# # Priors for frequency in the ancestral state
# if self.ab_flag:
# self.a = np.ones(self.n_snp)
# self.b = np.ones(self.n_snp)
# self.step = 0.0001
# mcmc chain
# self.chain_freq_opt = []
# self.count = 0
def optimise(self, path_data, n_thr=2):
"""
Optimisation of parameters
:param path_data: directory where to save the data
:param freqs_init: initial frequencies
:param sigma_params: parameters of disribution of sigma
:param n_thr: number of threads
:return:
"""
if not os.path.exists(path_data): # make dir
os.makedirs(path_data)
# self.load_init(freqs_init) # load initial dataset
func = partial(self.optimize_snp_new, path_chain=path_data)
with Pool(n_thr) as workers: # run the optimisation for each SNP separately
pmap = workers.map
# pmap(func, range(self.n_snp))
pmap(func, range(1))
def optimize_snp_new(self, snp_id, path_chain):
"""
Optimisation of parameters for a SNPs by Hamiltonian Monte Carlo
:param snp_id: Id of SNP
:param path_chain: path to save MCMC chain
"""
# snp_id = 6
# print(snp_id)
k1 = self.q[:, snp_id]
n1 = self.n[:, snp_id]
# In the number of alternative alleles in all locations is zero - return
if sum(k1) == 0:
return
if sum(k1) == sum(n1):
return
if (self.n_loc - sum(k1 == 0)) <= 2:
# print('Skip zero', path_chain, snp_id)
return
if (self.n_loc - sum(k1 == n1)) <= 2:
# print('Skip non-zero', path_chain, snp_id)
return
# niter = 50000
# samples = hmc(self.logprob, x0=[0] + list(np.random.uniform(-1, 1, self.n_loc)) + [0.1] + [1] * 2,
# args=(snp_id,), n_samples=2000, epsilon=0.1)
# print(samples[-1])
# samples = hmc(self.logprob, x0=samples[-1],
# args=(snp_id,), n_samples=niter, epsilon=0.2)
# print(list(zip(k1, n1)))
# Burn-in
niter = 5000
samples = [[k1.sum()/n1.sum()] + list(np.random.uniform(-1, 1, self.n_loc)) + [0.1] + [1] * 2]
params = np.array(samples[-1])
print(self.logprob(np.array(samples[-1]), snp_id))
eps = 0.11
while len(samples) < (niter / 4):
eps = eps - 0.01
samples = hmc(self.logprob, x0=samples[-1],
args=(snp_id,), n_samples=niter, epsilon=eps)
_, idx = np.unique(samples, axis=0, return_index=True)
samples = samples[np.sort(idx)]
print(len(samples), eps)
# Find epsilon after burn-in
niter = 5000
eps = 0.18
x0 = samples[-1]
samples = []
while len(samples) < (niter / 4) and eps > 0:
if eps > 0.019:
eps -= 0.01
else:
eps = eps * 0.7
samples = hmc(self.logprob,
x0=x0,
args=(snp_id,), n_samples=niter, epsilon=eps)
_, idx = np.unique(samples, axis=0, return_index=True)
samples = samples[np.sort(idx)]
print(len(samples), eps)
niter = 100000
if eps > 0:
samples = hmc(self.logprob,
x0=samples[-1],
args=(snp_id,), n_samples=niter, epsilon=eps)
_, idx = np.unique(samples, axis=0, return_index=True)
samples = samples[np.sort(idx)]
else:
print('Bad epsilon')
print('Len', path_chain, snp_id, len(samples), eps)
if len(samples) < niter/4:
print('Low number of samples', path_chain, snp_id)
np.savetxt(fname=path_chain +
'snp' + str(snp_id) + '.txt', X=samples, fmt='%.10f')
np.savetxt(fname=path_chain +
'eps' + str(snp_id) + '.txt', X=eps, fmt='%.10f')
n_thin = 10
samples = samples[::n_thin]
np.savetxt(fname=path_chain +
'mean' + str(snp_id) + '.txt', X=samples.mean(0), fmt='%.10f')
if 0 in samples.var(0):
print('Bad News', snp_id)
np.savetxt(fname=path_chain +
'var' + str(snp_id) + '.txt', X=samples.var(0), fmt='%.10f')
def likelihood_snp(self, snp_id, x_pop):
"""
Lod-likelihood of the data in one SNP: Binomial distribution
:param snp_id: SNP id
:return: Binimoal log-likelood
"""
# p = 0
# for loc_id in range(self.n_loc):
# if self.n[loc_id, snp_id] > 0:
#
# freq_pop = 1 / (1 + np.exp(self.x_pop[loc_id, snp_id]))
# p = st.binom.logpmf(n=self.n[loc_id, snp_id],
# k=self.q[loc_id, snp_id],
# p=freq_pop)
# print(p)
# return p
freq_pop = 1 / (1 + np.exp(x_pop))
tmp = self.q[:, snp_id] * np.log(freq_pop) + \
(self.n[:, snp_id] - self.q[:, snp_id]) * np.log(1 - freq_pop)
# print(tmp)
return tmp.sum()
# ==========================================================================
# WORK WITH FREQUENCIES and ILR COORDINATES
# ==========================================================================
# def set_init_freqs(self):
# """
#
# :return:
# """
# q_sum = reduce(lambda x, y: x + y, self.q)
# n_sum = reduce(lambda x, y: x + y, self.n)
#
# freq_mean = q_sum / n_sum * 0 + 0.5
# freq_anc = freq_mean
# freq_pop = np.tile(freq_mean, (self.n_loc, 1))
#
# return freq_anc, freq_pop
# ==========================================================================
# HAMILTONIAN MONTE CARLO: log-posteriors and gradients
# ==========================================================================
def logprob(self, params, snp_id): # Workable
k1 = self.q[:, snp_id]
n1 = self.n[:, snp_id]
n_pop = len(k1)
x_a = params[0] # ILR of ancestral state
r_p = params[1:(n_pop + 1)] # any an additional component
z = params[n_pop + 1]
a = params[n_pop + 2]
b = params[n_pop + 3]
x_p = x_a + r_p
f_p = 1 / (1 + np.exp(x_p)) # frequency in populations
f_a = 1 / (1 + np.exp(x_a)) # frequency in ancestral state
q1 = (r_p).T.dot(self.m_c_inv) * z
quad_form = (q1 * (r_p).T).sum() / 2
logp = (k1 * np.log(f_p) + (n1 - k1) * np.log(1 - f_p)).sum()
logp -= quad_form * f_a * (1 - f_a)
# logp += (a - 1) * np.log(f_a) + (a / m * (1-m) - 1) * np.log(1 - f_a)
logp += (b - 1 + n_pop / 2) * x_a - (n_pop + a + b - 2) * np.log(1 + np.exp(x_a))
barrier = 10
# barrier = 15
logp += (np.log(barrier - np.abs(r_p))).sum()
# additional for z param
z_lambda = 1
logp += n_pop / 2 * np.log(z) - z_lambda * z
# additional for "a" param
logp += st.uniform.logpdf(a / (a + b)) + st.expon.logpdf(a + b)
# grad_bin = (n1 / (1 + np.exp(x_p))) - k1
grad_bin = n1 * f_p - k1
grad = np.concatenate(([sum(grad_bin) +
- (-1) * quad_form * (np.exp(x_a) - np.exp(-x_a)) / (
np.exp(x_a) + 2 + np.exp(-x_a)) ** 2 +
b - 1 + n_pop / 2 - (n_pop + a + b - 2) * (1 - f_a)],
grad_bin - q1 * f_a * (1 - f_a) + (-1) * np.sign(r_p) / (barrier - np.abs(r_p)),
[-quad_form * f_a * (1 - f_a) / z + n_pop / 2 / z - z_lambda],
[np.log(f_a) - 1],
[np.log(1 - f_a) - 1]))
return logp, grad