-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify_single_molecule_binding_v4.py
414 lines (331 loc) · 17.9 KB
/
classify_single_molecule_binding_v4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
import pandas as pd
import numpy as np
import argparse
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
plt.switch_backend('agg')
from itertools import product, chain, combinations
from scipy.stats import bernoulli
from copy import copy
from os import path
import pickle
from common import load_single_molecule_matrix, \
get_methyl_positions, \
load_tfbs_positions, \
filter_all_converted_reads, \
decorate_single_read_plot2, \
plot_single_read, \
adjust_gcgs, \
fix_missing_data
def compute_hyperparameters():
'''
currently estimating this from what I expect things to be
but in the future can grab from paired EM-seq data? not sure it matters...
'''
# update methylation probabilities for short methylation time data
p_unmeth_given_bound = 0.99 # 0.9
p_unmeth_given_unbound = 0.4 # 0.1
# t is the thing we measure as a 1 in the data (i.e. unmethylated = bound = protected)
p_t_given_meth = 0.01
p_t_given_unmeth = 0.99
# marginalize
p_t_given_bound = p_unmeth_given_bound * p_t_given_unmeth + (1 - p_unmeth_given_bound) * p_t_given_meth
p_t_given_unbound = p_unmeth_given_unbound * p_t_given_unmeth + (1 - p_unmeth_given_unbound) * p_t_given_meth
return (p_t_given_bound, p_t_given_unbound)
def find_first_matching_sublist(lst, span, start):
'''
helper function for below
basically, given a start position, figure out the first subsequent position that satisfies the run being in the range
returns the index of this position, or else returns -1 if it falls off the end
'''
min_span, max_span = span
end = start
while end < len(lst):
temp_span = lst[end] - lst[start] + 1 # fencepost
if temp_span >= min_span and temp_span <= max_span:
return end
else:
end += 1
return -1
def sublist_finder(lst, span, start_ptr):
'''
generic function to compute all possible valid runs of a given length from a list of integers
input is a lst of positive integers (no duplicates, sorted)
also a range (low and high for how long the run can be, in the units of the entries of the list)
also a start (since this is a recursive function)
each valid configuration is a list of tuples of (start,end) of valid runs (so a single output could be [] or [(1,150)] or [(1,150),(161,300)])
output is a list of these lists
'''
valid_substrings = []
start = start_ptr
while start < len(lst):
# from start, find a valid range that works
end = find_first_matching_sublist(lst, span, start)
if end != -1:
# save this tuple
temp_sublist = [(lst[start],lst[end])]
# recursive call, starting from one past the end of the current
recursive_valid_sublists = sublist_finder(lst, span, end+1)
# append the empty run (so that we can get the "just this on run" state)
recursive_valid_sublists += [[]]
# add the current run to every recursive one
for sublist in recursive_valid_sublists:
valid_substrings.append(temp_sublist + sublist)
# now start at the next element in the sequnce
start += 1
# only once, append the empty run
if start_ptr == 0:
valid_substrings += [[]]
return valid_substrings
def powerset(iterable):
'''
computes the powerset of an iterable
stolen from SO
'''
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
def augment_methyl_positions(methyl_positions, step_size=10, overhang=30, nuc_length=110): #edited this to 110
'''
adds extra gpc positions to the end of the list (i.e. not observed ones, imaginary)
this is to allow nucs to slide off the end and still be valid
can modify how many and by how much a nucleosome has to overlap the molecule (currently 30 bp)
'''
start = methyl_positions[0]
end = methyl_positions[-1]
return list(range(start-nuc_length+overhang,start-step_size,step_size)) + methyl_positions + list(range(end+step_size, end+nuc_length-overhang, step_size))
def enumerate_states(methyl_positions, tfbs_positions, nuc_span=(110,140)):
'''
generates the list of all possible underlying occupancy states
first adds in a few "ghost" gpcs at the end of the list to allow nucs to slide off the edge
then computes all possible nucleosome occupancy events
then compute all possible tf binding events
returns the outer product of the two lists
'''
augmented_methyl_positions = augment_methyl_positions(methyl_positions)
possible_nuc_positions = sublist_finder(augmented_methyl_positions, nuc_span, 0)
possible_binding_events = powerset(tfbs_positions)
return product(possible_nuc_positions, possible_binding_events)
def nuc_overlaps_tf_binding(methyl_positions, state):
'''
check whether TF binding overlaps nucleosome binding
almost the same as generate_predicted_protection (below)
first fills in all gpcs that are covered by nucs
then figures out if any TFs are bound there too
also checks promoter state, too
'''
space_bw_tf_and_nuc = 25
nuc_positions, binding_sites = state
protection = [0 for _ in range(len(methyl_positions))]
# do nucs
for idx, p in enumerate(methyl_positions):
for nuc in nuc_positions:
if p >= (nuc[0]-space_bw_tf_and_nuc) and p <= (nuc[1]+space_bw_tf_and_nuc):
protection[idx] = 1
# do tfbs
for idx, p in enumerate(methyl_positions):
for tfbs in binding_sites:
if p >= tfbs[0] and p <= tfbs[1]:
if protection[idx] == 1: # checking to see if nucleosome overlaps a bound TF
return True
if (idx > 0 and protection[idx-1] == 1) or (idx+1 < len(methyl_positions) and protection[idx+1] == 1): # also checking if a nucleosome overlaps the _flanks_ of a bound tf, this way a nucleosome cannot abut a bound tf
return True
return False
def prune_states(all_states, methyl_positions):
'''
remove (most of) the states where there is TF and nucleosome bound at the same position
'''
return [s for s in all_states if not nuc_overlaps_tf_binding(methyl_positions, s)]
def generate_predicted_protection(methyl_positions, state):
'''
iterate through each gpc position, if it's bound (by either a nucleosome or a TF) mark the base protected
'''
nuc_positions, binding_sites = state
protection = [0 for _ in range(len(methyl_positions))]
# do nucs
for idx, p in enumerate(methyl_positions):
for nuc in nuc_positions:
if p >= nuc[0] and p <= nuc[1]:
protection[idx] = 1
# do tfbs
for idx, p in enumerate(methyl_positions):
for tfbs in binding_sites:
if p >= tfbs[0] and p <= tfbs[1]:
protection[idx] = 1
return protection
def create_bernoulli_logprob_matrices(predicted_methylations, methyl_positions, p_t_given_bound, p_t_given_unbound, promoter_positions=(75,175)):
'''
for each state, convert the predicted methylation profile into two matrices to use for computing bernoulli likelihood
'''
n_gpcs = len(methyl_positions)
n_states = len(predicted_methylations)
# initialize empty matrices
prob_states_unmeth = np.zeros((n_states,n_gpcs))
prob_states_meth = np.zeros((n_states,n_gpcs))
promoter_low, promoter_high = promoter_positions
for idx in range(n_states):
p_m_array = np.array(predicted_methylations[idx])
# when we see "T" = 1 = protected = unmethylated
prob_states_unmeth[idx] = p_t_given_bound * p_m_array + p_t_given_unbound * (1 - p_m_array)
# when we see "C" = 0 = accessible = methylated!
prob_states_meth[idx] = (1 - p_t_given_bound) * p_m_array + (1 - p_t_given_unbound) * (1 - p_m_array)
# do a little promoter hacking
# i.e. when the promoter dudes are 0s aka accessible, we should allow for seeing both meth or unmeth!
# commenting this out since now we are including promoter states
for i, p in enumerate(methyl_positions):
if p >= promoter_low and p <= promoter_high and p_m_array[i] == 0:
prob_states_unmeth[idx,i] = (1 - p_t_given_unbound)
log_prob_states_meth = np.log(prob_states_meth)
log_prob_states_unmeth = np.log(prob_states_unmeth)
return (log_prob_states_meth, log_prob_states_unmeth)
def determine_chunk_size(n_mols, n_states, data_size, memory_threshold):
'''
Figures out how many times we need to split the data into (along the single molecule axis) to not go above memory limits
'''
return int((n_mols * n_states * data_size) // (memory_threshold * 1e9)) + 1
def classify_all_molecules(single_molecules, log_prob_states_unmeth, log_prob_states_meth, n_matrix_partitions):
'''
Compute Bernoulli likelihood all at once
note that log_prob_states_unmeth and meth and not strictly reciprocals of each other, since we need to change unoccupied promoter probs (see above)
single_molecules is gpcs x molecules
log_prob_states_(un)meth are states x gpcs
result of matmul is states x molecules, argmax down columns
return a list of index of best state per molecule
'''
n_mols = single_molecules.shape[1]
chunk_size = n_mols // n_matrix_partitions + 1
to_return = []
for idx in range(n_matrix_partitions):
sub_mat = single_molecules[:,idx*chunk_size:(idx+1)*chunk_size]
res = np.argmax(np.exp(log_prob_states_unmeth @ sub_mat + log_prob_states_meth @ (1 - sub_mat)), axis=0)
to_return += list(res)
return to_return
def expand_state(state, idx, tfbs_positions):
'''
takes a single "state" and returns a legible version that we can plunk in a df and return
'''
nuc_positions, binding_sites = state
expanded_state = {}
# first do nuc
nuc_num = 1
for nuc in nuc_positions:
expanded_state['nuc{}_present'.format(nuc_num)] = True
expanded_state['nuc{}_start'.format(nuc_num)] = nuc[0]
expanded_state['nuc{}_end'.format(nuc_num)] = nuc[1]
nuc_num += 1
# then do tfbs
tfbs_num = 1
for tfbs in tfbs_positions:
expanded_state['tfbs_{}'.format(tfbs_num)] = tfbs in binding_sites
tfbs_num += 1
# finally tack on index of state
expanded_state['idx'] = idx
return expanded_state
def summarize_assignments(state_list, assignments, tfbs_positions):
'''
gets legible state assignment for every single molecule
put into df and return
'''
expanded_assignments = []
for a in assignments:
expanded_assignments.append(expand_state(state_list[a], a, tfbs_positions))
return pd.DataFrame(expanded_assignments)
def write_valid_states(states, out_path, tfbs_positions):
'''
we are going to write the valid states to a file
this will be in parallel to the file we are already writing (e.g. the per molecule file)
we can use this to assign energies to each state and compare the predicted abundances (boltzman) to the true ones
we'll also annotate the single molecule classifications with the index of each state, so we can aggregate by it later
'''
expanded_assignments = []
for idx, state in enumerate(states):
expanded_assignments.append(expand_state(state, idx, tfbs_positions))
to_write = pd.DataFrame(expanded_assignments)
to_write.to_csv(out_path, sep='\t', header=True, index=False)
def compute_classifications(single_molecules, methyl_positions, tfbs_positions, valid_states_path=None, precomputed_states_file=None, memory_threshold=1.):
# initialize hyperparams
p_t_given_bound, p_t_given_unbound = compute_hyperparameters()
# check whether there already exists single-molecule states for this amplicon
if precomputed_states_file and path.exists(precomputed_states_file):
# then we can save time and just load the precomputed possible methylation states
with open(precomputed_states_file, "rb") as in_file:
valid_states = pickle.load(in_file)
else:
# generate possible single molecule states
print('generating states')
possible_states = list(enumerate_states(methyl_positions, tfbs_positions))
print('total states: {}'.format(len(possible_states)))
# filter for non-overlapping states
valid_states = prune_states(possible_states, methyl_positions)
# write to file
if precomputed_states_file:
with open(precomputed_states_file, "wb") as out_file:
print('started dump')
pickle.dump(valid_states, out_file)
print('ended dump')
print('valid states: {}'.format(len(valid_states)))
# generate predicted methylation profiles per underlying state
predicted_methylations = [generate_predicted_protection(methyl_positions, s) for s in valid_states]
# turn these into matrices
log_prob_states_meth, log_prob_states_unmeth = create_bernoulli_logprob_matrices(predicted_methylations, methyl_positions, p_t_given_bound, p_t_given_unbound)
# determine memory usage
data_size = 8 # can get this down
n_matrix_partitions = determine_chunk_size(single_molecules.shape[1], len(valid_states), data_size, memory_threshold)
print('n chunks: {}'.format(n_matrix_partitions))
# classify single molecules
print('doing classification')
maximum_likelihood_assignments = classify_all_molecules(single_molecules, log_prob_states_unmeth, log_prob_states_meth, n_matrix_partitions)
# coerce (new) assignments to output
print('summarizing output')
assignments_to_return = summarize_assignments(valid_states, maximum_likelihood_assignments, tfbs_positions)
# optionally write valid_states to a file, so we can start building the equilibrium model to predict all microstates
if valid_states_path:
write_valid_states(valid_states, valid_states_path, tfbs_positions)
return assignments_to_return
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run Bayesian classification algorithm on an input single-molecule matrix')
parser.add_argument("--input", dest="input", type=str, help="Path to single molecule matrix")
parser.add_argument("--output", dest="output", type=str, help="Path for writing the classification output tables to")
parser.add_argument("--all_states_output", dest="all_states_output", type=str, default=None, help="Path for writing the valid state matrix")
parser.add_argument("--precomputed_states_file", dest="precomputed_states_file", type=str, default=None, help="Path to a precomputed methylation file to save computation time, if it doesn't exist it will be generated")
parser.add_argument("--plot", dest="plot_file", type=str, help="Path to output plots file (optional)")
parser.add_argument("--positions", dest="positions_file", type=str, help="Path with positions for calling bound/unbound")
parser.add_argument("--amplicon_name", dest="amplicon_name", type=str, help="Name of amplicon aligned to for grabbing TFBS positions")
parser.add_argument("--reads_to_use", dest="reads_to_use", type=int, default=0, help="Number of reads to sample for classification (if 0, use all)")
parser.add_argument("--filter_threshold", dest="filter_threshold", type=float, default=1., help="Fraction of converted Cs above which a molecule will be thrown out (to get rid of the totally 'closed' molecules")
parser.add_argument("--reads_to_plot", dest="reads_to_plot", type=int, default=10, help="Number of classifications to plot, to make sure the algorithm is performing as expected")
parser.add_argument("--memory_threshold", dest="memory_threshold", type=float, default=1., help="Amount of memory to stay under for the big mat mul (in Gb)")
parser.add_argument("--extra_motif_bps", dest="extra_motif_bps", type=int, default=0, help="A fixed bp offset compared to the position_list, making motifs larger with this number of bps, both to the left and right")
args = parser.parse_args()
with PdfPages(args.plot_file) as plots:
# read files
print('loading data')
mat = load_single_molecule_matrix(args.input)
methyl_positions = get_methyl_positions(mat)
pos_file = load_tfbs_positions(args.positions_file,args.extra_motif_bps)
tfbs_positions = pos_file[args.amplicon_name]
# optionally filter and downsample the matrix
# remove reads with -1s still in them
mat = filter_all_converted_reads(mat, args.filter_threshold)
mat = fix_missing_data(mat)
# optionally apply the 4/3 and 9/8 corrections (e.g. replace methylated 4/9 with the neighbor)
mat = adjust_gcgs(mat, 53, 49) # 4/3
mat = adjust_gcgs(mat, 129, 122) # 9/8
print('num molecules: {}'.format(len(mat)))
# transpose the matrix (to gpcs x molecules, and also convert to numpy array)
single_molecules = mat.T.values
# get assignments
print('starting classification')
assignments = compute_classifications(single_molecules, methyl_positions, tfbs_positions, valid_states_path=args.all_states_output, precomputed_states_file=args.precomputed_states_file, memory_threshold=args.memory_threshold)
assignments.index = mat.index
# write this to a file
assignments.to_csv(args.output, sep='\t', header=True, index=True)
# optionally plot some
if args.reads_to_plot:
print('plotting')
to_plot = assignments.sample(args.reads_to_plot)
for idx, assignment in to_plot.iterrows():
fig, ax = plt.subplots(figsize=(12,8))
plot_single_read(mat, idx, ax, fillbetween=tfbs_positions)
decorate_single_read_plot2(assignment, ax, tfbs_positions)
plots.savefig()
plt.close()