@@ -106,12 +106,14 @@ class TraceSeqRecord(Record):
106
106
"""
107
107
Class for storing information loaded from trace files (.ab1 files).
108
108
"""
109
- def __init__ (self , seq , quality , id = None , traces = None , base_locations = None , reference = None , reverse : bool = None ):
109
+ def __init__ (self , seq , quality , f , id = None , traces = None , base_locations = None , reference = None ,
110
+ reverse : bool = None ):
110
111
"""
111
112
This constructor is only called by read function below or from other functions starting with an instance, so
112
113
the parameters need not be understood in too much details - reflects structure of a trace file.
113
114
:param seq: Seq object with DNA sequence
114
115
:param quality: list of int representing per base quality
116
+ :param f: mixed peaks detection threshold
115
117
:param traces: dict containing a list of trace values for each base
116
118
:param base_locations: list of positions (int) on the x axis, where each base is called
117
119
:param reference: reference sequence in the DB, only added after instantiating
@@ -122,7 +124,8 @@ def __init__(self, seq, quality, id=None, traces=None, base_locations=None, refe
122
124
self .traces = traces
123
125
self .base_locations = base_locations
124
126
self .quality = quality
125
- self .mixed_peaks = self .find_mixed_peaks ()
127
+ self .f = f
128
+ self .mixed_peaks = self .find_mixed_peaks (f )
126
129
# should the original sequence be stored because of finding mixed peaks? Or just ignore al N's...
127
130
self .reference = reference
128
131
self .reverse = reverse
@@ -148,8 +151,8 @@ def read(cls, path):
148
151
# get locations in trace array for all bases
149
152
base_locations = list (record .annotations ['abif_raw' ]["PLOC1" ])
150
153
151
- return cls (record .seq , record .letter_annotations ['phred_quality' ], id = name , traces = traces ,
152
- base_locations = base_locations )
154
+ return cls (record .seq , record .letter_annotations ['phred_quality' ], f = 0.15 , id = name ,
155
+ traces = traces , base_locations = base_locations )
153
156
154
157
def filter_sequence_by_quality (self , threshold , end_threshold ):
155
158
"""
@@ -167,7 +170,8 @@ def filter_sequence_by_quality(self, threshold, end_threshold):
167
170
traces = self .traces ,
168
171
base_locations = self .base_locations ,
169
172
reference = self .reference ,
170
- reverse = self .reverse
173
+ reverse = self .reverse ,
174
+ f = self .f
171
175
)
172
176
173
177
def reverse_complement (self , ** kwargs ):
@@ -183,7 +187,8 @@ def reverse_complement(self, **kwargs):
183
187
traces = {str (Seq (base ).reverse_complement ()): values [::- 1 ] for base , values in self .traces .items ()},
184
188
base_locations = [num_locations - i - 1 for i in self .base_locations [::- 1 ]],
185
189
reverse = False ,
186
- reference = self .reference
190
+ reference = self .reference ,
191
+ f = self .f
187
192
)
188
193
189
194
def has_base_above_threshold (self ):
@@ -242,7 +247,7 @@ def peak_borders(self, i: int):
242
247
end = min (pos + width - 2 , len (self .traces ['A' ]))
243
248
return start , end
244
249
245
- def find_mixed_peaks (self , fraction : float = 0.15 ):
250
+ def find_mixed_peaks (self , fraction : float ):
246
251
"""
247
252
For each position of the sequence, determine if the peak in the chromatogram is "mixed".
248
253
Disregard mixed signals in regions with low signal to noise ratio (generally bad quality region)
@@ -253,12 +258,12 @@ def find_mixed_peaks(self, fraction: float = 0.15):
253
258
stn = [self .signal_to_noise (i ) for i in range (len (self .base_locations ))]
254
259
avg_stn = sum (stn ) / len (stn )
255
260
mixed_peaks = []
261
+ threshold = max (25 , avg_stn * 1.35 )
256
262
257
263
for i , pos in enumerate (self .base_locations ):
258
264
stn_local = stn [i - 10 :i ] + stn [i + 1 :i + 10 ]
259
265
signal_to_noise = sum (stn_local ) / 20
260
266
261
- threshold = max (25 , avg_stn * 1.35 )
262
267
if signal_to_noise < threshold :
263
268
continue
264
269
# bad StN ratio -> disregard potential mixed positions
@@ -269,13 +274,22 @@ def find_mixed_peaks(self, fraction: float = 0.15):
269
274
peaks = {base : values [pos ] for base , values in self .traces .items ()}
270
275
if base != "N" :
271
276
main_peak = peaks [base .upper ()]
272
- for letter , area in areas .items ():
273
- # check for both area and height of peak
274
- if base != letter and area > (areas [base .upper ()] * fraction ) and peaks [letter ] > (main_peak * fraction ) \
275
- and self .is_concave (pos , letter ):
276
- mixed_peaks .append (i )
277
+ else :
278
+ # If the called base is "N", we use the highest peak present as the main peak
279
+ main_peak = max (peaks .values ())
280
+ base = max (peaks , key = peaks .get )
281
+ for letter , area in areas .items ():
282
+ # check for both area and height of peak
283
+ if base != letter and area > (areas [base .upper ()] * fraction ) and peaks [letter ] > (main_peak * fraction ) \
284
+ and self .is_concave (pos , letter ):
285
+ mixed_peaks .append (i )
277
286
return mixed_peaks
278
287
288
+ def re_find_mixed_peaks (self , f ):
289
+ """Function to re-calculate mixed peaks after object has been constructed"""
290
+ self .mixed_peaks = self .find_mixed_peaks (f )
291
+ self .f = f
292
+
279
293
def signal_to_noise (self , i : int ):
280
294
"""
281
295
Calculate the signal to noise ratio of a position as the ratio of the primary peak area to the sum of all other
0 commit comments