Skip to content

Commit

Permalink
collapse bug fix
Browse files Browse the repository at this point in the history
sub-chunking bug prevented comparison of variants which should have been
compared but were separated.
But also, refactored chain to be simplier and to no longer 'slide'
  • Loading branch information
ACEnglish committed Jan 31, 2025
1 parent 7dd8400 commit 466ae6b
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 48 deletions.
5 changes: 2 additions & 3 deletions repo_utils/answer_key/collapse/input1_median_collapsed.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,7 @@ chr20 419860 . AGTGACCCTGCACCTGGCT A 60 . QNAME=HG002-S9-H2-000001F;QSTART=37411
chr20 420228 . A C 60 . QNAME=HG002-S9-H2-000001F;QSTART=374466;QSTRAND=+ GT:PL:DP 0|1:2,3,6:24,29
chr20 420465 . A AG 60 . QNAME=HG002-S9-H2-000001F;QSTART=374704;QSTRAND=+ GT:PL:DP 0|1:6,10,1:32,35
chr20 420561 . A T 60 . QNAME=HG002-S9-H1-000001F;QSTART=399939;QSTRAND=+ GT:PL:DP 1|1:5,6,7:36,12
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=9.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:.
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=10.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:.
chr20 421409 . G A 60 . QNAME=HG002-S9-H1-000001F;QSTART=401013;QSTRAND=+ GT:PL:DP 1|1:2,3,7:14,41
chr20 421527 . T C 60 . QNAME=HG002-S9-H2-000001F;QSTART=375993;QSTRAND=+ GT:PL:DP 0|1:3,8,4:49,42
chr20 422066 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=401670;QSTRAND=+ GT:PL:DP 1|1:3,7,8:9,39
Expand Down Expand Up @@ -1263,7 +1263,7 @@ chr20 639104 . A AT 60 . QNAME=HG002-S9-H1-000001F;QSTART=618555;QSTRAND=+ GT:PL
chr20 640046 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619497;QSTRAND=+ GT:PL:DP 1|0:10,10,9:11,48
chr20 640049 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619500;QSTRAND=+ GT:PL:DP 1|1:2,3,4:13,44
chr20 641878 . C G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621329;QSTRAND=+ GT:PL:DP 1|0:8,1,7:7,13
chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66 GT:PL:DP 1/0:7,5,7:34,13
chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=15.0;CollapseStart=642120;CollapseEnd=642121;CollapseSize=66 GT:PL:DP 1/0:7,5,7:34,13
chr20 641944 . GGA G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621462;QSTRAND=+ GT:PL:DP 1|0:6,6,6:17,7
chr20 642012 . GGT G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621528;QSTRAND=+ GT:PL:DP 1|0:5,7,1:6,23
chr20 642037 . T TG 60 . QNAME=HG002-S9-H1-000001F;QSTART=621551;QSTRAND=+ GT:PL:DP 1|0:4,10,10:14,47
Expand All @@ -1283,7 +1283,6 @@ chr20 642284 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621795;QSTRAND=+ GT:PL:
chr20 642300 . G GCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGC 60 . QNAME=HG002-S9-H1-000001F;QSTART=621812;QSTRAND=+;SVTYPE=INS;SVLEN=408 GT:PL:DP 1/0:5,10,5:44,36
chr20 642300 . G GGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597225;QSTRAND=+ GT:PL:DP 0|1:4,10,8:25,7
chr20 642330 . G C 60 . QNAME=HG002-S9-H1-000001F;QSTART=622249;QSTRAND=+ GT:PL:DP 1|0:2,9,8:16,10
chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66 GT:PL:DP 0/1:6,1,9:20,25
chr20 642362 . G GC 60 . QNAME=HG002-S9-H1-000001F;QSTART=622282;QSTRAND=+ GT:PL:DP 1|1:10,5,1:17,12
chr20 642391 . G GC 60 . QNAME=HG002-S9-H1-000001F;QSTART=622312;QSTRAND=+ GT:PL:DP 1|1:2,9,2:10,50
chr20 642420 . G GC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597415;QSTRAND=+ GT:PL:DP 0|1:7,1,2:41,16
Expand Down
3 changes: 2 additions & 1 deletion repo_utils/answer_key/collapse/input1_median_removed.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,5 @@
##INFO=<ID=MatchId,Number=.,Type=String,Description="Tuple of base and comparison call ids which were matched">
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=9.0 GT:PL:DP 1/0:4,8,6:32,9
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=10.0 GT:PL:DP 1/0:4,8,6:32,9
chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9627;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;TruScore=65;MatchId=15.0 GT:PL:DP 0/1:6,1,9:20,25
2 changes: 1 addition & 1 deletion repo_utils/answer_key/collapse/inputissue196_removed.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chr21 14088212 chr21-14088113-DEL-223 T <DEL> . . END=14088828;SVTYPE=DEL;SVLEN=-393;PctSeqSimilarity=0;PctSizeSimilarity=0.5674;PctRecOverlap=0.5624;SizeDiff=-170;StartDistance=-100;EndDistance=-270;TruScore=37;MatchId=1.0 GT 1|0
chr21 14088312 chr21-14088113-DEL-223 A <DEL> . . END=14089203;SVTYPE=DEL;SVLEN=-668;PctSeqSimilarity=0;PctSizeSimilarity=0.5883;PctRecOverlap=0.5796;SizeDiff=275;StartDistance=100;EndDistance=375;TruScore=38;MatchId=1.0 GT 1|0
chr21 14088312 chr21-14088113-DEL-223 A <DEL> . . END=14089203;SVTYPE=DEL;SVLEN=-668;PctSeqSimilarity=0;PctSizeSimilarity=0.5883;PctRecOverlap=0.5796;SizeDiff=-275;StartDistance=-100;EndDistance=-375;TruScore=38;MatchId=1.0 GT 1|0
86 changes: 43 additions & 43 deletions truvari/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,24 +95,31 @@ def combine(self, other):
self.genotype_mask |= other.genotype_mask


def chain_collapse(cur_collapse, all_collapse):
"""
Perform transitive matching of cur_collapse to all_collapse
Check the cur_collapse's entry to all other collapses' consolidated entries
"""
for m_collap in all_collapse:
for other in m_collap.matches:
mat = cur_collapse.entry.match(other.comp)
mat.matid = m_collap.match_id
if mat.state:
# The other's representative entry will report its
# similarity to the matched call that pulled it in
mat.base, mat.comp = mat.comp, mat.base
m_collap.matches.append(mat)
m_collap.combine(cur_collapse)
return True # you can just ignore it later
return False # You'll have to add it to your set of collapsed calls

def find_new_matches(base, remaining_calls, dest_collapse, params):
"""
Pull variants from remaining calls that match to the base entry into the destination collapse
Updates everything in place
"""
# Sort based on size difference to current call
remaining_calls.sort(key=partial(relative_size_sorter, base))
i = 0
while i < len(remaining_calls):
candidate = remaining_calls[i]
mat = base.match(candidate)
mat.matid = dest_collapse.match_id
if params.hap and not hap_resolve(base, candidate):
mat.state = False
if mat.state and dest_collapse.gt_conflict(candidate, params.gt):
mat.state = False
if mat.state:
dest_collapse.matches.append(mat)
remaining_calls.pop(i)
else:
# move to next one
i += 1
# short circuit
if mat.sizesim is not None and mat.sizesim < params.pctsize:
return

def collapse_chunk(chunk, params):
"""
Expand All @@ -121,7 +128,6 @@ def collapse_chunk(chunk, params):
chunk_dict, chunk_id = chunk
remaining_calls = sorted(chunk_dict['base'], key=params.sorter)

remaining_calls.sort(key=params.sorter)
call_id = -1
ret = [] # list of Collapses
while remaining_calls:
Expand All @@ -132,33 +138,27 @@ def collapse_chunk(chunk, params):
m_collap.genotype_mask = m_collap.make_genotype_mask(m_collap.entry,
params.gt)

# Sort based on size difference to current call
for candidate in sorted(remaining_calls, key=partial(relative_size_sorter, m_collap.entry)):
mat = m_collap.entry.match(candidate)
mat.matid = m_collap.match_id
if params.hap and not hap_resolve(m_collap.entry, candidate):
mat.state = False
if mat.state and m_collap.gt_conflict(candidate, params.gt):
mat.state = False
if mat.state:
m_collap.matches.append(mat)
elif mat.sizesim is not None and mat.sizesim < params.pctsize:
# The sort tells us that we're going through most->least
# similar size. So the next one will only be worse...
break

# Does this collap need to go into a previous collap?
if not params.chain or not chain_collapse(m_collap, ret):
ret.append(m_collap)
find_new_matches(m_collap.entry, remaining_calls, m_collap, params)
# Chain will only operate once to prevent 'sliding'
# e.g. collapsing integers within 5 of 1, 4, 6, 8
# without a single operation 1
# with a single operation 1, 8
# not chaining at all 1, 6
if params.chain:
i = 0
while remaining_calls and i < len(m_collap.matches):
find_new_matches(m_collap.matches[i].comp, remaining_calls, m_collap, params)
i += 1

# If hap, only allow the best match
# put the rest of the remaining calls back
if params.hap and m_collap.matches:
mats = sorted(m_collap.matches, reverse=True)
m_collap.matches = [mats.pop(0)]

# Remove everything that was used
to_rm = [_.comp for _ in m_collap.matches]
remaining_calls = [_ for _ in remaining_calls if _ not in to_rm]
remaining_calls.extend(mats)
ret.append(m_collap)
# Sort back to where they need to be to choose the next to evaluate
remaining_calls.sort(key=params.sorter)

if params.no_consolidate:
for val in ret:
Expand Down Expand Up @@ -771,12 +771,12 @@ def concatenate(self, other):

def merge_intervals(intervals):
"""
Merge sorted list of tuples
Merge list of tuples
"""
if not intervals:
return []
intervals.sort(key=lambda x: (x[0], x[1]))
merged = []

current_start, current_end, current_data = intervals[0]
for i in range(1, len(intervals)):
next_start, next_end, next_data = intervals[i]
Expand Down

0 comments on commit 466ae6b

Please sign in to comment.