Skip to content

Commit 626b829

Browse files
authored
Merge pull request #390 from AdamaJava/nanno_contig_int
fix(nanno): deal with contig names that include numbers and letters
2 parents 3c5b8ae + 9079391 commit 626b829

File tree

3 files changed

+126
-46
lines changed

3 files changed

+126
-46
lines changed

qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java

Lines changed: 61 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ public class Annotate {
2323

2424
static final List<String> SEARCH_TERM_VARIETIES = Arrays.asList(">", "->", "-->", "/");
2525

26+
// Define the set of standard GRCh38 contigs (no "chr" prefix version)
27+
private static final Set<String> STANDARD_GRCH38_CONTIGS = Set.of(
28+
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
29+
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
30+
"21", "22", "X", "Y", "MT", "M"
31+
);
32+
2633
static Comparator<String[]> CUSTOM_COMPARATOR;
2734
static QLogger logger;
2835

@@ -71,70 +78,77 @@ public int engage() throws Exception {
7178

7279
ChrPosition lastCP = null;
7380
try (
74-
VcfFileReader reader = new VcfFileReader(inputFile)) {
81+
VcfFileReader reader = new VcfFileReader(inputFile)) {
7582
logger.info("VcfFileReader has been setup");
7683
int vcfCount = 0;
84+
int nonStandardContigCount = 0;
7785
for (VcfRecord vcf : reader) {
7886
vcfCount++;
7987

8088
ChrPosition thisVcfsCP = vcf.getChrPositionRefAlt();
8189
logger.debug("thisVcfsCP: " + thisVcfsCP.toIGVString());
8290

91+
boolean isStandardContig = isStandardContig(thisVcfsCP);
92+
if (isStandardContig) {
8393

84-
/*
85-
* check that this CP is "after" the last CP
86-
*/
87-
int compare = null != lastCP ? ((ChrPositionRefAlt) thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0;
88-
if (compare < 0) {
89-
throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString());
90-
}
9194

92-
93-
String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt();
94-
String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0);
95-
String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0);
96-
97-
if (alt.contains(",")) {
98-
logger.info("alt has comma: " + thisVcfsCP);
9995
/*
100-
* split record, create new ChrPositions for each
96+
* check that this CP is "after" the last CP
10197
*/
102-
String[] altArray = alt.split(",");
103-
List<VcfRecord> splitVcfs = new ArrayList<>();
104-
for (String thisAlt : altArray) {
105-
if (thisAlt.equals("*")) {
106-
/*
107-
* ignore
108-
*/
109-
} else {
110-
VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt);
111-
splitVcfs.add(newVcf);
112-
}
98+
int compare = null != lastCP ? ((ChrPositionRefAlt) thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0;
99+
if (compare < 0) {
100+
throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString());
113101
}
114-
if (splitVcfs.size() > 1) {
102+
103+
104+
String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt();
105+
String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0);
106+
String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0);
107+
108+
if (alt.contains(",")) {
109+
logger.info("alt has comma: " + thisVcfsCP);
115110
/*
116-
* sort
111+
* split record, create new ChrPositions for each
117112
*/
118-
splitVcfs.sort(null);
119-
}
120-
for (VcfRecord splitVcf : splitVcfs) {
121-
List<String> annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor));
122-
queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt));
123-
}
113+
String[] altArray = alt.split(",");
114+
List<VcfRecord> splitVcfs = new ArrayList<>();
115+
for (String thisAlt : altArray) {
116+
if (thisAlt.equals("*")) {
117+
/*
118+
* ignore
119+
*/
120+
} else {
121+
VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt);
122+
splitVcfs.add(newVcf);
123+
}
124+
}
125+
if (splitVcfs.size() > 1) {
126+
/*
127+
* sort
128+
*/
129+
splitVcfs.sort(null);
130+
}
131+
for (VcfRecord splitVcf : splitVcfs) {
132+
List<String> annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor));
133+
queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt));
134+
}
124135

125-
} else {
136+
} else {
126137

127-
logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString());
128-
List<String> annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor);
129-
logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue");
130-
queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt));
138+
logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString());
139+
List<String> annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor);
140+
logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue");
141+
queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt));
131142

132-
}
143+
}
133144

134-
lastCP = thisVcfsCP;
145+
lastCP = thisVcfsCP;
146+
} else {
147+
nonStandardContigCount++;
148+
}
135149
}
136150

137-
logger.info("# of vcf records: " + vcfCount);
151+
logger.info("# of vcf records: " + vcfCount + ", # of non-standard contigs: " + nonStandardContigCount);
138152
} finally {
139153
/*
140154
* count down the count down latch
@@ -147,6 +161,10 @@ public int engage() throws Exception {
147161
return exitStatus;
148162
}
149163

164+
private boolean isStandardContig(ChrPosition thisVcfsCP) {
165+
return thisVcfsCP.getChromosome().startsWith("chr") ? STANDARD_GRCH38_CONTIGS.contains(thisVcfsCP.getChromosome().substring(3)) : STANDARD_GRCH38_CONTIGS.contains(thisVcfsCP.getChromosome());
166+
}
167+
150168

151169
private static List<String> getAnnotationsForPosition(ChrPosition cp, List<AnnotationSource> annotationSources, Executor executor) {
152170
long contigAndPosition = ((ChrPositionUtils.convertContigAndPositionToLong(cp.getChromosome().startsWith("chr") ? cp.getChromosome().substring(3) : cp.getChromosome(), cp.getStartPosition())));

qcommon/src/org/qcmg/common/util/ChrPositionUtils.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,14 @@ public static int convertContigNameToInt(String contigName) {
6464
if (null == contigName || contigName.isEmpty()) {
6565
throw new IllegalArgumentException("null or empty contig name supplied to convertContigNameToInt");
6666
}
67-
int i = Character.isDigit(contigName.charAt(0)) ? Integer.parseInt(contigName) : -1;
68-
if (i > -1) {
69-
return i;
67+
// check if the contig name is a number
68+
// if so, return it as an int
69+
// otherwise, convert it to a hash code
70+
if (isDigits(contigName)) {
71+
return Integer.parseInt(contigName);
7072
}
7173

74+
7275
if (contigName.length() > 3 && contigName.startsWith("chr")) {
7376
return convertContigNameToInt(contigName.substring(3));
7477
}
@@ -81,6 +84,10 @@ public static int convertContigNameToInt(String contigName) {
8184
};
8285
}
8386

87+
public static boolean isDigits(String str) {
88+
return str != null && !str.isEmpty() && str.chars().allMatch(Character::isDigit);
89+
}
90+
8491
/**
8592
* Checks if two ChrPosition objects overlap with a buffer.
8693
*

qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,61 @@ public void testDelta() {
3535
assertTrue(ChrPositionUtils.arePositionsWithinDelta(cp1, cp2, 4));
3636
}
3737

38+
@Test
39+
public void testConvertContigNameToInt_NumericContig() {
40+
assertEquals(1, ChrPositionUtils.convertContigNameToInt("1"));
41+
assertEquals(22, ChrPositionUtils.convertContigNameToInt("22"));
42+
}
43+
44+
@Test
45+
public void testConvertContigNameToInt_ChromosomeWithChrPrefix() {
46+
assertEquals(1, ChrPositionUtils.convertContigNameToInt("chr1"));
47+
assertEquals(22, ChrPositionUtils.convertContigNameToInt("chr22"));
48+
}
49+
50+
@Test
51+
public void testConvertContigNameToInt_SexChromosomes() {
52+
assertEquals(23, ChrPositionUtils.convertContigNameToInt("X"));
53+
assertEquals(24, ChrPositionUtils.convertContigNameToInt("Y"));
54+
}
55+
56+
@Test
57+
public void testConvertContigNameToInt_Mitochondrial() {
58+
assertEquals(25, ChrPositionUtils.convertContigNameToInt("M"));
59+
assertEquals(25, ChrPositionUtils.convertContigNameToInt("MT"));
60+
}
61+
62+
@Test
63+
public void testConvertContigNameToInt_ChromosomeWithChrPrefixSpecialCases() {
64+
assertEquals(23, ChrPositionUtils.convertContigNameToInt("chrX"));
65+
assertEquals(24, ChrPositionUtils.convertContigNameToInt("chrY"));
66+
assertEquals(25, ChrPositionUtils.convertContigNameToInt("chrM"));
67+
}
68+
69+
@Test
70+
public void testConvertContigNameToInt_AltChromosome() {
71+
assertEquals("22_KI270739v1_random".hashCode(), ChrPositionUtils.convertContigNameToInt("chr22_KI270739v1_random"));
72+
assertEquals("Y_KI270740v1_random".hashCode(), ChrPositionUtils.convertContigNameToInt("chrY_KI270740v1_random"));
73+
assertEquals("Un_KI270302v1".hashCode(), ChrPositionUtils.convertContigNameToInt("chrUn_KI270302v1"));
74+
}
75+
76+
@Test
77+
public void testConvertContigNameToInt_OtherValues() {
78+
// For other values, it should return hashCode
79+
String contig = "other";
80+
assertEquals(contig.hashCode(), ChrPositionUtils.convertContigNameToInt(contig));
81+
}
82+
83+
@Test(expected = IllegalArgumentException.class)
84+
public void testConvertContigNameToInt_NullInput() {
85+
ChrPositionUtils.convertContigNameToInt(null);
86+
}
87+
88+
@Test(expected = IllegalArgumentException.class)
89+
public void testConvertContigNameToInt_EmptyInput() {
90+
ChrPositionUtils.convertContigNameToInt("");
91+
}
92+
3893
@Test
3994
public void testConvertChrPositionToLong() {
4095
long expected = ((long) 4 << 32) + 9;

0 commit comments

Comments
 (0)