@@ -23,6 +23,13 @@ public class Annotate {
2323
2424 static final List <String > SEARCH_TERM_VARIETIES = Arrays .asList (">" , "->" , "-->" , "/" );
2525
26+ // Define the set of standard GRCh38 contigs (no "chr" prefix version)
27+ private static final Set <String > STANDARD_GRCH38_CONTIGS = Set .of (
28+ "1" , "2" , "3" , "4" , "5" , "6" , "7" , "8" , "9" , "10" ,
29+ "11" , "12" , "13" , "14" , "15" , "16" , "17" , "18" , "19" , "20" ,
30+ "21" , "22" , "X" , "Y" , "MT" , "M"
31+ );
32+
2633 static Comparator <String []> CUSTOM_COMPARATOR ;
2734 static QLogger logger ;
2835
@@ -71,70 +78,77 @@ public int engage() throws Exception {
7178
7279 ChrPosition lastCP = null ;
7380 try (
74- VcfFileReader reader = new VcfFileReader (inputFile )) {
81+ VcfFileReader reader = new VcfFileReader (inputFile )) {
7582 logger .info ("VcfFileReader has been setup" );
7683 int vcfCount = 0 ;
84+ int nonStandardContigCount = 0 ;
7785 for (VcfRecord vcf : reader ) {
7886 vcfCount ++;
7987
8088 ChrPosition thisVcfsCP = vcf .getChrPositionRefAlt ();
8189 logger .debug ("thisVcfsCP: " + thisVcfsCP .toIGVString ());
8290
91+ boolean isStandardContig = isStandardContig (thisVcfsCP );
92+ if (isStandardContig ) {
8393
84- /*
85- * check that this CP is "after" the last CP
86- */
87- int compare = null != lastCP ? ((ChrPositionRefAlt ) thisVcfsCP ).compareTo ((ChrPositionRefAlt ) lastCP ) : 0 ;
88- if (compare < 0 ) {
89- throw new IllegalArgumentException ("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP .toIGVString () + ", last vcf: " + lastCP .toIGVString ());
90- }
9194
92-
93- String alt = ((ChrPositionRefAlt ) thisVcfsCP ).getAlt ();
94- String gatkAD = VcfUtils .getFormatField (vcf .getFormatFields (), "AD" , 0 );
95- String gatkGT = VcfUtils .getFormatField (vcf .getFormatFields (), "GT" , 0 );
96-
97- if (alt .contains ("," )) {
98- logger .info ("alt has comma: " + thisVcfsCP );
9995 /*
100- * split record, create new ChrPositions for each
96+ * check that this CP is "after" the last CP
10197 */
102- String [] altArray = alt .split ("," );
103- List <VcfRecord > splitVcfs = new ArrayList <>();
104- for (String thisAlt : altArray ) {
105- if (thisAlt .equals ("*" )) {
106- /*
107- * ignore
108- */
109- } else {
110- VcfRecord newVcf = VcfUtils .cloneWithNewAlt (vcf , thisAlt );
111- splitVcfs .add (newVcf );
112- }
98+ int compare = null != lastCP ? ((ChrPositionRefAlt ) thisVcfsCP ).compareTo ((ChrPositionRefAlt ) lastCP ) : 0 ;
99+ if (compare < 0 ) {
100+ throw new IllegalArgumentException ("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP .toIGVString () + ", last vcf: " + lastCP .toIGVString ());
113101 }
114- if (splitVcfs .size () > 1 ) {
102+
103+
104+ String alt = ((ChrPositionRefAlt ) thisVcfsCP ).getAlt ();
105+ String gatkAD = VcfUtils .getFormatField (vcf .getFormatFields (), "AD" , 0 );
106+ String gatkGT = VcfUtils .getFormatField (vcf .getFormatFields (), "GT" , 0 );
107+
108+ if (alt .contains ("," )) {
109+ logger .info ("alt has comma: " + thisVcfsCP );
115110 /*
116- * sort
111+ * split record, create new ChrPositions for each
117112 */
118- splitVcfs .sort (null );
119- }
120- for (VcfRecord splitVcf : splitVcfs ) {
121- List <String > annotations = new ArrayList <>(getAnnotationsForPosition (splitVcf .getChrPositionRefAlt (), annotationSources , executor ));
122- queue .add (new ChrPositionAnnotations (splitVcf .getChrPositionRefAlt (), annotations , gatkAD , gatkGT , alt ));
123- }
113+ String [] altArray = alt .split ("," );
114+ List <VcfRecord > splitVcfs = new ArrayList <>();
115+ for (String thisAlt : altArray ) {
116+ if (thisAlt .equals ("*" )) {
117+ /*
118+ * ignore
119+ */
120+ } else {
121+ VcfRecord newVcf = VcfUtils .cloneWithNewAlt (vcf , thisAlt );
122+ splitVcfs .add (newVcf );
123+ }
124+ }
125+ if (splitVcfs .size () > 1 ) {
126+ /*
127+ * sort
128+ */
129+ splitVcfs .sort (null );
130+ }
131+ for (VcfRecord splitVcf : splitVcfs ) {
132+ List <String > annotations = new ArrayList <>(getAnnotationsForPosition (splitVcf .getChrPositionRefAlt (), annotationSources , executor ));
133+ queue .add (new ChrPositionAnnotations (splitVcf .getChrPositionRefAlt (), annotations , gatkAD , gatkGT , alt ));
134+ }
124135
125- } else {
136+ } else {
126137
127- logger .debug ("about to get annotations for: " + thisVcfsCP .toIGVString ());
128- List <String > annotations = getAnnotationsForPosition (thisVcfsCP , annotationSources , executor );
129- logger .debug ("got annotations for: " + thisVcfsCP .toIGVString () + " - adding to queue" );
130- queue .add (new ChrPositionAnnotations (thisVcfsCP , annotations , gatkAD , gatkGT , alt ));
138+ logger .debug ("about to get annotations for: " + thisVcfsCP .toIGVString ());
139+ List <String > annotations = getAnnotationsForPosition (thisVcfsCP , annotationSources , executor );
140+ logger .debug ("got annotations for: " + thisVcfsCP .toIGVString () + " - adding to queue" );
141+ queue .add (new ChrPositionAnnotations (thisVcfsCP , annotations , gatkAD , gatkGT , alt ));
131142
132- }
143+ }
133144
134- lastCP = thisVcfsCP ;
145+ lastCP = thisVcfsCP ;
146+ } else {
147+ nonStandardContigCount ++;
148+ }
135149 }
136150
137- logger .info ("# of vcf records: " + vcfCount );
151+ logger .info ("# of vcf records: " + vcfCount + ", # of non-standard contigs: " + nonStandardContigCount );
138152 } finally {
139153 /*
140154 * count down the count down latch
@@ -147,6 +161,10 @@ public int engage() throws Exception {
147161 return exitStatus ;
148162 }
149163
164+ private boolean isStandardContig (ChrPosition thisVcfsCP ) {
165+ return thisVcfsCP .getChromosome ().startsWith ("chr" ) ? STANDARD_GRCH38_CONTIGS .contains (thisVcfsCP .getChromosome ().substring (3 )) : STANDARD_GRCH38_CONTIGS .contains (thisVcfsCP .getChromosome ());
166+ }
167+
150168
151169 private static List <String > getAnnotationsForPosition (ChrPosition cp , List <AnnotationSource > annotationSources , Executor executor ) {
152170 long contigAndPosition = ((ChrPositionUtils .convertContigAndPositionToLong (cp .getChromosome ().startsWith ("chr" ) ? cp .getChromosome ().substring (3 ) : cp .getChromosome (), cp .getStartPosition ())));
0 commit comments