diff --git a/build.xml b/build.xml index abe3a32a15..d3e25d4244 100644 --- a/build.xml +++ b/build.xml @@ -1,5 +1,5 @@ - + + + + + + + + diff --git a/ivy.xml b/ivy.xml index 4f41904ba4..06296c6b4a 100644 --- a/ivy.xml +++ b/ivy.xml @@ -1,3 +1,26 @@ + @@ -18,10 +41,9 @@ - + - @@ -40,17 +62,17 @@ - + - + - + diff --git a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R index d5ee3626f4..ae340e688d 100644 --- a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R +++ b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R @@ -1,6 +1,7 @@ library(gsalib) -require("ggplot2") -require("gplots") +library(ggplot2) +library(gplots) +library(tools) # # Standard command line switch. Can we loaded interactively for development @@ -201,4 +202,7 @@ for ( group in gatkReportData ) { if ( ! is.na(outputPDF) ) { dev.off() -} + if (exists("compactPDF")) { + compactPDF(outputPDF) + } +} diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R index 46bbf7eda5..876cf5cbc9 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R @@ -4,11 +4,9 @@ colnames(d) = tableHeader; for (i in 1:ncol(d)) { - v = suppressWarnings(as.numeric(d[,i])); - - if (length(na.omit(as.numeric(v))) == length(d[,i])) { - d[,i] = v; - } + # use the general type.convert infrastructure of read.table to convert column data to R types + v = type.convert(d[,i]) + d[,i] = v; } usedNames = ls(envir=tableEnv, pattern=tableName); diff --git a/public/c/bwa/build_linux.sh b/public/c/bwa/build_linux.sh index b3631a28df..8683bb3772 100755 --- a/public/c/bwa/build_linux.sh +++ b/public/c/bwa/build_linux.sh @@ -1,5 +1,5 @@ #!/bin/sh -export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa-trunk/bwa" +export BWA_HOME="/humgen/gsa-scr1/hanna/src/bio-bwa/bwa" export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux" export TARGET_LIB="libbwa.so" export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread" diff --git a/public/c/bwa/bwa_gateway.cpp b/public/c/bwa/bwa_gateway.cpp index 00f5aa5bcd..088ee43bf9 100644 --- a/public/c/bwa/bwa_gateway.cpp +++ b/public/c/bwa/bwa_gateway.cpp @@ -233,6 +233,8 @@ void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_s void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; } void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; } void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; } +void BWA::set_mode_nonstop() { options.mode |= BWA_MODE_NONSTOP; options.max_top2 = 0x7fffffff; } +void BWA::set_max_entries_in_queue(int max_entries) { options.max_entries = max_entries; } /** * Create a sequence with a set of reasonable initial defaults. diff --git a/public/c/bwa/bwa_gateway.h b/public/c/bwa/bwa_gateway.h index 2d26ec6509..62756ec2a1 100644 --- a/public/c/bwa/bwa_gateway.h +++ b/public/c/bwa/bwa_gateway.h @@ -60,6 +60,8 @@ class BWA { void set_mismatch_penalty(int penalty); void set_gap_open_penalty(int penalty); void set_gap_extension_penalty(int penalty); + void set_mode_nonstop(); + void set_max_entries_in_queue(int max_entries); // Perform the alignment Alignment* generate_single_alignment(const char* bases, diff --git a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp index 1ccbef0d41..90d70d4a1b 100644 --- a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp +++ b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp @@ -8,11 +8,13 @@ #include "bwa_gateway.h" #include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h" +typedef void (BWA::*boolean_setter)(); typedef void (BWA::*int_setter)(int value); typedef void (BWA::*float_setter)(float value); static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment); static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name); +static void set_boolean_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, boolean_setter setter); static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter); static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter); static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message); @@ -100,6 +102,10 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner if(env->ExceptionCheck()) return; set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty); if(env->ExceptionCheck()) return; + set_boolean_configuration_param(env, configuration, "nonStopMode", bwa, &BWA::set_mode_nonstop); + if(env->ExceptionCheck()) return; + set_int_configuration_param(env, configuration, "maxEntriesInQueue", bwa, &BWA::set_max_entries_in_queue); + if(env->ExceptionCheck()) return; } JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) @@ -357,6 +363,36 @@ static jstring get_configuration_file(JNIEnv* env, jobject configuration, const return path; } +static void set_boolean_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, boolean_setter setter) { + jclass configuration_class = env->GetObjectClass(configuration); + if(configuration_class == NULL) return; + + jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Boolean;"); + if(configuration_field == NULL) return; + + jobject boxed_value = env->GetObjectField(configuration,configuration_field); + if(env->ExceptionCheck()) return; + + if(boxed_value != NULL) { + jclass boolean_box_class = env->FindClass("java/lang/Boolean"); + if(boolean_box_class == NULL) return; + + jmethodID boolean_extractor = env->GetMethodID(boolean_box_class,"booleanValue", "()Z"); + if(boolean_extractor == NULL) return; + + jboolean value = env->CallBooleanMethod(boxed_value,boolean_extractor); + if(env->ExceptionCheck()) return; + + if(value) + (bwa->*setter)(); + + env->DeleteLocalRef(boolean_box_class); + } + + env->DeleteLocalRef(boxed_value); + env->DeleteLocalRef(configuration_class); +} + static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) { jclass configuration_class = env->GetObjectClass(configuration); if(configuration_class == NULL) return; diff --git a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java deleted file mode 100644 index 4b1c7a9994..0000000000 --- a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package net.sf.picard.sam; - -import net.sf.picard.PicardException; - -import java.util.*; -import java.lang.reflect.Constructor; - -import net.sf.samtools.*; -import net.sf.samtools.util.CloseableIterator; - -/** - * Provides an iterator interface for merging multiple underlying iterators into a single - * iterable stream. The underlying iterators/files must all have the same sort order unless - * the requested output format is unsorted, in which case any combination is valid. - */ -public class MergingSamRecordIterator implements CloseableIterator { - private final PriorityQueue pq; - private final SamFileHeaderMerger samHeaderMerger; - private final Collection readers; - private final SAMFileHeader.SortOrder sortOrder; - private final SAMRecordComparator comparator; - - private boolean initialized = false; - private boolean iterationStarted = false; - - /** - * Constructs a new merging iterator with the same set of readers and sort order as - * provided by the header merger parameter. - * @param headerMerger The merged header and contents of readers. - * @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order. - * @deprecated replaced by (SamFileHeaderMerger, Collection, boolean) - */ - public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) { - this(headerMerger, headerMerger.getReaders(), forcePresorted); - } - - /** - * Constructs a new merging iterator with the same set of readers and sort order as - * provided by the header merger parameter. - * @param headerMerger The merged header and contents of readers. - * @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order. - */ - public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection readers, final boolean assumeSorted) { - this.samHeaderMerger = headerMerger; - this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); - this.comparator = getComparator(); - this.readers = readers; - - this.pq = new PriorityQueue(readers.size()); - - for (final SAMFileReader reader : readers) { - if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted && - reader.getFileHeader().getSortOrder() != this.sortOrder){ - throw new PicardException("Files are not compatible with sort order"); - } - } - } - - /** - * Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval, - * rather than iterating over every read in the backing file or stream. - * @param reader Reader to add to the merging iterator. - * @param iterator Iterator traversing over reader contents. - */ - public void addIterator(final SAMFileReader reader, final CloseableIterator iterator) { - if(iterationStarted) - throw new PicardException("Cannot add another iterator; iteration has already begun"); - if(!samHeaderMerger.containsHeader(reader.getFileHeader())) - throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger"); - final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator); - addIfNotEmpty(comparableIterator); - initialized = true; - } - - private void startIterationIfRequired() { - if(initialized) - return; - for(SAMFileReader reader: readers) - addIterator(reader,reader.iterator()); - iterationStarted = true; - } - - /** - * Close down all open iterators. - */ - public void close() { - // Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue. - for(CloseableIterator iterator: pq) - iterator.close(); - } - - /** Returns true if any of the underlying iterators has more records, otherwise false. */ - public boolean hasNext() { - startIterationIfRequired(); - return !this.pq.isEmpty(); - } - - /** Returns the next record from the top most iterator during merging. */ - public SAMRecord next() { - startIterationIfRequired(); - - final ComparableSamRecordIterator iterator = this.pq.poll(); - final SAMRecord record = iterator.next(); - addIfNotEmpty(iterator); - record.setHeader(this.samHeaderMerger.getMergedHeader()); - - // Fix the read group if needs be - if (this.samHeaderMerger.hasReadGroupCollisions()) { - final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID); - if (oldGroupId != null ) { - final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId); - record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId); - } - } - - // Fix the program group if needs be - if (this.samHeaderMerger.hasProgramGroupCollisions()) { - final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID); - if (oldGroupId != null ) { - final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId); - record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId); - } - } - - // Fix up the sequence indexes if needs be - if (this.samHeaderMerger.hasMergedSequenceDictionary()) { - if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex())); - } - - if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex())); - } - } - - return record; - } - - /** - * Adds iterator to priority queue. If the iterator has more records it is added - * otherwise it is closed and not added. - */ - private void addIfNotEmpty(final ComparableSamRecordIterator iterator) { - if (iterator.hasNext()) { - pq.offer(iterator); - } - else { - iterator.close(); - } - } - - /** Unsupported operation. */ - public void remove() { - throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()"); - } - - /** - * Get the right comparator for a given sort order (coordinate, alphabetic). In the - * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive - * ordering. - */ - private SAMRecordComparator getComparator() { - // For unsorted build a fake comparator that compares based on object ID - if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { - return new SAMRecordComparator() { - public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { - return System.identityHashCode(lhs) - System.identityHashCode(rhs); - } - - public int compare(final SAMRecord lhs, final SAMRecord rhs) { - return fileOrderCompare(lhs, rhs); - } - }; - } - if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) { - return new MergedSequenceDictionaryCoordinateOrderComparator(); - } - - // Otherwise try and figure out what kind of comparator to return and build it - return this.sortOrder.getComparatorInstance(); - } - - /** Returns the merged header that the merging iterator is working from. */ - public SAMFileHeader getMergedHeader() { - return this.samHeaderMerger.getMergedHeader(); - } - - /** - * Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged - * sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids - * more copy & paste. - */ - private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator { - - public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { - final int referenceIndex1 = getReferenceIndex(samRecord1); - final int referenceIndex2 = getReferenceIndex(samRecord2); - if (referenceIndex1 != referenceIndex2) { - if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - return 1; - } else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - return -1; - } else { - return referenceIndex1 - referenceIndex2; - } - } - if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - // Both are unmapped. - return 0; - } - return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); - } - - private int getReferenceIndex(final SAMRecord samRecord) { - if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex()); - } - if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex()); - } - return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; - } - } -} diff --git a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java deleted file mode 100644 index f78cd81dac..0000000000 --- a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java +++ /dev/null @@ -1,744 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package net.sf.picard.sam; - -import java.util.*; - -import net.sf.picard.PicardException; -import net.sf.samtools.AbstractSAMHeaderRecord; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import net.sf.samtools.util.SequenceUtil; - -/** - * Merges SAMFileHeaders that have the same sequences into a single merged header - * object while providing read group translation for cases where read groups - * clash across input headers. - */ -public class SamFileHeaderMerger { - //Super Header to construct - private final SAMFileHeader mergedHeader; - private Collection readers; - private final Collection headers; - - //Translation of old group ids to new group ids - private final Map> samReadGroupIdTranslation = - new IdentityHashMap>(); - - //the read groups from different files use the same group ids - private boolean hasReadGroupCollisions = false; - - //the program records from different files use the same program record ids - private boolean hasProgramGroupCollisions = false; - - //Translation of old program group ids to new program group ids - private Map> samProgramGroupIdTranslation = - new IdentityHashMap>(); - - private boolean hasMergedSequenceDictionary = false; - - // Translation of old sequence dictionary ids to new dictionary ids - // This is an IdentityHashMap because it can be quite expensive to compute the hashCode for - // large SAMFileHeaders. It is possible that two input files will have identical headers so that - // the regular HashMap would fold them together, but the value stored in each of the two - // Map entries will be the same, so it should not hurt anything. - private final Map> samSeqDictionaryIdTranslationViaHeader = - new IdentityHashMap>(); - - //HeaderRecordFactory that creates SAMReadGroupRecord instances. - private static final HeaderRecordFactory READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory() { - public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) { - return new SAMReadGroupRecord(id, srcReadGroupRecord); - } - }; - - //HeaderRecordFactory that creates SAMProgramRecord instances. - private static final HeaderRecordFactory PROGRAM_RECORD_FACTORY = new HeaderRecordFactory() { - public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) { - return new SAMProgramRecord(id, srcProgramRecord); - } - }; - - //comparator used to sort lists of program group and read group records - private static final Comparator RECORD_ID_COMPARATOR = new Comparator() { - public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) { - return o1.getId().compareTo(o2.getId()); - } - }; - - /** - * Create SAMFileHeader with additional information. Required that sequence dictionaries agree. - * - * @param readers sam file readers to combine - * @param sortOrder sort order new header should have - * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) - */ - public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { - this(readers, sortOrder, false); - } - - /** - * Create SAMFileHeader with additional information. - * - * @param readers sam file readers to combine - * @param sortOrder sort order new header should have - * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that - * all input sequence dictionaries be identical. - * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) - */ - public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) { - this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries); - this.readers = readers; - } - - /** - * Create SAMFileHeader with additional information.. This is the preferred constructor. - * - * @param sortOrder sort order new header should have - * @param headers sam file headers to combine - * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that - * all input sequence dictionaries be identical. - */ - public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection headers, final boolean mergeDictionaries) { - this.headers = headers; - this.mergedHeader = new SAMFileHeader(); - - SAMSequenceDictionary sequenceDictionary; - try { - sequenceDictionary = getSequenceDictionary(headers); - this.hasMergedSequenceDictionary = false; - } - catch (SequenceUtil.SequenceListsDifferException pe) { - if (mergeDictionaries) { - sequenceDictionary = mergeSequenceDictionaries(headers); - this.hasMergedSequenceDictionary = true; - } - else { - throw pe; - } - } - - this.mergedHeader.setSequenceDictionary(sequenceDictionary); - - // Set program that creates input alignments - for (final SAMProgramRecord program : mergeProgramGroups(headers)) { - this.mergedHeader.addProgramRecord(program); - } - - // Set read groups for merged header - final List readGroups = mergeReadGroups(headers); - this.mergedHeader.setReadGroups(readGroups); - this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); - - this.mergedHeader.setSortOrder(sortOrder); - - for (final SAMFileHeader header : headers) { - for (final String comment : header.getComments()) { - this.mergedHeader.addComment(comment); - } - } - } - - // Utilility method to make use with old constructor - private static List getHeadersFromReaders(Collection readers) { - List headers = new ArrayList(readers.size()); - for (SAMFileReader reader : readers) { - headers.add(reader.getFileHeader()); - } - return headers; - } - - - /** - * Checks to see if there are clashes where different readers are using the same read - * group IDs. If yes, then those IDs that collided are remapped. - * - * @param headers headers to combine - * @return new list of read groups constructed from all the readers - */ - private List mergeReadGroups(final Collection headers) { - //prepare args for mergeHeaderRecords(..) call - final HashSet idsThatAreAlreadyTaken = new HashSet(); - - final List> readGroupsToProcess = new LinkedList>(); - for (final SAMFileHeader header : headers) { - for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { - //verify that there are no existing id collisions in this input file - if(!idsThatAreAlreadyTaken.add(readGroup.getId())) - throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")"); - - readGroupsToProcess.add(new HeaderRecordAndFileHeader(readGroup, header)); - } - idsThatAreAlreadyTaken.clear(); - } - - final List result = new LinkedList(); - - hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result); - - //sort the result list by record id - Collections.sort(result, RECORD_ID_COMPARATOR); - - return result; - } - - - /** - * Checks to see if there are clashes where different readers are using the same program - * group IDs. If yes, then those IDs that collided are remapped. - * - * @param headers headers to combine - * @return new list of program groups constructed from all the readers - */ - private List mergeProgramGroups(final Collection headers) { - - final List overallResult = new LinkedList(); - - //this Set will accumulate all SAMProgramRecord ids that have been encountered so far. - final HashSet idsThatAreAlreadyTaken = new HashSet(); - - //need to process all program groups - List> programGroupsLeftToProcess = new LinkedList>(); - for (final SAMFileHeader header : headers) { - for (final SAMProgramRecord programGroup : header.getProgramRecords()) { - //verify that there are no existing id collisions in this input file - if(!idsThatAreAlreadyTaken.add(programGroup.getId())) - throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")"); - - programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader(programGroup, header)); - } - idsThatAreAlreadyTaken.clear(); - } - - //A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to - //another program group header (lets say ID=1 PN=A) to indicate that the given file was - //processed by program A followed by program B. These PP attributes potentially - //connect headers into one or more tree structures. Merging is done by - //first merging all headers that don't have PP attributes (eg. tree roots), - //then updating and merging all headers whose PPs point to the tree-root headers, - //and so on until all program group headers are processed. - - //currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots). - List< HeaderRecordAndFileHeader > currentProgramGroups = new LinkedList>(); - for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { - final HeaderRecordAndFileHeader pair = programGroupsLeftToProcessIterator.next(); - if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) { - programGroupsLeftToProcessIterator.remove(); - currentProgramGroups.add(pair); - } - } - - //merge currentProgramGroups - while(!currentProgramGroups.isEmpty()) - { - final List currentResult = new LinkedList(); - - hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult); - - //add currentResults to overallResults - overallResult.addAll(currentResult); - - //apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess - currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false); - programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true); - - //find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups), - //and move them to the list of programGroupsToProcessNext. - LinkedList> programGroupsToProcessNext = new LinkedList>(); - for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { - final HeaderRecordAndFileHeader pairLeftToProcess = programGroupsLeftToProcessIterator.next(); - final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); - //find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file) - for(final HeaderRecordAndFileHeader justProcessedPair : currentProgramGroups) { - String idJustProcessed = justProcessedPair.getHeaderRecord().getId(); - if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) { - programGroupsLeftToProcessIterator.remove(); - programGroupsToProcessNext.add(pairLeftToProcess); - break; - } - } - } - - currentProgramGroups = programGroupsToProcessNext; - } - - //verify that all records were processed - if(!programGroupsLeftToProcess.isEmpty()) { - StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n"); - for( final HeaderRecordAndFileHeader pair : programGroupsLeftToProcess ) { - SAMProgramRecord record = pair.getHeaderRecord(); - errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n"); - } - throw new PicardException(errorMsg.toString()); - } - - //sort the result list by record id - Collections.sort(overallResult, RECORD_ID_COMPARATOR); - - return overallResult; - } - - - /** - * Utility method that takes a list of program groups and remaps all their - * ids (including ppIds if requested) using the given idTranslationTable. - * - * NOTE: when remapping, this method creates new SAMProgramRecords and - * doesn't mutate any records in the programGroups list. - * - * @param programGroups The program groups to translate. - * @param idTranslationTable The translation table. - * @param translatePpIds Whether ppIds should be translated as well. - * - * @return The list of translated records. - */ - private List> translateIds( - List> programGroups, - Map> idTranslationTable, - boolean translatePpIds) { - - //go through programGroups and translate any IDs and PPs based on the idTranslationTable. - List> result = new LinkedList>(); - for(final HeaderRecordAndFileHeader pair : programGroups ) { - final SAMProgramRecord record = pair.getHeaderRecord(); - final String id = record.getProgramGroupId(); - final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); - - final SAMFileHeader header = pair.getFileHeader(); - final Map translations = idTranslationTable.get(header); - - //see if one or both ids need to be translated - SAMProgramRecord translatedRecord = null; - if(translations != null) - { - String translatedId = translations.get( id ); - String translatedPpId = translatePpIds ? translations.get( ppId ) : null; - - boolean needToTranslateId = translatedId != null && !translatedId.equals(id); - boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId); - - if(needToTranslateId && needToTranslatePpId) { - translatedRecord = new SAMProgramRecord(translatedId, record); - translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); - } else if(needToTranslateId) { - translatedRecord = new SAMProgramRecord(translatedId, record); - } else if(needToTranslatePpId) { - translatedRecord = new SAMProgramRecord(id, record); - translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); - } - } - - if(translatedRecord != null) { - result.add(new HeaderRecordAndFileHeader(translatedRecord, header)); - } else { - result.add(pair); //keep the original record - } - } - - return result; - } - - - /** - * Utility method for merging a List of AbstractSAMHeaderRecords. If it finds - * records that have identical ids and attributes, it will collapse them - * into one record. If it finds records that have identical ids but - * non-identical attributes, this is treated as a collision. When collision happens, - * the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable. - * - * NOTE: Non-collided records also get recorded in the idTranslationTable as - * old-id to old-id. This way, an idTranslationTable lookup should never return null. - * - * @param headerRecords The header records to merge. - * @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord. - * @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped. - * @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id - * mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as - * old-id to old-id. This way, an idTranslationTable lookup should never return null. - * - * @param result The list of merged header records. - * - * @return True if there were collisions. - */ - private boolean mergeHeaderRecords(final List> headerRecords, HeaderRecordFactory headerRecordFactory, - final HashSet idsThatAreAlreadyTaken, Map> idTranslationTable, List result) { - - //The outer Map bins the header records by their ids. The nested Map further collapses - //header records which, in addition to having the same id, also have identical attributes. - //In other words, each key in the nested map represents one or more - //header records which have both identical ids and identical attributes. The List of - //SAMFileHeaders keeps track of which readers these header record(s) came from. - final Map>> idToRecord = - new HashMap>>(); - - //Populate the idToRecord and seenIds data structures - for (final HeaderRecordAndFileHeader pair : headerRecords) { - final RecordType record = pair.getHeaderRecord(); - final SAMFileHeader header = pair.getFileHeader(); - final String recordId = record.getId(); - Map> recordsWithSameId = idToRecord.get(recordId); - if(recordsWithSameId == null) { - recordsWithSameId = new LinkedHashMap>(); - idToRecord.put(recordId, recordsWithSameId); - } - - List fileHeaders = recordsWithSameId.get(record); - if(fileHeaders == null) { - fileHeaders = new LinkedList(); - recordsWithSameId.put(record, fileHeaders); - } - - fileHeaders.add(header); - } - - //Resolve any collisions between header records by remapping their ids. - boolean hasCollisions = false; - for (final Map.Entry>> entry : idToRecord.entrySet() ) - { - final String recordId = entry.getKey(); - final Map> recordsWithSameId = entry.getValue(); - - - for( Map.Entry> recordWithUniqueAttr : recordsWithSameId.entrySet()) { - final RecordType record = recordWithUniqueAttr.getKey(); - final List fileHeaders = recordWithUniqueAttr.getValue(); - - String newId; - if(!idsThatAreAlreadyTaken.contains(recordId)) { - //don't remap 1st record. If there are more records - //with this id, they will be remapped in the 'else'. - newId = recordId; - idsThatAreAlreadyTaken.add(recordId); - } else { - //there is more than one record with this id. - hasCollisions = true; - - //find a unique newId for this record - int idx=1; - while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++))) - ; - - idsThatAreAlreadyTaken.add( newId ); - } - - for(SAMFileHeader fileHeader : fileHeaders) { - Map readerTranslationTable = idTranslationTable.get(fileHeader); - if(readerTranslationTable == null) { - readerTranslationTable = new HashMap(); - idTranslationTable.put(fileHeader, readerTranslationTable); - } - readerTranslationTable.put(recordId, newId); - } - - result.add( headerRecordFactory.createRecord(newId, record) ); - } - } - - return hasCollisions; - } - - - /** - * Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence - * are different from one another. - * - * @param headers headers to pull sequences from - * @return sequences from files. Each file should have the same sequence - */ - private SAMSequenceDictionary getSequenceDictionary(final Collection headers) { - SAMSequenceDictionary sequences = null; - for (final SAMFileHeader header : headers) { - - if (sequences == null) { - sequences = header.getSequenceDictionary(); - } - else { - final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); - SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences); - } - } - - return sequences; - } - - /** - * Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries. - * - * @param headers headers to pull sequences from - * @return sequences from files. Each file should have the same sequence - */ - private SAMSequenceDictionary mergeSequenceDictionaries(final Collection headers) { - SAMSequenceDictionary sequences = new SAMSequenceDictionary(); - for (final SAMFileHeader header : headers) { - final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); - sequences = mergeSequences(sequences, currentSequences); - } - // second pass, make a map of the original seqeunce id -> new sequence id - createSequenceMapping(headers, sequences); - return sequences; - } - - /** - * They've asked to merge the sequence headers. What we support right now is finding the sequence name superset. - * - * @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals. - * @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict. - * @return A new sequence dictionary that resulting from merging the two inputs. - */ - private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) { - - // a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict. - LinkedList holder = new LinkedList(); - - // Return value will be created from this. - LinkedList resultingDict = new LinkedList(); - for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) { - resultingDict.add(sequenceRecord); - } - - // Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. - int prevloc = -1; - // Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. - SAMSequenceRecord previouslyMerged = null; - - for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) { - // Does it already exist in resultingDict? - int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName()); - if (loc == -1) { - // If doesn't already exist in resultingDict, save it an decide where to insert it later. - holder.add(sequenceRecord.clone()); - } else if (prevloc > loc) { - // If sequenceRecord already exists in resultingDict, but prior to the previous one - // from mergeIntoDict that already existed, cannot merge. - throw new PicardException("Cannot merge sequence dictionaries because sequence " + - sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() + - " are in different orders in two input sequence dictionaries."); - } else { - // Since sequenceRecord already exists in resultingDict, don't need to add it. - // Add in all the sequences prior to it that have been held in holder. - resultingDict.addAll(loc, holder); - // Remember the index of sequenceRecord so can check for merge imcompatibility. - prevloc = loc + holder.size(); - previouslyMerged = sequenceRecord; - holder.clear(); - } - } - // Append anything left in holder. - if (holder.size() != 0) { - resultingDict.addAll(holder); - } - return new SAMSequenceDictionary(resultingDict); - } - - /** - * Find sequence in list. - * @param list List to search for the sequence name. - * @param sequenceName Name to search for. - * @return Index of SAMSequenceRecord with the given name in list, or -1 if not found. - */ - private static int getIndexOfSequenceName(final List list, final String sequenceName) { - for (int i = 0; i < list.size(); ++i) { - if (list.get(i).getSequenceName().equals(sequenceName)) { - return i; - } - } - return -1; - } - - /** - * create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged - * list of sequence id's. - * @param headers the collections of headers. - * @param masterDictionary the superset dictionary we've created. - */ - private void createSequenceMapping(final Collection headers, SAMSequenceDictionary masterDictionary) { - LinkedList resultingDictStr = new LinkedList(); - for (SAMSequenceRecord r : masterDictionary.getSequences()) { - resultingDictStr.add(r.getSequenceName()); - } - for (final SAMFileHeader header : headers) { - Map seqMap = new HashMap(); - SAMSequenceDictionary dict = header.getSequenceDictionary(); - for (SAMSequenceRecord rec : dict.getSequences()) { - seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName())); - } - this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap); - } - } - - - - /** - * Returns the read group id that should be used for the input read and RG id. - * - * @deprecated replaced by getReadGroupId(SAMFileHeader, String) - * */ - public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { - return getReadGroupId(reader.getFileHeader(), originalReadGroupId); - } - - /** Returns the read group id that should be used for the input read and RG id. */ - public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) { - return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId); - } - - /** - * @param reader one of the input files - * @param originalProgramGroupId a program group ID from the above input file - * @return new ID from the merged list of program groups in the output file - * @deprecated replaced by getProgramGroupId(SAMFileHeader, String) - */ - public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) { - return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId); - } - - /** - * @param header one of the input headers - * @param originalProgramGroupId a program group ID from the above input file - * @return new ID from the merged list of program groups in the output file - */ - public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) { - return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId); - } - - /** Returns true if there are read group duplicates within the merged headers. */ - public boolean hasReadGroupCollisions() { - return this.hasReadGroupCollisions; - } - - /** Returns true if there are program group duplicates within the merged headers. */ - public boolean hasProgramGroupCollisions() { - return hasProgramGroupCollisions; - } - - /** @return if we've merged the sequence dictionaries, return true */ - public boolean hasMergedSequenceDictionary() { - return hasMergedSequenceDictionary; - } - - /** Returns the merged header that should be written to any output merged file. */ - public SAMFileHeader getMergedHeader() { - return this.mergedHeader; - } - - /** Returns the collection of readers that this header merger is working with. May return null. - * @deprecated replaced by getHeaders() - */ - public Collection getReaders() { - return this.readers; - } - - /** Returns the collection of readers that this header merger is working with. - */ - public Collection getHeaders() { - return this.headers; - } - - /** - * Tells whether this header merger contains a given SAM file header. Note that header presence - * is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for - * reasons of performance. - * @param header header to check for. - * @return True if the header exists in this HeaderMerger. False otherwise. - */ - boolean containsHeader(SAMFileHeader header) { - for(SAMFileHeader headerMergerHeader: headers) { - if(headerMergerHeader == header) - return true; - } - return false; - } - - /** - * returns the new mapping for a specified reader, given it's old sequence index - * @param reader the reader - * @param oldReferenceSequenceIndex the old sequence (also called reference) index - * @return the new index value - * @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer) - */ - public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) { - return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex); - } - - /** - * Another mechanism for getting the new sequence index, for situations in which the reader is not available. - * Note that if the SAMRecord has already had its header replaced with the merged header, this won't work. - * @param header The original header for the input record in question. - * @param oldReferenceSequenceIndex The original sequence index. - * @return the new index value that is compatible with the merged sequence index. - */ - public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) { - final Map mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header); - if (mapping == null) { - throw new PicardException("No sequence dictionary mapping available for header: " + header); - } - - final Integer newIndex = mapping.get(oldReferenceSequenceIndex); - if (newIndex == null) { - throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header); - } - - return newIndex; - } - - - /** - * Implementations of this interface are used by mergeHeaderRecords(..) to instantiate - * specific subclasses of AbstractSAMHeaderRecord. - */ - private static interface HeaderRecordFactory { - - /** - * Constructs a new instance of RecordType. - * @param id The id of the new record. - * @param srcRecord Except for the id, the new record will be a copy of this source record. - */ - public RecordType createRecord(final String id, RecordType srcRecord); - } - - /** - * Struct that groups together a subclass of AbstractSAMHeaderRecord with the - * SAMFileHeader that it came from. - */ - private static class HeaderRecordAndFileHeader { - private RecordType headerRecord; - private SAMFileHeader samFileHeader; - - public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) { - this.headerRecord = headerRecord; - this.samFileHeader = samFileHeader; - } - - public RecordType getHeaderRecord() { - return headerRecord; - } - public SAMFileHeader getFileHeader() { - return samFileHeader; - } - } -} diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java deleted file mode 100644 index 5005b6265f..0000000000 --- a/public/java/src/net/sf/samtools/BAMFileReader.java +++ /dev/null @@ -1,762 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package net.sf.samtools; - - -import net.sf.samtools.util.*; -import net.sf.samtools.SAMFileReader.ValidationStringency; - -import java.io.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Internal class for reading and querying BAM files. - */ -class BAMFileReader extends SAMFileReader.ReaderImplementation { - // True if reading from a File rather than an InputStream - private boolean mIsSeekable = false; - - // For converting bytes into other primitive types - private BinaryCodec mStream = null; - - // Underlying compressed data stream. - private final BAMInputStream mInputStream; - private SAMFileHeader mFileHeader = null; - - // Populated if the file is seekable and an index exists - private File mIndexFile; - private BAMIndex mIndex = null; - private long mFirstRecordPointer = 0; - private CloseableIterator mCurrentIterator = null; - - // If true, all SAMRecords are fully decoded as they are read. - private final boolean eagerDecode; - - // For error-checking. - private ValidationStringency mValidationStringency; - - // For creating BAMRecords - private SAMRecordFactory samRecordFactory; - - /** - * Use the caching index reader implementation rather than the disk-hit-per-file model. - */ - private boolean mEnableIndexCaching = false; - - /** - * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. - */ - private boolean mEnableIndexMemoryMapping = true; - - /** - * Add information about the origin (reader and position) to SAM records. - */ - private SAMFileReader mFileReader = null; - - /** - * Prepare to read BAM from a stream (not seekable) - * @param stream source of bytes. - * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. - * @param validationStringency Controls how to handle invalidate reads or header lines. - */ - BAMFileReader(final InputStream stream, - final File indexFile, - final boolean eagerDecode, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - mIndexFile = indexFile; - mIsSeekable = false; - mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream); - mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream)); - this.eagerDecode = eagerDecode; - this.mValidationStringency = validationStringency; - this.samRecordFactory = factory; - readHeader(null); - } - - /** - * Prepare to read BAM from a file (seekable) - * @param file source of bytes. - * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. - * @param validationStringency Controls how to handle invalidate reads or header lines. - */ - BAMFileReader(final File file, - final File indexFile, - final boolean eagerDecode, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); - if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { - System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + - " is older than BAM " + file.getAbsolutePath()); - } - } - - BAMFileReader(final SeekableStream strm, - final File indexFile, - final boolean eagerDecode, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm), - indexFile, - eagerDecode, - strm.getSource(), - validationStringency, - factory); - } - - private BAMFileReader(final BAMInputStream inputStream, - final File indexFile, - final boolean eagerDecode, - final String source, - final ValidationStringency validationStringency, - final SAMRecordFactory factory) - throws IOException { - mIndexFile = indexFile; - mIsSeekable = true; - mInputStream = inputStream; - mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream)); - this.eagerDecode = eagerDecode; - this.mValidationStringency = validationStringency; - this.samRecordFactory = factory; - readHeader(source); - mFirstRecordPointer = inputStream.getFilePointer(); - } - - /** - * If true, writes the source of every read into the source SAMRecords. - * @param enabled true to write source information into each SAMRecord. - */ - void enableFileSource(final SAMFileReader reader, final boolean enabled) { - this.mFileReader = enabled ? reader : null; - } - - /** - * If true, uses the caching version of the index reader. - * @param enabled true to write source information into each SAMRecord. - */ - public void enableIndexCaching(final boolean enabled) { - if(mIndex != null) - throw new SAMException("Unable to turn on index caching; index file has already been loaded."); - this.mEnableIndexCaching = enabled; - } - - /** - * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). - * This is slower but more scalable when accessing large numbers of BAM files sequentially. - * @param enabled True to use memory mapping, false to use regular I/O. - */ - public void enableIndexMemoryMapping(final boolean enabled) { - if (mIndex != null) { - throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); - } - this.mEnableIndexMemoryMapping = enabled; - } - - @Override void enableCrcChecking(final boolean enabled) { - this.mInputStream.setCheckCrcs(enabled); - } - - @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } - - /** - * @return true if ths is a BAM file, and has an index - */ - public boolean hasIndex() { - return (mIndexFile != null); - } - - /** - * Retrieves the index for the given file type. Ensure that the index is of the specified type. - * @return An index of the given type. - */ - public BAMIndex getIndex() { - if(mIndexFile == null) - throw new SAMException("No index is available for this BAM file."); - if(mIndex == null) - mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) - : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); - return mIndex; - } - - void close() { - if (mStream != null) { - mStream.close(); - } - if (mIndex != null) { - mIndex.close(); - } - mStream = null; - mFileHeader = null; - mIndex = null; - } - - SAMFileHeader getFileHeader() { - return mFileHeader; - } - - /** - * Set error-checking level for subsequent SAMRecord reads. - */ - void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { - this.mValidationStringency = validationStringency; - } - - SAMFileReader.ValidationStringency getValidationStringency() { - return this.mValidationStringency; - } - - /** - * Prepare to iterate through the SAMRecords in file order. - * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, - * that iterator must be closed before getIterator() can be called again. - * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to - * getIterator() begins its iteration where the last one left off. That is the best that can be - * done in that situation. - */ - CloseableIterator getIterator() { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (mIsSeekable) { - try { - mInputStream.seek(mFirstRecordPointer); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - mCurrentIterator = new BAMFileIterator(); - return mCurrentIterator; - } - - @Override - CloseableIterator getIterator(final SAMFileSpan chunks) { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!(chunks instanceof BAMFileSpan)) { - throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); - } - - // Create an iterator over the given chunk boundaries. - mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); - return mCurrentIterator; - } - - /** - * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know - * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, - * the rightmost bound is guaranteed to be after the last read in the file. - * @return An unbounded pointer to the first record in the BAM file. - */ - @Override - SAMFileSpan getFilePointerSpanningReads() { - return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); - } - - /** - * Prepare to iterate through the SAMRecords that match the given interval. - * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed - * before calling any of the methods that return an iterator. - * - * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting - * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate - * matches the specified interval. - * - * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect - * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. - * - * @param sequence Reference sequence sought. - * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. - * A value of zero implies the start of the reference sequence. - * @param end A value of zero implies the end of the reference sequence. - * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval - * specified by start and end. If false, the SAMRecords need only overlap the interval. - * @return Iterator for the matching SAMRecords - */ - CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!mIsSeekable) { - throw new UnsupportedOperationException("Cannot query stream-based BAM file"); - } - mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING); - return mCurrentIterator; - } - - /** - * Prepare to iterate through the SAMRecords with the given alignment start. - * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed - * before calling any of the methods that return an iterator. - * - * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting - * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate - * matches the specified interval. - * - * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect - * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. - * - * @param sequence Reference sequence sought. - * @param start Alignment start sought. - * @return Iterator for the matching SAMRecords. - */ - CloseableIterator queryAlignmentStart(final String sequence, final int start) { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!mIsSeekable) { - throw new UnsupportedOperationException("Cannot query stream-based BAM file"); - } - mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT); - return mCurrentIterator; - } - - public CloseableIterator queryUnmapped() { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!mIsSeekable) { - throw new UnsupportedOperationException("Cannot query stream-based BAM file"); - } - try { - final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); - if (startOfLastLinearBin != -1) { - mInputStream.seek(startOfLastLinearBin); - } else { - // No mapped reads in file, just start at the first read in file. - mInputStream.seek(mFirstRecordPointer); - } - mCurrentIterator = new BAMFileIndexUnmappedIterator(); - return mCurrentIterator; - } catch (IOException e) { - throw new RuntimeException("IOException seeking to unmapped reads", e); - } - } - - /** - * Reads the header from the file or stream - * @param source Note that this is used only for reporting errors. - */ - private void readHeader(final String source) - throws IOException { - - final byte[] buffer = new byte[4]; - mStream.readBytes(buffer); - if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { - throw new IOException("Invalid BAM file header"); - } - - final int headerTextLength = mStream.readInt(); - final String textHeader = mStream.readString(headerTextLength); - final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); - headerCodec.setValidationStringency(mValidationStringency); - mFileHeader = headerCodec.decode(new StringLineReader(textHeader), - source); - - final int sequenceCount = mStream.readInt(); - if (mFileHeader.getSequenceDictionary().size() > 0) { - // It is allowed to have binary sequences but no text sequences, so only validate if both are present - if (sequenceCount != mFileHeader.getSequenceDictionary().size()) { - throw new SAMFormatException("Number of sequences in text header (" + - mFileHeader.getSequenceDictionary().size() + - ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); - } - for (int i = 0; i < sequenceCount; i++) { - final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source); - final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); - if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + - source); - } - if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + - source); - } - } - } else { - // If only binary sequences are present, copy them into mFileHeader - final List sequences = new ArrayList(sequenceCount); - for (int i = 0; i < sequenceCount; i++) { - sequences.add(readSequenceRecord(source)); - } - mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); - } - } - - /** - * Reads a single binary sequence record from the file or stream - * @param source Note that this is used only for reporting errors. - */ - private SAMSequenceRecord readSequenceRecord(final String source) { - final int nameLength = mStream.readInt(); - if (nameLength <= 1) { - throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); - } - final String sequenceName = mStream.readString(nameLength - 1); - // Skip the null terminator - mStream.readByte(); - final int sequenceLength = mStream.readInt(); - return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); - } - - /** - * Iterator for non-indexed sequential iteration through all SAMRecords in file. - * Starting point of iteration is wherever current file position is when the iterator is constructed. - */ - private class BAMFileIterator implements CloseableIterator { - private SAMRecord mNextRecord = null; - private final BAMRecordCodec bamRecordCodec; - private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file - - BAMFileIterator() { - this(true); - } - - /** - * @param advance Trick to enable subclass to do more setup before advancing - */ - BAMFileIterator(final boolean advance) { - this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); - this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); - - if (advance) { - advance(); - } - } - - public void close() { - if (mCurrentIterator != null && this != mCurrentIterator) { - throw new IllegalStateException("Attempt to close non-current iterator"); - } - mCurrentIterator = null; - } - - public boolean hasNext() { - return (mNextRecord != null); - } - - public SAMRecord next() { - final SAMRecord result = mNextRecord; - advance(); - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - void advance() { - try { - mNextRecord = getNextRecord(); - - if (mNextRecord != null) { - ++this.samRecordIndex; - // Because some decoding is done lazily, the record needs to remember the validation stringency. - mNextRecord.setValidationStringency(mValidationStringency); - - if (mValidationStringency != ValidationStringency.SILENT) { - final List validationErrors = mNextRecord.isValid(); - SAMUtils.processValidationErrors(validationErrors, - this.samRecordIndex, BAMFileReader.this.getValidationStringency()); - } - } - if (eagerDecode && mNextRecord != null) { - mNextRecord.eagerDecode(); - } - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - /** - * Read the next record from the input stream. - */ - SAMRecord getNextRecord() throws IOException { - final long startCoordinate = mInputStream.getFilePointer(); - final SAMRecord next = bamRecordCodec.decode(); - final long stopCoordinate = mInputStream.getFilePointer(); - - if(mFileReader != null && next != null) - next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); - - return next; - } - - /** - * @return The record that will be return by the next call to next() - */ - protected SAMRecord peek() { - return mNextRecord; - } - } - - /** - * Prepare to iterate through SAMRecords matching the target interval. - * @param sequence Desired reference sequence. - * @param start 1-based start of target interval, inclusive. - * @param end 1-based end of target interval, inclusive. - * @param queryType contained, overlapping, or starting-at query. - */ - private CloseableIterator createIndexIterator(final String sequence, - final int start, - final int end, - final QueryType queryType) { - long[] filePointers = null; - - // Hit the index to determine the chunk boundaries for the required data. - final SAMFileHeader fileHeader = getFileHeader(); - final int referenceIndex = fileHeader.getSequenceIndex(sequence); - if (referenceIndex != -1) { - final BAMIndex fileIndex = getIndex(); - final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end); - filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; - } - - // Create an iterator over the above chunk boundaries. - final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); - - // Add some preprocessing filters for edge-case reads that don't fit into this - // query type. - return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType); - } - - enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT} - - /** - * Look for BAM index file according to standard naming convention. - * - * @param dataFile BAM file name. - * @return Index file name, or null if not found. - */ - private static File findIndexFile(final File dataFile) { - // If input is foo.bam, look for foo.bai - final String bamExtension = ".bam"; - File indexFile; - final String fileName = dataFile.getName(); - if (fileName.endsWith(bamExtension)) { - final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix; - indexFile = new File(dataFile.getParent(), bai); - if (indexFile.exists()) { - return indexFile; - } - } - - // If foo.bai doesn't exist look for foo.bam.bai - indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai"); - if (indexFile.exists()) { - return indexFile; - } else { - return null; - } - } - - private class BAMFileIndexIterator extends BAMFileIterator { - - private long[] mFilePointers = null; - private int mFilePointerIndex = 0; - private long mFilePointerLimit = -1; - - /** - * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. - * @param filePointers the block / offset combination, stored in chunk format. - */ - BAMFileIndexIterator(final long[] filePointers) { - super(false); // delay advance() until after construction - mFilePointers = filePointers; - advance(); - } - - SAMRecord getNextRecord() - throws IOException { - // Advance to next file block if necessary - while (mInputStream.getFilePointer() >= mFilePointerLimit) { - if (mFilePointers == null || - mFilePointerIndex >= mFilePointers.length) { - return null; - } - final long startOffset = mFilePointers[mFilePointerIndex++]; - final long endOffset = mFilePointers[mFilePointerIndex++]; - mInputStream.seek(startOffset); - mFilePointerLimit = endOffset; - } - // Pull next record from stream - return super.getNextRecord(); - } - } - - /** - * A decorating iterator that filters out records that are outside the bounds of the - * given query parameters. - */ - private class BAMQueryFilteringIterator implements CloseableIterator { - /** - * The wrapped iterator. - */ - private final CloseableIterator wrappedIterator; - - /** - * The next record to be returned. Will be null if no such record exists. - */ - private SAMRecord mNextRecord; - - private final int mReferenceIndex; - private final int mRegionStart; - private final int mRegionEnd; - private final QueryType mQueryType; - - public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) { - this.wrappedIterator = iterator; - final SAMFileHeader fileHeader = getFileHeader(); - mReferenceIndex = fileHeader.getSequenceIndex(sequence); - mRegionStart = start; - if (queryType == QueryType.STARTING_AT) { - mRegionEnd = mRegionStart; - } else { - mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; - } - mQueryType = queryType; - mNextRecord = advance(); - } - - /** - * Returns true if a next element exists; false otherwise. - */ - public boolean hasNext() { - return mNextRecord != null; - } - - /** - * Gets the next record from the given iterator. - * @return The next SAM record in the iterator. - */ - public SAMRecord next() { - if(!hasNext()) - throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); - final SAMRecord currentRead = mNextRecord; - mNextRecord = advance(); - return currentRead; - } - - /** - * Closes down the existing iterator. - */ - public void close() { - if (this != mCurrentIterator) { - throw new IllegalStateException("Attempt to close non-current iterator"); - } - mCurrentIterator = null; - } - - /** - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - SAMRecord advance() { - while (true) { - // Pull next record from stream - if(!wrappedIterator.hasNext()) - return null; - - final SAMRecord record = wrappedIterator.next(); - // If beyond the end of this reference sequence, end iteration - final int referenceIndex = record.getReferenceIndex(); - if (referenceIndex != mReferenceIndex) { - if (referenceIndex < 0 || - referenceIndex > mReferenceIndex) { - return null; - } - // If before this reference sequence, continue - continue; - } - if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { - // Quick exit to avoid expensive alignment end calculation - return record; - } - final int alignmentStart = record.getAlignmentStart(); - // If read is unmapped but has a coordinate, return it if the coordinate is within - // the query region, regardless of whether the mapped mate will be returned. - final int alignmentEnd; - if (mQueryType == QueryType.STARTING_AT) { - alignmentEnd = -1; - } else { - alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START? - record.getAlignmentEnd(): alignmentStart); - } - - if (alignmentStart > mRegionEnd) { - // If scanned beyond target region, end iteration - return null; - } - // Filter for overlap with region - if (mQueryType == QueryType.CONTAINED) { - if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { - return record; - } - } else if (mQueryType == QueryType.OVERLAPPING) { - if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { - return record; - } - } else { - if (alignmentStart == mRegionStart) { - return record; - } - } - } - } - } - - private class BAMFileIndexUnmappedIterator extends BAMFileIterator { - private BAMFileIndexUnmappedIterator() { - while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - advance(); - } - } - } - -} diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java index c48567f6e2..42f0871316 100644 --- a/public/java/src/net/sf/samtools/GATKChunk.java +++ b/public/java/src/net/sf/samtools/GATKChunk.java @@ -40,6 +40,10 @@ public GATKChunk(final long start, final long stop) { super(start,stop); } + public GATKChunk(final long blockStart, final int blockOffsetStart, final long blockEnd, final int blockOffsetEnd) { + super(blockStart << 16 | blockOffsetStart,blockEnd << 16 | blockOffsetEnd); + } + public GATKChunk(final Chunk chunk) { super(chunk.getChunkStart(),chunk.getChunkEnd()); } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala b/public/java/src/net/sf/samtools/PicardNamespaceUtils.java similarity index 65% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala rename to public/java/src/net/sf/samtools/PicardNamespaceUtils.java index 7fb96e0741..b645f8fdce 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala +++ b/public/java/src/net/sf/samtools/PicardNamespaceUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -22,15 +22,18 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.queue.extensions.gatk - -import org.broadinstitute.sting.queue.function.scattergather.GatherFunction -import org.broadinstitute.sting.queue.function.InProcessFunction +package net.sf.samtools; /** - * A no-op for index files that were automatically generated during the gather step. - * TODO: Allow graph to know that this isn't needed, and/or that one gather job can actually gather N-outputs, and/or look more into generic source->sinks. + * Utils that insist on being in the same package as Picard. */ -class AutoIndexGatherFunction extends InProcessFunction with GatherFunction { - def run() {} +public class PicardNamespaceUtils { + /** + * Private constructor only. Do not instantiate. + */ + private PicardNamespaceUtils() {} + + public static void setFileSource(final SAMRecord read, final SAMFileSource fileSource) { + read.setFileSource(fileSource); + } } diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java deleted file mode 100644 index d825c23d51..0000000000 --- a/public/java/src/net/sf/samtools/util/BAMInputStream.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package net.sf.samtools.util; - -import java.io.IOException; - -/** - * An input stream formulated for use reading BAM files. Supports - */ -public interface BAMInputStream { - /** - * Seek to the given position in the file. Note that pos is a special virtual file pointer, - * not an actual byte offset. - * - * @param pos virtual file pointer - */ - public void seek(final long pos) throws IOException; - - /** - * @return virtual file pointer that can be passed to seek() to return to the current position. This is - * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between - * the two. - */ - public long getFilePointer(); - - /** - * Determines whether or not the inflater will re-calculated the CRC on the decompressed data - * and check it against the value stored in the GZIP header. CRC checking is an expensive - * operation and should be used accordingly. - */ - public void setCheckCrcs(final boolean check); - - public int read() throws java.io.IOException; - - public int read(byte[] bytes) throws java.io.IOException; - - public int read(byte[] bytes, int i, int i1) throws java.io.IOException; - - public long skip(long l) throws java.io.IOException; - - public int available() throws java.io.IOException; - - public void close() throws java.io.IOException; - - public void mark(int i); - - public void reset() throws java.io.IOException; - - public boolean markSupported(); -} diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java deleted file mode 100755 index fae2fc89b4..0000000000 --- a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java +++ /dev/null @@ -1,483 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package net.sf.samtools.util; - - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.RandomAccessFile; -import java.net.URL; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; - -import net.sf.samtools.FileTruncatedException; - -/* - * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream. - * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering. - * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the - * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used. - * - * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format - */ -public class BlockCompressedInputStream extends InputStream implements BAMInputStream { - private InputStream mStream = null; - private SeekableStream mFile = null; - private byte[] mFileBuffer = null; - private byte[] mCurrentBlock = null; - private int mCurrentOffset = 0; - private long mBlockAddress = 0; - private int mLastBlockLength = 0; - private final BlockGunzipper blockGunzipper = new BlockGunzipper(); - - - /** - * Note that seek() is not supported if this ctor is used. - */ - public BlockCompressedInputStream(final InputStream stream) { - mStream = IOUtil.toBufferedStream(stream); - mFile = null; - } - - /** - * Use this ctor if you wish to call seek() - */ - public BlockCompressedInputStream(final File file) - throws IOException { - mFile = new SeekableFileStream(file); - mStream = null; - - } - - public BlockCompressedInputStream(final URL url) { - mFile = new SeekableBufferedStream(new SeekableHTTPStream(url)); - mStream = null; - } - - /** - * For providing some arbitrary data source. No additional buffering is - * provided, so if the underlying source is not buffered, wrap it in a - * SeekableBufferedStream before passing to this ctor. - */ - public BlockCompressedInputStream(final SeekableStream strm) { - mFile = strm; - mStream = null; - } - - /** - * Determines whether or not the inflater will re-calculated the CRC on the decompressed data - * and check it against the value stored in the GZIP header. CRC checking is an expensive - * operation and should be used accordingly. - */ - public void setCheckCrcs(final boolean check) { - this.blockGunzipper.setCheckCrcs(check); - } - - /** - * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the - * next caller of a method for this input stream. The next caller might be the same thread or another thread. - * Note that although the next caller can read this many bytes without blocking, the available() method call itself - * may block in order to fill an internal buffer if it has been exhausted. - */ - public int available() - throws IOException { - if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { - readBlock(); - } - if (mCurrentBlock == null) { - return 0; - } - return mCurrentBlock.length - mCurrentOffset; - } - - /** - * Closes the underlying InputStream or RandomAccessFile - */ - public void close() - throws IOException { - if (mFile != null) { - mFile.close(); - mFile = null; - } else if (mStream != null) { - mStream.close(); - mStream = null; - } - // Encourage garbage collection - mFileBuffer = null; - mCurrentBlock = null; - } - - /** - * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255. - * If no byte is available because the end of the stream has been reached, the value -1 is returned. - * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown. - - * @return the next byte of data, or -1 if the end of the stream is reached. - */ - public int read() - throws IOException { - return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; - } - - /** - * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes - * actually read is returned as an integer. This method blocks until input data is available, end of file is detected, - * or an exception is thrown. - * - * read(buf) has the same effect as read(buf, 0, buf.length). - * - * @param buffer the buffer into which the data is read. - * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of - * the stream has been reached. - */ - public int read(final byte[] buffer) - throws IOException { - return read(buffer, 0, buffer.length); - } - - private volatile ByteArrayOutputStream buf = null; - private static final byte eol = '\n'; - private static final byte eolCr = '\r'; - - /** - * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'), - * carriage return ('\r') or carriage return followed by a line feed ("\r\n"). - * - * @return A String containing the contents of the line, excluding the line terminating - * character, or null if the end of the stream has been reached - * - * @exception IOException If an I/O error occurs - * @ - */ - public String readLine() throws IOException { - int available = available(); - if (available == 0) { - return null; - } - if(null == buf){ // lazy initialisation - buf = new ByteArrayOutputStream(8192); - } - buf.reset(); - boolean done = false; - boolean foundCr = false; // \r found flag - while (!done) { - int linetmpPos = mCurrentOffset; - int bCnt = 0; - while((available-- > 0)){ - final byte c = mCurrentBlock[linetmpPos++]; - if(c == eol){ // found \n - done = true; - break; - } else if(foundCr){ // previous char was \r - --linetmpPos; // current char is not \n so put it back - done = true; - break; - } else if(c == eolCr){ // found \r - foundCr = true; - continue; // no ++bCnt - } - ++bCnt; - } - if(mCurrentOffset < linetmpPos){ - buf.write(mCurrentBlock, mCurrentOffset, bCnt); - mCurrentOffset = linetmpPos; - } - available = available(); - if(available == 0){ - // EOF - done = true; - } - } - return buf.toString(); - } - - /** - * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read - * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer. - * - * This method blocks until input data is available, end of file is detected, or an exception is thrown. - * - * @param buffer buffer into which data is read. - * @param offset the start offset in array b at which the data is written. - * @param length the maximum number of bytes to read. - * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of - * the stream has been reached. - */ - public int read(final byte[] buffer, int offset, int length) - throws IOException { - final int originalLength = length; - while (length > 0) { - final int available = available(); - if (available == 0) { - // Signal EOF to caller - if (originalLength == length) { - return -1; - } - break; - } - final int copyLength = Math.min(length, available); - System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); - mCurrentOffset += copyLength; - offset += copyLength; - length -= copyLength; - } - return originalLength - length; - } - - /** - * Seek to the given position in the file. Note that pos is a special virtual file pointer, - * not an actual byte offset. - * - * @param pos virtual file pointer - */ - public void seek(final long pos) - throws IOException { - if (mFile == null) { - throw new IOException("Cannot seek on stream based file"); - } - // Decode virtual file pointer - // Upper 48 bits is the byte offset into the compressed stream of a block. - // Lower 16 bits is the byte offset into the uncompressed stream inside the block. - final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos); - final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos); - final int available; - if (mBlockAddress == compressedOffset && mCurrentBlock != null) { - available = mCurrentBlock.length; - } else { - mFile.seek(compressedOffset); - mBlockAddress = compressedOffset; - mLastBlockLength = 0; - readBlock(); - available = available(); - } - if (uncompressedOffset > available || - (uncompressedOffset == available && !eof())) { - throw new IOException("Invalid file pointer: " + pos); - } - mCurrentOffset = uncompressedOffset; - } - - private boolean eof() throws IOException { - if (mFile.eof()) { - return true; - } - // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF. - return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); - } - - /** - * @return virtual file pointer that can be passed to seek() to return to the current position. This is - * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between - * the two. - */ - public long getFilePointer() { - if (mCurrentOffset == mCurrentBlock.length) { - // If current offset is at the end of the current block, file pointer should point - // to the beginning of the next block. - return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0); - } - return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset); - } - - public static long getFileBlock(final long bgzfOffset) { - return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset); - } - - /** - * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported(). - * @return true if the given file looks like a valid BGZF file. - */ - public static boolean isValidFile(final InputStream stream) - throws IOException { - if (!stream.markSupported()) { - throw new RuntimeException("Cannot test non-buffered stream"); - } - stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; - final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - stream.reset(); - return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer); - } - - private static boolean isValidBlockHeader(final byte[] buffer) { - return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && - (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && - (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && - buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && - buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && - buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); - } - - private void readBlock() - throws IOException { - - if (mFileBuffer == null) { - mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; - } - int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - if (count == 0) { - // Handle case where there is no empty gzip block at end. - mCurrentOffset = 0; - mBlockAddress += mLastBlockLength; - mCurrentBlock = new byte[0]; - return; - } - if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { - throw new IOException("Premature end of file"); - } - final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; - if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { - throw new IOException("Unexpected compressed block length: " + blockLength); - } - final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; - count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); - if (count != remaining) { - throw new FileTruncatedException("Premature end of file"); - } - inflateBlock(mFileBuffer, blockLength); - mCurrentOffset = 0; - mBlockAddress += mLastBlockLength; - mLastBlockLength = blockLength; - } - - private void inflateBlock(final byte[] compressedBlock, final int compressedLength) - throws IOException { - final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); - byte[] buffer = mCurrentBlock; - mCurrentBlock = null; - if (buffer == null || buffer.length != uncompressedLength) { - try { - buffer = new byte[uncompressedLength]; - } catch (NegativeArraySizeException e) { - throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e); - } - } - blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength); - mCurrentBlock = buffer; - } - - private int readBytes(final byte[] buffer, final int offset, final int length) - throws IOException { - if (mFile != null) { - return readBytes(mFile, buffer, offset, length); - } else if (mStream != null) { - return readBytes(mStream, buffer, offset, length); - } else { - return 0; - } - } - - private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length) - throws IOException { - int bytesRead = 0; - while (bytesRead < length) { - final int count = file.read(buffer, offset + bytesRead, length - bytesRead); - if (count <= 0) { - break; - } - bytesRead += count; - } - return bytesRead; - } - - private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) - throws IOException { - int bytesRead = 0; - while (bytesRead < length) { - final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); - if (count <= 0) { - break; - } - bytesRead += count; - } - return bytesRead; - } - - private int unpackInt16(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8)); - } - - private int unpackInt32(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8) | - ((buffer[offset+2] & 0xFF) << 16) | - ((buffer[offset+3] & 0xFF) << 24)); - } - - public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE} - - public static FileTermination checkTermination(final File file) - throws IOException { - final long fileSize = file.length(); - if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) { - return FileTermination.DEFECTIVE; - } - final RandomAccessFile raFile = new RandomAccessFile(file, "r"); - try { - raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); - byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length]; - raFile.readFully(buf); - if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) { - return FileTermination.HAS_TERMINATOR_BLOCK; - } - final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); - buf = new byte[bufsize]; - raFile.seek(fileSize - bufsize); - raFile.read(buf); - for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; - i >= 0; --i) { - if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE, - buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) { - continue; - } - final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4); - byteBuffer.order(ByteOrder.LITTLE_ENDIAN); - final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF; - if (buf.length - i == totalBlockSizeMinusOne + 1) { - return FileTermination.HAS_HEALTHY_LAST_BLOCK; - } else { - return FileTermination.DEFECTIVE; - } - } - return FileTermination.DEFECTIVE; - } finally { - raFile.close(); - } - } - - private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) { - for (int i = 0; i < length; ++i) { - if (preamble[i] != buf[i + startOffset]) { - return false; - } - } - return true; - } -} - - diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java index 73441cb6a4..e453c7f8a9 100644 --- a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java @@ -41,4 +41,14 @@ public class BWAConfiguration { * What is the scoring penalty for a gap extension? */ public Integer gapExtensionPenalty = null; + + /** + * Enter bwa's 'non-stop' mode (equivalent to bwa aln -N parameter). + */ + public Boolean nonStopMode = false; + + /** + * Set the max queue size that bwa will use when searching for matches (equivalent to bwa aln -m parameter). + */ + public Integer maxEntriesInQueue = null; } diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index a399867fa7..a999593413 100755 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -139,11 +139,11 @@ public class AnalyzeCovariates extends CommandLineProgram { */ @Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots") private int MAX_HISTOGRAM_VALUE = 0; + @Hidden @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting") private boolean DO_INDEL_QUALITY = false; - ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -274,7 +274,6 @@ private void addCSVData(String line) { RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); // Add that datum to all the collapsed tables which will be used in the sequential calculation dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN ); - } private void writeDataTables() { @@ -341,7 +340,7 @@ private void callRScripts() { // for each covariate for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { - Covariate cov = requestedCovariates.get(iii); + final Covariate cov = requestedCovariates.get(iii); final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat"); if (DO_INDEL_QUALITY) { RScriptExecutor executor = new RScriptExecutor(); @@ -349,7 +348,7 @@ private void callRScripts() { // The second argument is the name of the covariate in order to make the plots look nice executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]); executor.exec(); - } else { + } else { if( iii == 1 ) { // Analyze reported quality RScriptExecutor executor = new RScriptExecutor(); diff --git a/public/java/src/org/broadinstitute/sting/commandline/Gather.java b/public/java/src/org/broadinstitute/sting/commandline/Gather.java index 59c3f50cbf..d452f708e0 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Gather.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Gather.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -34,5 +34,6 @@ @Retention(RetentionPolicy.RUNTIME) @Target({ElementType.FIELD}) public @interface Gather { - Class value(); + Class value() default Gather.class; + boolean enabled() default true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 32002e0936..e5aaf23385 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -35,9 +35,12 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.crypt.CryptUtils; +import org.broadinstitute.sting.utils.crypt.GATKKey; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.ListFileUtils; +import java.security.PublicKey; import java.util.*; /** @@ -78,6 +81,9 @@ protected int execute() throws Exception { Walker walker = engine.getWalkerByName(getAnalysisName()); try { + // Make sure a valid GATK user key is present, if required. + authorizeGATKRun(); + engine.setArguments(getArgumentCollection()); // File lists can require a bit of additional expansion. Set these explicitly by the engine. @@ -130,6 +136,28 @@ protected int execute() throws Exception { return 0; } + /** + * Authorizes this run of the GATK by checking for a valid GATK user key, if required. + * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. + */ + private void authorizeGATKRun() { + if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || + getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { + if ( getArgumentCollection().gatkKeyFile == null ) { + throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " + + "for more information and instructions on how to obtain a key."); + } + else { + PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); + + if ( ! gatkUserKey.isValid() ) { + throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); + } + } + } + } /** * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index b4d337d8df..9c59ffe9a9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk; +import net.sf.picard.PicardException; +import net.sf.samtools.SAMException; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; @@ -95,7 +97,11 @@ public static void main(String[] argv) { // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are // lazy loaded, so they aren't caught elsewhere and made into User Exceptions exitSystemWithUserError(e); - } catch (net.sf.samtools.SAMException e) { + } catch(PicardException e) { + // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions? + exitSystemWithError(e); + } + catch (SAMException e) { checkForTooManyOpenFilesProblem(e.getMessage()); exitSystemWithSamError(e); } catch (Throwable t) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f954d76501..50ef4653b9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import java.io.File; import java.util.*; @@ -179,10 +180,18 @@ public void setReferenceMetaDataFiles(Collection referenceMetaDataFi */ private static final long GATK_RANDOM_SEED = 47382911L; private static Random randomGenerator = new Random(GATK_RANDOM_SEED); - public static Random getRandomGenerator() { return randomGenerator; } public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } + + /** + * Base Quality Score Recalibration helper object + */ + private BaseRecalibration baseRecalibration = null; + public BaseRecalibration getBaseRecalibration() { return baseRecalibration; } + public boolean hasBaseRecalibration() { return baseRecalibration != null; } + public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); } + /** * Actually run the GATK with the specified walker. * @@ -205,6 +214,10 @@ public Object execute() { if (this.getArguments().nonDeterministicRandomSeed) resetRandomGenerator(System.currentTimeMillis()); + // if the use specified an input BQSR recalibration table then enable on the fly recalibration + if (this.getArguments().BQSR_RECAL_FILE != null) + setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE); + // Determine how the threads should be divided between CPU vs. IO. determineThreadAllocation(); @@ -224,7 +237,7 @@ public Object execute() { // create temp directories as necessary initializeTempDirectory(); - // create the output streams " + // create the output streams initializeOutputStreams(microScheduler.getOutputTracker()); Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); @@ -450,7 +463,15 @@ protected Iterable getShardStrategy(SAMDataSource readsDataSource, Refere return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); + } else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { // Apply special validation to read pair walkers. if(walker instanceof ReadPairWalker) { @@ -749,6 +770,7 @@ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, getWalkerBAQQualityMode(), refReader, + getBaseRecalibration(), argCollection.defaultBaseQualities); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index daa8ff60db..db22886ce1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import java.util.Collection; /** @@ -27,23 +28,20 @@ * information about how they should be downsampled, sorted, and filtered. */ public class ReadProperties { - private Collection readers = null; - private SAMFileHeader header = null; - private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT; - private DownsamplingMethod downsamplingMethod = null; - private ValidationExclusion exclusionList = null; - private Collection supplementalFilters = null; - private boolean includeReadsWithDeletionAtLoci = false; - private boolean useOriginalBaseQualities = false; - private boolean generateExtendedEvents = false; - private BAQ.CalculationMode cmode = BAQ.CalculationMode.OFF; - private BAQ.QualityMode qmode = BAQ.QualityMode.DONT_MODIFY; - IndexedFastaSequenceFile refReader = null; // read for BAQ, if desired - private byte defaultBaseQualities; - - // do we want to generate additional piles of "extended" events (indels) -// immediately after the reference base such event is associated with? - + private final Collection readers; + private final SAMFileHeader header; + private final SAMFileReader.ValidationStringency validationStringency; + private final DownsamplingMethod downsamplingMethod; + private final ValidationExclusion exclusionList; + private final Collection supplementalFilters; + private final boolean includeReadsWithDeletionAtLoci; + private final boolean useOriginalBaseQualities; + private final boolean generateExtendedEvents; + private final BAQ.CalculationMode cmode; + private final BAQ.QualityMode qmode; + private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired + private final BaseRecalibration bqsrApplier; + private final byte defaultBaseQualities; /** * Return true if the walker wants to see reads that contain deletions when looking at locus pileups @@ -126,6 +124,8 @@ public IndexedFastaSequenceFile getRefReader() { return refReader; } + public BaseRecalibration getBQSRApplier() { return bqsrApplier; } + /** * @return Default base quality value to fill reads missing base quality information. */ @@ -165,8 +165,9 @@ public ReadProperties( Collection samFiles, boolean includeReadsWithDeletionAtLoci, boolean generateExtendedEvents, BAQ.CalculationMode cmode, - BAQ.QualityMode qmode, + BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, + BaseRecalibration bqsrApplier, byte defaultBaseQualities) { this.readers = samFiles; this.header = header; @@ -180,6 +181,7 @@ public ReadProperties( Collection samFiles, this.cmode = cmode; this.qmode = qmode; this.refReader = refReader; + this.bqsrApplier = bqsrApplier; this.defaultBaseQualities = defaultBaseQualities; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 08d2c1ad15..02d211a0cb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -65,9 +65,12 @@ public GATKArgumentCollection() { @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; - @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false) + @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD; + @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false) + public File gatkKeyFile = null; + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); @@ -75,6 +78,7 @@ public GATKArgumentCollection() { * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). + * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. */ @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) public List> intervals = null; @@ -185,6 +189,15 @@ public static DownsamplingMethod getDefaultDownsamplingMethod() { @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) public Boolean useOriginalBaseQualities = false; + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration") + public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) public byte defaultBaseQualities = -1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java index a6731ee184..d1a2e7519b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java @@ -23,7 +23,7 @@ */ /** - * A LocusView over which the user can iterate. + * A LocusView over which the user can iterate. */ public class AllLocusView extends LocusView { @@ -47,12 +47,13 @@ public class AllLocusView extends LocusView { /** * Create a new queue of locus contexts. + * * @param provider */ - public AllLocusView(LocusShardDataProvider provider) { - super( provider ); + public AllLocusView(LocusShardDataProvider provider) { + super(provider); // Seed the state tracking members with the first possible seek position and the first possible locus context. - locusIterator = new GenomeLocusIterator(genomeLocParser,provider.getLocus()); + locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus()); } public boolean hasNext() { @@ -63,7 +64,7 @@ public boolean hasNext() { public AlignmentContext next() { advance(); - if(nextPosition == null) + if (nextPosition == null) throw new NoSuchElementException("No next is available in the all locus view"); // Flag to the iterator that no data is waiting in the queue to be processed. @@ -72,7 +73,7 @@ public AlignmentContext next() { AlignmentContext currentLocus; // If actual data is present, return it. Otherwise, return empty data. - if( nextLocus != null && nextLocus.getLocation().equals(nextPosition) ) + if (nextLocus != null && nextLocus.getLocation().equals(nextPosition)) currentLocus = nextLocus; else currentLocus = createEmptyLocus(nextPosition); @@ -82,15 +83,15 @@ public AlignmentContext next() { private void advance() { // Already at the next element? Don't move forward. - if(atNextElement) + if (atNextElement) return; // Out of elements? - if(nextPosition == null && !locusIterator.hasNext()) - return; + if (nextPosition == null && !locusIterator.hasNext()) + return; // If nextLocus has been consumed, clear it out to make room for the next incoming locus. - if(nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { + if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { nextLocus = null; // Determine the next locus. The trick is that we may have more than one alignment context at the same @@ -98,9 +99,9 @@ private void advance() { // is still at the current position, we do not increment current position and wait for next call to next() to return // that context. If we know that next context is past the current position, we are done with current // position - if(hasNextLocus()) { + if (hasNextLocus()) { nextLocus = nextLocus(); - if(nextPosition.equals(nextLocus.getLocation())) { + if (nextPosition.equals(nextLocus.getLocation())) { atNextElement = true; return; } @@ -108,7 +109,7 @@ private void advance() { } // No elements left in queue? Clear out the position state tracker and return. - if(!locusIterator.hasNext()) { + if (!locusIterator.hasNext()) { nextPosition = null; return; } @@ -119,9 +120,9 @@ private void advance() { // Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus // while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal. - while(nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { + while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { nextLocus = null; - if(!hasNextLocus()) + if (!hasNextLocus()) break; nextLocus = nextLocus(); } @@ -129,12 +130,15 @@ private void advance() { /** * Creates a blank locus context at the specified location. + * * @param site Site at which to create the blank locus context. * @return empty context. */ private final static List EMPTY_PILEUP_READS = Collections.emptyList(); private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList(); - private AlignmentContext createEmptyLocus( GenomeLoc site ) { - return new AlignmentContext(site,new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); + private final static List EMPTY_DELETION_STATUS = Collections.emptyList(); + + private AlignmentContext createEmptyLocus(GenomeLoc site) { + return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index f9ed0cb747..a3ce6dd278 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -25,9 +25,14 @@ */ /** - * A queue of locus context entries. + * The two goals of the LocusView are as follows: + * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch + * between iterating over all bases in a region, only covered bases in a region covered by + * reads, only bases in a region covered by RODs, or any other sort of trigger track + * implementation one can think of. + * 2) To manage the copious number of iterators that have to be jointly pulled through the + * genome to make a locus traversal function. */ - public abstract class LocusView extends LocusIterator implements View { /** * The locus bounding this view. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java similarity index 58% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java index 0a6173c1e4..1649713658 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java @@ -27,8 +27,10 @@ import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; +import net.sf.samtools.util.BlockCompressedFilePointerUtil; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.LinkedList; import java.util.List; /** @@ -38,7 +40,7 @@ * Time: 10:47 PM * To change this template use File | Settings | File Templates. */ -class SAMReaderPosition { +class BAMAccessPlan { private final SAMReaderID reader; private final BlockInputStream inputStream; @@ -51,7 +53,7 @@ class SAMReaderPosition { private long nextBlockAddress; - SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { this.reader = reader; this.inputStream = inputStream; @@ -84,11 +86,45 @@ public int getFirstOffsetInBlock() { } /** - * Retrieves the last offset of interest in the block returned by getBlockAddress(). - * @return First block of interest in this segment. + * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer. + * @param blockAddress Block address for which to search. + * @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span. + * @return list of chunks containing that block. */ - public int getLastOffsetInBlock() { - return (nextBlockAddress == positionIterator.peek().getBlockEnd()) ? positionIterator.peek().getBlockOffsetEnd() : 65536; + public List getSpansOverlappingBlock(long blockAddress, long filePosition) { + List spansOverlapping = new LinkedList(); + // While the position iterator overlaps the given block, pull out spans to report. + while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) { + // Create a span over as much of the block as is covered by this chunk. + int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; + + // Calculate the end of this span. If the span extends past this block, cap it using the current file position. + long blockEnd; + int blockOffsetEnd; + if(blockAddress < positionIterator.peek().getBlockEnd()) { + blockEnd = filePosition; + blockOffsetEnd = 0; + } + else { + blockEnd = positionIterator.peek().getBlockEnd(); + blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd(); + } + + GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd); + + if(newChunk.getChunkStart() <= newChunk.getChunkEnd()) + spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd)); + + // If the value currently stored in the position iterator ends past the current block, we must be done. Abort. + if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress) + break; + + // If the position iterator ends before the block ends, pull the position iterator forward. + if(positionIterator.peek().getBlockEnd() <= blockAddress) + positionIterator.next(); + } + + return spansOverlapping; } public void reset() { @@ -111,20 +147,16 @@ private void initialize() { * @param filePosition The current position within the file. */ void advancePosition(final long filePosition) { - nextBlockAddress = filePosition >> 16; + nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition); // Check the current file position against the iterator; if the iterator is before the current file position, // draw the iterator forward. Remember when performing the check that coordinates are half-open! - while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) { + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) positionIterator.next(); - // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. - if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) { - nextBlockAddress = positionIterator.peek().getBlockStart(); - //System.out.printf("SAMReaderPosition: next block address advanced to %d%n",nextBlockAddress); - break; - } - } + // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) + nextBlockAddress = positionIterator.peek().getBlockStart(); // If we've shot off the end of the block pointer, notify consumers that iteration is complete. if(!positionIterator.hasNext()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java index 657c70aaa3..1d8879d512 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java @@ -407,7 +407,14 @@ public BAMScheduleEntry next() { position(currentPosition); // Read data. - read(binHeader); + int binHeaderBytesRead = read(binHeader); + + // Make sure we read in a complete bin header: + if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) { + throw new ReviewedStingException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " + + "The BAM schedule file is likely incomplete/corrupt.", + scheduleFile.getAbsolutePath(), reader.getSamFilePath())); + } // Decode contents. binHeader.flip(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index bcb726607f..fdc3d2aa73 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -34,6 +34,8 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; @@ -245,7 +247,14 @@ private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc curr // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then // we'll be using the correct contig index for the BAMs. // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. - final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex(); + SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig()); + if ( currentContigSequenceRecord == null ) { + throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s", + currentLocus.getContig(), + ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary()))); + } + + final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex(); // Stale reference sequence or first invocation. (Re)create the binTreeIterator. if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java index f468d20204..d75e91bf3a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java @@ -44,12 +44,12 @@ public class BGZFBlockLoadingDispatcher { private final ExecutorService threadPool; - private final Queue inputQueue; + private final Queue inputQueue; public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) { threadPool = Executors.newFixedThreadPool(numThreads); fileHandleCache = new FileHandleCache(numFileHandles); - inputQueue = new LinkedList(); + inputQueue = new LinkedList(); threadPool.execute(new BlockLoader(this,fileHandleCache,true)); } @@ -58,7 +58,7 @@ public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles * Initiates a request for a new block load. * @param readerPosition Position at which to load. */ - void queueBlockLoad(final SAMReaderPosition readerPosition) { + void queueBlockLoad(final BAMAccessPlan readerPosition) { synchronized(inputQueue) { inputQueue.add(readerPosition); inputQueue.notify(); @@ -69,7 +69,7 @@ void queueBlockLoad(final SAMReaderPosition readerPosition) { * Claims the next work request from the queue. * @return The next work request, or null if none is available. */ - SAMReaderPosition claimNextWorkRequest() { + BAMAccessPlan claimNextWorkRequest() { synchronized(inputQueue) { while(inputQueue.isEmpty()) { try { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java index cb37bad312..fda5d818c6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java @@ -26,24 +26,21 @@ import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; -import net.sf.samtools.util.BAMInputStream; -import net.sf.samtools.util.BlockCompressedFilePointerUtil; import net.sf.samtools.util.BlockCompressedInputStream; -import net.sf.samtools.util.RuntimeEOFException; -import net.sf.samtools.util.SeekableStream; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; -import java.util.Iterator; import java.util.LinkedList; +import java.util.List; /** * Presents decompressed blocks to the SAMFileReader. */ -public class BlockInputStream extends SeekableStream implements BAMInputStream { +public class BlockInputStream extends InputStream { /** * Mechanism for triggering block loads. */ @@ -65,9 +62,9 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { private Throwable error; /** - * Current position. + * Current accessPlan. */ - private SAMReaderPosition position; + private BAMAccessPlan accessPlan; /** * A stream of compressed data blocks. @@ -94,11 +91,6 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { */ private final BlockCompressedInputStream validatingInputStream; - /** - * Has the buffer been filled since last request? - */ - private boolean bufferFilled = false; - /** * Create a new block presenting input stream with a dedicated buffer. * @param dispatcher the block loading messenger. @@ -118,7 +110,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { this.dispatcher = dispatcher; // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. - this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); // The block offsets / block positions guarantee that the ending offset/position in the data structure maps to // the point in the file just following the last read. These two arrays should never be empty; initializing @@ -151,7 +143,7 @@ public long getFilePointer() { synchronized(lock) { // Find the current block within the input stream. int blockIndex; - for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() >= blockOffsets.get(blockIndex + 1); blockIndex++) + for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++) ; filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex)); } @@ -164,51 +156,8 @@ public long getFilePointer() { return filePointer; } - public void seek(long target) { - //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target)); - synchronized(lock) { - clearBuffers(); - - // Ensure that the position filled in by submitAccessPlan() is in sync with the seek target just specified. - position.advancePosition(target); - - // If the position advances past the end of the target, that must mean that we seeked to a point at the end - // of one of the chunk list's subregions. Make a note of our current position and punt on loading any data. - if(target < position.getBlockAddress() << 16) { - blockOffsets.clear(); - blockOffsets.add(0); - blockPositions.clear(); - blockPositions.add(target); - } - else { - waitForBufferFill(); - // A buffer fill will load the relevant data from the shard, but the buffer position still needs to be - // advanced as appropriate. - Iterator blockOffsetIterator = blockOffsets.descendingIterator(); - Iterator blockPositionIterator = blockPositions.descendingIterator(); - while(blockOffsetIterator.hasNext() && blockPositionIterator.hasNext()) { - final int blockOffset = blockOffsetIterator.next(); - final long blockPosition = blockPositionIterator.next(); - if((blockPosition >> 16) == (target >> 16) && (blockPosition&0xFFFF) < (target&0xFFFF)) { - buffer.position(blockOffset + (int)(target&0xFFFF)-(int)(blockPosition&0xFFFF)); - break; - } - } - } - - if(validatingInputStream != null) { - try { - validatingInputStream.seek(target); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to validate against Picard input stream",ex); - } - } - } - } - private void clearBuffers() { - this.position.reset(); + this.accessPlan.reset(); // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. // Indicate no data to be read. @@ -225,29 +174,41 @@ private void clearBuffers() { public boolean eof() { synchronized(lock) { // TODO: Handle multiple empty BGZF blocks at end of the file. - return position != null && (position.getBlockAddress() < 0 || position.getBlockAddress() >= length); + return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length); } } - public void setCheckCrcs(final boolean check) { - // TODO: Implement - } - /** - * Submits a new access plan for the given dataset. - * @param position The next seek point for BAM data in this reader. + * Submits a new access plan for the given dataset and seeks to the given point. + * @param accessPlan The next seek point for BAM data in this reader. */ - public void submitAccessPlan(final SAMReaderPosition position) { + public void submitAccessPlan(final BAMAccessPlan accessPlan) { //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); - synchronized(lock) { - // Assume that the access plan is going to tell us to start where we are and move forward. - // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset. - if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress()) - position.advancePosition(this.position.getBlockAddress() << 16); + this.accessPlan = accessPlan; + accessPlan.reset(); + + clearBuffers(); + + // Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc). + // TODO: Don't pass these empty chunks in. + accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0)); + + if(accessPlan.getBlockAddress() >= 0) { + waitForBufferFill(); + } + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0)); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } } - this.position = position; + } + private void compactBuffer() { // Compact buffer to maximize storage space. int bytesToRemove = 0; @@ -286,27 +247,14 @@ private void compactBuffer() { * Push contents of incomingBuffer into the end of this buffer. * MUST be called from a thread that is NOT the reader thread. * @param incomingBuffer The data being pushed into this input stream. - * @param position target position for the data. + * @param accessPlan target access plan for the data. * @param filePosition the current position of the file pointer */ - public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) { + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) { synchronized(lock) { try { - compactBuffer(); - // Open up the buffer for more reading. - buffer.limit(buffer.capacity()); - - // Advance the position to take the most recent read into account. - final long lastBlockAddress = position.getBlockAddress(); - final int blockOffsetStart = position.getFirstOffsetInBlock(); - final int blockOffsetEnd = position.getLastOffsetInBlock(); - - // Where did this read end? It either ended in the middle of a block (for a bounding chunk) or it ended at the start of the next block. - final long endOfRead = (blockOffsetEnd < incomingBuffer.remaining()) ? (lastBlockAddress << 16) | blockOffsetEnd : filePosition << 16; - - byte[] validBytes = null; if(validatingInputStream != null) { - validBytes = new byte[incomingBuffer.remaining()]; + byte[] validBytes = new byte[incomingBuffer.remaining()]; byte[] currentBytes = new byte[incomingBuffer.remaining()]; int pos = incomingBuffer.position(); @@ -317,7 +265,7 @@ public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosit incomingBuffer.position(pos); long currentFilePointer = validatingInputStream.getFilePointer(); - validatingInputStream.seek(lastBlockAddress << 16); + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0)); validatingInputStream.read(validBytes); validatingInputStream.seek(currentFilePointer); @@ -325,33 +273,41 @@ public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosit throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); } - this.position = position; - position.advancePosition(filePosition << 16); + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); + + // Get the spans overlapping this particular block... + List spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition); + + // ...and advance the block + this.accessPlan = accessPlan; + accessPlan.advancePosition(makeFilePointer(filePosition, 0)); - if(buffer.remaining() < incomingBuffer.remaining()) { - //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining()); + if(buffer.remaining() < incomingBuffer.remaining()) lock.wait(); - //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining()); - } - // Remove the last position in the list and add in the last read position, in case the two are different. - blockOffsets.removeLast(); - blockOffsets.add(buffer.position()); - blockPositions.removeLast(); - blockPositions.add(lastBlockAddress << 16 | blockOffsetStart); + final int bytesInIncomingBuffer = incomingBuffer.limit(); + + for(GATKChunk spanOverlapping: spansOverlapping) { + // Clear out the endcap tracking state and add in the starting position for this transfer. + blockOffsets.removeLast(); + blockOffsets.add(buffer.position()); + blockPositions.removeLast(); + blockPositions.add(spanOverlapping.getChunkStart()); - // Stream the buffer into the data stream. - incomingBuffer.position(blockOffsetStart); - incomingBuffer.limit(Math.min(incomingBuffer.limit(),blockOffsetEnd)); - buffer.put(incomingBuffer); + // Stream the buffer into the data stream. + incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd()); + incomingBuffer.position(spanOverlapping.getBlockOffsetStart()); + buffer.put(incomingBuffer); - // Then, add the last position read to the very end of the list, just past the end of the last buffer. - blockOffsets.add(buffer.position()); - blockPositions.add(endOfRead); + // Add the endcap for this transfer. + blockOffsets.add(buffer.position()); + blockPositions.add(spanOverlapping.getChunkEnd()); + } // Set up the buffer for reading. buffer.flip(); - bufferFilled = true; lock.notify(); } @@ -447,12 +403,8 @@ public int read(byte[] bytes, final int offset, final int length) { if(remaining < length) return length - remaining; - // Otherwise, if at eof(), return -1. - else if(eof()) - return -1; - - // Otherwise, we must've hit a bug in the system. - throw new ReviewedStingException("BUG: read returned no data, but eof() reports false."); + // Otherwise, return -1. + return -1; } public void close() { @@ -472,20 +424,26 @@ public String getSource() { private void waitForBufferFill() { synchronized(lock) { - bufferFilled = false; if(buffer.remaining() == 0 && !eof()) { //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); - dispatcher.queueBlockLoad(position); + dispatcher.queueBlockLoad(accessPlan); try { lock.wait(); } catch(InterruptedException ex) { throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex); } - - if(bufferFilled && buffer.remaining() == 0) - throw new RuntimeEOFException("No more data left in InputStream"); } } } + + /** + * Create an encoded BAM file pointer given the address of a BGZF block and an offset. + * @param blockAddress Physical address on disk of a BGZF block. + * @param blockOffset Offset into the uncompressed data stored in the BGZF block. + * @return 64-bit pointer encoded according to the BAM spec. + */ + public static long makeFilePointer(final long blockAddress, final int blockOffset) { + return blockAddress << 16 | blockOffset; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java index ab42998026..81a37e53ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -70,29 +70,29 @@ public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandle public void run() { for(;;) { - SAMReaderPosition readerPosition = null; + BAMAccessPlan accessPlan = null; try { - readerPosition = dispatcher.claimNextWorkRequest(); - FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader()); + accessPlan = dispatcher.claimNextWorkRequest(); + FileInputStream inputStream = fileHandleCache.claimFileInputStream(accessPlan.getReader()); - long blockAddress = readerPosition.getBlockAddress(); + //long blockAddress = readerPosition.getBlockAddress(); //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream()); - ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress()); + ByteBuffer compressedBlock = readBGZFBlock(inputStream,accessPlan.getBlockAddress()); long nextBlockAddress = position(inputStream); - fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream); + fileHandleCache.releaseFileInputStream(accessPlan.getReader(),inputStream); ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock; int bytesCopied = block.remaining(); - BlockInputStream bamInputStream = readerPosition.getInputStream(); - bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress); + BlockInputStream bamInputStream = accessPlan.getInputStream(); + bamInputStream.copyIntoBuffer(block,accessPlan,nextBlockAddress); //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream()); } catch(Throwable error) { - if(readerPosition != null && readerPosition.getInputStream() != null) - readerPosition.getInputStream().reportException(error); + if(accessPlan != null && accessPlan.getInputStream() != null) + accessPlan.getInputStream().reportException(error); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 244438a593..2bf75b0357 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -25,6 +25,7 @@ import net.sf.samtools.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; import java.io.FileInputStream; @@ -349,7 +350,18 @@ private long[] readLongs(final int count) { private void read(final ByteBuffer buffer) { try { - fileChannel.read(buffer); + int bytesExpected = buffer.limit(); + int bytesRead = fileChannel.read(buffer); + + // We have a rigid expectation here to read in exactly the number of bytes we've limited + // our buffer to -- if we read in fewer bytes than this, or encounter EOF (-1), the index + // must be truncated or otherwise corrupt: + if ( bytesRead < bytesExpected ) { + throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + + "It's likely that this file is truncated or corrupt -- " + + "Please try re-indexing the corresponding BAM file.", + mFile)); + } } catch(IOException ex) { throw new ReviewedStingException("Index: unable to read bytes from index file " + mFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java new file mode 100644 index 0000000000..4005f1c321 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; +import java.util.NoSuchElementException; + +/** + * High efficiency filtering iterator designed to filter out reads only included + * in the query results due to the granularity of the BAM index. + * + * Built into the BAM index is a notion of 16kbase granularity -- an index query for + * two regions contained within a 16kbase chunk (say, chr1:5-10 and chr1:11-20) will + * return exactly the same regions within the BAM file. This iterator is optimized + * to subtract out reads which do not at all overlap the interval list passed to the + * constructor. + * + * Example: + * interval list: chr20:6-10 + * Reads that would pass through the filter: chr20:6-10, chr20:1-15, chr20:1-7, chr20:8-15. + * Reads that would be discarded by the filter: chr20:1-5, chr20:11-15. + */ +class IntervalOverlapFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private CloseableIterator iterator; + + /** + * The next read, queued up and ready to go. + */ + private SAMRecord nextRead; + + /** + * Rather than using the straight genomic bounds, use filter out only mapped reads. + */ + private boolean keepOnlyUnmappedReads; + + /** + * Custom representation of interval bounds. + * Makes it simpler to track current position. + */ + private int[] intervalContigIndices; + private int[] intervalStarts; + private int[] intervalEnds; + + /** + * Position within the interval list. + */ + private int currentBound = 0; + + public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { + this.iterator = iterator; + + // Look at the interval list to detect whether we should worry about unmapped reads. + // If we find a mix of mapped/unmapped intervals, throw an exception. + boolean foundMappedIntervals = false; + for(GenomeLoc location: intervals) { + if(! GenomeLoc.isUnmapped(location)) + foundMappedIntervals = true; + keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location); + } + + + if(foundMappedIntervals) { + if(keepOnlyUnmappedReads) + throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); + this.intervalContigIndices = new int[intervals.size()]; + this.intervalStarts = new int[intervals.size()]; + this.intervalEnds = new int[intervals.size()]; + int i = 0; + for(GenomeLoc interval: intervals) { + intervalContigIndices[i] = interval.getContigIndex(); + intervalStarts[i] = interval.getStart(); + intervalEnds[i] = interval.getStop(); + i++; + } + } + + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(nextRead == null) + throw new NoSuchElementException("No more reads left in this iterator."); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator"); + } + + + public void close() { + iterator.close(); + } + + private void advance() { + nextRead = null; + + if(!iterator.hasNext()) + return; + + SAMRecord candidateRead = iterator.next(); + while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { + if(!keepOnlyUnmappedReads) { + // Mapped read filter; check against GenomeLoc-derived bounds. + if(readEndsOnOrAfterStartingBound(candidateRead)) { + // This read ends after the current interval begins. + // Promising, but this read must be checked against the ending bound. + if(readStartsOnOrBeforeEndingBound(candidateRead)) { + // Yes, this read is within both bounds. This must be our next read. + nextRead = candidateRead; + break; + } + else { + // Oops, we're past the end bound. Increment the current bound and try again. + currentBound++; + continue; + } + } + } + else { + // Found an unmapped read. We're done. + if(candidateRead.getReadUnmappedFlag()) { + nextRead = candidateRead; + break; + } + } + + // No more reads available. Stop the search. + if(!iterator.hasNext()) + break; + + // No reasonable read found; advance the iterator. + candidateRead = iterator.next(); + } + } + + /** + * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its + * end will be distorted, so rely only on the alignment start. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { + return + // Read ends on a later contig, or... + read.getReferenceIndex() > intervalContigIndices[currentBound] || + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + } + + /** + * Check whether the read lies before the end of the current bound. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { + return + // Read starts on a prior contig, or... + read.getReferenceIndex() < intervalContigIndices[currentBound] || + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 8d73b1b158..96b55674ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -36,7 +36,7 @@ */ public class ReadShard extends Shard { /** - * What is the maximum number of reads which should go into a read shard. + * What is the maximum number of reads per BAM file which should go into a read shard. */ public static int MAX_READS = 10000; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index b215763b53..bf7afe4f0d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.SimpleTimer; @@ -47,6 +46,8 @@ import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; @@ -202,6 +203,7 @@ public SAMDataSource( BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ + null, // no BQSR (byte) -1); } @@ -238,6 +240,7 @@ public SAMDataSource( BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, + BaseRecalibration bqsrApplier, byte defaultBaseQualities) { this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; @@ -310,6 +313,7 @@ public SAMDataSource( cmode, qmode, refReader, + bqsrApplier, defaultBaseQualities); // cache the read group id (original) -> read group id (merged) @@ -555,7 +559,7 @@ private void initializeReaderPositions(SAMReaders readers) { */ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { // Set up merging to dynamically merge together multiple BAMs. - MergingSamRecordIterator mergingIterator = readers.createMergingIterator(); + Map> iteratorMap = new HashMap>(); for(SAMReaderID id: getReaderIDs()) { CloseableIterator iterator = null; @@ -567,15 +571,23 @@ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean en if(threadAllocation.getNumIOThreads() > 0) { BlockInputStream inputStream = readers.getInputStream(id); - inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id))); + inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); + BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); + codec.setInputStream(inputStream); + iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); } - iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + else { + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + } + iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); - mergingIterator.addIterator(readers.getReader(id),iterator); + iteratorMap.put(readers.getReader(id), iterator); } + MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); + return applyDecoratingIterators(shard.getReadMetrics(), enableVerification, readProperties.useOriginalBaseQualities(), @@ -586,9 +598,53 @@ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean en readProperties.getBAQCalculationMode(), readProperties.getBAQQualityMode(), readProperties.getRefReader(), + readProperties.getBQSRApplier(), readProperties.defaultBaseQualities()); } + private class BAMCodecIterator implements CloseableIterator { + private final BlockInputStream inputStream; + private final SAMFileReader reader; + private final BAMRecordCodec codec; + private SAMRecord nextRead; + + private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) { + this.inputStream = inputStream; + this.reader = reader; + this.codec = codec; + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); + } + + private void advance() { + final long startCoordinate = inputStream.getFilePointer(); + nextRead = codec.decode(); + final long stopCoordinate = inputStream.getFilePointer(); + + if(reader != null && nextRead != null) + PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate)))); + } + } + /** * Filter reads based on user-specified criteria. * @@ -612,9 +668,10 @@ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, + BaseRecalibration bqsrApplier, byte defaultBaseQualities) { - if ( useOriginalBaseQualities || defaultBaseQualities >= 0 ) - // only wrap if we are replacing the original qualitiies or using a default base quality + if (useOriginalBaseQualities || defaultBaseQualities >= 0) + // only wrap if we are replacing the original qualities or using a default base quality wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); // NOTE: this (and other filtering) should be done before on-the-fly sorting @@ -627,6 +684,9 @@ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, if (!noValidationOfReadOrder && enableVerification) wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator); + if (bqsrApplier != null) + wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier); + if (cmode != BAQ.CalculationMode.OFF) wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode); @@ -796,8 +856,13 @@ public String getReadGroupId(final SAMReaderID readerID, final String originalRe return headerMerger.getReadGroupId(header,originalReadGroupID); } - public MergingSamRecordIterator createMergingIterator() { - return new MergingSamRecordIterator(headerMerger,readers.values(),true); + /** + * Creates a new merging iterator from the given map, with the given header. + * @param iteratorMap A map of readers to iterators. + * @return An iterator which will merge those individual iterators. + */ + public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) { + return new MergingSamRecordIterator(headerMerger,iteratorMap,true); } /** @@ -863,12 +928,9 @@ public ReaderInitializer(final SAMReaderID readerID) { public ReaderInitializer call() { final File indexFile = findIndexFile(readerID.samFile); try { - if (threadAllocation.getNumIOThreads() > 0) { + if (threadAllocation.getNumIOThreads() > 0) blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(blockInputStream,indexFile,false); - } - else - reader = new SAMFileReader(readerID.samFile,indexFile,false); + reader = new SAMFileReader(readerID.samFile,indexFile,false); } catch ( RuntimeIOException e ) { if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException ) throw new UserException.CouldNotReadInputFile(readerID.samFile, e); @@ -927,167 +989,6 @@ public SAMRecord next() { */ private class ReadGroupMapping extends HashMap {} - /** - * Filters out reads that do not overlap the current GenomeLoc. - * Note the custom implementation: BAM index querying returns all reads that could - * possibly overlap the given region (and quite a few extras). In order not to drag - * down performance, this implementation is highly customized to its task. - */ - private class IntervalOverlapFilteringIterator implements CloseableIterator { - /** - * The wrapped iterator. - */ - private CloseableIterator iterator; - - /** - * The next read, queued up and ready to go. - */ - private SAMRecord nextRead; - - /** - * Rather than using the straight genomic bounds, use filter out only mapped reads. - */ - private boolean keepOnlyUnmappedReads; - - /** - * Custom representation of interval bounds. - * Makes it simpler to track current position. - */ - private int[] intervalContigIndices; - private int[] intervalStarts; - private int[] intervalEnds; - - /** - * Position within the interval list. - */ - private int currentBound = 0; - - public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { - this.iterator = iterator; - - // Look at the interval list to detect whether we should worry about unmapped reads. - // If we find a mix of mapped/unmapped intervals, throw an exception. - boolean foundMappedIntervals = false; - for(GenomeLoc location: intervals) { - if(! GenomeLoc.isUnmapped(location)) - foundMappedIntervals = true; - keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location); - } - - - if(foundMappedIntervals) { - if(keepOnlyUnmappedReads) - throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); - this.intervalContigIndices = new int[intervals.size()]; - this.intervalStarts = new int[intervals.size()]; - this.intervalEnds = new int[intervals.size()]; - int i = 0; - for(GenomeLoc interval: intervals) { - intervalContigIndices[i] = interval.getContigIndex(); - intervalStarts[i] = interval.getStart(); - intervalEnds[i] = interval.getStop(); - i++; - } - } - - advance(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if(nextRead == null) - throw new NoSuchElementException("No more reads left in this iterator."); - SAMRecord currentRead = nextRead; - advance(); - return currentRead; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator"); - } - - - public void close() { - iterator.close(); - } - - private void advance() { - nextRead = null; - - if(!iterator.hasNext()) - return; - - SAMRecord candidateRead = iterator.next(); - while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { - if(!keepOnlyUnmappedReads) { - // Mapped read filter; check against GenomeLoc-derived bounds. - if(readEndsOnOrAfterStartingBound(candidateRead)) { - // This read ends after the current interval begins. - // Promising, but this read must be checked against the ending bound. - if(readStartsOnOrBeforeEndingBound(candidateRead)) { - // Yes, this read is within both bounds. This must be our next read. - nextRead = candidateRead; - break; - } - else { - // Oops, we're past the end bound. Increment the current bound and try again. - currentBound++; - continue; - } - } - } - else { - // Found an unmapped read. We're done. - if(candidateRead.getReadUnmappedFlag()) { - nextRead = candidateRead; - break; - } - } - - // No more reads available. Stop the search. - if(!iterator.hasNext()) - break; - - // No reasonable read found; advance the iterator. - candidateRead = iterator.next(); - } - } - - /** - * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its - * end will be distorted, so rely only on the alignment start. - * @param read The read to position-check. - * @return True if the read starts after the current bounds. False otherwise. - */ - private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { - return - // Read ends on a later contig, or... - read.getReferenceIndex() > intervalContigIndices[currentBound] || - // Read ends of this contig... - (read.getReferenceIndex() == intervalContigIndices[currentBound] && - // either after this location, or... - (read.getAlignmentEnd() >= intervalStarts[currentBound] || - // read is unmapped but positioned and alignment start is on or after this start point. - (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); - } - - /** - * Check whether the read lies before the end of the current bound. - * @param read The read to position-check. - * @return True if the read starts after the current bounds. False otherwise. - */ - private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { - return - // Read starts on a prior contig, or... - read.getReferenceIndex() < intervalContigIndices[currentBound] || - // Read starts on this contig and the alignment start is registered before this end point. - (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); - } - } - /** * Locates the index file alongside the given BAM, if present. * TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 39e1bdc726..433c7d82fb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import java.util.Collection; @@ -102,7 +102,7 @@ public Object execute( Walker walker, Iterable shardStrategy ) { while (isShardTraversePending() || isTreeReducePending()) { // Check for errors during execution. if(hasTraversalErrorOccurred()) - throw new ReviewedStingException("An error has occurred during the traversal.",getTraversalError()); + throw getTraversalError(); // Too many files sitting around taking up space? Merge them. if (isMergeLimitExceeded()) @@ -345,10 +345,15 @@ private synchronized boolean hasTraversalErrorOccurred() { return error != null; } - private synchronized Throwable getTraversalError() { + private synchronized StingException getTraversalError() { if(!hasTraversalErrorOccurred()) throw new ReviewedStingException("User has attempted to retrieve a traversal error when none exists"); - return error; + + // If the error is already a StingException, pass it along as is. Otherwise, wrap it. + if(error instanceof StingException) + return (StingException)error; + else + return new ReviewedStingException("An error occurred during the traversal.",error); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index ff5e1064bd..16487054bc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; @@ -55,7 +56,6 @@ public Object execute(Walker walker, Iterable shardStrategy) { traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { - LocusWalker lWalker = (LocusWalker)walker; WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { @@ -77,6 +77,12 @@ public Object execute(Walker walker, Iterable shardStrategy) { done = walker.isDone(); } + // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine + if( traversalEngine instanceof TraverseActiveRegions ) { + final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); + accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator + } + Object result = accumulator.finishTraversal(); printOnTraversalDone(result); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index d013db7e84..5080997084 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -128,6 +128,8 @@ protected MicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSour traversalEngine = new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { traversalEngine = new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + traversalEngine = new TraverseActiveRegions(); } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index d1f5d80daf..da11d36ddd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -17,9 +17,21 @@ import java.util.NoSuchElementException; /** - * Buffer shards of data which may or may not contain multiple loci into - * iterators of all data which cover an interval. Its existence is an homage - * to Mark's stillborn WindowMaker, RIP 2009. + * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci + * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp + * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of + * loci to only those covered by the given interval list. + * + * Example: + * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10 + * Incoming intervals: chr20:3-7 + * + * Locus iterator by state will produce the following stream of data: + * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, + * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E} + * + * WindowMakerIterator will then filter the incoming stream, emitting the following stream: + * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E} * * @author mhanna * @version 0.1 diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 75e787e05a..a47c61d0b8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -49,9 +49,13 @@ import java.util.*; -/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */ +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ public class LocusIteratorByState extends LocusIterator { - /** our log, which we want to capture anything from this class */ + /** + * our log, which we want to capture anything from this class + */ private static Logger logger = Logger.getLogger(LocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- @@ -70,16 +74,15 @@ public class LocusIteratorByState extends LocusIterator { static private class SAMRecordState { SAMRecord read; - int readOffset = -1; // how far are we offset from the start of the read bases? - int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? Cigar cigar = null; int cigarOffset = -1; CigarElement curElement = null; int nCigarElements = 0; - // how far are we into a single cigarElement - int cigarElementCounter = -1; + int cigarElementCounter = -1; // how far are we into a single cigarElement // The logical model for generating extended events is as follows: the "record state" implements the traversal // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This @@ -89,17 +92,19 @@ static private class SAMRecordState { // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended // events immediately preceding the current reference base). - boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? - // the only purpose of this flag is to shield away a few additional lines of code - // when extended piles are not needed, it may not be even worth it... - byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) - int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events - byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the - // current base on the ref. We use a counter-like variable here since clearing the indel event is - // delayed by one base, so we need to remember how long ago we have seen the actual event - int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the - // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, - // we cache it here mainly for convenience + boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? + // the only purpose of this flag is to shield away a few additional lines of code + // when extended piles are not needed, it may not be even worth it... + + byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) + int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events + byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the + // current base on the ref. We use a counter-like variable here since clearing the indel event is + // delayed by one base, so we need to remember how long ago we have seen the actual event + + int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the + // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, + // we cache it here mainly for convenience public SAMRecordState(SAMRecord read, boolean extended) { @@ -111,23 +116,31 @@ public SAMRecordState(SAMRecord read, boolean extended) { //System.out.printf("Creating a SAMRecordState: %s%n", this); } - public SAMRecord getRead() { return read; } + public SAMRecord getRead() { + return read; + } /** * What is our current offset in the read's bases that aligns us with the reference genome? * * @return */ - public int getReadOffset() { return readOffset; } + public int getReadOffset() { + return readOffset; + } /** * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? * * @return */ - public int getGenomeOffset() { return genomeOffset; } + public int getGenomeOffset() { + return genomeOffset; + } - public int getGenomePosition() { return read.getAlignmentStart() + getGenomeOffset(); } + public int getGenomePosition() { + return read.getAlignmentStart() + getGenomeOffset(); + } public GenomeLoc getLocation(GenomeLocParser genomeLocParser) { return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); @@ -137,52 +150,66 @@ public CigarOperator getCurrentCigarOperator() { return curElement.getOperator(); } - /** Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome. + /** + * Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome. * * @return */ public boolean hadIndel() { - return ( eventLength > 0 ); + return (eventLength > 0); } - public int getEventLength() { return eventLength; } + public int getEventLength() { + return eventLength; + } - public byte[] getEventBases() { return insertedBases; } + public byte[] getEventBases() { + return insertedBases; + } - public int getReadEventStartOffset() { return eventStart; } + public int getReadEventStartOffset() { + return eventStart; + } public String toString() { return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); } + public CigarElement peekForwardOnGenome() { + return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); + } + public CigarOperator stepForwardOnGenome() { // we enter this method with readOffset = index of the last processed base on the read // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion - if ( curElement == null || ++cigarElementCounter > curElement.getLength() ) { + if (curElement == null || ++cigarElementCounter > curElement.getLength()) { cigarOffset++; - if ( cigarOffset < nCigarElements ) { + if (cigarOffset < nCigarElements) { curElement = cigar.getCigarElement(cigarOffset); cigarElementCounter = 0; // next line: guards against cigar elements of length 0; when new cigar element is retrieved, // we reenter in order to re-check cigarElementCounter against curElement's length return stepForwardOnGenome(); } else { + if (curElement != null && curElement.getOperator() == CigarOperator.D) + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString()); + // Reads that contain indels model the genomeOffset as the following base in the reference. Because // we fall into this else block only when indels end the read, increment genomeOffset such that the // current offset of this read is the next ref base after the end of the indel. This position will // model a point on the reference somewhere after the end of the read. genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: - // we do step forward on the ref, and by returning null we also indicate that we are past the read end. + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. - if ( generateExtendedEvents && eventDelayedFlag > 0 ) { + if (generateExtendedEvents && eventDelayedFlag > 0) { // if we had an indel right before the read ended (i.e. insertion was the last cigar element), // we keep it until next reference base; then we discard it and this will allow the LocusIterator to // finally discard this read eventDelayedFlag--; - if ( eventDelayedFlag == 0 ) { + if (eventDelayedFlag == 0) { eventLength = -1; // reset event when we are past it insertedBases = null; eventStart = -1; @@ -193,34 +220,37 @@ public CigarOperator stepForwardOnGenome() { } } - boolean done = false; switch (curElement.getOperator()) { - case H : // ignore hard clips - case P : // ignore pads + case H: // ignore hard clips + case P: // ignore pads cigarElementCounter = curElement.getLength(); break; - case I : // insertion w.r.t. the reference - if ( generateExtendedEvents ) { + case I: // insertion w.r.t. the reference + if (generateExtendedEvents) { // we see insertions only once, when we step right onto them; the position on the read is scrolled // past the insertion right after that - if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName()); - insertedBases = Arrays.copyOfRange(read.getReadBases(),readOffset+1,readOffset+1+curElement.getLength()); - eventLength = curElement.getLength() ; + if (eventDelayedFlag > 1) + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); + insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength()); + eventLength = curElement.getLength(); eventStart = readOffset; eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2 // System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset); } // continue onto the 'S' case ! - case S : // soft clip + case S: // soft clip cigarElementCounter = curElement.getLength(); readOffset += curElement.getLength(); break; - case D : // deletion w.r.t. the reference - if ( generateExtendedEvents ) { - if ( cigarElementCounter == 1) { + case D: // deletion w.r.t. the reference + if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string + throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString()); + if (generateExtendedEvents) { + if (cigarElementCounter == 1) { // generate an extended event only if we just stepped into the deletion (i.e. don't // generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!) - if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName()); + if (eventDelayedFlag > 1) + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); eventLength = curElement.getLength(); eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only eventStart = readOffset; @@ -232,26 +262,27 @@ public CigarOperator stepForwardOnGenome() { genomeOffset++; done = true; break; - case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) genomeOffset++; done = true; break; - case M : + case M: readOffset++; genomeOffset++; done = true; break; - default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + default: + throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); } - if ( generateExtendedEvents ) { - if ( eventDelayedFlag > 0 && done ) { - // if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementthe, + if (generateExtendedEvents) { + if (eventDelayedFlag > 0 && done) { + // if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementing the, // the flag is 1, we are standing on the reference base right after the indel (so we have to keep it). // Otherwise, we are away from the previous indel and have to clear our memories... eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now - // if eventDelayedFlag == 1, an indel occured right before the current base - if ( eventDelayedFlag == 0 ) { + // if eventDelayedFlag == 1, an indel occured right before the current base + if (eventDelayedFlag == 0) { eventLength = -1; // reset event when we are past it insertedBases = null; eventStart = -1; @@ -274,15 +305,15 @@ public CigarOperator stepForwardOnGenome() { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) { + public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); + this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true - if ( this.samples.isEmpty() && samIterator.hasNext() ) { + if (this.samples.isEmpty() && samIterator.hasNext()) { throw new IllegalArgumentException("samples list must not be empty"); } } @@ -322,7 +353,7 @@ private GenomeLoc getLocation() { // ----------------------------------------------------------------------------------------------------------------- public AlignmentContext next() { lazyLoadNextAlignmentContext(); - if(!hasNext()) + if (!hasNext()) throw new NoSuchElementException("LocusIteratorByState: out of elements."); AlignmentContext currentAlignmentContext = nextAlignmentContext; nextAlignmentContext = null; @@ -334,7 +365,7 @@ public AlignmentContext next() { * nextAlignmentContext MUST BE null in order for this method to advance to the next entry. */ private void lazyLoadNextAlignmentContext() { - while(nextAlignmentContext == null && readStates.hasNext()) { + while (nextAlignmentContext == null && readStates.hasNext()) { // this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref: readStates.collectPendingReads(); @@ -350,14 +381,14 @@ private void lazyLoadNextAlignmentContext() { // In this case, the subsequent call to next() will emit the normal pileup at the current base // and shift the position. if (readInfo.generateExtendedEvents() && hasExtendedEvents) { - Map fullExtendedEventPileup = new HashMap(); + Map fullExtendedEventPileup = new HashMap(); // get current location on the reference and decrement it by 1: the indels we just stepped over // are associated with the *previous* reference base - GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1); + GenomeLoc loc = genomeLocParser.incPos(getLocation(), -1); boolean hasBeenSampled = false; - for(final String sample: samples) { + for (final String sample : samples) { Iterator iterator = readStates.iterator(sample); List indelPile = new ArrayList(readStates.size(sample)); hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample); @@ -368,103 +399,117 @@ private void lazyLoadNextAlignmentContext() { nMQ0Reads = 0; int maxDeletionLength = 0; - while(iterator.hasNext()) { - SAMRecordState state = iterator.next(); - if ( state.hadIndel() ) { + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final int readOffset = state.getReadOffset(); // the base offset on this read + final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. + final int eventLength = state.getEventLength(); + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref size++; - if ( state.getEventBases() == null ) { + ExtendedEventPileupElement pileupElement; + if (state.getEventBases() == null) { // Deletion event nDeletions++; - maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength()); + maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength()); + pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength); } - else nInsertions++; - indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadEventStartOffset(), state.getEventLength(), state.getEventBases()) ); - - } else { - // HACK: The readahead mechanism for LocusIteratorByState will effectively read past the current position - // and add in extra reads that start after this indel. Skip these reads. - // My belief at this moment after empirically looking at read->ref alignment is that, in a cigar string - // like 1I76M, the first insertion is between alignment start-1 and alignment start, so we shouldn't be - // filtering these out. - // TODO: UPDATE! Eric tells me that we *might* want reads adjacent to the pileup in the pileup. Strike this block. - //if(state.getRead().getAlignmentStart() > loc.getStart()) - // continue; - - if ( state.getCurrentCigarOperator() != CigarOperator.N ) { - // this read has no indel associated with the previous position on the ref; - // we count this read in only if it has actual bases, not N span... - if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) { - - // if cigar operator is D but the read has no extended event reported (that's why we ended - // up in this branch), it means that we are currently inside a deletion that started earlier; - // we count such reads (with a longer deletion spanning over a deletion at the previous base we are - // about to report) only if includeReadsWithDeletionAtLoci is true. - size++; - indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()-1, -1) // length=-1 --> noevent - ); - } + else { // Insertion event + nInsertions++; + pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases()); } + if (read.getMappingQuality() == 0) + nMQ0Reads++; + + indelPile.add(pileupElement); } - if ( state.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; + + // this read has no indel so add it to the pileup as a NOEVENT: + // a deletion that didn't start here (therefore, not an extended event) + // we add (mis)matches as no events. + else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) { + size++; + indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset)); + if (read.getMappingQuality() == 0) + nMQ0Reads++; } } - if( indelPile.size() != 0 ) fullExtendedEventPileup.put(sample,new ReadBackedExtendedEventPileupImpl(loc,indelPile,size,maxDeletionLength,nInsertions,nDeletions,nMQ0Reads)); + + if (indelPile.size() != 0) + fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); } - hasExtendedEvents = false; // we are done with extended events prior to current ref base -// System.out.println("Indel(s) at "+loc); -// for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); } + hasExtendedEvents = false; // we are done with extended events prior to current ref base nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); - } else { + } + else { // this is a regular event pileup (not extended) GenomeLoc location = getLocation(); - Map fullPileup = new HashMap(); - + Map fullPileup = new HashMap(); boolean hasBeenSampled = false; - for(final String sample: samples) { + for (final String sample : samples) { Iterator iterator = readStates.iterator(sample); List pile = new ArrayList(readStates.size(sample)); hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); - size = 0; - nDeletions = 0; - nMQ0Reads = 0; + size = 0; // number of elements in this sample's pileup + nDeletions = 0; // number of deletions in this sample's pileup + nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) - while(iterator.hasNext()) { - SAMRecordState state = iterator.next(); - if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { - if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) { - //discarded_bases++; - //printStatus("Adaptor bases", discarded_adaptor_bases); - continue; - } else { - //observed_bases++; - pile.add(new PileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset())); + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element + final CigarOperator nextOp = nextElement.getOperator(); + final int readOffset = state.getReadOffset(); // the base offset on this read + + int nextElementLength = nextElement.getLength(); + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (op == CigarOperator.D) { + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()), + null,nextOp == CigarOperator.D? nextElementLength:-1)); size++; + nDeletions++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; } - } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) { - size++; - pile.add(new PileupElement((GATKSAMRecord) state.getRead(), -1)); - nDeletions++; } - - if ( state.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; + else { + if (!filterBaseInRead(read, location.getStart())) { + String insertedBaseString = null; + if (nextOp == CigarOperator.I) { + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength())); + } + pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()), + insertedBaseString,nextElementLength)); + size++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } } } - if( pile.size() != 0 ) - fullPileup.put(sample,new ReadBackedPileupImpl(location,pile,size,nDeletions,nMQ0Reads)); + if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup + fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads)); } - updateReadStates(); // critical - must be called after we get the current state offsets and location - // if we got reads with non-D/N over the current position, we are done - if ( !fullPileup.isEmpty() ) nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location,fullPileup),hasBeenSampled); + updateReadStates(); // critical - must be called after we get the current state offsets and location + if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); } } } // fast testing of position private boolean readIsPastCurrentPosition(SAMRecord read) { - if ( readStates.isEmpty() ) + if (readStates.isEmpty()) return false; else { SAMRecordState state = readStates.getFirst(); @@ -485,20 +530,18 @@ private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { } private void updateReadStates() { - for(final String sample: samples) { + for (final String sample : samples) { Iterator it = readStates.iterator(sample); - while ( it.hasNext() ) { + while (it.hasNext()) { SAMRecordState state = it.next(); CigarOperator op = state.stepForwardOnGenome(); - if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true; - else { + if (state.hadIndel() && readInfo.generateExtendedEvents()) + hasExtendedEvents = true; + else if (op == null) { // we discard the read only when we are past its end AND indel at the end of the read (if any) was // already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - if ( op == null ) { // we've stepped off the end of the object - //if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart())); - it.remove(); - } + it.remove(); // we've stepped off the end of the object } } } @@ -508,20 +551,20 @@ public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - private class ReadStateManager { + private class ReadStateManager { private final PeekableIterator iterator; private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + private final Map readStatesBySample = new HashMap(); private final int targetCoverage; private int totalReadStates = 0; public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { this.iterator = new PeekableIterator(source); this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; - switch(this.downsamplingMethod.type) { + switch (this.downsamplingMethod.type) { case BY_SAMPLE: - if(downsamplingMethod.toCoverage == null) + if (downsamplingMethod.toCoverage == null) throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); this.targetCoverage = downsamplingMethod.toCoverage; break; @@ -529,10 +572,10 @@ public ReadStateManager(Iterator source, DownsamplingMethod downsampl this.targetCoverage = Integer.MAX_VALUE; } - Map readSelectors = new HashMap(); - for(final String sample: samples) { - readStatesBySample.put(sample,new PerSampleReadStateManager()); - readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); + Map readSelectors = new HashMap(); + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); } samplePartitioner = new SamplePartitioner(readSelectors); @@ -541,6 +584,7 @@ public ReadStateManager(Iterator source, DownsamplingMethod downsampl /** * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented * for this iterator; if present, total read states will be decremented. + * * @param sample The sample. * @return Iterator over the reads associated with that sample. */ @@ -569,6 +613,7 @@ public boolean isEmpty() { /** * Retrieves the total number of reads in the manager across all samples. + * * @return Total number of reads over all samples. */ public int size() { @@ -577,6 +622,7 @@ public int size() { /** * Retrieves the total number of reads in the manager in the given sample. + * * @param sample The sample. * @return Total number of reads in the given sample. */ @@ -587,6 +633,7 @@ public int size(final String sample) { /** * The extent of downsampling; basically, the furthest base out which has 'fallen * victim' to the downsampler. + * * @param sample Sample, downsampled independently. * @return Integer stop of the furthest undownsampled region. */ @@ -595,9 +642,9 @@ public int getDownsamplingExtent(final String sample) { } public SAMRecordState getFirst() { - for(final String sample: samples) { + for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); - if(!reads.isEmpty()) + if (!reads.isEmpty()) return reads.peek(); } return null; @@ -608,19 +655,18 @@ public boolean hasNext() { } public void collectPendingReads() { - if(!iterator.hasNext()) + if (!iterator.hasNext()) return; - if(readStates.size() == 0) { + if (readStates.size() == 0) { int firstContigIndex = iterator.peek().getReferenceIndex(); int firstAlignmentStart = iterator.peek().getAlignmentStart(); - while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { + while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { samplePartitioner.submitRead(iterator.next()); } - } - else { + } else { // Fast fail in the case that the read is past the current position. - if(readIsPastCurrentPosition(iterator.peek())) + if (readIsPastCurrentPosition(iterator.peek())) return; while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { @@ -629,7 +675,7 @@ public void collectPendingReads() { } samplePartitioner.complete(); - for(final String sample: samples) { + for (final String sample : samples) { ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); Collection newReads = new ArrayList(aggregator.getSelectedReads()); @@ -638,21 +684,20 @@ public void collectPendingReads() { int numReads = statesBySample.size(); int downsamplingExtent = aggregator.getDownsamplingExtent(); - if(numReads+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.NONE) { + if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { long readLimit = aggregator.getNumReadsSeen(); - addReadsToSample(statesBySample,newReads,readLimit); + addReadsToSample(statesBySample, newReads, readLimit); statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } - else { + } else { int[] counts = statesBySample.getCountsPerAlignmentStart(); int[] updatedCounts = new int[counts.length]; - System.arraycopy(counts,0,updatedCounts,0,counts.length); + System.arraycopy(counts, 0, updatedCounts, 0, counts.length); boolean readPruned = true; - while(numReads+newReads.size()>targetCoverage && readPruned) { + while (numReads + newReads.size() > targetCoverage && readPruned) { readPruned = false; - for(int alignmentStart=updatedCounts.length-1;numReads+newReads.size()>targetCoverage&&alignmentStart>=0;alignmentStart--) { - if(updatedCounts[alignmentStart] > 1) { + for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { + if (updatedCounts[alignmentStart] > 1) { updatedCounts[alignmentStart]--; numReads--; readPruned = true; @@ -660,7 +705,7 @@ public void collectPendingReads() { } } - if(numReads == targetCoverage) { + if (numReads == targetCoverage) { updatedCounts[0]--; numReads--; } @@ -668,18 +713,18 @@ public void collectPendingReads() { BitSet toPurge = new BitSet(readStates.size()); int readOffset = 0; - for(int i = 0; i < updatedCounts.length; i++) { + for (int i = 0; i < updatedCounts.length; i++) { int n = counts[i]; int k = updatedCounts[i]; - for(Integer purgedElement: MathUtils.sampleIndicesWithoutReplacement(n,n-k)) - toPurge.set(readOffset+purgedElement); + for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) + toPurge.set(readOffset + purgedElement); readOffset += counts[i]; } - downsamplingExtent = Math.max(downsamplingExtent,statesBySample.purge(toPurge)); - - addReadsToSample(statesBySample,newReads,targetCoverage-numReads); + downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); + + addReadsToSample(statesBySample, newReads, targetCoverage - numReads); statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); } } @@ -688,23 +733,25 @@ public void collectPendingReads() { /** * Add reads with the given sample name to the given hanger entry. + * * @param readStates The list of read states to add this collection of reads. - * @param reads Reads to add. Selected reads will be pulled from this source. - * @param maxReads Maximum number of reads to add. + * @param reads Reads to add. Selected reads will be pulled from this source. + * @param maxReads Maximum number of reads to add. */ private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { - if(reads.isEmpty()) + if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); int readCount = 0; - for(SAMRecord read: reads) { - if(readCount < maxReads) { + for (SAMRecord read : reads) { + if (readCount < maxReads) { SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents()); state.stepForwardOnGenome(); newReadStates.add(state); // TODO: What if we downsample the extended events away? - if (state.hadIndel()) hasExtendedEvents = true; + if (state.hadIndel()) + hasExtendedEvents = true; readCount++; } } @@ -735,7 +782,7 @@ public int size() { } public void specifyNewDownsamplingExtent(int downsamplingExtent) { - this.downsamplingExtent = Math.max(this.downsamplingExtent,downsamplingExtent); + this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); } public int getDownsamplingExtent() { @@ -745,7 +792,7 @@ public int getDownsamplingExtent() { public int[] getCountsPerAlignmentStart() { int[] counts = new int[readStateCounter.size()]; int index = 0; - for(Counter counter: readStateCounter) + for (Counter counter : readStateCounter) counts[index++] = counter.getCount(); return counts; } @@ -766,7 +813,7 @@ public void remove() { wrappedIterator.remove(); Counter counter = readStateCounter.peek(); counter.decrement(); - if(counter.getCount() == 0) + if (counter.getCount() == 0) readStateCounter.remove(); } }; @@ -775,13 +822,14 @@ public void remove() { /** * Purge the given elements from the bitset. If an element in the bitset is true, purge * the corresponding read state. + * * @param elements bits from the set to purge. * @return the extent of the final downsampled read. */ public int purge(final BitSet elements) { int downsamplingExtent = 0; - if(elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; Iterator readStateIterator = readStates.iterator(); @@ -794,22 +842,22 @@ public int purge(final BitSet elements) { int toPurge = elements.nextSetBit(0); int removedCount = 0; - while(readStateIterator.hasNext() && toPurge >= 0) { + while (readStateIterator.hasNext() && toPurge >= 0) { SAMRecordState state = readStateIterator.next(); - downsamplingExtent = Math.max(downsamplingExtent,state.getRead().getAlignmentEnd()); + downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); - if(readIndex == toPurge) { + if (readIndex == toPurge) { readStateIterator.remove(); currentCounter.decrement(); - if(currentCounter.getCount() == 0) + if (currentCounter.getCount() == 0) counterIterator.remove(); removedCount++; - toPurge = elements.nextSetBit(toPurge+1); + toPurge = elements.nextSetBit(toPurge + 1); } readIndex++; alignmentStartCounter--; - if(alignmentStartCounter == 0 && counterIterator.hasNext()) { + if (alignmentStartCounter == 0 && counterIterator.hasNext()) { currentCounter = counterIterator.next(); alignmentStartCounter = currentCounter.getCount(); } @@ -849,12 +897,14 @@ public void decrement() { interface ReadSelector { /** * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. + * * @param read the read to evaluate. */ public void submitRead(SAMRecord read); /** * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. + * * @param read the read previously rejected. */ public void notifyReadRejected(SAMRecord read); @@ -866,12 +916,14 @@ interface ReadSelector { /** * Retrieve the number of reads seen by this selector so far. + * * @return number of reads seen. */ public long getNumReadsSeen(); /** * Return the number of reads accepted by this selector so far. + * * @return number of reads selected. */ public long getNumReadsSelected(); @@ -880,12 +932,14 @@ interface ReadSelector { * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at * position 3 whose cigar string is 76M, the value of this parameter will be 78. + * * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. */ public int getDownsamplingExtent(); /** * Get the reads selected by this selector. + * * @return collection of reads selected by this selector. */ public Collection getSelectedReads(); @@ -911,7 +965,7 @@ public void submitRead(SAMRecord read) { public void notifyReadRejected(SAMRecord read) { readsSeen++; - downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd()); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); } public void complete() { @@ -949,18 +1003,18 @@ class NRandomReadSelector implements ReadSelector { private final ReservoirDownsampler reservoir; private final ReadSelector chainedSelector; private long readsSeen = 0; - private int downsamplingExtent = 0; + private int downsamplingExtent = 0; public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { - this.reservoir = new ReservoirDownsampler((int)readLimit); + this.reservoir = new ReservoirDownsampler((int) readLimit); this.chainedSelector = chainedSelector; } public void submitRead(SAMRecord read) { SAMRecord displaced = reservoir.add(read); - if(displaced != null && chainedSelector != null) { + if (displaced != null && chainedSelector != null) { chainedSelector.notifyReadRejected(read); - downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd()); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); } readsSeen++; } @@ -970,9 +1024,9 @@ public void notifyReadRejected(SAMRecord read) { } public void complete() { - for(SAMRecord read: reservoir.getDownsampledContents()) + for (SAMRecord read : reservoir.getDownsampledContents()) chainedSelector.submitRead(read); - if(chainedSelector != null) + if (chainedSelector != null) chainedSelector.complete(); } @@ -987,7 +1041,7 @@ public long getNumReadsSelected() { public int getDownsamplingExtent() { return downsamplingExtent; - } + } public Collection getSelectedReads() { return reservoir.getDownsampledContents(); @@ -996,7 +1050,7 @@ public Collection getSelectedReads() { public void reset() { reservoir.clear(); downsamplingExtent = 0; - if(chainedSelector != null) + if (chainedSelector != null) chainedSelector.reset(); } } @@ -1005,23 +1059,23 @@ public void reset() { * Note: stores reads by sample ID string, not by sample object */ class SamplePartitioner implements ReadSelector { - private final Map readsBySample; + private final Map readsBySample; private long readsSeen = 0; - public SamplePartitioner(Map readSelectors) { + public SamplePartitioner(Map readSelectors) { readsBySample = readSelectors; } public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null; - if(readsBySample.containsKey(sampleName)) + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) readsBySample.get(sampleName).submitRead(read); readsSeen++; } public void notifyReadRejected(SAMRecord read) { - String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null; - if(readsBySample.containsKey(sampleName)) + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) readsBySample.get(sampleName).notifyReadRejected(read); readsSeen++; } @@ -1040,23 +1094,23 @@ public long getNumReadsSelected() { public int getDownsamplingExtent() { int downsamplingExtent = 0; - for(ReadSelector storage: readsBySample.values()) - downsamplingExtent = Math.max(downsamplingExtent,storage.getDownsamplingExtent()); + for (ReadSelector storage : readsBySample.values()) + downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); return downsamplingExtent; } - + public Collection getSelectedReads() { throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); } public ReadSelector getSelectedReads(String sampleName) { - if(!readsBySample.containsKey(sampleName)) + if (!readsBySample.containsKey(sampleName)) throw new NoSuchElementException("Sample name not found"); return readsBySample.get(sampleName); } public void reset() { - for(ReadSelector storage: readsBySample.values()) + for (ReadSelector storage : readsBySample.values()) storage.reset(); readsSeen = 0; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index f098655376..bc7d5c6ca8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -90,19 +90,12 @@ public class GATKRunReport { protected static Logger logger = Logger.getLogger(GATKRunReport.class); - // the listing of the fields is somewhat important; this is the order that the simple XML will output them - @ElementList(required = true, name = "gatk_header_Information") - private List mGATKHeader; - @Element(required = false, name = "id") private final String id; @Element(required = false, name = "exception") private final ExceptionToXML mException; - @Element(required = true, name = "working_directory") - private String currentPath; - @Element(required = true, name = "start_time") private String startTime = "ND"; @@ -112,9 +105,6 @@ public class GATKRunReport { @Element(required = true, name = "run_time") private long runTime = 0; - @Element(required = true, name = "command_line") - private String cmdLine = "COULD NOT BE DETERMINED"; - @Element(required = true, name = "walker_name") private String walkerName; @@ -127,9 +117,6 @@ public class GATKRunReport { @Element(required = true, name = "max_memory") private long maxMemory; - @Element(required = true, name = "java_tmp_directory") - private String tmpDir; - @Element(required = true, name = "user_name") private String userName; @@ -145,18 +132,13 @@ public class GATKRunReport { @Element(required = true, name = "iterations") private long nIterations; - @Element(required = true, name = "reads") - private long nReads; - public enum PhoneHomeOption { /** Disable phone home */ NO_ET, /** Standard option. Writes to local repository if it can be found, or S3 otherwise */ STANDARD, /** Force output to STDOUT. For debugging only */ - STDOUT, - /** Force output to S3. For debugging only */ - AWS_S3 // todo -- remove me -- really just for testing purposes + STDOUT } private static final DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss"); @@ -174,15 +156,8 @@ public GATKRunReport(Walker walker, Exception e, GenomeAnalysisEngine engin logger.debug("Aggregating data for run report"); - mGATKHeader = CommandLineGATK.createApplicationHeader(); - currentPath = System.getProperty("user.dir"); - // what did we run? id = org.apache.commons.lang.RandomStringUtils.randomAlphanumeric(32); - try { - cmdLine = engine.createApproximateCommandLineArgumentString(engine, walker); - } catch (Exception ignore) { } - walkerName = engine.getWalkerName(walker.getClass()); svnVersion = CommandLineGATK.getVersionNumber(); @@ -193,7 +168,6 @@ public GATKRunReport(Walker walker, Exception e, GenomeAnalysisEngine engin startTime = dateFormat.format(engine.getStartTime()); runTime = (end.getTime() - engine.getStartTime().getTime()) / 1000L; // difference in seconds } - tmpDir = System.getProperty("java.io.tmpdir"); // deal with memory usage Runtime.getRuntime().gc(); // call GC so totalMemory is ~ used memory @@ -204,12 +178,11 @@ public GATKRunReport(Walker walker, Exception e, GenomeAnalysisEngine engin if ( engine.getCumulativeMetrics() != null ) { // it's possible we aborted so early that these data structures arent initialized nIterations = engine.getCumulativeMetrics().getNumIterations(); - nReads = engine.getCumulativeMetrics().getNumReadsSeen(); } // user and hostname -- information about the runner of the GATK userName = System.getProperty("user.name"); - hostName = "unknown"; // resolveHostname(); + hostName = Utils.resolveHostname(); // basic java information java = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version"))); @@ -239,11 +212,8 @@ public void postReport(PhoneHomeOption type) { case STDOUT: postReportToStream(System.out); break; - case AWS_S3: - postReportToAWSS3(); - break; default: - exceptDuringRunReport("BUG: unexcepted PhoneHomeOption "); + exceptDuringRunReport("BUG: unexpected PhoneHomeOption "); break; } } @@ -264,22 +234,8 @@ private void postReportToStream(OutputStream stream) { } } - /** - * Opens the destination file and writes a gzipped version of the XML report there. - * - * @param destination - * @throws IOException - */ - private void postReportToFile(File destination) throws IOException { - BufferedOutputStream out = - new BufferedOutputStream( - new GZIPOutputStream( - new FileOutputStream(destination))); - try { - postReportToStream(out); - } finally { - out.close(); - } + private final String getKey() { + return getID() + ".report.xml.gz"; } /** @@ -288,16 +244,21 @@ private void postReportToFile(File destination) throws IOException { * That is, postReport() is guarenteed not to fail for any reason. */ private File postReportToLocalDisk(File rootDir) { - String filename = getID() + ".report.xml.gz"; - File file = new File(rootDir, filename); + final String filename = getKey(); + final File destination = new File(rootDir, filename); + try { - postReportToFile(file); - logger.debug("Wrote report to " + file); - return file; + final BufferedOutputStream out = new BufferedOutputStream( + new GZIPOutputStream( + new FileOutputStream(destination))); + postReportToStream(out); + out.close(); + logger.debug("Wrote report to " + destination); + return destination; } catch ( Exception e ) { // we catch everything, and no matter what eat the error exceptDuringRunReport("Couldn't read report file", e); - file.delete(); + destination.delete(); return null; } } @@ -305,42 +266,46 @@ private File postReportToLocalDisk(File rootDir) { private void postReportToAWSS3() { // modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html this.hostName = Utils.resolveHostname(); // we want to fill in the host name - File localFile = postReportToLocalDisk(new File("./")); - logger.debug("Generating GATK report to AWS S3 based on local file " + localFile); - if ( localFile != null ) { // we succeeded in creating the local file - localFile.deleteOnExit(); - try { - // stop us from printing the annoying, and meaningless, mime types warning - Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); - mimeTypeLogger.setLevel(Level.FATAL); - - // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials - // are stored in an AWSCredentials object: - - // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket - String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user - String awsSecretKey = "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"; // GATK AWS user - AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); - - // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP - // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. - S3Service s3Service = new RestS3Service(awsCredentials); - - // Create an S3Object based on a file, with Content-Length set automatically and - // Content-Type set based on the file's extension (using the Mimetypes utility class) - S3Object fileObject = new S3Object(localFile); - //logger.info("Created S3Object" + fileObject); - //logger.info("Uploading " + localFile + " to AWS bucket"); - S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); - logger.debug("Uploaded to AWS: " + s3Object); - logger.info("Uploaded run statistics report to AWS S3"); - } catch ( S3ServiceException e ) { - exceptDuringRunReport("S3 exception occurred", e); - } catch ( NoSuchAlgorithmException e ) { - exceptDuringRunReport("Couldn't calculate MD5", e); - } catch ( IOException e ) { - exceptDuringRunReport("Couldn't read report file", e); - } + final String key = getKey(); + logger.debug("Generating GATK report to AWS S3 with key " + key); + try { + // create an byte output stream so we can capture the output as a byte[] + final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096); + final OutputStream outputStream = new GZIPOutputStream(byteStream); + postReportToStream(outputStream); + outputStream.close(); + final byte[] report = byteStream.toByteArray(); + + // stop us from printing the annoying, and meaningless, mime types warning + Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); + mimeTypeLogger.setLevel(Level.FATAL); + + // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials + // are stored in an AWSCredentials object: + + // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket + String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user + String awsSecretKey = "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"; // GATK AWS user + AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); + + // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP + // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. + S3Service s3Service = new RestS3Service(awsCredentials); + + // Create an S3Object based on a file, with Content-Length set automatically and + // Content-Type set based on the file's extension (using the Mimetypes utility class) + S3Object fileObject = new S3Object(key, report); + //logger.info("Created S3Object" + fileObject); + //logger.info("Uploading " + localFile + " to AWS bucket"); + S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); + logger.debug("Uploaded to AWS: " + s3Object); + logger.info("Uploaded run statistics report to AWS S3"); + } catch ( S3ServiceException e ) { + exceptDuringRunReport("S3 exception occurred", e); + } catch ( NoSuchAlgorithmException e ) { + exceptDuringRunReport("Couldn't calculate MD5", e); + } catch ( IOException e ) { + exceptDuringRunReport("Couldn't read report file", e); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index b72b20e0b7..b59b550e1c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -296,6 +296,10 @@ public boolean containsKey(Object primaryKey) { return primaryKeyColumn.contains(primaryKey); } + public Collection getPrimaryKeys() { + return Collections.unmodifiableCollection(primaryKeyColumn); + } + /** * Set the value for a given position in the table * diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java new file mode 100644 index 0000000000..3f24e65852 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -0,0 +1,258 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 12/9/11 + */ + +public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + /** + * our log, which we want to capture anything from this class + */ + protected static Logger logger = Logger.getLogger(TraversalEngine.class); + + private final Queue workQueue = new LinkedList(); + private final LinkedHashSet myReads = new LinkedHashSet(); + + @Override + protected String getTraversalType() { + return "active regions"; + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + final GenomeLocSortedSet initialIntervals = engine.getIntervals(); // BUGBUG: unfortunate inefficiency that needs to be removed + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + + int minStart = Integer.MAX_VALUE; + final ArrayList isActiveList = new ArrayList(); + GenomeLoc firstIsActiveStart = null; + + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + GenomeLoc location = locus.getLocation(); + if(prevLoc != null) { + for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) { + final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); + if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) { + final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) ); + isActiveList.add( isActiveProb ); + if( firstIsActiveStart == null ) { + firstIsActiveStart = fakeLoc; + } + } + } + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + if( initialIntervals == null || initialIntervals.overlaps( location ) ) { + final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) + : ( walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0 ) ); + isActiveList.add( isActiveProb ); + if( firstIsActiveStart == null ) { + firstIsActiveStart = location; + } + } + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); + } + } + + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + if( !locusView.hasNext() ) { + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); + } + if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); } + } + } + prevLoc = location; + printProgress(dataProvider.getShard(), locus.getLocation()); + } + + // Take the individual isActive calls and integrate them into contiguous active regions and + // add these blocks of work to the work queue + final ArrayList activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null ); + logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); + if( walker.activeRegionOutStream == null ) { + workQueue.addAll( activeRegions ); + } else { // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : activeRegions ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } + } + + return sum; + } + + // Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine + public T endTraversal( final Walker walker, T sum) { + while( workQueue.peek() != null ) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker) walker ); + } + + return sum; + } + + private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { + final ArrayList placedReads = new ArrayList(); + for( final GATKSAMRecord read : reads ) { + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) + long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + bestRegion = otherRegionToTest; + } + } + bestRegion.add( read ); + + // The read is also added to all other regions in which it overlaps but marked as non-primary + if( walker.wantsNonPrimaryReads() ) { + if( !bestRegion.equals(activeRegion) ) { + activeRegion.add( read ); + } + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } + } + } + placedReads.add( read ); + } else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) && walker.wantsNonPrimaryReads() ) { + activeRegion.add( read ); + } + } + reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map( activeRegion, null ); + return walker.reduce( x, sum ); + } + + /** + * Gets the best view of loci for this walker given the available data. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( final Walker walker, final LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + + // band-pass filter the list of isActive probabilities and turn into active regions + private ArrayList integrateActiveList( final ArrayList activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) { + + final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author + final ArrayList returnList = new ArrayList(); + if( activeList.size() == 0 ) { + return returnList; + } else if( activeList.size() == 1 ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()), + activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) ); + return returnList; + } else { + final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]); + final double[] filteredProbArray = new double[activeProbArray.length]; + final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author + final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + double maxVal = 0; + for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) { + if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } + } + filteredProbArray[iii] = maxVal; + } + + boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD; + int curStart = 0; + for(int iii = 1; iii < filteredProbArray.length; iii++ ) { + final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD; + if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { + returnList.add( new ActiveRegion( + engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)), + curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); + curStatus = thisStatus; + curStart = iii; + } + } + if( curStart != filteredProbArray.length-1 ) { + returnList.add( new ActiveRegion( + engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)), + curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); + } + return returnList; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java index d99e7c3539..1d14a7f35d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java @@ -102,7 +102,9 @@ public T traverse( LocusWalker walker, } /** - * Gets the best view of loci for this walker given the available data. + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. * @param walker walker to interrogate. * @param dataProvider Data which which to drive the locus view. * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java new file mode 100644 index 0000000000..bb007893c9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.gatk.walkers; + +import java.lang.annotation.Documented; +import java.lang.annotation.Inherited; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** + * Describes the size of the buffer region that is added to each active region when pulling in covered reads. + * User: rpoplin + * Date: 1/18/12 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) + +public @interface ActiveRegionExtension { + public int extension() default 0; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java new file mode 100644 index 0000000000..6403f15a2f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.gatk.walkers; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.IntervalBinding; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; +import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; + +/** + * Base class for all the Active Region Walkers. + * User: rpoplin + * Date: 12/7/11 + */ + +@By(DataSource.READS) +@Requires({DataSource.READS, DataSource.REFERENCE_BASES}) +@PartitionBy(PartitionType.READ) +@ActiveRegionExtension(extension=50) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +public abstract class ActiveRegionWalker extends Walker { + + @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) + public PrintStream activeRegionOutStream = null; + + @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) + protected List> activeRegionBindings = null; + + public GenomeLocSortedSet presetActiveRegions = null; + + @Override + public void initialize() { + if( activeRegionBindings == null ) { return; } + List allIntervals = new ArrayList(0); + for ( IntervalBinding intervalBinding : activeRegionBindings ) { + List intervals = intervalBinding.getIntervals(this.getToolkit()); + + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION); + } + + presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL); + } + + // Do we actually want to operate on the context? + public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + return true; // We are keeping all the reads + } + + public boolean wantsNonPrimaryReads() { + return false; + } + + // Determine probability of active status over the AlignmentContext + public abstract double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); + + // Map over the ActiveRegion + public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker); + + public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) { + final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + final List allIntervals = new ArrayList(); + for( final GenomeLoc interval : intervals.toList() ) { + final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); + final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); + allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); + } + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index ac69738d3d..0702b08c13 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -30,18 +30,19 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; import java.util.Collection; +import java.util.Random; import java.util.Set; import java.util.TreeSet; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. * @@ -70,12 +71,21 @@ * -I input2.bam \ * --read_filter MappingQualityZero * + * // Prints the first 2000 reads in the BAM file * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T PrintReads \ * -o output.bam \ * -I input.bam \ * -n 2000 + * + * // Downsamples BAM file to 25% + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input.bam \ + * -ds 0.25 * * */ @@ -95,9 +105,18 @@ public class PrintReadsWalker extends ReadWalker { @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false) String platform = null; + /** + * Only prints the first n reads of the file + */ @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) int nReadsToPrint = -1; + /** + * Downsamples the bam file by the given ratio, printing only approximately the given percentage of reads. The downsampling is balanced (over the entire coverage) + */ + @Argument(fullName = "downsample_coverage", shortName = "ds", doc="Downsample BAM to desired coverage", required = false) + public double downsampleRatio = 1.0; + /** * Only reads from samples listed in the provided file(s) will be included in the output. */ @@ -112,6 +131,8 @@ public class PrintReadsWalker extends ReadWalker { private TreeSet samplesToChoose = new TreeSet(); private boolean SAMPLES_SPECIFIED = false; + + Random random; /** * The initialize function. @@ -132,13 +153,15 @@ public void initialize() { if(!samplesToChoose.isEmpty()) { SAMPLES_SPECIFIED = true; } + + random = GenomeAnalysisEngine.getRandomGenerator(); } /** * The reads filter function. * - * @param ref the reference bases that correspond to our read, if a reference was provided + * @param ref the reference bases that correspond to our read, if a reference was provided * @param read the read itself, as a SAMRecord * @return true if the read passes the filter, false if it doesn't */ @@ -177,13 +200,14 @@ else if (nReadsToPrint > 0) { nReadsToPrint--; // n > 0 means there are still reads to be printed. } - return true; - } + // if downsample option is turned off (= 1) then don't waste time getting the next random number. + return (downsampleRatio == 1 || random.nextDouble() < downsampleRatio); + } /** * The reads map function. * - * @param ref the reference bases that correspond to our read, if a reference was provided + * @param ref the reference bases that correspond to our read, if a reference was provided * @param read the read itself, as a SAMRecord * @return the read itself */ @@ -194,6 +218,7 @@ public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTrac /** * reduceInit is called once before any calls to the map function. We use it here to setup the output * bam file, if it was specified on the command line + * * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise */ public SAMFileWriter reduceInit() { @@ -202,7 +227,8 @@ public SAMFileWriter reduceInit() { /** * given a read and a output location, reduce by emitting the read - * @param read the read itself + * + * @param read the read itself * @param output the output source * @return the SAMFileWriter, so that the next reduce can emit to the same source */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 312b505ec1..97a4ac4680 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -22,16 +22,16 @@ public class BaseQualityRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) ) { - if ( p.getBase() == ref ) { + if ( p.getBase() == ref ) refQuals.add((double)p.getQual()); - } else if ( p.getBase() == alt ) { + else if ( alts.contains(p.getBase()) ) altQuals.add((double)p.getQual()); - } } } + } protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? @@ -57,8 +57,6 @@ protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List re refQuals.add(-10.0*refLikelihood); else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) altQuals.add(-10.0*altLikelihood); - - } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index c4025a25c2..6a825cba79 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -54,15 +54,15 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private static final double MIN_PVALUE = 1E-320; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( ! vc.isVariant() || vc.isFiltered() ) + if ( !vc.isVariant() ) return null; int[][] table; - if (vc.isBiallelic() && vc.isSNP()) - table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0)); - else if (vc.isIndel() || vc.isMixed()) { - table = getIndelContingencyTable(stratifiedContexts, vc); + if ( vc.isSNP() ) + table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + else if ( vc.isIndel() || vc.isMixed() ) { + table = getIndelContingencyTable(stratifiedContexts); if (table == null) return null; } @@ -73,7 +73,6 @@ else if (vc.isIndel() || vc.isMixed()) { if ( pvalue == null ) return null; - // use Math.abs to prevent -0's Map map = new HashMap(); map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); return map; @@ -206,7 +205,7 @@ private static int[][] getSNPContingencyTable(Map stra for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - if ( p.isDeletion() || p.isReducedRead() ) // ignore deletions and reduced reads + if ( p.isDeletion() || p.getRead().isReducedRead() ) // ignore deletions and reduced reads continue; if ( p.getRead().getMappingQuality() < 20 || p.getQual() < 20 ) @@ -235,7 +234,7 @@ private static int[][] getSNPContingencyTable(Map stra * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getIndelContingencyTable(Map stratifiedContexts, VariantContext vc) { + private static int[][] getIndelContingencyTable(Map stratifiedContexts) { final double INDEL_LIKELIHOOD_THRESH = 0.3; final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); @@ -259,7 +258,7 @@ else if (context.hasBasePileup()) continue; for (final PileupElement p: pileup) { - if ( p.isReducedRead() ) // ignore reduced reads + if ( p.getRead().isReducedRead() ) // ignore reduced reads continue; if ( p.getRead().getMappingQuality() < 20) continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 551f8e2cf4..f323a7be2d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -43,6 +42,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -52,8 +52,7 @@ /** * Consistency of the site with two (and only two) segregating haplotypes. Higher scores * are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls. - * Note that the Haplotype Score is only calculated for sites with read coverage; also, for SNPs, the - * site must be bi-allelic. + * Note that the Haplotype Score is only calculated for sites with read coverage. */ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation { private final static boolean DEBUG = false; @@ -62,15 +61,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private final static char REGEXP_WILDCARD = '.'; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here - return null; - - if (vc.isSNP() && !vc.isBiallelic()) + if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here return null; final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); - final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); + final int contextWingSize = Math.min((ref.getWindow().size() - 1) / 2, MIN_CONTEXT_WING_SIZE); final int contextSize = contextWingSize * 2 + 1; final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2; @@ -84,14 +80,14 @@ else if (context.hasBasePileup()) if (pileup == null) return null; - + final List haplotypes = computeHaplotypes(pileup, contextSize, locus, vc); - final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); + final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); if (haplotypes != null) { - for ( final Genotype genotype : vc.getGenotypes()) { + for (final Genotype genotype : vc.getGenotypes()) { final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); - if ( thisContext != null ) { + if (thisContext != null) { final ReadBackedPileup thisPileup; if (thisContext.hasExtendedEventPileup()) thisPileup = thisContext.getExtendedEventPileup(); @@ -102,14 +98,13 @@ else if (thisContext.hasBasePileup()) if (thisPileup != null) { if (vc.isSNP()) - scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense else if (vc.isIndel() || vc.isMixed()) { Double d = scoreIndelsAgainstHaplotypes(thisPileup); if (d == null) return null; - scoreRA.add( d ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - } - else + scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } else return null; } } @@ -122,12 +117,12 @@ else if (vc.isIndel() || vc.isMixed()) { return map; } - private class HaplotypeComparator implements Comparator{ + private class HaplotypeComparator implements Comparator { public int compare(Haplotype a, Haplotype b) { if (a.getQualitySum() < b.getQualitySum()) return 1; - if (a.getQualitySum() > b.getQualitySum()){ + if (a.getQualitySum() > b.getQualitySum()) { return -1; } return 0; @@ -137,39 +132,38 @@ public int compare(Haplotype a, Haplotype b) { private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus, final VariantContext vc) { // Compute all possible haplotypes consistent with current pileup - int haplotypesToCompute = vc.getAlternateAlleles().size()+1; + int haplotypesToCompute = vc.getAlternateAlleles().size() + 1; final PriorityQueue candidateHaplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); final PriorityQueue consensusHaplotypeQueue = new PriorityQueue(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); - for ( final PileupElement p : pileup ) { + for (final PileupElement p : pileup) { final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus); candidateHaplotypeQueue.add(haplotypeFromRead); } // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes Haplotype elem; - while ((elem = candidateHaplotypeQueue.poll()) != null) { + while ((elem = candidateHaplotypeQueue.poll()) != null) { boolean foundHaplotypeMatch = false; Haplotype lastCheckedHaplotype = null; - for ( final Haplotype haplotypeFromList : consensusHaplotypeQueue ) { + for (final Haplotype haplotypeFromList : consensusHaplotypeQueue) { final Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); - if (consensusHaplotype != null) { + if (consensusHaplotype != null) { foundHaplotypeMatch = true; if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) { consensusHaplotypeQueue.remove(haplotypeFromList); consensusHaplotypeQueue.add(consensusHaplotype); } break; - } - else { + } else { lastCheckedHaplotype = haplotypeFromList; } } if (!foundHaplotypeMatch && consensusHaplotypeQueue.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) { consensusHaplotypeQueue.add(elem); - } else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum() ) { + } else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum()) { consensusHaplotypeQueue.remove(lastCheckedHaplotype); consensusHaplotypeQueue.add(elem); } @@ -180,12 +174,14 @@ private List computeHaplotypes(final ReadBackedPileup pileup, final i // The consensus haplotypes are in a quality-ordered priority queue, so the best haplotypes are just the ones at the front of the queue final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); - Listhlist = new ArrayList(); + List hlist = new ArrayList(); hlist.add(new Haplotype(haplotype1.getBases(), 60)); - for (int k=1; k < haplotypesToCompute; k++) { + for (int k = 1; k < haplotypesToCompute; k++) { Haplotype haplotype2 = consensusHaplotypeQueue.poll(); - if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found + if (haplotype2 == null) { + haplotype2 = haplotype1; + } // Sometimes only the reference haplotype can be found hlist.add(new Haplotype(haplotype2.getBases(), 20)); } return hlist; @@ -194,36 +190,43 @@ private List computeHaplotypes(final ReadBackedPileup pileup, final i } private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { - final SAMRecord read = p.getRead(); + final GATKSAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; - Arrays.fill(haplotypeBases, (byte)REGEXP_WILDCARD); + Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); final double[] baseQualities = new double[contextSize]; Arrays.fill(baseQualities, 0.0); byte[] readBases = read.getReadBases(); - readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string + readBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); - readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string + readQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); - final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; - for (int i = 0; i < contextSize; i++ ) { + for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) { + if (baseOffset < 0) { continue; } - if ( baseOffset >= readBases.length ) { + if (baseOffset >= readBases.length) { break; } - if( readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } - if( !BaseUtils.isRegularBase(readBases[baseOffset]) ) { readBases[baseOffset] = (byte)REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 0; } // N's shouldn't be treated as distinct bases - readQuals[baseOffset] = (byte)Math.min((int)readQuals[baseOffset], p.getMappingQual()); - if( ((int)readQuals[baseOffset]) < 5 ) { readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them + if (readQuals[baseOffset] == PileupElement.DELETION_BASE) { + readQuals[baseOffset] = PileupElement.DELETION_QUAL; + } + if (!BaseUtils.isRegularBase(readBases[baseOffset])) { + readBases[baseOffset] = (byte) REGEXP_WILDCARD; + readQuals[baseOffset] = (byte) 0; + } // N's shouldn't be treated as distinct bases + readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual()); + if (((int) readQuals[baseOffset]) < 5) { + readQuals[baseOffset] = (byte) 0; + } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them haplotypeBases[i] = readBases[baseOffset]; - baseQualities[i] = (double)readQuals[baseOffset]; + baseQualities[i] = (double) readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); @@ -238,7 +241,7 @@ private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplot } byte chA, chB; - final byte wc = (byte)REGEXP_WILDCARD; + final byte wc = (byte) REGEXP_WILDCARD; final int length = a.length; final byte[] consensusChars = new byte[length]; @@ -247,7 +250,7 @@ private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplot final double[] qualsA = haplotypeA.getQuals(); final double[] qualsB = haplotypeB.getQuals(); - for (int i=0; i < length; i++) { + for (int i = 0; i < length; i++) { chA = a[i]; chB = b[i]; @@ -257,17 +260,15 @@ private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplot if ((chA == wc) && (chB == wc)) { consensusChars[i] = wc; consensusQuals[i] = 0.0; - } - else if ((chA == wc)) { + } else if ((chA == wc)) { consensusChars[i] = chB; consensusQuals[i] = qualsB[i]; - } - else if ((chB == wc)){ + } else if ((chB == wc)) { consensusChars[i] = chA; consensusQuals[i] = qualsA[i]; } else { consensusChars[i] = chA; - consensusQuals[i] = qualsA[i]+qualsB[i]; + consensusQuals[i] = qualsA[i] + qualsB[i]; } } @@ -276,31 +277,33 @@ else if ((chB == wc)){ // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes private double scoreReadsAgainstHaplotypes(final List haplotypes, final ReadBackedPileup pileup, final int contextSize, final int locus) { - if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0)); - if ( DEBUG ) System.out.printf("HAP2: %s%n", haplotypes.get(1)); + if (DEBUG) System.out.printf("HAP1: %s%n", haplotypes.get(0)); + if (DEBUG) System.out.printf("HAP2: %s%n", haplotypes.get(1)); final ArrayList haplotypeScores = new ArrayList(); - for ( final PileupElement p : pileup ) { + for (final PileupElement p : pileup) { // Score all the reads in the pileup, even the filtered ones final double[] scores = new double[haplotypes.size()]; - for ( int i = 0; i < haplotypes.size(); i++ ) { + for (int i = 0; i < haplotypes.size(); i++) { final Haplotype haplotype = haplotypes.get(i); final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus); scores[i] = score; - if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i, score); } + if (DEBUG) { + System.out.printf(" vs. haplotype %d = %f%n", i, score); + } } haplotypeScores.add(scores); } double overallScore = 0.0; - for ( final double[] readHaplotypeScores : haplotypeScores ) { + for (final double[] readHaplotypeScores : haplotypeScores) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } return overallScore; } - private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus ) { + private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) { double expected = 0.0; double mismatches = 0.0; @@ -315,33 +318,35 @@ private double scoreReadAgainstHaplotype(final PileupElement p, final int contex // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n final byte[] haplotypeBases = haplotype.getBases(); - final SAMRecord read = p.getRead(); + final GATKSAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string int readOffsetFromPileup = p.getOffset(); - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); - final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, read.getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; - for ( int i = 0; i < contextSize; i++ ) { + for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) { + if (baseOffset < 0) { continue; } - if ( baseOffset >= readBases.length ) { + if (baseOffset >= readBases.length) { break; } final byte haplotypeBase = haplotypeBases[i]; final byte readBase = readBases[baseOffset]; - final boolean matched = ( readBase == haplotypeBase || haplotypeBase == (byte)REGEXP_WILDCARD ); + final boolean matched = (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD); byte qual = readQuals[baseOffset]; - if( qual == PileupElement.DELETION_BASE ) { qual = PileupElement.DELETION_QUAL; } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions - qual = (byte)Math.min((int)qual, p.getMappingQual()); - if( ((int) qual) >= 5 ) { // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them + if (qual == PileupElement.DELETION_BASE) { + qual = PileupElement.DELETION_QUAL; + } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions + qual = (byte) Math.min((int) qual, p.getMappingQual()); + if (((int) qual) >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them final double e = QualityUtils.qualToErrorProb(qual); expected += e; mismatches += matched ? e : 1.0 - e / 3.0; @@ -355,26 +360,27 @@ private double scoreReadAgainstHaplotype(final PileupElement p, final int contex } - private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { final ArrayList haplotypeScores = new ArrayList(); - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - if (indelLikelihoodMap== null) + if (indelLikelihoodMap == null) return null; - for (final PileupElement p: pileup) { + for (final PileupElement p : pileup) { if (indelLikelihoodMap.containsKey(p)) { // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); + LinkedHashMap el = indelLikelihoodMap.get(p); // Score all the reads in the pileup, even the filtered ones final double[] scores = new double[el.size()]; int i = 0; - for (Allele a: el.keySet() ) { + for (Allele a : el.keySet()) { scores[i++] = -el.get(a); - if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i-1, scores[i-1]); } + if (DEBUG) { + System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); + } } haplotypeScores.add(scores); @@ -383,7 +389,7 @@ private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { // indel likelihoods are stric log-probs, not phred scored double overallScore = 0.0; - for ( final double[] readHaplotypeScores : haplotypeScores ) { + for (final double[] readHaplotypeScores : haplotypeScores) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } @@ -392,6 +398,11 @@ private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { } - public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); } + public List getKeyNames() { + return Arrays.asList("HaplotypeScore"); + } + + public List getDescriptions() { + return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b9e6a5b2bc..e38d7d1424 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -21,9 +22,9 @@ * User: chartl * Date: 9/14/11 * Time: 12:24 PM - * To change this template use File | Settings | File Templates. */ -public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation { + +public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; private String motherId; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 9857c339f3..aa4f26ef3d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -24,12 +24,12 @@ public class MappingQualityRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { if ( p.getBase() == ref ) { refQuals.add((double)p.getMappingQual()); - } else if ( p.getBase() == alt ) { + } else if ( alts.contains(p.getBase()) ) { altQuals.add((double)p.getMappingQual()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index c5a2df1fd5..00968943d8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -30,32 +31,34 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar static final boolean DEBUG = false; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) + if (stratifiedContexts.size() == 0) return null; - + final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) + if (genotypes == null || genotypes.size() == 0) return null; final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); - if (vc.isSNP() && vc.isBiallelic()) { - // todo - no current support for multiallelic snps + if ( vc.isSNP() ) { + final List altAlleles = new ArrayList(); + for ( final Allele a : vc.getAlternateAlleles() ) + altAlleles.add(a.getBases()[0]); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) { + if ( context == null ) continue; - } - fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals); + + fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals); } - } - else if (vc.isIndel() || vc.isMixed()) { + } else if ( vc.isIndel() || vc.isMixed() ) { - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + for (final Genotype genotype : genotypes.iterateInSampleNameOrder()) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) { + if (context == null) { continue; } @@ -74,46 +77,47 @@ else if (context.hasBasePileup()) fillIndelQualsFromPileup(pileup, refQuals, altQuals); } - } - else + } else return null; final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for ( final Double qual : altQuals ) { + for (final Double qual : altQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); } - for ( final Double qual : refQuals ) { + for (final Double qual : refQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); } if (DEBUG) { - System.out.format("%s, REF QUALS:",this.getClass().getName()); - for ( final Double qual : refQuals ) - System.out.format("%4.1f ",qual); + System.out.format("%s, REF QUALS:", this.getClass().getName()); + for (final Double qual : refQuals) + System.out.format("%4.1f ", qual); System.out.println(); - System.out.format("%s, ALT QUALS:",this.getClass().getName()); - for ( final Double qual : altQuals ) - System.out.format("%4.1f ",qual); + System.out.format("%s, ALT QUALS:", this.getClass().getName()); + for (final Double qual : altQuals) + System.out.format("%4.1f ", qual); System.out.println(); } // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest( MannWhitneyU.USet.SET1 ); + final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); final Map map = new HashMap(); - if ( ! Double.isNaN(testResults.first) ) + if (!Double.isNaN(testResults.first)) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; } - protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals); + protected abstract void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals); + protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); - protected static boolean isUsableBase( final PileupElement p ) { - return !( p.isDeletion() || - p.getMappingQual() == 0 || - p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - ((int)p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE ); // need the unBAQed quality score here + protected static boolean isUsableBase(final PileupElement p) { + return !(p.isInsertionAtBeginningOfRead() || + p.isDeletion() || + p.getMappingQual() == 0 || + p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || + ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index d762af4284..a998cd08b4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -24,27 +24,31 @@ */ public class ReadPosRankSumTest extends RankSumTest { - public List getKeyNames() { return Arrays.asList("ReadPosRankSum"); } + public List getKeyNames() { + return Arrays.asList("ReadPosRankSum"); + } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); } + public List getDescriptions() { + return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); + } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if( isUsableBase(p) ) { - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); + protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { + for (final PileupElement p : pileup) { + if (isUsableBase(p)) { + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); - if( readPos > numAlignedBases / 2 ) { - readPos = numAlignedBases - ( readPos + 1 ); - } + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); - if ( p.getBase() == ref ) { - refQuals.add( (double)readPos ); - } else if ( p.getBase() == alt ) { - altQuals.add( (double)readPos ); - } + + if ( p.getBase() == ref ) + refQuals.add((double) readPos); + else if ( alts.contains(p.getBase()) ) + altQuals.add((double) readPos); } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. @@ -52,18 +56,15 @@ protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List re // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref" // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p : pileup) { if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + LinkedHashMap el = indelLikelihoodMap.get(p); // retrieve likelihood information corresponding to this read + double refLikelihood = 0.0, altLikelihood = Double.NEGATIVE_INFINITY; // by design, first element in LinkedHashMap was ref allele for (Allele a : el.keySet()) { - if (a.isReference()) - refLikelihood =el.get(a); + refLikelihood = el.get(a); else { double like = el.get(a); if (like >= altLikelihood) @@ -75,23 +76,22 @@ protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List re final int numAlignedBases = getNumAlignedBases(p.getRead()); int rp = readPos; - if( readPos > numAlignedBases / 2 ) { - readPos = numAlignedBases - ( readPos + 1 ); + if (readPos > numAlignedBases / 2) { + readPos = numAlignedBases - (readPos + 1); } - //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); + //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); // if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads // where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping. - // if (readPos < -1) + // if (readPos < -1) // return; - if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { - refQuals.add((double)readPos); + if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { + refQuals.add((double) readPos); //if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos); - } - else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { - altQuals.add((double)readPos); - //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); + } else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { + altQuals.add((double) readPos); + //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); } @@ -115,7 +115,7 @@ int getNumClippedBasesAtStart(SAMRecord read) { // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, // and may leave a string of Q2 bases still hanging off the reads. - for (int i=numStartClippedBases; i < unclippedReadBases.length; i++) { + for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) { if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) numStartClippedBases++; else @@ -134,7 +134,7 @@ int getNumClippedBasesAtEnd(SAMRecord read) { // compute total number of clipped bases (soft or hard clipped) // check for hard clips (never consider these bases): final Cigar c = read.getCigar(); - CigarElement last = c.getCigarElement(c.numCigarElements()-1); + CigarElement last = c.getCigarElement(c.numCigarElements() - 1); int numEndClippedBases = 0; if (last.getOperator() == CigarOperator.H) { @@ -145,7 +145,7 @@ int getNumClippedBasesAtEnd(SAMRecord read) { // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, // and may leave a string of Q2 bases still hanging off the reads. - for (int i=unclippedReadBases.length-numEndClippedBases-1; i >= 0; i-- ){ + for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) { if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) numEndClippedBases++; else @@ -157,8 +157,6 @@ int getNumClippedBasesAtEnd(SAMRecord read) { } int getOffsetFromClippedReadStart(SAMRecord read, int offset) { - - - return offset - getNumClippedBasesAtStart(read); + return offset - getNumClippedBasesAtStart(read); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index ecdde1e4fe..1f8ccf6525 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -7,8 +7,9 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.MendelianViolation; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -18,16 +19,14 @@ /** * Created by IntelliJ IDEA. - * User: rpoplin + * User: rpoplin, lfran, ebanks * Date: 11/14/11 */ -public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private Set trios = null; - private final static int REF = 0; - private final static int HET = 1; - private final static int HOM = 2; + private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( trios == null ) { @@ -38,10 +37,10 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati } } - final Map toRet = new HashMap(1); + final Map toRet = new HashMap(1); final HashSet triosToTest = new HashSet(); - for( final Sample child : trios) { + for( final Sample child : trios ) { final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() && vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() && vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods(); @@ -50,7 +49,9 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati } } - toRet.put("TDT", calculateTDT( vc, triosToTest )); + if( triosToTest.size() >= MIN_NUM_VALID_TRIOS ) { + toRet.put("TDT", calculateTDT( vc, triosToTest )); + } return toRet; } @@ -58,33 +59,52 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati // return the descriptions used for the VCF INFO meta field public List getKeyNames() { return Arrays.asList("TDT"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", 1, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); } // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT - private double calculateTDT( final VariantContext vc, final Set triosToTest ) { - - final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET); - final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET); - final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET); - final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET); - final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF); - final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF); - - final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB); - final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB); - return (numer * numer) / denom; + private List calculateTDT( final VariantContext vc, final Set triosToTest ) { + + List pairwiseTDTs = new ArrayList(10); + final int HomRefIndex = 0; + + // for each pair of alleles, add the likelihoods + int numAltAlleles = vc.getAlternateAlleles().size(); + for ( int alt = 1; alt <= numAltAlleles; alt++ ) { + final int HetIndex = alt; + final int HomVarIndex = determineHomIndex(alt, numAltAlleles+1); + + final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HetIndex, HetIndex, HomVarIndex) + calculateNChildren(vc, triosToTest, HetIndex, HomVarIndex, HetIndex); + final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HomVarIndex, HetIndex, HomVarIndex) + calculateNChildren(vc, triosToTest, HomVarIndex, HomVarIndex, HetIndex); + final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, HomRefIndex, HetIndex, HetIndex); + final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HomVarIndex, HetIndex, HetIndex); + final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, HomRefIndex, HomRefIndex, HetIndex) + calculateNChildren(vc, triosToTest, HomRefIndex, HetIndex, HomRefIndex); + final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HetIndex, HomRefIndex, HetIndex) + calculateNChildren(vc, triosToTest, HetIndex, HetIndex, HomRefIndex); + + final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB); + final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB); + pairwiseTDTs.add((numer * numer) / denom); + } + + return pairwiseTDTs; } - private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) { + private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int momIdx, final int dadIdx ) { final double likelihoodVector[] = new double[triosToTest.size()]; int iii = 0; for( final Sample child : triosToTest ) { final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector(); final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector(); final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector(); - likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx]; + likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx]; } return MathUtils.sumLog10(likelihoodVector); } + + private static int determineHomIndex(final int alleleIndex, int numAlleles) { + int result = 0; + for ( int i = 0; i < alleleIndex; i++ ) + result += numAlleles--; + return result; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 69560c7cb1..5312c41367 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.BaseUtils; @@ -84,7 +83,6 @@ public class VariantAnnotator extends RodWalker implements Ann @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - public RodBinding getVariantRodBinding() { return variantCollection.variants; } /** * The INFO field will be annotated with information on the most biologically-significant effect @@ -163,6 +161,13 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") protected Boolean LIST = false; + /** + * By default, the dbSNP ID is added only when the ID field in the variant VCF is empty. + */ + @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated") + protected Boolean ALWAYS_APPEND_DBSNP_ID = false; + public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; } + @Hidden @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 98d2fe17b1..90d0ad7402 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -195,11 +195,20 @@ public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceConte private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + + // put the DB key into the INFO field infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null); - // annotate dbsnp id if available and not already there - if ( rsID != null && vc.emptyID() ) - vc = new VariantContextBuilder(vc).id(rsID).make(); + + // add the ID if appropriate + if ( rsID != null ) { + if ( vc.emptyID() ) { + vc = new VariantContextBuilder(vc).id(rsID).make(); + } else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) { + final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID; + vc = new VariantContextBuilder(vc).id(newRsID).make(); + } + } } else { boolean overlapsComp = false; for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java index 7200f841bc..1331ad5df1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java @@ -8,9 +8,9 @@ public interface AnnotatorCompatibleWalker { // getter methods for various used bindings - public abstract RodBinding getVariantRodBinding(); public abstract RodBinding getSnpEffRodBinding(); public abstract RodBinding getDbsnpRodBinding(); public abstract List> getCompRodBindings(); public abstract List> getResourceRodBindings(); + public abstract boolean alwaysAppendDbsnpId(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index f827856be9..ec67563dcd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -241,6 +241,11 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC String alleleA = beagleGenotypePairs.get(0); String alleleB = beagleGenotypePairs.get(1); + if ( alleleA.equals("null") || alleleB.equals("null") ) { + logger.warn("Beagle produced 'null' alleles at location "+ref.getLocus().toString()+". Ignoring."); + return 0; + } + // Beagle always produces genotype strings based on the strings we input in the likelihood file. String refString = vc_input.getReference().getDisplayString(); if (refString.length() == 0) // ref was null @@ -315,8 +320,7 @@ else if (originalAlleleB.isReference()) og = a1+"/"+a2; // See if Beagle switched genotypes - if (!((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) || - (bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))))){ + if (! originalAlleleA.equals(Allele.NO_CALL) && beagleSwitchedGenotypes(bglAlleleA,originalAlleleA,bglAlleleB,originalAlleleB)){ originalAttributes.put("OG",og); numGenotypesChangedByBeagle++; } @@ -359,6 +363,11 @@ else if (originalAlleleB.isReference()) return 1; } + private boolean beagleSwitchedGenotypes(Allele bglAlleleA, Allele originalAlleleA, Allele bglAlleleB, Allele originalAlleleB) { + return !((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) || + (bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA)))); + } + public Integer reduceInit() { return 0; // Nothing to do here } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java new file mode 100644 index 0000000000..a1ab733418 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; +import java.util.BitSet; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 9/26/11 + */ + +public class ContextCovariate implements StandardCovariate { + + private int mismatchesContextSize; + private int insertionsContextSize; + private int deletionsContextSize; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE; + insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE; + deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE; + + if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0) + throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize)); + + } + + @Override + public CovariateValues getValues(final GATKSAMRecord read) { + int l = read.getReadLength(); + BitSet[] mismatches = new BitSet[l]; + BitSet[] insertions = new BitSet[l]; + BitSet[] deletions = new BitSet[l]; + + final boolean negativeStrand = read.getReadNegativeStrandFlag(); + byte[] bases = read.getReadBases(); + if (negativeStrand) + bases = BaseUtils.simpleReverseComplement(bases); + + for (int i = 0; i < read.getReadLength(); i++) { + mismatches[i] = contextWith(bases, i, mismatchesContextSize); + insertions[i] = contextWith(bases, i, insertionsContextSize); + deletions[i] = contextWith(bases, i, deletionsContextSize); + } + + if (negativeStrand) { + reverse(mismatches); + reverse(insertions); + reverse(deletions); + } + return new CovariateValues(mismatches, insertions, deletions); + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return str; + } + + /** + * calculates the context of a base independent of the covariate mode + * + * @param bases the bases in the read to build the context from + * @param offset the position in the read to calculate the context for + * @param contextSize context size to use building the context + * @return + */ + private BitSet contextWith(byte [] bases, int offset, int contextSize) { + if (offset < contextSize) + return null; + + String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + if (context.contains("N")) + return null; + + return MathUtils.bitSetFrom(context); + } + + /** + * Reverses the given array in place. + * + * @param array any array + */ + private static void reverse(final Object[] array) { + final int arrayLength = array.length; + for (int l = 0, r = arrayLength - 1; l < r; l++, r--) { + final Object temp = array[l]; + array[l] = array[r]; + array[r] = temp; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java new file mode 100755 index 0000000000..80d8cff5d8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java @@ -0,0 +1,63 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 30, 2009 + * + * The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read. + * In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code. + * This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up. + */ + +public interface Covariate { + /** + * Initialize any member variables using the command-line arguments passed to the walker + * + * @param RAC the recalibration argument collection + */ + public void initialize(RecalibrationArgumentCollection RAC); + + /** + * Calculates covariate values for all positions in the read. + * + * @param read the read to calculate the covariates on. + * @return all the covariate values for every base in the read. + */ + public CovariateValues getValues(GATKSAMRecord read); + + public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration +} + +interface RequiredCovariate extends Covariate {} + +interface StandardCovariate extends Covariate {} + +interface ExperimentalCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java new file mode 100644 index 0000000000..1b62160a3d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * The object temporarily held by a read that describes all of it's covariates. + * + * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap + * + * @author Mauricio Carneiro + * @since 2/8/12 + */ +public class CovariateKeySet { + private Object[][] mismatchesKeySet; + private Object[][] insertionsKeySet; + private Object[][] deletionsKeySet; + + private int nextCovariateIndex; + + private static String mismatchesCovariateName = "M"; + private static String insertionsCovariateName = "I"; + private static String deletionsCovariateName = "D"; + + public CovariateKeySet(int readLength, int numberOfCovariates) { + numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format) + this.mismatchesKeySet = new Object[readLength][numberOfCovariates]; + this.insertionsKeySet = new Object[readLength][numberOfCovariates]; + this.deletionsKeySet = new Object[readLength][numberOfCovariates]; + initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName); + initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName); + initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName); + this.nextCovariateIndex = 0; + } + + public void addCovariate(CovariateValues covariate) { + transposeCovariateValues(mismatchesKeySet, covariate.getMismatches()); + transposeCovariateValues(insertionsKeySet, covariate.getInsertions()); + transposeCovariateValues(deletionsKeySet, covariate.getDeletions()); + nextCovariateIndex++; + } + + public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) { + if (modelString.equals(mismatchesCovariateName)) + return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION; + else if (modelString.equals(insertionsCovariateName)) + return RecalDataManager.BaseRecalibrationType.BASE_INSERTION; + else if (modelString.equals(deletionsCovariateName)) + return RecalDataManager.BaseRecalibrationType.BASE_DELETION; + throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString); + } + + public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) { + switch (errorModel) { + case BASE_SUBSTITUTION: + return getMismatchesKeySet(readPosition); + case BASE_INSERTION: + return getInsertionsKeySet(readPosition); + case BASE_DELETION: + return getDeletionsKeySet(readPosition); + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); + } + } + + public Object[] getMismatchesKeySet(int readPosition) { + return mismatchesKeySet[readPosition]; + } + + public Object[] getInsertionsKeySet(int readPosition) { + return insertionsKeySet[readPosition]; + } + + public Object[] getDeletionsKeySet(int readPosition) { + return deletionsKeySet[readPosition]; + } + + private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) { + for (int i=0; i DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); + private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM)) + throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform."); + } + + // Used to pick out the covariate's value from attributes of the read + @Override + public CovariateValues getValues(final GATKSAMRecord read) { + Integer [] cycles = new Integer[read.getReadLength()]; + final NGSPlatform ngsPlatform = read.getNGSPlatform(); + + // Discrete cycle platforms + if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { + final int init; + final int increment; + if (!read.getReadNegativeStrandFlag()) { + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { + //second of pair, positive strand + init = -1; + increment = -1; + } + else { + //first of pair, positive strand + init = 1; + increment = 1; + } + + } + else { + if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { + //second of pair, negative strand + init = -read.getReadLength(); + increment = 1; + } + else { + //first of pair, negative strand + init = read.getReadLength(); + increment = -1; + } + } + + int cycle = init; + for (int i = 0; i < read.getReadLength(); i++) { + cycles[i] = cycle; + cycle += increment; + } + } + + // Flow cycle platforms + else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) { + + final int readLength = read.getReadLength(); + final byte[] bases = read.getReadBases(); + + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); + + int cycle = multiplyByNegative1 ? -1 : 1; + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one + if (!read.getReadNegativeStrandFlag()) { // Forward direction + int iii = 0; + while (iii < readLength) { + while (iii < readLength && bases[iii] == (byte) 'T') { + cycles[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'A') { + cycles[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'C') { + cycles[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'G') { + cycles[iii] = cycle; + iii++; + } + if (iii < readLength) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { + cycles[iii] = cycle; + iii++; + } + + } + } + else { // Negative direction + int iii = readLength - 1; + while (iii >= 0) { + while (iii >= 0 && bases[iii] == (byte) 'T') { + cycles[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'A') { + cycles[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'C') { + cycles[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'G') { + cycles[iii] = cycle; + iii--; + } + if (iii >= 0) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { + cycles[iii] = cycle; + iii--; + } + } + } + } + + // Unknown platforms + else { + throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); + } + + return new CovariateValues(cycles, cycles, cycles); + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return Integer.parseInt(str); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java new file mode 100755 index 0000000000..373210bdbb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * The Reported Quality Score covariate. + */ + +public class QualityScoreCovariate implements RequiredCovariate { + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + } + + @Override + public CovariateValues getValues(final GATKSAMRecord read) { + int readLength = read.getReadLength(); + + Integer [] mismatches = new Integer[readLength]; + Integer [] insertions = new Integer[readLength]; + Integer [] deletions = new Integer[readLength]; + + byte [] baseQualities = read.getBaseQualities(); + byte [] baseInsertionQualities = read.getBaseInsertionQualities(); + byte [] baseDeletionQualities = read.getBaseDeletionQualities(); + + for (int i=0; i readGroupLookupTable = new HashMap(); + private final HashMap readGroupReverseLookupTable = new HashMap(); + private short nextId = 0; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + } + + @Override + public CovariateValues getValues(final GATKSAMRecord read) { + final int l = read.getReadLength(); + final String readGroupId = read.getReadGroup().getReadGroupId(); + short shortId; + if (readGroupLookupTable.containsKey(readGroupId)) + shortId = readGroupLookupTable.get(readGroupId); + else { + shortId = nextId; + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + Short [] readGroups = new Short[l]; + Arrays.fill(readGroups, shortId); + return new CovariateValues(readGroups, readGroups, readGroups); + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return str; + } + + public final String decodeReadGroup(final short id) { + return readGroupReverseLookupTable.get(id); + } +} + + diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java new file mode 100644 index 0000000000..cc60ac0106 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -0,0 +1,710 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 6, 2009 + * + * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. + * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. + * This class holds the parsing methods that are shared between CountCovariates and TableRecalibration. + */ + +public class RecalDataManager { + public final NestedHashMap nestedHashMap; // The full dataset + private final HashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private final HashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + private final HashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams + public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + private static boolean warnUserNullPlatform = false; + + private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ + + public enum BaseRecalibrationType { + BASE_SUBSTITUTION, + BASE_INSERTION, + BASE_DELETION + } + + public enum SOLID_RECAL_MODE { + /** + * Treat reference inserted bases as reference matching bases. Very unsafe! + */ + DO_NOTHING, + /** + * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. + */ + SET_Q_ZERO, + /** + * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. + */ + SET_Q_ZERO_BASE_N, + /** + * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. + */ + REMOVE_REF_BIAS + } + + public enum SOLID_NOCALL_STRATEGY { + /** + * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. + */ + THROW_EXCEPTION, + /** + * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. + */ + LEAVE_READ_UNRECALIBRATED, + /** + * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. + */ + PURGE_READ + } + + public RecalDataManager() { + nestedHashMap = new NestedHashMap(); + dataCollapsedReadGroup = null; + dataCollapsedQualityScore = null; + dataCollapsedByCovariate = null; + } + + public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) { + if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration + nestedHashMap = null; + dataCollapsedReadGroup = new HashMap(); + dataCollapsedQualityScore = new HashMap(); + dataCollapsedByCovariate = new HashMap>(); + for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { + dataCollapsedReadGroup.put(errorModel, new NestedHashMap()); + dataCollapsedQualityScore.put(errorModel, new NestedHashMap()); + dataCollapsedByCovariate.put(errorModel, new ArrayList()); + for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap()); + } + } + } + else { + nestedHashMap = new NestedHashMap(); + dataCollapsedReadGroup = null; + dataCollapsedQualityScore = null; + dataCollapsedByCovariate = null; + } + } + + public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) { + return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE); + } + + /** + * Add the given mapping to all of the collapsed hash tables + * + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping + * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table + */ + public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) { + + // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around + //data.put(key, thisDatum); // add the mapping to the main table + + final int qualityScore = Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + RecalDatum collapsedDatum; + + // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed + if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) { + readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group + collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey); + } + else { + collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported + } + } + + // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed + qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... + qualityScoreCollapsedKey[1] = key[1]; // and quality score + collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey); + } + else { + collapsedDatum.increment(fullDatum); + } + + // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) { + covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... + covariateCollapsedKey[1] = key[1]; // and quality score ... + final Object theCovariateElement = key[iii + 2]; // and the given covariate + if (theCovariateElement != null) { + covariateCollapsedKey[2] = theCovariateElement; + collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey); + } + else { + collapsedDatum.increment(fullDatum); + } + } + } + } + + /** + * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score + * that will be used in the sequential calculation in TableRecalibrationWalker + * + * @param smoothing The smoothing parameter that goes into empirical quality score calculation + * @param maxQual At which value to cap the quality scores + */ + public final void generateEmpiricalQualities(final int smoothing, final int maxQual) { + + for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { + recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual); + recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual); + for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) { + recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); + checkForSingletons(map.data); + } + } + } + + private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) { + + for (Object comp : data.keySet()) { + final Object val = data.get(comp); + if (val instanceof RecalDatum) { // We are at the end of the nested hash maps + ((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual); + } + else { // Another layer in the nested hash map + recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual); + } + } + } + + private void checkForSingletons(final Map data) { + // todo -- this looks like it's better just as a data.valueSet() call? + for (Object comp : data.keySet()) { + final Object val = data.get(comp); + if (val instanceof RecalDatum) { // We are at the end of the nested hash maps + if (data.keySet().size() == 1) { + data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ... + // in a previous step of the sequential calculation model + } + } + else { // Another layer in the nested hash map + checkForSingletons((Map) val); + } + } + } + + /** + * Get the appropriate collapsed table out of the set of all the tables held by this Object + * + * @param covariate Which covariate indexes the desired collapsed HashMap + * @return The desired collapsed HashMap + */ + public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) { + if (covariate == 0) { + return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed + } + else if (covariate == 1) { + return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed + } + else { + return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed + } + } + + /** + * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * + * @param read The read to adjust + * @param RAC The list of shared command line arguments + */ + public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { + GATKSAMReadGroupRecord readGroup = read.getReadGroup(); + + if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { + readGroup.setPlatform(RAC.FORCE_PLATFORM); + } + + if (readGroup.getPlatform() == null) { + if (RAC.DEFAULT_PLATFORM != null) { + if (!warnUserNullPlatform) { + Utils.warnUser("The input .bam file contains reads with no platform information. " + + "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName()); + warnUserNullPlatform = true; + } + readGroup.setPlatform(RAC.DEFAULT_PLATFORM); + } + else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); + } + } + } + + /** + * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space + * + * @param read The SAMRecord to parse + */ + public static void parseColorSpace(final GATKSAMRecord read) { + + // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + if (ReadUtils.isSOLiDRead(read)) { + if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read + final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if (attr != null) { + byte[] colorSpace; + if (attr instanceof String) { + colorSpace = ((String) attr).getBytes(); + } + else { + throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); + } + final byte[] inconsistency = new byte[readBases.length]; + int iii; + byte prevBase = colorSpace[0]; // The sentinel + for (iii = 0; iii < readBases.length; iii++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); + inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1); + prevBase = readBases[iii]; + } + read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); + + } + else { + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + } + } + } + } + + /** + * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases + * This method doesn't add the inconsistent tag to the read like parseColorSpace does + * + * @param read The SAMRecord to parse + * @param originalQualScores The array of original quality scores to modify during the correction + * @param solidRecalMode Which mode of solid recalibration to apply + * @param refBases The reference for this read + * @return A new array of quality scores that have been ref bias corrected + */ + public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) { + + final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if (attr != null) { + byte[] colorSpace; + if (attr instanceof String) { + colorSpace = ((String) attr).getBytes(); + } + else { + throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); + final byte[] colorImpliedBases = readBases.clone(); + byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); + refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone()); + } + final int[] inconsistency = new int[readBases.length]; + byte prevBase = colorSpace[0]; // The sentinel + for (int iii = 0; iii < readBases.length; iii++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); + colorImpliedBases[iii] = thisBase; + inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1); + prevBase = readBases[iii]; + } + + // Now that we have the inconsistency array apply the desired correction to the inconsistent bases + if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0 + final boolean setBaseN = false; + originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); + } + else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) { + final boolean setBaseN = true; + originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); + } + else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases + solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); + } + + } + else { + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + } + + return originalQualScores; + } + + public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { + if (ReadUtils.isSOLiDRead(read)) { + final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if (attr != null) { + byte[] colorSpace; + if (attr instanceof String) { + colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel + } + else { + throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + for (byte color : colorSpace) { + if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { + return true; // There is a bad color in this SOLiD read and the user wants to skip over it + } + } + + } + else { + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + } + } + + return false; // There aren't any color no calls in this SOLiD read + } + + /** + * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero + * + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * @param originalQualScores The array of original quality scores to set to zero if needed + * @param refBases The reference which has been RC'd if necessary + * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar + * @return The byte array of original quality scores some of which might have been set to zero + */ + private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) { + + final boolean negStrand = read.getReadNegativeStrandFlag(); + for (int iii = 1; iii < originalQualScores.length; iii++) { + if (inconsistency[iii] == 1) { + if (readBases[iii] == refBases[iii]) { + if (negStrand) { + originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0; + } + else { + originalQualScores[iii] = (byte) 0; + } + if (setBaseN) { + readBases[iii] = (byte) 'N'; + } + } + // Set the prev base to Q0 as well + if (readBases[iii - 1] == refBases[iii - 1]) { + if (negStrand) { + originalQualScores[originalQualScores.length - iii] = (byte) 0; + } + else { + originalQualScores[iii - 1] = (byte) 0; + } + if (setBaseN) { + readBases[iii - 1] = (byte) 'N'; + } + } + } + } + if (negStrand) { + readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read + } + read.setReadBases(readBases); + + return originalQualScores; + } + + /** + * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference + * + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * @param colorImpliedBases The bases implied by the color space, RC'd if necessary + * @param refBases The reference which has been RC'd if necessary + */ + private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) { + + final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); + if (attr != null) { + byte[] colorSpaceQuals; + if (attr instanceof String) { + String x = (String) attr; + colorSpaceQuals = x.getBytes(); + SAMUtils.fastqToPhred(colorSpaceQuals); + } + else { + throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + + for (int iii = 1; iii < inconsistency.length - 1; iii++) { + if (inconsistency[iii] == 1) { + for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read + if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step + if (readBases[jjj] == refBases[jjj]) { + if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2); + if (rand == 0) { // The color implied base won the coin flip + readBases[jjj] = colorImpliedBases[jjj]; + } + } + else { + final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); + final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); + int diffInQuality = maxQuality - minQuality; + int numLow = minQuality; + if (numLow == 0) { + numLow++; + diffInQuality++; + } + final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh); + if (rand >= numLow) { // higher q score won + if (maxQuality == (int) colorSpaceQuals[jjj]) { + readBases[jjj] = colorImpliedBases[jjj]; + } // else ref color had higher q score, and won out, so nothing to do here + } + else { // lower q score won + if (minQuality == (int) colorSpaceQuals[jjj]) { + readBases[jjj] = colorImpliedBases[jjj]; + } // else ref color had lower q score, and won out, so nothing to do here + } + } + } + } + } + } + } + + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read + } + read.setReadBases(readBases); + } + else { // No color space quality tag in file + throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName()); + } + } + + /** + * Given the base and the color calculate the next base in the sequence + * + * @param prevBase The base + * @param color The color + * @return The next base in the sequence + */ + private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { + switch (color) { + case '0': + return prevBase; + case '1': + return performColorOne(prevBase); + case '2': + return performColorTwo(prevBase); + case '3': + return performColorThree(prevBase); + default: + throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + + " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); + } + } + + /** + * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality + * + * @param read The read which contains the color space to check against + * @param offset The offset in the read at which to check + * @return Returns true if the base was inconsistent with the color space + */ + public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) { + final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); + if (attr != null) { + final byte[] inconsistency = (byte[]) attr; + // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! + if (read.getReadNegativeStrandFlag()) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] != (byte) 0; + } + else { // Forward direction + return inconsistency[offset] != (byte) 0; + } + + // This block of code is for if you want to check both the offset and the next base for color space inconsistency + //if( read.getReadNegativeStrandFlag() ) { // Negative direction + // if( offset == 0 ) { + // return inconsistency[0] != 0; + // } else { + // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); + // } + //} else { // Forward direction + // if( offset == inconsistency.length - 1 ) { + // return inconsistency[inconsistency.length - 1] != 0; + // } else { + // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); + // } + //} + + } + else { // No inconsistency array, so nothing is inconsistent + return false; + } + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * @param read The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @return An array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + */ + public static void computeCovariates(final GATKSAMRecord read, final List requestedCovariates) { + final int numRequestedCovariates = requestedCovariates.size(); + final int readLength = read.getReadLength(); + final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates); + + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for (Covariate covariate : requestedCovariates) + covariateKeySet.addCovariate(covariate.getValues(read)); + + read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet); + } + + /** + * Perform a certain transversion (A <-> C or G <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transversion of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorOne(byte base) { + switch (base) { + case 'A': + case 'a': + return 'C'; + case 'C': + case 'c': + return 'A'; + case 'G': + case 'g': + return 'T'; + case 'T': + case 't': + return 'G'; + default: + return base; + } + } + + /** + * Perform a transition (A <-> G or C <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transition of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorTwo(byte base) { + switch (base) { + case 'A': + case 'a': + return 'G'; + case 'C': + case 'c': + return 'T'; + case 'G': + case 'g': + return 'A'; + case 'T': + case 't': + return 'C'; + default: + return base; + } + } + + /** + * Return the complement (A <-> T or C <-> G) of a base. + * + * @param base the base [AaCcGgTt] + * @return the complementary base, or the input base if it's not one of the understood ones + */ + private static byte performColorThree(byte base) { + switch (base) { + case 'A': + case 'a': + return 'T'; + case 'C': + case 'c': + return 'G'; + case 'G': + case 'g': + return 'C'; + case 'T': + case 't': + return 'A'; + default: + return base; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java new file mode 100755 index 0000000000..91f865180a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -0,0 +1,112 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. + */ + +public class RecalDatum extends RecalDatumOptimized { + + private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations + private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) + + //--------------------------------------------------------------------------------------------------------------- + // + // constructors + // + //--------------------------------------------------------------------------------------------------------------- + + public RecalDatum() { + numObservations = 0L; + numMismatches = 0L; + estimatedQReported = 0.0; + empiricalQuality = 0.0; + } + + public RecalDatum(final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality) { + numObservations = _numObservations; + numMismatches = _numMismatches; + estimatedQReported = _estimatedQReported; + empiricalQuality = _empiricalQuality; + } + + public RecalDatum(final RecalDatum copy) { + this.numObservations = copy.numObservations; + this.numMismatches = copy.numMismatches; + this.estimatedQReported = copy.estimatedQReported; + this.empiricalQuality = copy.empiricalQuality; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final void combine(final RecalDatum other) { + final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); + this.increment(other.numObservations, other.numMismatches); + this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations); + //if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // methods to derive empirical quality score + // + //--------------------------------------------------------------------------------------------------------------- + + public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) { + this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again + } + + //--------------------------------------------------------------------------------------------------------------- + // + // misc. methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final double getEstimatedQReported() { + return estimatedQReported; + } + + public final double getEmpiricalQuality() { + return empiricalQuality; + } + + private double calcExpectedErrors() { + return (double) this.numObservations * qualToErrorProb(estimatedQReported); + } + + private double qualToErrorProb(final double qual) { + return Math.pow(10.0, qual / -10.0); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java new file mode 100755 index 0000000000..2333808206 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java @@ -0,0 +1,115 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.List; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Jan 6, 2010 + * + * An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed. + * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. + */ + +public class RecalDatumOptimized { + + protected long numObservations; // number of bases seen in total + protected long numMismatches; // number of bases seen that didn't match the reference + + //--------------------------------------------------------------------------------------------------------------- + // + // constructors + // + //--------------------------------------------------------------------------------------------------------------- + + public RecalDatumOptimized() { + numObservations = 0L; + numMismatches = 0L; + } + + public RecalDatumOptimized(final long _numObservations, final long _numMismatches) { + numObservations = _numObservations; + numMismatches = _numMismatches; + } + + public RecalDatumOptimized(final RecalDatumOptimized copy) { + this.numObservations = copy.numObservations; + this.numMismatches = copy.numMismatches; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + public synchronized final void increment(final long incObservations, final long incMismatches) { + numObservations += incObservations; + numMismatches += incMismatches; + } + + public synchronized final void increment(final RecalDatumOptimized other) { + increment(other.numObservations, other.numMismatches); + } + + public synchronized final void increment(final List data) { + for (RecalDatumOptimized other : data) { + this.increment(other); + } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // methods to derive empirical quality score + // + //--------------------------------------------------------------------------------------------------------------- + + public final double empiricalQualDouble(final int smoothing, final double maxQual) { + final double doubleMismatches = (double) (numMismatches + smoothing); + final double doubleObservations = (double) (numObservations + smoothing); + double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + return Math.min(empiricalQual, maxQual); + } + + public final byte empiricalQualByte(final int smoothing) { + final double doubleMismatches = (double) (numMismatches + smoothing); + final double doubleObservations = (double) (numObservations + smoothing); + return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 + } + + public final byte empiricalQualByte() { + return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero + } + + public final String outputToCSV() { + return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java new file mode 100755 index 0000000000..cc6f67cc9a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 27, 2009 + * + * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker. + * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. + */ + +public class RecalibrationArgumentCollection { + + /** + * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, + * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) + * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. + * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. + */ + @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) + protected List> knownSites = Collections.emptyList(); + + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ + @Gather(CountCovariatesGatherer.class) + @Output + protected PrintStream RECAL_FILE; + + /** + * List all implemented covariates. + */ + @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) + protected boolean LIST_ONLY = false; + + /** + * Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list. + */ + @Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false) + protected String[] COVARIATES = null; + + /* + * Use the standard set of covariates in addition to the ones listed using the -cov argument + */ + @Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false) + protected boolean USE_STANDARD_COVARIATES = true; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + /** + * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. + */ + @Hidden + @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") + protected boolean RUN_WITHOUT_DBSNP = false; + + ///////////////////////////// + // protected Member Variables + ///////////////////////////// + protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables) + protected final ArrayList requestedCovariates = new ArrayList();// A list to hold the covariate objects that were requested + + protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. + protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. + + + /** + * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the + * reads which have had the reference inserted because of color space inconsistencies. + */ + @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") + public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; + + /** + * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles + * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in + * their color space tag can not be recalibrated. + */ + @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) + public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + + /** + * The context covariate will use a context of this size to calculate it's covariate value for base mismatches + */ + @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false) + public int MISMATCHES_CONTEXT_SIZE = 2; + + /** + * The context covariate will use a context of this size to calculate it's covariate value for base insertions + */ + @Argument(fullName = "insertions_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions", required = false) + public int INSERTIONS_CONTEXT_SIZE = 8; + + /** + * The context covariate will use a context of this size to calculate it's covariate value for base deletions + */ + @Argument(fullName = "deletions_context_size", shortName = "dcs", doc = "size of the k-mer context to be used for base deletions", required = false) + public int DELETIONS_CONTEXT_SIZE = 8; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) + */ + @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) + public byte MISMATCHES_DEFAULT_QUALITY = -1; + + /** + * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on) + */ + @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) + public byte INSERTIONS_DEFAULT_QUALITY = 45; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) + */ + @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) + public byte DELETIONS_DEFAULT_QUALITY = 45; + + + @Hidden + @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + public String DEFAULT_PLATFORM = null; + @Hidden + @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + public String FORCE_PLATFORM = null; + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index cbbb3d43f6..833dce9328 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -26,8 +26,10 @@ package org.broadinstitute.sting.gatk.walkers.coverage; import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -113,27 +115,13 @@ // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) -@PartitionBy(PartitionType.INTERVAL) +@PartitionBy(PartitionType.NONE) +@Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE) public class DepthOfCoverageWalker extends LocusWalker>, CoveragePartitioner> implements TreeReducible { @Output @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) Map out; - /** - * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin. - */ - @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) - int start = 1; - /** - * Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin. - */ - @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) - int stop = 500; - /** - * Sets the number of bins for granular binning - */ - @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) - int nBins = 499; @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false) int minMappingQuality = -1; @Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false) @@ -142,16 +130,19 @@ public class DepthOfCoverageWalker extends LocusWalker END are counted in the last bin. + */ + @Advanced + @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) + int stop = 500; + /** + * Sets the number of bins for granular binning + */ + @Advanced + @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) + int nBins = 499; + /** * Do not tabulate the sample summary statistics (total, mean, median, quartile coverage per sample) */ @@ -174,27 +209,22 @@ public class DepthOfCoverageWalker extends LocusWalker partitionTypes = EnumSet.of(DoCOutputType.Partition.sample); + /** * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. */ + @Advanced @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) boolean includeDeletions = false; + + @Advanced @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) boolean ignoreDeletionSites = false; - /** - * Path to the RefSeq file for use in aggregating coverage statistics over genes - */ - @Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false) - File refSeqGeneList = null; - /** - * The format of the output file - */ - @Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false) - String outputFormat = "rtable"; /** * A coverage threshold for summarizing (e.g. % bases >= CT for each sample) */ + @Advanced @Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments.", required = false) int[] coverageThresholds = {15}; @@ -334,24 +364,29 @@ public CoveragePartitioner reduceInit() { } public Map> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (includeRefNBases || BaseUtils.isRegularBase(ref.getBase())) { + if ( ! omitDepthOutput ) { + getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) + //System.out.printf("\t[log]\t%s",ref.getLocus()); + } - if ( ! omitDepthOutput ) { - getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) - //System.out.printf("\t[log]\t%s",ref.getLocus()); + return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes); + } else { + return null; } - - return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes); } public CoveragePartitioner reduce(Map> thisMap, CoveragePartitioner prevReduce) { - if ( ! omitDepthOutput ) { - //checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order - printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType()); - // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without - // turning on omit - } + if ( thisMap != null ) { // skip sites we didn't want to include in the calculation (ref Ns) + if ( ! omitDepthOutput ) { + //checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order + printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType()); + // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without + // turning on omit + } - prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object + prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object + } return prevReduce; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java new file mode 100755 index 0000000000..e7a2f74e23 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -0,0 +1,162 @@ +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; + +/** + * Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine) + * + * Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read + * group in the input BAMs FOR ONLY THE FIRST OF PAIR READS. + * + *

Input

+ *

+ * Any number of BAM files + *

+ * + *

Output

+ *

+ * GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. + * + * For example, running this tool on the NA12878 data sets: + * + *

+ *      ##:GATKReport.v0.2 ErrorRatePerCycle : The error rate per sequenced position in the reads
+ *      readgroup  cycle  mismatches  counts  qual  errorrate
+ *      20FUK.1        0          80   23368    25   3.47e-03
+ *      20FUK.1        1          40   23433    28   1.75e-03
+ *      20FUK.1        2          36   23453    28   1.58e-03
+ *      20FUK.1        3          26   23476    29   1.15e-03
+ *      20FUK.1        4          32   23495    29   1.40e-03
+ *      up to 101 cycles
+ *      20FUK.2        0          77   20886    24   3.73e-03
+ *      20FUK.2        1          28   20920    29   1.39e-03
+ *      20FUK.2        2          24   20931    29   1.19e-03
+ *      20FUK.2        3          30   20940    28   1.48e-03
+ *      20FUK.2        4          25   20948    29   1.24e-03
+ *      up to 101 cycles
+ *      20FUK.3        0          78   22038    24   3.58e-03
+ *      20FUK.3        1          40   22091    27   1.86e-03
+ *      20FUK.3        2          23   22108    30   1.09e-03
+ *      20FUK.3        3          36   22126    28   1.67e-03
+ *      
+ *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ErrorRatePerCycle
+ *      -I bundle/current/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam
+ *      -R bundle/current/b37/human_g1k_v37.fasta
+ *      -o example.gatkreport.txt
+ *  
+ * + * @author Kiran Garimella, Mark DePristo + */ +public class ErrorRatePerCycle extends LocusWalker { + @Output PrintStream out; + @Argument(fullName="min_base_quality_score", shortName="mbq", doc="Minimum base quality required to consider a base for calling", required=false) + public Integer MIN_BASE_QUAL = 0; + @Argument(fullName="min_mapping_quality_score", shortName="mmq", doc="Minimum read mapping quality required to consider a read for calling", required=false) + public Integer MIN_MAPPING_QUAL = 20; + + private GATKReport report; + private GATKReportTable table; + private final static String reportName = "ErrorRatePerCycle"; + private final static String reportDescription = "The error rate per sequenced position in the reads"; + + /** + * Allows us to use multiple records for the key (read group x cycle) + */ + private static class TableKey implements Comparable { + final String readGroup; + final int cycle; + + private TableKey(final String readGroup, final int cycle) { + this.readGroup = readGroup; + this.cycle = cycle; + } + + @Override + public int compareTo(final TableKey tableKey) { + final int scmp = readGroup.compareTo(tableKey.readGroup); + if ( scmp == 0 ) + return Integer.valueOf(cycle).compareTo(tableKey.cycle); + else + return scmp; + } + } + + public void initialize() { + report = new GATKReport(); + report.addTable(reportName, reportDescription); + table = report.getTable(reportName); + table.addPrimaryKey("key", false); + table.addColumn("readgroup", 0); + table.addColumn("cycle", 0); + table.addColumn("mismatches", 0); + table.addColumn("counts", 0); + table.addColumn("qual", 0); + table.addColumn("errorrate", 0.0f, "%.2e"); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + for ( final PileupElement p : context.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + final int offset = p.getOffset(); + final boolean firstOfPair = ! read.getReadPairedFlag() || read.getFirstOfPairFlag(); + + if ( firstOfPair && read.getMappingQuality() >= MIN_MAPPING_QUAL && p.getQual() >= MIN_BASE_QUAL ) { + final byte readBase = p.getBase(); + final byte refBase = ref.getBase(); + final int cycle = offset; + + if ( BaseUtils.isRegularBase(readBase) && BaseUtils.isRegularBase(refBase) ) { + final TableKey key = new TableKey(read.getReadGroup().getReadGroupId(), cycle); + + if ( ! table.containsKey(key) ) { + table.set(key, "cycle", cycle); + table.set(key, "readgroup", read.getReadGroup().getReadGroupId()); + } + + table.increment(key, "counts"); + if (readBase != refBase) + table.increment(key, "mismatches"); + } + } + } + + return null; + } + + public Integer reduceInit() { return null; } + + public Integer reduce(Integer value, Integer sum) { return null; } + + public void onTraversalDone(Integer sum) { + for ( final Object key : table.getPrimaryKeys() ) { + final int mismatches = (Integer)table.get(key, "mismatches"); + final int count = (Integer)table.get(key, "counts"); + final double errorRate = (mismatches + 1) / (1.0*(count + 1)); + final int qual = QualityUtils.probToQual(1-errorRate, 0.0); + table.set(key, "qual", qual); + table.set(key, "errorrate", errorRate); + } + + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java new file mode 100644 index 0000000000..14985907d4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.Median; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; +import java.text.DateFormat; +import java.util.HashMap; +import java.util.Map; + +/** + * Emits a GATKReport containing read group, sample, library, platform, center, sequencing data, + * paired end status, simple read type name (e.g. 2x76) median insert size and median read length + * for each read group in every provided BAM file + * + * Note that this walker stops when all read groups have been observed at least a few thousand times so that + * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate + * timeframe. + * + *

Input

+ *

+ * Any number of BAM files + *

+ * + *

Output

+ *

+ * GATKReport containing read group, sample, library, platform, center, median insert size and median read length. + * + * For example, running this tool on the NA12878 data sets: + * + *

+ *      ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
+ *      readgroup  sample   library       platform  center  date     has.any.reads  is.paired.end  n.reads.analyzed  simple.read.type  median.read.length  median.insert.size
+ *      20FUK.1    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        498  2x101                            101                 386
+ *      20FUK.2    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        476  2x101                            101                 417
+ *      20FUK.3    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        407  2x101                            101                 387
+ *      20FUK.4    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        389  2x101                            101                 415
+ *      20FUK.5    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        433  2x101                            101                 386
+ *      20FUK.6    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        480  2x101                            101                 418
+ *      20FUK.7    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        450  2x101                            101                 386
+ *      20FUK.8    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        438  2x101                            101                 418
+ *      20GAV.1    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        490  2x101                            101                 391
+ *      20GAV.2    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        485  2x101                            101                 417
+ *      20GAV.3    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        460  2x101                            101                 392
+ *      20GAV.4    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        434  2x101                            101                 415
+ *      20GAV.5    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        479  2x101                            101                 389
+ *      20GAV.6    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        461  2x101                            101                 416
+ *      20GAV.7    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        509  2x101                            101                 386
+ *      20GAV.8    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        476  2x101                            101                 410                           101                 414
+ *      
+ *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadGroupProperties
+ *      -I example1.bam -I example2.bam etc
+ *      -R reference.fasta
+ *      -o example.gatkreport.txt
+ *  
+ * + * @author Mark DePristo + */ +public class ReadGroupProperties extends ReadWalker { + @Output + public PrintStream out; + + @Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false) + public int MAX_VALUES_FOR_MEDIAN = 10000; + + private final static String TABLE_NAME = "ReadGroupProperties"; + private final Map readGroupInfo = new HashMap(); + + private class PerReadGroupInfo { + public final Median readLength = new Median(MAX_VALUES_FOR_MEDIAN); + public final Median insertSize = new Median(MAX_VALUES_FOR_MEDIAN); + public int nReadsSeen = 0, nReadsPaired = 0; + + public boolean needsMoreData() { + return ! readLength.isFull() || ! insertSize.isFull(); + } + } + + @Override + public void initialize() { + for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + readGroupInfo.put(rg.getId(), new PerReadGroupInfo()); + } + } + + @Override + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag()); + } + + @Override + public boolean isDone() { + for ( PerReadGroupInfo info : readGroupInfo.values() ) { + if ( info.needsMoreData() ) + return false; + } + + return true; + } + + @Override + public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) { + final String rgID = read.getReadGroup().getId(); + final PerReadGroupInfo info = readGroupInfo.get(rgID); + + if ( info.needsMoreData() ) { + info.readLength.add(read.getReadLength()); + info.nReadsSeen++; + if ( read.getReadPairedFlag() ) { + info.nReadsPaired++; + if ( read.getInferredInsertSize() != 0) { + info.insertSize.add(Math.abs(read.getInferredInsertSize())); + } + } + } + + return null; + } + + @Override + public Integer reduceInit() { + return null; + } + + @Override + public Integer reduce(Integer integer, Integer integer1) { + return null; + } + + @Override + public void onTraversalDone(Integer sum) { + final GATKReport report = new GATKReport(); + report.addTable(TABLE_NAME, "Table of read group properties"); + GATKReportTable table = report.getTable(TABLE_NAME); + DateFormat dateFormatter = DateFormat.getDateInstance(DateFormat.SHORT); + + table.addPrimaryKey("readgroup"); + //* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and + //* median read length for each read group in every BAM file. + table.addColumn("sample", "NA"); + table.addColumn("library", "NA"); + table.addColumn("platform", "NA"); + table.addColumn("center", "NA"); + table.addColumn("date", "NA"); + table.addColumn("has.any.reads", "false"); + table.addColumn("is.paired.end", "false"); + table.addColumn("n.reads.analyzed", "NA"); + table.addColumn("simple.read.type", "NA"); + table.addColumn("median.read.length", Integer.valueOf(0)); + table.addColumn("median.insert.size", Integer.valueOf(0)); + + for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + final String rgID = rg.getId(); + PerReadGroupInfo info = readGroupInfo.get(rgID); + + // we are paired if > 25% of reads are paired + final boolean isPaired = info.nReadsPaired / (1.0 * (info.nReadsSeen+1)) > 0.25; + final boolean hasAnyReads = info.nReadsSeen > 0; + final int readLength = info.readLength.getMedian(0); + + setTableValue(table, rgID, "sample", rg.getSample()); + setTableValue(table, rgID, "library", rg.getLibrary()); + setTableValue(table, rgID, "platform", rg.getPlatform()); + setTableValue(table, rgID, "center", rg.getSequencingCenter()); + try { + setTableValue(table, rgID, "date", rg.getRunDate() != null ? dateFormatter.format(rg.getRunDate()) : "NA"); + } catch ( NullPointerException e ) { + // TODO: remove me when bug in Picard is fixed that causes NPE when date isn't present + setTableValue(table, rgID, "date", "NA"); + } + setTableValue(table, rgID, "has.any.reads", hasAnyReads); + setTableValue(table, rgID, "is.paired.end", isPaired); + setTableValue(table, rgID, "n.reads.analyzed", info.nReadsSeen); + setTableValue(table, rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA"); + setTableValue(table, rgID, "median.read.length", hasAnyReads ? readLength : "NA" ); + setTableValue(table, rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" ); + } + + report.print(out); + } + + private final void setTableValue(GATKReportTable table, final String rgID, final String key, final Object value) { + table.set(rgID, key, value == null ? "NA" : value); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java new file mode 100644 index 0000000000..60f20074ae --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java @@ -0,0 +1,22 @@ +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +/** + * Short one line description of the walker. + * + * @author Mauricio Carneiro + * @since 2/1/12 + */ +public enum CallableStatus { + /** the reference base was an N, which is not considered callable the GATK */ + REF_N, + /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */ + CALLABLE, + /** absolutely no reads were seen at this locus, regardless of the filtering parameters */ + NO_COVERAGE, + /** there were less than min. depth bases at the locus, after applying filters */ + LOW_COVERAGE, + /** more than -maxDepth read at the locus, indicating some sort of mapping problem */ + EXCESSIVE_COVERAGE, + /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */ + POOR_QUALITY +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java new file mode 100644 index 0000000000..979fb665f1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -0,0 +1,172 @@ +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.IntervalBinding; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocComparator; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.PrintStream; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; + +/** + * Short one line description of the walker. + * + *

+ * [Long description of the walker] + *

+ * + * + *

Input

+ *

+ * [Description of the Input] + *

+ * + *

Output

+ *

+ * [Description of the Output] + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T [walker name]
+ *  
+ * + * @author Mauricio Carneiro + * @since 2/1/12 + */ +@By(value = DataSource.READS) +public class DiagnoseTargets extends LocusWalker { + @Input(fullName = "interval_track", shortName = "int", doc = "", required = true) + private IntervalBinding intervalTrack = null; + + @Output + private PrintStream out = System.out; + + @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false) + private int expandInterval = 50; + + @Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false) + private int minimumBaseQuality = 20; + + @Argument(fullName = "minimum_mapping_quality", shortName = "mmq", doc = "", required = false) + private int minimumMappingQuality = 20; + + @Argument(fullName = "minimum_coverage", shortName = "mincov", doc = "", required = false) + private int minimumCoverage = 5; + + @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) + private int maximumCoverage = 700; + + private TreeSet intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them) + private HashMap intervalMap = null; // interval => statistics + private Iterator intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome + private GenomeLoc currentInterval = null; // The "current" interval loaded and being filled with statistics + private IntervalStatistics currentIntervalStatistics = null; // The "current" interval loaded and being filled with statistics + + private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals) + + @Override + public void initialize() { + super.initialize(); + + if (intervalTrack == null) + throw new UserException("This tool currently only works if you provide an interval track"); + + parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below + + List originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided + intervalList = new TreeSet(new GenomeLocComparator()); + intervalMap = new HashMap(originalList.size() * 2); + for (GenomeLoc interval : originalList) + addAndExpandIntervalToLists(interval); + + intervalListIterator = intervalList.iterator(); + } + + @Override + public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + GenomeLoc refLocus = ref.getLocus(); + while (currentInterval == null || currentInterval.isBefore(refLocus)) { + if (!intervalListIterator.hasNext()) + return 0L; + + currentInterval = intervalListIterator.next(); + currentIntervalStatistics = intervalMap.get(currentInterval); + } + + if (currentInterval.isPast(refLocus)) + return 0L; + + byte[] mappingQualities = context.getBasePileup().getMappingQuals(); + byte[] baseQualities = context.getBasePileup().getQuals(); + int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); + int rawCoverage = context.size(); + + IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage); + currentIntervalStatistics.addLocus(refLocus, locusData); + + return 1L; + } + + @Override + public Long reduceInit() { + return 0L; + } + + @Override + public Long reduce(Long value, Long sum) { + return sum + value; + } + + @Override + public void onTraversalDone(Long result) { + super.onTraversalDone(result); + out.println("Interval\tCallStatus\tCOV\tAVG"); + for (GenomeLoc interval : intervalList) { + IntervalStatistics stats = intervalMap.get(interval); + out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage())); + } + } + + private GenomeLoc createIntervalBefore(GenomeLoc interval) { + int start = Math.max(interval.getStart() - expandInterval, 0); + int stop = Math.max(interval.getStart() - 1, 0); + return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); + } + + private GenomeLoc createIntervalAfter(GenomeLoc interval) { + int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength(); + int start = Math.min(interval.getStop() + 1, contigLimit); + int stop = Math.min(interval.getStop() + expandInterval, contigLimit); + return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); + } + + private void addAndExpandIntervalToLists(GenomeLoc interval) { + if (expandInterval > 0) { + GenomeLoc before = createIntervalBefore(interval); + GenomeLoc after = createIntervalAfter(interval); + intervalList.add(before); + intervalList.add(after); + intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + } + intervalList.add(interval); + intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java new file mode 100644 index 0000000000..5620c3902a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +/** + * The definition of a locus for the DiagnoseTargets walker statistics calculation + * + * @author Mauricio Carneiro + * @since 2/3/12 + */ +class IntervalStatisticLocus { + private final byte[] mappingQuality; + private final byte[] baseQuality; + private final int coverage; + private final int rawCoverage; + + public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) { + this.mappingQuality = mappingQuality; + this.baseQuality = baseQuality; + this.coverage = coverage; + this.rawCoverage = rawCoverage; + } + + public IntervalStatisticLocus() { + this(new byte[1], new byte[1], 0, 0); + } + + public int getCoverage() { + return coverage; + } + + public int getRawCoverage() { + return rawCoverage; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java new file mode 100644 index 0000000000..8ee5f76fb7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -0,0 +1,122 @@ +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.HashMap; + +/** + * Short one line description of the walker. + * + * @author Mauricio Carneiro + * @since 2/1/12 + */ +class IntervalStatistics { + private final GenomeLoc interval; + private final ArrayList loci; + + private final int minimumCoverageThreshold; + private final int maximumCoverageThreshold; + private final int minimumMappingQuality; + private final int minimumBaseQuality; + + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + + private IntervalStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this.interval = interval; + this.loci = loci; + this.minimumCoverageThreshold = minimumCoverageThreshold; + this.maximumCoverageThreshold = maximumCoverageThreshold; + this.minimumMappingQuality = minimumMappingQuality; + this.minimumBaseQuality = minimumBaseQuality; + } + + public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + + // Initialize every loci (this way we don't have to worry about non-existent loci in the object + for (int i = 0; i < interval.size(); i++) + this.loci.add(i, new IntervalStatisticLocus()); + + } + + public long totalCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return preComputedTotalCoverage; + } + + public double averageCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return (double) preComputedTotalCoverage / loci.size(); + } + + /** + * Calculates the callable status of the entire interval + * + * @return the callable status of the entire interval + */ + public CallableStatus callableStatus() { + long max = -1; + CallableStatus maxCallableStatus = null; + HashMap statusCounts = new HashMap(CallableStatus.values().length); + + // initialize the statusCounts with all callable states + for (CallableStatus key : CallableStatus.values()) + statusCounts.put(key, 0); + + // calculate the callable status for each locus + for (int i = 0; i < loci.size(); i++) { + CallableStatus status = callableStatus(i); + int count = statusCounts.get(status) + 1; + statusCounts.put(status, count); + + if (count > max) { + max = count; + maxCallableStatus = status; + } + } + + return maxCallableStatus; + } + + public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) { + if (!interval.containsP(locus)) + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); + + int locusIndex = locus.getStart() - interval.getStart(); + + loci.add(locusIndex, locusData); + } + + /** + * returns the callable status of this locus without taking the reference base into account. + * + * @param locusIndex location in the genome to inquire (only one locus) + * @return the callable status of a locus + */ + private CallableStatus callableStatus(int locusIndex) { + if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold) + return CallableStatus.EXCESSIVE_COVERAGE; + + if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold) + return CallableStatus.CALLABLE; + + if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold) + return CallableStatus.POOR_QUALITY; + + if (loci.get(locusIndex).getRawCoverage() > 0) + return CallableStatus.LOW_COVERAGE; + + return CallableStatus.NO_COVERAGE; + } + + private void calculateTotalCoverage() { + preComputedTotalCoverage = 0; + for (IntervalStatisticLocus locus : loci) + preComputedTotalCoverage += locus.getCoverage(); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 8278dbab76..7eb6fad542 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -139,6 +139,12 @@ public class VariantFiltrationWalker extends RodWalker { @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean FAIL_MISSING_VALUES = false; + /** + * Invalidate previous filters applied to the VariantContext, applying only the filters here + */ + @Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false) + boolean invalidatePrevious = false; + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; @@ -215,6 +221,9 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo for ( VariantContext vc : VCs ) { + if ( invalidatePrevious ) { + vc = (new VariantContextBuilder(vc)).filters(new HashSet()).make(); + } // filter based on previous mask position if ( previousMaskPosition != null && // we saw a previous mask site previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 681cc1fa68..9f2403bbf7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -27,7 +27,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.List; @@ -41,10 +41,11 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { /** The default model with the best performance in all cases */ - EXACT, + EXACT } protected int N; + protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; protected Logger logger; protected PrintStream verboseWriter; @@ -53,20 +54,21 @@ protected enum GenotypeType { AA, AB, BB } protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) { this.N = N; + this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES; this.logger = logger; this.verboseWriter = verboseWriter; } /** * Must be overridden by concrete subclasses - * @param GLs genotype likelihoods - * @param Alleles Alleles corresponding to GLs + * @param vc variant context with alleles and genotype likelihoods * @param log10AlleleFrequencyPriors priors * @param result (pre-allocated) object to store likelihoods results + * @return the alleles used for genotyping */ - protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, - double[][] log10AlleleFrequencyPriors, - AlleleFrequencyCalculationResult result); + protected abstract List getLog10PNonRef(final VariantContext vc, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 295cf86884..7143606aeb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -275,19 +275,22 @@ public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQ public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { byte obsBase = elt.getBase(); byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + if ( qual == 0 ) + return 0; - if ( elt.isReducedRead() ) { + if ( elt.getRead().isReducedRead() ) { // reduced read representation if ( BaseUtils.isRegularBase( obsBase )) { - add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods - return elt.getRepresentativeCount(); // we added nObs bases here + int representativeCount = elt.getRepresentativeCount(); + add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods + return representativeCount; // we added nObs bases here } // odd bases or deletions => don't use them return 0; } - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; + return add(obsBase, qual, (byte)0, (byte)0, 1); } public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { @@ -519,7 +522,7 @@ private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean c if ( qual > SAMUtils.MAX_PHRED_SCORE ) throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); if ( capBaseQualsAtMappingQual ) - qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); + qual = (byte)Math.min((int)qual, p.getMappingQual()); if ( (int)qual < minBaseQual ) qual = (byte)0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index aa743f86fc..ed737064db 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -35,25 +35,89 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - private final static boolean DEBUG = false; + // private final static boolean DEBUG = false; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); } - public void getLog10PNonRef(final GenotypesContext GLs, - final List alleles, - final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - final int numAlleles = alleles.size(); + public List getLog10PNonRef(final VariantContext vc, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + GenotypesContext GLs = vc.getGenotypes(); + List alleles = vc.getAlleles(); + + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) { + logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); + alleles.add(vc.getReference()); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE)); + GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false); + } //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); + linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false); + + return alleles; } + private static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public Allele allele; + + public LikelihoodSum(Allele allele) { this.allele = allele; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + private static final int PL_INDEX_OF_HOM_REF = 0; + private static final List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) + likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); + + // make sure that we've cached enough data + if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 ) + UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles); + + // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + final ArrayList GLs = getGLs(vc.getGenotypes()); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL]; + if ( alleles[0] != 0 ) + likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles[1] != 0 && alleles[1] != alleles[0] ) + likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + + // sort them by probability mass and choose the best ones + Collections.sort(Arrays.asList(likelihoodSums)); + final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); + for ( int i = 0; i < numAllelesToChoose; i++ ) + bestAlleles.add(likelihoodSums[i].allele); + + final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); + for ( Allele allele : vc.getAlternateAlleles() ) { + if ( bestAlleles.contains(allele) ) + orderedBestAlleles.add(allele); + } + + return orderedBestAlleles; + } + private static final ArrayList getGLs(GenotypesContext GLs) { ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); @@ -70,47 +134,6 @@ private static final ArrayList getGLs(GenotypesContext GLs) { return genotypeLikelihoods; } - - final static double approximateLog10SumLog10(double[] vals) { - if ( vals.length < 2 ) - throw new ReviewedStingException("Passing array with fewer than 2 values when computing approximateLog10SumLog10"); - - double approx = approximateLog10SumLog10(vals[0], vals[1]); - for ( int i = 2; i < vals.length; i++ ) - approx = approximateLog10SumLog10(approx, vals[i]); - return approx; - } - - final static double approximateLog10SumLog10(double small, double big) { - // make sure small is really the smaller value - if ( small > big ) { - final double t = big; - big = small; - small = t; - } - - if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY ) - return big; - - if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE) - return big; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - //final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding - int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding - - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - return big + MathUtils.jacobianLogTable[ind]; - } - - // ------------------------------------------------------------------------------------- // // Multi-allelic implementation. @@ -207,7 +230,7 @@ public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numChr = 2*numSamples; // queue of AC conformations to process - final Queue ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects final HashMap indexesToACset = new HashMap(numChr+1); @@ -218,40 +241,57 @@ public static void linearExactMultiAllelic(final GenotypesContext GLs, ACqueue.add(zeroSet); indexesToACset.put(zeroSet.ACcounts, zeroSet); + // optimization: create the temporary storage for computing L(j,k) just once + final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1; + final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies]; + for ( int i = 0; i < maxPossibleDependencies; i++ ) + tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY; + // keep processing while we have AC conformations that need to be calculated double maxLog10L = Double.NEGATIVE_INFINITY; while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods); // adjust max likelihood seen if needed maxLog10L = Math.max(maxLog10L, log10LofKs); } } + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + private static double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, final double maxLog10L, final int numChr, final boolean preserveData, - final Queue ACqueue, + final LinkedList ACqueue, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { + final AlleleFrequencyCalculationResult result, + final double[][] tempLog10ConformationLikelihoods) { - if ( DEBUG ) - System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + //if ( DEBUG ) + // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result); + computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods); // clean up memory if ( !preserveData ) { for ( ExactACcounts index : set.dependentACsetsToDelete ) { - indexesToACset.put(index, null); - if ( DEBUG ) - System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); + indexesToACset.remove(index); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); } } @@ -259,12 +299,12 @@ private static double calculateAlleleCountConformation(final ExactACset set, // can we abort early because the log10Likelihoods are so small? if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - if ( DEBUG ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + //if ( DEBUG ) + // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); // no reason to keep this data around because nothing depends on it if ( !preserveData ) - indexesToACset.put(set.ACcounts, null); + indexesToACset.remove(set.ACcounts); return log10LofK; } @@ -274,7 +314,6 @@ private static double calculateAlleleCountConformation(final ExactACset set, if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; - ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing final int numAltAlleles = set.ACcounts.getCounts().length; // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. @@ -285,30 +324,40 @@ private static double calculateAlleleCountConformation(final ExactACset set, for ( int allele = 0; allele < numAltAlleles; allele++ ) { final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele]++; - lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); + updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different if ( ACwiggle > 1 ) { + final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList(numAltAlleles); + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele_i]++; ACcountsClone[allele_j]++; - lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset); + + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex)); } } + + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); + for ( DependentSet dependent : sameAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); } - // if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate - // over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early) - if ( !preserveData && lastSet == null ) { - if ( DEBUG ) - System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts); - lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); + // determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column + if ( !preserveData ) { + final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); + if ( lastSet != null ) + lastSet.dependentACsetsToDelete.add(set.ACcounts); } - if ( lastSet != null ) - lastSet.dependentACsetsToDelete.add(set.ACcounts); return log10LofK; } @@ -316,41 +365,44 @@ private static double calculateAlleleCountConformation(final ExactACset set, // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also adds it as a dependency to the given callingSetIndex. // returns the ExactACset if that set was not already in the queue and null otherwise. - private static ExactACset updateACset(final int[] ACcounts, - final int numChr, - final ExactACset callingSet, - final int PLsetIndex, - final Queue ACqueue, - final HashMap indexesToACset) { + private static void updateACset(final int[] ACcounts, + final int numChr, + final ExactACset callingSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset) { final ExactACcounts index = new ExactACcounts(ACcounts); - boolean wasInQueue = true; if ( !indexesToACset.containsKey(index) ) { ExactACset set = new ExactACset(numChr/2 +1, index); indexesToACset.put(index, set); ACqueue.add(set); - wasInQueue = false; } // add the given dependency to the set + //if ( DEBUG ) + // System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts); final ExactACset set = indexesToACset.get(index); set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); - return wasInQueue ? null : set; } - private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue ACqueue) { - ExactACset set = null; - for ( ExactACset queued : ACqueue ) { - if ( queued.dependentACsetsToDelete.contains(callingSetIndex) ) - set = queued; + private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList ACqueue) { + Iterator reverseIterator = ACqueue.descendingIterator(); + while ( reverseIterator.hasNext() ) { + final ExactACset queued = reverseIterator.next(); + if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) ) + return queued; } - return set; + + // shouldn't get here + throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!"); } private static void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { + final AlleleFrequencyCalculationResult result, + final double[][] tempLog10ConformationLikelihoods) { set.log10Likelihoods[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -362,38 +414,41 @@ private static void computeLofK(final ExactACset set, } // k > 0 for at least one k else { - // all possible likelihoods for a given cell from which to choose the max - final int numPaths = set.ACsetIndexToPLIndex.size() + 1; - final double[] log10ConformationLikelihoods = new double[numPaths]; // TODO can be created just once, since you initialize it + // deal with the non-AA possible conformations + int conformationIndex = 1; + for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { + //if ( DEBUG ) + // System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - final double[] gl = genotypeLikelihoods.get(j); - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + ExactACset dependent = indexesToACset.get(mapping.getKey()); + + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - // initialize - for ( int i = 0; i < numPaths; i++ ) - // TODO -- Arrays.fill? - // todo -- is this even necessary? Why not have as else below? - log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY; - - // deal with the AA case first - if ( totalK < 2*j-1 ) - log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - - // deal with the other possible conformations now - if ( totalK <= 2*j ) { // skip impossible conformations - int conformationIndex = 1; - for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { - if ( DEBUG ) - System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); - log10ConformationLikelihoods[conformationIndex++] = - determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()]; + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + tempLog10ConformationLikelihoods[j][conformationIndex] = + determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()]; + } else { + tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY; } } - final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods); + conformationIndex++; + } - // finally, update the L(j,k) value + // finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + final int numPaths = set.ACsetIndexToPLIndex.size() + 1; + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + } else { + tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY; + } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths); set.log10Likelihoods[j] = log10Max - logDenominator; } } @@ -415,10 +470,10 @@ private static void computeLofK(final ExactACset set, // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { int AC = set.ACcounts.getCounts()[i]; - result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); + result.log10AlleleFrequencyLikelihoods[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; - result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); + result.log10AlleleFrequencyPosteriors[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); } } } @@ -564,7 +619,7 @@ public int linearExact(GenotypesContext GLs, lastK = k; maxLog10L = Math.max(maxLog10L, log10LofK); if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); + //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); done = true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index b30a254148..fb2428258a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -31,12 +31,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.List; import java.util.Map; @@ -72,25 +74,28 @@ protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Log this.logger = logger; } - /** - * Must be overridden by concrete subclasses - * - * @param tracker rod data - * @param ref reference context - * @param contexts stratified alignment contexts - * @param contextType stratified context type - * @param priors priors to use for GLs - * @param alternateAlleleToUse the alternate allele to use, null if not set - * @param useBAQedPileup should we use the BAQed pileup or the raw one? - * @return variant context where genotypes are no-called but with GLs - */ - public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Allele alternateAlleleToUse, - boolean useBAQedPileup); + /** + * Can be overridden by concrete subclasses + * + * @param tracker rod data + * @param ref reference context + * @param contexts stratified alignment contexts + * @param contextType stratified context type + * @param priors priors to use for GLs + * @param alternateAllelesToUse the alternate allele to use, null if not set + * @param useBAQedPileup should we use the BAQed pileup or the raw one? + * @param locParser Genome Loc Parser + * @return variant context where genotypes are no-called but with GLs + */ + public abstract VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenotypePriors priors, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenomeLocParser locParser); + protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index fe2086d474..1b73ef1d70 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -33,9 +33,11 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -54,19 +56,18 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - + private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; - private PairHMMIndelErrorModel pairModel; - private static ThreadLocal>> indelLikelihoodMap = - new ThreadLocal>>() { - protected synchronized HashMap> initialValue() { - return new HashMap>(); - } - }; + private static ThreadLocal>> indelLikelihoodMap = + new ThreadLocal>>() { + protected synchronized HashMap> initialValue() { + return new HashMap>(); + } + }; - private LinkedHashMap haplotypeMap; + private LinkedHashMap haplotypeMap; // gdebug removeme // todo -cleanup @@ -74,37 +75,37 @@ protected synchronized HashMap> initi private ArrayList alleleList; static { - indelLikelihoodMap.set(new HashMap>()); + indelLikelihoodMap.set(new HashMap>()); } protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.BANDED_INDEL_COMPUTATION); + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, + UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); + haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } private ArrayList computeConsensusAlleles(ReferenceContext ref, Map contexts, - AlignmentContextUtils.ReadOrientation contextType) { - Allele refAllele=null, altAllele=null; + AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { + Allele refAllele = null, altAllele = null; GenomeLoc loc = ref.getLocus(); ArrayList aList = new ArrayList(); - HashMap consensusIndelStrings = new HashMap(); + HashMap consensusIndelStrings = new HashMap(); int insCount = 0, delCount = 0; // quick check of total number of indels in pileup - for ( Map.Entry sample : contexts.entrySet() ) { + for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); @@ -114,22 +115,20 @@ private ArrayList computeConsensusAlleles(ReferenceContext ref, if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) return aList; - - for ( Map.Entry sample : contexts.entrySet() ) { + + for (Map.Entry sample : contexts.entrySet()) { // todo -- warning, can be duplicating expensive partition here AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { + for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) - continue; - if(ReadUtils.is454Read(read)) { + continue; + if (ReadUtils.is454Read(read)) { continue; } @@ -143,60 +142,69 @@ private ArrayList computeConsensusAlleles(ReferenceContext ref, String indelString = p.getEventBases(); if (p.isInsertion()) { boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + if (read.getAlignmentEnd() == loc.getStart()) { // first corner condition: a read has an insertion at the end, and we're right at the insertion. // In this case, the read could have any of the inserted bases and we need to build a consensus - for (String s : consensusIndelStrings.keySet()) { - int cnt = consensusIndelStrings.get(s); - if (s.startsWith(indelString)){ - // case 1: current insertion is prefix of indel in hash map - consensusIndelStrings.put(s,cnt+1); + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(indelString)) { + cList.set(k,new Pair(s,cnt+1)); foundKey = true; - break; } else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. - consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString,cnt+1); foundKey = true; - break; + cList.set(k,new Pair(indelString,cnt+1)); } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString,1); + cList.add(new Pair(indelString,1)); } else if (read.getAlignmentStart() == loc.getStart()+1) { // opposite corner condition: read will start at current locus with an insertion - for (String s : consensusIndelStrings.keySet()) { - int cnt = consensusIndelStrings.get(s); - if (s.endsWith(indelString)){ - // case 1: current insertion is suffix of indel in hash map - consensusIndelStrings.put(s,cnt+1); + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(indelString)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); foundKey = true; - break; } else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is suffix of current insertion + // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. - - consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString,cnt+1); foundKey = true; - break; + cList.set(k,new Pair(indelString,cnt+1)); } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString,1); + cList.add(new Pair(indelString,1)); + } else { - // normal case: insertion somewhere in the middle of a read: add count to hash map + // normal case: insertion somewhere in the middle of a read: add count to arrayList int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); + cList.add(new Pair(indelString,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); } } @@ -208,78 +216,84 @@ else if (p.isDeletion()) { } } -/* if (DEBUG) { - int icount = indelPileup.getNumberOfInsertions(); - int dcount = indelPileup.getNumberOfDeletions(); - if (icount + dcount > 0) - { - List> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); - System.out.format("#ins: %d, #del:%d\n", insCount, delCount); - - for (int i=0 ; i < eventStrings.size() ; i++ ) { - System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); - // int k=0; - } - System.out.println(); - } - } */ } + Collection vcs = new ArrayList(); int maxAlleleCnt = 0; String bestAltAllele = ""; + for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s); - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - bestAltAllele = s; + int curCnt = consensusIndelStrings.get(s), stop = 0; + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + } else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } } -// if (DEBUG) -// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); - } //gdebug- - if (maxAlleleCnt < minIndelCountForGenotyping) - return aList; - if (bestAltAllele.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(bestAltAllele.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); + ArrayList vcAlleles = new ArrayList(); + vcAlleles.add(refAllele); + vcAlleles.add(altAllele); - //System.out.println(new String(ref.getBases())); - byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(vcAlleles); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases,true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); } } - else { - // insertion case - if (Allele.acceptableAlleleBases(bestAltAllele)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(bestAltAllele, false); - } - } - if (refAllele != null && altAllele != null) { - aList.add(0,refAllele); - aList.add(1,altAllele); - } + + if (vcs.isEmpty()) + return aList; // nothing else to do, no alleles passed minimum count criterion + + VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + + aList = new ArrayList(mergedVC.getAlleles()); + return aList; } private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); - public VariantContext getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenotypePriors priors, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenomeLocParser locParser) { - if ( tracker == null ) + if (tracker == null) return null; GenomeLoc loc = ref.getLocus(); @@ -290,21 +304,21 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, // starting a new site: clear allele list alleleList.clear(); lastSiteVisited = ref.getLocus(); - indelLikelihoodMap.set(new HashMap>()); + indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); if (getAlleleListFromVCF) { - for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { - if( vc_input != null && - allowableTypes.contains(vc_input.getType()) && - ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; - } - } - // ignore places where we don't have a variant - if ( vc == null ) - return null; + for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { + if (vc_input != null && + allowableTypes.contains(vc_input.getType()) && + ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if (vc == null) + return null; alleleList.clear(); if (ignoreSNPAllelesWhenGenotypingIndels) { @@ -315,15 +329,13 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, else alleleList.add(a); - } - else { + } else { for (Allele a : vc.getAlleles()) alleleList.add(a); } - } - else { - alleleList = computeConsensusAlleles(ref,contexts, contextType); + } else { + alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser); if (alleleList.isEmpty()) return null; } @@ -333,21 +345,21 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, return null; // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) - if (ref.getWindow().getStop() < loc.getStop()+HAPLOTYPE_SIZE) + if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) return null; - if ( !(priors instanceof DiploidIndelGenotypePriors) ) - throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); + if (!(priors instanceof DiploidIndelGenotypePriors)) + throw new StingException("Only diploid-based Indel priors are supported in the INDEL GL model"); if (alleleList.isEmpty()) return null; - + refAllele = alleleList.get(0); altAllele = alleleList.get(1); // look for alt allele that has biggest length distance to ref allele int maxLenDiff = 0; - for (Allele a: alleleList) { - if(a.isNonReference()) { + for (Allele a : alleleList) { + if (a.isNonReference()) { int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); if (lenDiff > maxLenDiff) { maxLenDiff = lenDiff; @@ -357,11 +369,11 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, } final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; - final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; + final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1; + final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; - if (hsize <=0) { - logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping",loc.toString())); + if (hsize <= 0) { + logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString())); return null; } haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), @@ -379,7 +391,7 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. - for ( Map.Entry sample : contexts.entrySet() ) { + for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); ReadBackedPileup pileup = null; @@ -388,8 +400,8 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, else if (context.hasBasePileup()) pileup = context.getBasePileup(); - if (pileup != null ) { - final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + if (pileup != null) { + final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); HashMap attributes = new HashMap(); @@ -398,9 +410,9 @@ else if (context.hasBasePileup()) genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); if (DEBUG) { - System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString()); - for (int k=0; k < genotypeLikelihoods.length; k++) - System.out.format("%1.4f ",genotypeLikelihoods[k]); + System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); + for (int k = 0; k < genotypeLikelihoods.length; k++) + System.out.format("%1.4f ", genotypeLikelihoods[k]); System.out.println(); } } @@ -412,21 +424,21 @@ else if (context.hasBasePileup()) private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { // for indels, stop location is one more than ref allele length boolean hasNullAltAllele = false; - for ( Allele a : alleles ) { - if ( a.isNull() ) { + for (Allele a : alleles) { + if (a.isNull()) { hasNullAltAllele = true; break; } } int endLoc = loc.getStart() + refAllele.length(); - if( !hasNullAltAllele ) + if (!hasNullAltAllele) endLoc--; return endLoc; } - public static HashMap> getIndelLikelihoodMap() { + public static HashMap> getIndelLikelihoodMap() { return indelLikelihoodMap.get(); } @@ -434,8 +446,8 @@ public static HashMap> getIndelLikeli // so that per-sample DP will include deletions covering the event. protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; - for ( PileupElement p : pileup ) { - if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) ) + for (PileupElement p : pileup) { + if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase())) count++; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 57cc5594a3..dd21681f04 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -42,34 +39,38 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50; - - private boolean ALLOW_MULTIPLE_ALLELES; - private final boolean useAlleleFromVCF; + private final double[] likelihoodSums = new double[4]; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC; useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + + // make sure the PL cache has been initialized with enough alleles + if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles + UnifiedGenotyperEngine.calculatePLcache(3); } - public VariantContext getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenotypePriors priors, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenomeLocParser locParser) { if ( !(priors instanceof DiploidSNPGenotypePriors) ) throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); - final boolean[] basesToUse = new boolean[4]; final byte refBase = ref.getBase(); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); @@ -79,68 +80,66 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, alleles.add(Allele.create(refBase, true)); final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); + // calculate the GLs + ArrayList GLs = new ArrayList(contexts.size()); + for ( Map.Entry sample : contexts.entrySet() ) { + ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( useBAQedPileup ) + pileup = createBAQedPileup( pileup ); + + // create the GenotypeLikelihoods object + final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); + final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); + if ( nGoodBases > 0 ) + GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); + } + // find the alternate allele(s) that we should be using - if ( alternateAlleleToUse != null ) { - basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true; + if ( alternateAllelesToUse != null ) { + alleles.addAll(alternateAllelesToUse); } else if ( useAlleleFromVCF ) { final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); // ignore places where we don't have a SNP if ( vc == null || !vc.isSNP() ) return null; - - for ( Allele allele : vc.getAlternateAlleles() ) - basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true; + + alleles.addAll(vc.getAlternateAlleles()); } else { - determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup); - - // how many alternate alleles are we using? - int alleleCounter = Utils.countSetBits(basesToUse); + alleles.addAll(determineAlternateAlleles(refBase, GLs)); // if there are no non-ref alleles... - if ( alleleCounter == 0 ) { + if ( alleles.size() == 1 ) { // if we only want variants, then we don't need to calculate genotype likelihoods if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) return builder.make(); // otherwise, choose any alternate allele (it doesn't really matter) - basesToUse[indexOfRefBase == 0 ? 1 : 0] = true; + alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0))); } } // create the alternate alleles and the allele ordering (the ordering is crucial for the GLs) - final int numAltAlleles = Utils.countSetBits(basesToUse); - final int[] alleleOrdering = new int[numAltAlleles + 1]; - alleleOrdering[0] = indexOfRefBase; - int alleleOrderingIndex = 1; - int numLikelihoods = 1; - for ( int i = 0; i < 4; i++ ) { - if ( i != indexOfRefBase && basesToUse[i] ) { - alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); - alleleOrdering[alleleOrderingIndex++] = i; - numLikelihoods += alleleOrderingIndex; - } + final int numAlleles = alleles.size(); + final int numAltAlleles = numAlleles - 1; + + final int[] alleleOrdering = new int[numAlleles]; + int alleleOrderingIndex = 0; + int numLikelihoods = 0; + for ( Allele allele : alleles ) { + alleleOrdering[alleleOrderingIndex++] = BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0]); + numLikelihoods += alleleOrderingIndex; } builder.alleles(alleles); // create the genotypes; no-call everyone for now - GenotypesContext genotypes = GenotypesContext.create(); + final GenotypesContext genotypes = GenotypesContext.create(); final List noCall = new ArrayList(); noCall.add(Allele.NO_CALL); - for ( Map.Entry sample : contexts.entrySet() ) { - ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); - if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); - - // create the GenotypeLikelihoods object - final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); - final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); - if ( nGoodBases == 0 ) - continue; - - final double[] allLikelihoods = GL.getLikelihoods(); + for ( SampleGenotypeData sampleData : GLs ) { + final double[] allLikelihoods = sampleData.GL.getLikelihoods(); final double[] myLikelihoods = new double[numLikelihoods]; int myLikelihoodsIndex = 0; @@ -151,62 +150,46 @@ public VariantContext getLikelihoods(RefMetaDataTracker tracker, } // normalize in log space so that max element is zero. - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); + final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); - HashMap attributes = new HashMap(); - attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + final HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth); attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); - genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); + genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); } return builder.genotypes(genotypes).make(); } - - // fills in the allelesToUse array - protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map contexts, boolean useBAQedPileup) { - int[] qualCounts = new int[4]; - - for ( Map.Entry sample : contexts.entrySet() ) { - // calculate the sum of quality scores for each base - ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup(); - for ( PileupElement p : pileup ) { - // ignore deletions - if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) ) - continue; - - final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase()); - if ( index >= 0 ) { - qualCounts[index] += p.getQual(); - } + + // determines the alleles to use + protected List determineAlternateAlleles(final byte ref, final List sampleDataList) { + + final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); + final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); + for ( int i = 0; i < 4; i++ ) + likelihoodSums[i] = 0.0; + + // based on the GLs, find the alternate alleles with enough probability + for ( SampleGenotypeData sampleData : sampleDataList ) { + final double[] likelihoods = sampleData.GL.getLikelihoods(); + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PLindexOfRef ) { + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL]; + if ( alleles[0] != baseIndexOfRef ) + likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + // don't double-count it + if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] ) + likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; } } - if ( ALLOW_MULTIPLE_ALLELES ) { - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) { - allelesToUse[index] = true; - } - } - } else { - // set the non-ref base which has the maximum quality score sum - int maxCount = 0; - int indexOfMax = 0; - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] > maxCount ) { - maxCount = qualCounts[index]; - indexOfMax = index; - } - } - - if ( maxCount > 0 ) - allelesToUse[indexOfMax] = true; + final List allelesToUse = new ArrayList(3); + for ( int i = 0; i < 4; i++ ) { + if ( likelihoodSums[i] > 0.0 ) + allelesToUse.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); } + + return allelesToUse; } public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { @@ -220,11 +203,23 @@ public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { public class BAQedPileupElement extends PileupElement { public BAQedPileupElement( final PileupElement PE ) { - super(PE.getRead(), PE.getOffset()); + super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip()); } @Override public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } -} \ No newline at end of file + private static class SampleGenotypeData { + + public final String name; + public final DiploidSNPGenotypeLikelihoods GL; + public final int depth; + + public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) { + this.name = name; + this.GL = GL; + this.depth = depth; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java index e40054c9f7..99d55bc698 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java @@ -204,6 +204,6 @@ private double simpAux(double[] likelihoods, double a,double b,double eps,double return Math.log10(s_2 + (s_2 - s)/15.0); } - return ExactAFCalculationModel.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1)); + return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java deleted file mode 100755 index c7e5773937..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashSet; -import java.util.Set; - - -/** - * Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs). - * Absolutely not supported or recommended for public use. - * Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to - * the name 'allele' so we know which alternate allele to use at each site. - */ -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) -@Reference(window=@Window(start=-200,stop=200)) -@By(DataSource.READS) -@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UGCalcLikelihoods extends LocusWalker implements TreeReducible { - - @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - // control the output - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - - // enable deletions in the pileup - public boolean includeReadsWithDeletionAtLoci() { return true; } - - // enable extended events for indels - public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; } - - public void initialize() { - // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); - - // initialize the header - Set headerInfo = new HashSet(); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - - writer.writeHeader(new VCFHeader(headerInfo, samples)) ; - } - - public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext); - return call == null ? null : new VariantCallContext(call, true); - } - - public Integer reduceInit() { return 0; } - - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - public Integer reduce(VariantCallContext value, Integer sum) { - if ( value == null ) - return sum; - - try { - writer.add(value); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); - } - - return sum + 1; - } - - public void onTraversalDone(Integer sum) { - logger.info(String.format("Visited bases: %d", sum)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java deleted file mode 100755 index 97f7b21eb7..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.util.*; - -/** - * Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs). - * Absolutely not supported or recommended for public use. - * Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number - * of GL/PL-annotated VCFs bound to a name starting with 'variant'. - */ -public class UGCallVariants extends RodWalker { - - @ArgumentCollection - private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public List> variants; - - // control the output - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - - // variant track names - private Set trackNames = new HashSet(); - - public void initialize() { - - for ( RodBinding rb : variants ) - trackNames.add(rb.getName()); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames); - - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); - - Set headerInfo = new HashSet(); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ) - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - - // initialize the header - writer.writeHeader(new VCFHeader(headerInfo, samples)); - } - - public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - List VCs = tracker.getValues(variants, context.getLocation()); - - VariantContext mergedVC = mergeVCsWithGLs(VCs); - if ( mergedVC == null ) - return null; - - return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC); - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(VariantCallContext value, Integer sum) { - if ( value == null ) - return sum; - - try { - VariantContextBuilder builder = new VariantContextBuilder(value); - VariantContextUtils.calculateChromosomeCounts(builder, true); - writer.add(builder.make()); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); - } - - return sum + 1; - } - - public void onTraversalDone(Integer result) { - logger.info(String.format("Visited sites: %d", result)); - } - - private static VariantContext mergeVCsWithGLs(List VCs) { - // we can't use the VCUtils classes because our VCs can all be no-calls - if ( VCs.size() == 0 ) - return null; - - VariantContext variantVC = null; - GenotypesContext genotypes = GenotypesContext.create(); - for ( VariantContext vc : VCs ) { - if ( variantVC == null && vc.isVariant() ) - variantVC = vc; - genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes())); - } - - if ( variantVC == null ) { - VariantContext vc = VCs.get(0); - throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart()); - } - - return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make(); - } - - private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) { - GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size()); - for ( final Genotype g : genotypes ) { - if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null ) - genotypesWithGLs.add(g); - } - return genotypesWithGLs; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 5713432b42..82e411c25c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -84,8 +84,8 @@ public class UnifiedArgumentCollection { /** * This argument is not enabled by default because it increases the runtime by an appreciable amount. */ - @Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) - public boolean COMPUTE_SLOD = false; + @Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false) + public boolean NO_SLOD = false; /** * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding @@ -103,21 +103,12 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - /** - * The default behavior of the Unified Genotyper is to allow the genotyping of just one alternate allele in discovery mode; using this flag - * will enable the discovery of multiple alternate alleles. Please note that this works for SNPs only and that it is still highly experimental. - * For advanced users only. - */ - @Advanced - @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false) - public boolean MULTI_ALLELIC = false; - /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. */ @Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 5; + public int MAX_ALTERNATE_ALLELES = 3; // indel-related arguments /** @@ -146,8 +137,8 @@ public class UnifiedArgumentCollection { public int INDEL_HAPLOTYPE_SIZE = 80; @Hidden - @Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false) - public boolean BANDED_INDEL_COMPUTATION = false; + @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) + public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) @@ -168,7 +159,7 @@ public UnifiedArgumentCollection clone() { uac.PCR_error = PCR_error; uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; - uac.COMPUTE_SLOD = COMPUTE_SLOD; + uac.NO_SLOD = NO_SLOD; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; @@ -184,8 +175,7 @@ public UnifiedArgumentCollection clone() { // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION; - uac.MULTI_ALLELIC = MULTI_ALLELIC; + uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 369c2d0c68..1106fcb527 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -126,10 +126,10 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; } - public RodBinding getVariantRodBinding() { return null; } public RodBinding getSnpEffRodBinding() { return null; } public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } /** * A raw, unfiltered, highly specific callset in VCF format. @@ -169,9 +169,11 @@ public class UnifiedGenotyper extends LocusWalker samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); @@ -232,7 +240,7 @@ private Set getHeaderInfo() { headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); // annotation (INFO) fields from UnifiedGenotyper - if ( UAC.COMPUTE_SLOD ) + if ( !UAC.NO_SLOD ) headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index aae7816285..05a977add7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -54,8 +54,9 @@ public enum OUTPUT_MODE { EMIT_VARIANTS_ONLY, /** produces calls at variant sites and confident reference sites */ EMIT_ALL_CONFIDENT_SITES, - /** produces calls at any callable site regardless of confidence; this argument is intended for point - * mutations (SNPs) only and while some indel calls may be produced they are by no means comprehensive */ + /** produces calls at any callable site regardless of confidence; this argument is intended only for point + * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by + * no means produce a comprehensive set of indels in DISCOVERY mode */ EMIT_ALL_SITES } @@ -236,14 +237,14 @@ public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, Referen // --------------------------------------------------------------------------------------------------------- // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine - private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, AlignmentContextUtils.ReadOrientation type, Allele alternateAlleleToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) { + private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, AlignmentContextUtils.ReadOrientation type, List alternateAllelesToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) { // initialize the data for this thread if that hasn't been done yet if ( glcm.get() == null ) { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); + return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -252,7 +253,7 @@ private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, Refe VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if ( vcInput == null ) return null; - vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).attributes(new HashMap()).filters(new HashSet()).make(); + vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), vcInput.getAlleles()).make(); } else { // deal with bad/non-standard reference bases if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) @@ -294,12 +295,6 @@ public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, Referen } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) { - logger.warn("the Unified Genotyper is currently set to genotype at most " + UAC.MAX_ALTERNATE_ALLELES + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + vc.getAlternateAlleles().size() + " alternate alleles; see the --max_alternate_alleles argument"); - return null; - } - // estimate our confidence in a reference call and return if ( vc.getNSamples() == 0 ) { if ( limitedContext ) @@ -312,25 +307,32 @@ public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, Referen // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 - boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; + final List myAlleles = new ArrayList(vc.getAlleles().size()); + myAlleles.add(vc.getReference()); for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[i]); + final Allele alternateAllele = vc.getAlternateAllele(i); + final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele); + // the genotyping model may have stripped it out + if ( indexOfAllele == -1 ) + continue; + + int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]); // if the most likely AC is not 0, then this is a good alternate allele to use; // make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array - if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[i][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { - altAllelesToUse[i] = true; + if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { + myAlleles.add(alternateAllele); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - altAllelesToUse[i] = true; + myAlleles.add(alternateAllele); } } @@ -366,20 +368,6 @@ else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_M return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); } - // strip out any alleles that aren't going to be used in the VariantContext - final List myAlleles; - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new ArrayList(vc.getAlleles().size()); - myAlleles.add(vc.getReference()); - for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - if ( altAllelesToUse[i] ) - myAlleles.add(vc.getAlternateAllele(i)); - } - } else { - // use all of the alleles if we are given them by the user - myAlleles = vc.getAlleles(); - } - // start constructing the resulting VC final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); @@ -393,7 +381,7 @@ else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_M } // create the genotypes - final GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse); + final GenotypesContext genotypes = subsetAlleles(vc, myAlleles, true); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) @@ -406,33 +394,31 @@ else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_M if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { + if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; // the overall lod - VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); + List alternateAllelesToUse = builder.make().getAlternateAlleles(); + // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); + VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); + VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); @@ -771,52 +757,69 @@ public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, Ref /** * @param vc variant context with genotype likelihoods - * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use * @return genotypes */ - public static GenotypesContext assignGenotypes(final VariantContext vc, - final boolean[] allelesToUse) { + public static GenotypesContext assignGenotypes(final VariantContext vc) { + return subsetAlleles(vc, vc.getAlleles(), true); + } + + /** + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** + * @param assignGenotypes true if we should change the genotypes based on the (subsetted) PLs + * @return genotypes + */ + public static GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes) { - // the no-called genotypes - final GenotypesContext GLs = vc.getGenotypes(); + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); // samples - final List sampleIndices = GLs.getSampleNamesOrderedByName(); + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - // the new called genotypes to create - final GenotypesContext calls = GenotypesContext.create(); + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = allelesToUse.length; - final List newAlleles = new ArrayList(numOriginalAltAlleles+1); - newAlleles.add(vc.getReference()); - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToUse[i] ) - newAlleles.add(vc.getAlternateAllele(i)); - } - final int numNewAltAlleles = newAlleles.size() - 1; + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + // which PLs should be carried forward? ArrayList likelihoodIndexesToUse = null; // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, // then we can keep the PLs as is; otherwise, we determine which ones to keep if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { likelihoodIndexesToUse = new ArrayList(30); + + // make sure that we've cached enough data + if ( numOriginalAltAlleles > PLIndexToAlleleIndex.length - 1 ) + calculatePLcache(numOriginalAltAlleles); final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; + final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) + altAlleleIndexToUse[i] = true; + } + for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) { - int[] alleles = PLcache[PLindex]; + final int[] alleles = PLcache[PLindex]; // consider this entry only if both of the alleles are good - if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) ) + if ( (alleles[0] == 0 || altAlleleIndexToUse[alleles[0] - 1]) && (alleles[1] == 0 || altAlleleIndexToUse[alleles[1] - 1]) ) likelihoodIndexesToUse.add(PLindex); } } // create the new genotypes - for ( int k = GLs.size() - 1; k >= 0; k-- ) { - final String sample = sampleIndices.get(k); - final Genotype g = GLs.get(sample); - if ( !g.hasLikelihoods() ) + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); continue; + } // create the new likelihoods array from the alleles we are allowed to use final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); @@ -833,29 +836,38 @@ public static GenotypesContext assignGenotypes(final VariantContext vc, newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); } - // if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample + // if there is no mass on the (new) likelihoods, then just no-call the sample if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); - continue; + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + } + else { + Map attrs = new HashMap(g.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, attrs, false)); + else + newGTs.add(assignGenotype(g, newLikelihoods, allelesToUse, numNewAltAlleles, attrs)); } - - // find the genotype with maximum likelihoods - int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; - - ArrayList myAlleles = new ArrayList(); - myAlleles.add(newAlleles.get(alleles[0])); - myAlleles.add(newAlleles.get(alleles[1])); - - final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); - Map attrs = new HashMap(g.getAttributes()); - if ( numNewAltAlleles == 0 ) - attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); - else - attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); - calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false)); } - - return calls; + + return newGTs; + } + + protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List allelesToUse, final int numNewAltAlleles, final Map attrs) { + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; + + ArrayList myAlleles = new ArrayList(); + myAlleles.add(allelesToUse.get(alleles[0])); + myAlleles.add(allelesToUse.get(alleles[1])); + + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); + return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index 200a250f24..26023bd2ff 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -454,8 +454,7 @@ public double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, HashMap // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. // Second term is a constant added to both likelihoods so will be ignored - haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0], - readLikelihood[1]); + haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 6410d619d4..64993b43ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -166,18 +166,17 @@ private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGT final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead, - YMetricArray[im1][jm1] + pBaseRead); + matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]}); final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); + XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); // update Y array final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); + YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); } } @@ -316,9 +315,7 @@ else if (bestMetric < maxElementInDiag - DIAG_TOL) final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; - final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ], - XMetricArray[bestI][bestJ], - YMetricArray[bestI][bestJ]); + final double bestMetric = MathUtils.approximateLog10SumLog10(new double[]{ matchMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ], YMetricArray[bestI][bestJ] }); /* if (DEBUG) { @@ -651,7 +648,7 @@ private int computeFirstDifferingPosition(double[] b1, double[] b2) { private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; - // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix + // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix for (int i=0; i < numHaplotypes; i++) { for (int j=i; j < numHaplotypes; j++){ // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] @@ -665,7 +662,7 @@ private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, f final double li = readLikelihoods[readIdx][i]; final double lj = readLikelihoods[readIdx][j]; final int readCount = readCounts[readIdx]; - haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF); + haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF); } } } @@ -678,7 +675,7 @@ private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, f } } - // renormalize so that max element is zero. + // renormalize so that max element is zero. return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index aa9ae1517e..59a7bd01a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -26,6 +26,10 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.*; +import org.apache.commons.jexl2.Expression; +import org.apache.commons.jexl2.JexlContext; +import org.apache.commons.jexl2.JexlEngine; +import org.apache.commons.jexl2.MapContext; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -71,7 +75,7 @@ *

* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs - * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many + * include additional statistics such as mismatches and base qualitites around the calls, read strandness (how many * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional @@ -88,6 +92,16 @@ * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups). * + * Which (putative) calls will make it into the output file(s) is controlled by an expression/list of expressions passed with -filter + * flag: if any of the expressions evaluate to TRUE, the site will be discarded. Otherwise the putative call and all the + * associated statistics will be printed into the output. Expressions recognize the following variables(in paired-sample + * somatic mode variables are prefixed with T_ and N_ for Tumor and Normal, e.g. N_COV and T_COV are defined instead of COV): + * COV for coverage at the site, INDEL_F for fraction of reads supporting consensus indel at the site (wrt total coverage), + * INDEL_CF for fraction of reads with consensus indel wrt all reads with an indel at the site, CONS_CNT for the count of + * reads supporting the consensus indel at the site. Conventional arithmetic and logical operations are supported. For instance, + * N_COV<4||T_COV<6||T_INDEL_F<0.3||T_INDEL_CF<0.7 instructs the tool to only output indel calls with at least 30% observed + * allelic fraction and with consensus indel making at least 70% of all indel observations at the site, and only at the sites + * where tumor coverage and normal coverage are at least 6 and 4, respectively. *

Input

*

* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode). @@ -147,30 +161,44 @@ public class SomaticIndelDetectorWalker extends ReadWalker { doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) java.io.File bedOutput = null; + @Deprecated @Argument(fullName="minCoverage", shortName="minCoverage", doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+ - "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false) + "with --unpaired (single sample) option, this value is used for minimum sample coverage. "+ + "INSTEAD USE: T_COV { "GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) String RefseqFileName = null; + + @Argument(shortName="filter", doc="One or more logical expressions. If any of the expressions is TRUE, " + + "putative indel will be discarded and nothing will be printed into the output (unless genotyping "+ + "at the specific position is explicitly requested, see -genotype). "+ + "Default: T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7", required=false) + public ArrayList FILTER_EXPRESSIONS = new ArrayList(); + //@Argument(fullName="blacklistedLanes", shortName="BL", // doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ // "by this application, so they will not contribute indels to consider and will not be counted.", required=false) @@ -221,7 +256,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { private Writer verboseWriter = null; - private static String annGenomic = "GENOMIC"; + private static String annGenomic = "GENOMIC\t"; private static String annIntron = "INTRON"; private static String annUTR = "UTR"; private static String annCoding = "CODING"; @@ -245,6 +280,32 @@ enum CallType { private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed; // can be 1 base before lastGenotyped start + private JexlEngine jexlEngine = new JexlEngine(); + private ArrayList jexlExpressions = new ArrayList(); + + // the following arrays store indel source-specific (normal/tumor) metric names + // for fast access when populating JEXL expression contexts (see IndelPrecall.fillContext()) + private final static String[] normalMetricsCassette = new String[4]; + private final static String[] tumorMetricsCassette = new String[4]; + private final static String[] singleMetricsCassette = new String[4]; + private final static int C_COV=0; + private final static int C_CONS_CNT=1; + private final static int C_INDEL_F=2; + private final static int C_INDEL_CF=3; + static { + normalMetricsCassette[C_COV] = "N_COV"; + tumorMetricsCassette[C_COV] = "T_COV"; + singleMetricsCassette[C_COV] = "COV"; + normalMetricsCassette[C_CONS_CNT] = "N_CONS_CNT"; + tumorMetricsCassette[C_CONS_CNT] = "T_CONS_CNT"; + singleMetricsCassette[C_CONS_CNT] = "CONS_CNT"; + normalMetricsCassette[C_INDEL_F] = "N_INDEL_F"; + tumorMetricsCassette[C_INDEL_F] = "T_INDEL_F"; + singleMetricsCassette[C_INDEL_F] = "INDEL_F"; + normalMetricsCassette[C_INDEL_CF] = "N_INDEL_CF"; + tumorMetricsCassette[C_INDEL_CF] = "T_INDEL_CF"; + singleMetricsCassette[C_INDEL_CF] = "INDEL_CF"; + } // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" @@ -389,6 +450,24 @@ public void initialize() { vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ; refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); + + // Now initialize JEXL expressions: + if ( FILTER_EXPRESSIONS.size() == 0 ) { + if ( call_unpaired ) { + FILTER_EXPRESSIONS.add("COV<6||INDEL_F<0.3||INDEL_CF<0.7"); + } else { + FILTER_EXPRESSIONS.add("T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7"); + } + } + for ( String s : FILTER_EXPRESSIONS ) { + try { + Expression e = jexlEngine.createExpression(s); + jexlExpressions.add(e); + } catch (Exception e) { + throw new UserException.BadArgumentValue("Filter expression", "Invalid expression used (" + s + "). Please see the JEXL docs for correct syntax.") ; + } + + } } @@ -661,14 +740,26 @@ private void emit(long position, boolean force) { if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue; IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); + JexlContext jc = new MapContext(); + normalCall.fillContext(jc,singleMetricsCassette); + boolean discard_event = false; - if ( normalCall.getCoverage() < minCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage + for ( Expression e : jexlExpressions ) { + if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; } } + if ( discard_event && ! genotype ) { + normal_context.indelsAt(pos).clear(); + continue; //* + } + +// if ( normalCall.getCoverage() < minCoverage && ! genotype ) { +// if ( DEBUG ) { +// System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); +// } +// continue; // low coverage +// } + if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos); long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() ); @@ -697,24 +788,16 @@ private void emit(long position, boolean force) { location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos); - boolean haveCall = normalCall.isCall(); // cache the value - - if ( haveCall || genotype) { - if ( haveCall ) normalCallsMade++; - printVCFLine(vcf_writer,normalCall); - if ( bedWriter != null ) normalCall.printBedLine(bedWriter); - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall); - lastGenotypedPosition = pos; - } + if ( ! discard_event ) normalCallsMade++; + printVCFLine(vcf_writer,normalCall, discard_event); + if ( bedWriter != null ) normalCall.printBedLine(bedWriter); + if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, discard_event); + lastGenotypedPosition = pos; normal_context.indelsAt(pos).clear(); // we dealt with this indel; don't want to see it again // (we might otherwise in the case when 1) there is another indel that follows // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } } if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); @@ -829,18 +912,32 @@ private void emit_somatic(long position, boolean force) { IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH); IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage + JexlContext jc = new MapContext(); + tumorCall.fillContext(jc,tumorMetricsCassette); + normalCall.fillContext(jc,normalMetricsCassette); + boolean discard_event = false; + + for ( Expression e : jexlExpressions ) { + if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; } } - if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage + + if ( discard_event && ! genotype ) { + tumor_context.indelsAt(pos).clear(); + normal_context.indelsAt(pos).clear(); + continue; //* } +// if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { +// if ( DEBUG ) { +// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); +// } +// continue; // low coverage +// } +// if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { +// if ( DEBUG ) { +// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); +// } +// continue; // low coverage +// } if ( DEBUG ) { System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, "); @@ -868,32 +965,24 @@ private void emit_somatic(long position, boolean force) { if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right -// location = getToolkit().getGenomeLocParser().setStart(location,pos); -// location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data - boolean haveCall = tumorCall.isCall(); // cache the value +// boolean haveCall = tumorCall.isCall(); // cache the value - if ( haveCall || genotype ) { - if ( haveCall ) tumorCallsMade++; + if ( ! discard_event ) tumorCallsMade++; - printVCFLine(vcf_writer,normalCall,tumorCall); + printVCFLine(vcf_writer,normalCall,tumorCall,discard_event); - if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); + if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); + + if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall, discard_event ); + lastGenotypedPosition = pos; - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall ); - lastGenotypedPosition = pos; - } tumor_context.indelsAt(pos).clear(); normal_context.indelsAt(pos).clear(); // we dealt with this indel; don't want to see it again // (we might otherwise in the case when 1) there is another indel that follows // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } } if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); @@ -947,14 +1036,14 @@ private String getAnnotationString(RODRecordList ann) { } - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) { + public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, boolean discard_event) { RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); StringBuilder fullRecord = new StringBuilder(); fullRecord.append(makeFullRecord(normalCall)); fullRecord.append(annotationString); - if ( ! normalCall.isCall() && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); + if ( discard_event && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); try { verboseWriter.write(fullRecord.toString()); verboseWriter.write('\n'); @@ -965,7 +1054,7 @@ public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) { } - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall) { + public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall, boolean discard_event) { RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); @@ -1013,7 +1102,7 @@ public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, Inde fullRecord.append('\t'); fullRecord.append(annotationString); - if ( ! tumorCall.isCall() && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); + if ( discard_event && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); try { verboseWriter.write(fullRecord.toString()); @@ -1023,7 +1112,7 @@ public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, Inde } } - public void printVCFLine(VCFWriter vcf, IndelPrecall call) { + public void printVCFLine(VCFWriter vcf, IndelPrecall call, boolean discard_event) { long start = call.getPosition()-1; // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. @@ -1060,14 +1149,14 @@ public void printVCFLine(VCFWriter vcf, IndelPrecall call) { Map attrs = call.makeStatsAttributes(null); - if ( call.isCall() ) // we made a call - put actual het genotype here: + if ( ! discard_event ) // we made a call - put actual het genotype here: genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false)); else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false)); } Set filters = null; - if ( call.getVariant() != null && ! call.isCall() ) { + if ( call.getVariant() != null && discard_event ) { filters = new HashSet(); filters.add("NoCall"); } @@ -1095,7 +1184,7 @@ private void fillAlleleList(List l, IndelPrecall call) { } } - public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) { + public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall, boolean discard_event) { long start = tCall.getPosition()-1; long stop = start; @@ -1112,7 +1201,7 @@ public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) Map attrs = new HashMap(); boolean isSomatic = false; - if ( nCall.getCoverage() >= minNormalCoverage && nCall.getVariant() == null && tCall.getVariant() != null ) { + if ( nCall.getVariant() == null && tCall.getVariant() != null ) { isSomatic = true; attrs.put(VCFConstants.SOMATIC_KEY,true); } @@ -1155,7 +1244,7 @@ public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) } Set filters = null; - if ( tCall.getVariant() != null && ! tCall.isCall() ) { + if ( tCall.getVariant() != null && discard_event ) { filters = new HashSet(); filters.add("NoCall"); } @@ -1602,6 +1691,13 @@ public double getNQSRefAvQual() { public IndelVariant getVariant() { return consensus_indel; } + public void fillContext(JexlContext context,String[] cassette) { + context.set(cassette[C_INDEL_F],((double)consensus_indel_count)/total_coverage); + context.set(cassette[C_INDEL_CF],((double)consensus_indel_count/all_indel_count)); + context.set(cassette[C_COV],total_coverage); + context.set(cassette[C_CONS_CNT],consensus_indel_count); + } +/* public boolean isCall() { boolean ret = ( consensus_indel_count >= minIndelCount && (double)consensus_indel_count > minFraction * total_coverage && @@ -1610,10 +1706,11 @@ public boolean isCall() { " total_count="+all_indel_count+" cov="+total_coverage+ " minConsensusF="+((double)consensus_indel_count)/all_indel_count+ " minF="+((double)consensus_indel_count)/total_coverage); - return ret; + return ret; +// return true; } - +*/ /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus) * @param variants diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java index cb123c8683..c629bd313b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java @@ -36,7 +36,7 @@ public class AllelePair { public AllelePair(Genotype gt) { if (gt.getPloidy() != 2) - throw new ReviewedStingException("AllelePair must have ploidy of 2!"); + throw new ReviewedStingException("AllelePair must have ploidy of 2! incoming gt was"+gt.toBriefString()); this.top = gt.getAllele(0); this.bottom = gt.getAllele(1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java new file mode 100644 index 0000000000..ab5324e39e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequence; +import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.PrintStream; +import java.util.Collections; +import java.util.List; + +/** + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One reference file only. And optionally -L intervals + *

+ * + *

Output

+ *

+ * If ok, nothing, else will throw an exception at the site where there's been a problem + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T QCRefWalker
+ * 
+ * + */ +public class QCRefWalker extends RefWalker { + @Output + public PrintStream out; + + String contigName = ""; + int contigStart, contigEnd; + IndexedFastaSequenceFile uncachedRef; + byte[] uncachedBases; + + @Override + public void initialize() { + super.initialize(); //To change body of overridden methods use File | Settings | File Templates. + uncachedRef = getToolkit().getReferenceDataSource().getReference(); + } + + private final void throwError(ReferenceContext ref, String message) { + throw new StingException(String.format("Site %s failed: %s", ref.getLocus(), message)); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String locusContigName = ref.getLocus().getContig(); + if ( ! locusContigName.equals(contigName) ) { + contigName = locusContigName; + ReferenceSequence refSeq = uncachedRef.getSequence(contigName); + contigStart = 1; + contigEnd = contigStart + refSeq.length() - 1; + uncachedBases = uncachedRef.getSubsequenceAt(contigName, contigStart, contigEnd).getBases(); + logger.info(String.format("Loading contig %s (%d-%d)", contigName, contigStart, contigEnd)); + } + + final byte refBase = ref.getBase(); + if (! ( BaseUtils.isRegularBase(refBase) || isExtendFastaBase(refBase) ) ) + throwError(ref, String.format("Refbase isn't a regular base (%d %c)", refBase, (char)refBase)); + + // check bases are equal + final int pos = (int)context.getPosition() - contigStart; + if ( pos > contigEnd ) + throwError(ref, String.format("off contig (len=%d)", contigEnd)); + final byte uncachedBase = uncachedBases[pos]; + + if ( uncachedBase != refBase ) + throwError(ref, String.format("Provided refBase (%d %c) not equal to uncached one (%d %c)", + refBase, (char)refBase, uncachedBase, (char)uncachedBase)); + + return 1; + } + + private static final boolean isExtendFastaBase(final byte b) { + switch ( b ) { + case 'U': + case 'R': + case 'Y': + case 'K': + case 'M': + case 'S': + case 'W': + case 'B': + case 'D': + case 'H': + case 'V': + case 'N': + case 'X': + case '-': + return true; + default: + return false; + } + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer one, Integer sum) { + return one + sum; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java new file mode 100644 index 0000000000..e1a7772dbc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 9/26/11 + */ + +public class ContextCovariate implements ExperimentalCovariate { + + private int CONTEXT_SIZE; + private String allN = ""; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + CONTEXT_SIZE = RAC.CONTEXT_SIZE; + + if (CONTEXT_SIZE <= 0) + throw new UserException("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead"); + + // initialize allN given the size of the context + for (int i = 0; i < CONTEXT_SIZE; i++) + allN += "N"; + } + + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + byte[] bases = read.getReadBases(); + for (int i = 0; i < read.getReadLength(); i++) + comparable[i] = (i < CONTEXT_SIZE) ? allN : new String(Arrays.copyOfRange(bases, i - CONTEXT_SIZE, i)); + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + @Override + public final Comparable getValue(final String str) { + return str; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 88a9668cce..a99f35f458 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; @@ -76,20 +77,20 @@ *

Output

*

* A recalibration table file in CSV format that is used by the TableRecalibration walker. - * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. + * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. * - * The first 20 lines of such a file is shown below. + * The first 20 lines of such a file is shown below. * * The file begins with a series of comment lines describing: * ** The number of counted loci * ** The number of counted bases * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases - * - * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. + * + * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. * * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change - * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of + * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. - * + * *

  * # Counted Sites    19451059
  * # Counted Bases    56582018
@@ -128,13 +129,14 @@
  *   -cov DinucCovariate \
  *   -recalFile my_reads.recal_data.csv
  * 
- * */ @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) -@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality -@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta +@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) +// Filter out all reads with zero or unavailable mapping quality +@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +// This walker requires both -I input.bam and -R reference.fasta @PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { @@ -148,16 +150,19 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList(); /** @@ -166,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker> covariateClasses = new PluginManager( Covariate.class ).getPlugins(); - final List> requiredClasses = new PluginManager( RequiredCovariate.class ).getPlugins(); - final List> standardClasses = new PluginManager( StandardCovariate.class ).getPlugins(); + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); // Print and exit if that's what was requested - if ( LIST_ONLY ) { - logger.info( "Available covariates:" ); - for( Class covClass : covariateClasses ) { - logger.info( covClass.getSimpleName() ); + if (LIST_ONLY) { + logger.info("Available covariates:"); + for (Class covClass : covariateClasses) { + logger.info(covClass.getSimpleName()); } logger.info(""); - System.exit( 0 ); // Early exit here because user requested it + System.exit(0); // Early exit here because user requested it } // Warn the user if no dbSNP file or other variant mask was specified - if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) { + if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) { throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."); } // Initialize the requested covariates by parsing the -cov argument // First add the required covariates - if( requiredClasses.size() == 2) { // readGroup and reported quality score - requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here - requestedCovariates.add( new QualityScoreCovariate() ); - } else { + if (requiredClasses.size() == 2) { // readGroup and reported quality score + requestedCovariates.add(new ReadGroupCovariate()); // Order is important here + requestedCovariates.add(new QualityScoreCovariate()); + } + else { throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); } // Next add the standard covariates if -standard was specified by the user - if( USE_STANDARD_COVARIATES ) { + if (USE_STANDARD_COVARIATES) { // We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order // A list of Classes can't be sorted, but a list of Class names can be final List standardClassNames = new ArrayList(); - for( Class covClass : standardClasses ) { - standardClassNames.add( covClass.getName() ); + for (Class covClass : standardClasses) { + standardClassNames.add(covClass.getName()); } Collections.sort(standardClassNames); // Sort the list of class names - for( String className : standardClassNames ) { - for( Class covClass : standardClasses ) { // Find the class that matches this class name - if( covClass.getName().equals( className ) ) { + for (String className : standardClassNames) { + for (Class covClass : standardClasses) { // Find the class that matches this class name + if (covClass.getName().equals(className)) { try { - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + final Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -299,17 +307,17 @@ public void initialize() { } } // Finally parse the -cov arguments that were provided, skipping over the ones already specified - if( COVARIATES != null ) { - for( String requestedCovariateString : COVARIATES ) { + if (COVARIATES != null) { + for (String requestedCovariateString : COVARIATES) { boolean foundClass = false; - for( Class covClass : covariateClasses ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class foundClass = true; - if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { + if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { try { // Now that we've found a matching class, try to instantiate it - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + final Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -317,20 +325,19 @@ public void initialize() { } } - if( !foundClass ) { - throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); + if (!foundClass) { + throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); } } } - logger.info( "The covariates being used here: " ); - for( Covariate cov : requestedCovariates ) { - logger.info( "\t" + cov.getClass().getSimpleName() ); - cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection + logger.info("The covariates being used here: "); + for (Covariate cov : requestedCovariates) { + logger.info("\t" + cov.getClass().getSimpleName()); + cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection } } - //--------------------------------------------------------------------------------------------------------------- // // map @@ -339,63 +346,63 @@ public void initialize() { /** * For each read at this locus get the various covariate values and increment that location in the map based on - * whether or not the base matches the reference at this particular location + * whether or not the base matches the reference at this particular location + * * @param tracker The reference metadata tracker - * @param ref The reference context + * @param ref The reference context * @param context The alignment context * @return Returns 1, but this value isn't used in the reduce step */ - public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { // Only use data from non-dbsnp sites // Assume every mismatch at a non-dbsnp site is indicative of poor quality CountedData counter = new CountedData(); - if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed + if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed // For each read at this locus - for( final PileupElement p : context.getBasePileup() ) { - final GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead(); + for (final PileupElement p : context.getBasePileup()) { + final GATKSAMRecord gatkRead = p.getRead(); int offset = p.getOffset(); - if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) { + if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) { continue; } - if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) ) - { - gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true ); - RecalDataManager.parseSAMRecord( gatkRead, RAC ); + if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) { + gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true); + RecalDataManager.parseSAMRecord(gatkRead, RAC); // Skip over reads with no calls in the color space if the user requested it - if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) { - gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true); + if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) { + gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true); continue; } - RecalDataManager.parseColorSpace( gatkRead ); - gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE, - RecalDataManager.computeCovariates( gatkRead, requestedCovariates )); + RecalDataManager.parseColorSpace(gatkRead); + gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates)); } - // Skip this position if base quality is zero - if( gatkRead.getBaseQualities()[offset] > 0 ) { + if (gatkRead.getBaseQualities()[offset] > 0) { byte[] bases = gatkRead.getReadBases(); byte refBase = ref.getBase(); // Skip if this base is an 'N' or etc. - if( BaseUtils.isRegularBase( bases[offset] ) ) { + if (BaseUtils.isRegularBase(bases[offset])) { // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || - !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { + if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || + !RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) { // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( counter, gatkRead, offset, refBase ); + updateDataFromRead(counter, gatkRead, offset, refBase); - } else { // calculate SOLID reference insertion rate - if( refBase == bases[offset] ) { + } + else { // calculate SOLID reference insertion rate + if (refBase == bases[offset]) { counter.solidInsertedReferenceBases++; - } else { + } + else { counter.otherColorSpaceInconsistency++; } } @@ -403,7 +410,8 @@ public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, Alignm } } counter.countedSites++; - } else { // We skipped over the dbSNP site, and we are only processing every Nth locus + } + else { // We skipped over the dbSNP site, and we are only processing every Nth locus counter.skippedSites++; updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable } @@ -411,7 +419,7 @@ public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, Alignm return counter; } - /** + /** * Update the mismatch / total_base counts for a given class of loci. * * @param counter The CountedData to be updated @@ -419,13 +427,13 @@ public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, Alignm * @param refBase The reference base */ private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) { - for( PileupElement p : context.getBasePileup() ) { + for (PileupElement p : context.getBasePileup()) { final byte readBase = p.getBase(); final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase); - final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase); + final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase); - if( readBaseIndex != -1 && refBaseIndex != -1 ) { - if( readBaseIndex != refBaseIndex ) { + if (readBaseIndex != -1 && refBaseIndex != -1) { + if (readBaseIndex != refBaseIndex) { counter.novelCountsMM++; } counter.novelCountsBases++; @@ -437,13 +445,14 @@ private static void updateMismatchCounts(CountedData counter, final AlignmentCon * Major workhorse routine for this walker. * Loop through the list of requested covariates and pick out the value from the read, offset, and reference * Using the list of covariate values as a key, pick out the RecalDatum and increment, - * adding one to the number of observations and potentially one to the number of mismatches + * adding one to the number of observations and potentially one to the number of mismatches * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls - * because pulling things out of the SAMRecord is an expensive operation. - * @param counter Data structure which holds the counted bases + * because pulling things out of the SAMRecord is an expensive operation. + * + * @param counter Data structure which holds the counted bases * @param gatkRead The SAMRecord holding all the data for this read - * @param offset The offset in the read for this locus - * @param refBase The reference base at this locus + * @param offset The offset in the read for this locus + * @param refBase The reference base at this locus */ private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) { final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE); @@ -451,10 +460,10 @@ private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRea // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap final NestedHashMap data = dataManager.data; //optimization - create local reference - RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key ); - if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it + RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key); + if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it // initialized with zeros, will be incremented at end of method - datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key ); + datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key); } // Need the bases to determine whether or not we have a mismatch @@ -462,13 +471,12 @@ private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRea final long curMismatches = datum.getNumMismatches(); // Add one to the number of observations and potentially one to the number of mismatches - datum.incrementBaseCounts( base, refBase ); + datum.incrementBaseCounts(base, refBase); counter.countedBases++; counter.novelCountsBases++; counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable } - //--------------------------------------------------------------------------------------------------------------- // // reduce @@ -477,6 +485,7 @@ private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRea /** * Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker. + * * @return returns A PrintStream created from the -recalFile filename argument specified to the walker */ public CountedData reduceInit() { @@ -485,11 +494,12 @@ public CountedData reduceInit() { /** * The Reduce method doesn't do anything for this walker. + * * @param mapped Result of the map. This value is immediately ignored. - * @param sum The summing CountedData used to output the CSV data + * @param sum The summing CountedData used to output the CSV data * @return returns The sum used to output the CSV data */ - public CountedData reduce( CountedData mapped, CountedData sum ) { + public CountedData reduce(CountedData mapped, CountedData sum) { // Do a dbSNP sanity check every so often return validatingDbsnpMismatchRate(sum.add(mapped)); } @@ -498,16 +508,15 @@ public CountedData reduce( CountedData mapped, CountedData sum ) { * Validate the dbSNP reference mismatch rates. */ private CountedData validatingDbsnpMismatchRate(CountedData counter) { - if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) { + if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) { counter.lociSinceLastDbsnpCheck = 0; - if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) { - final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases; - final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases; + if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) { + final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases; + final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases; - if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { - Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + - String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); + if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) { + Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel)); DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file } } @@ -516,47 +525,50 @@ private CountedData validatingDbsnpMismatchRate(CountedData counter) { return counter; } - public CountedData treeReduce( CountedData sum1, CountedData sum2 ) { + public CountedData treeReduce(CountedData sum1, CountedData sum2) { return validatingDbsnpMismatchRate(sum1.add(sum2)); } /** * Write out the full data hashmap to disk in CSV format + * * @param sum The CountedData to write out to RECAL_FILE */ - public void onTraversalDone( CountedData sum ) { - logger.info( "Writing raw recalibration data..." ); - if( sum.countedBases == 0L ) { + public void onTraversalDone(CountedData sum) { + logger.info("Writing raw recalibration data..."); + if (sum.countedBases == 0L) { throw new UserException.BadInput("Could not find any usable data in the input BAM file(s)."); } - outputToCSV( sum, RECAL_FILE ); - logger.info( "...done!" ); + outputToCSV(sum, RECAL_FILE); + logger.info("...done!"); } /** * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format + * * @param recalTableStream The PrintStream to write out to */ - private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) { + private void outputToCSV(CountedData sum, final PrintStream recalTableStream) { recalTableStream.printf("# Counted Sites %d%n", sum.countedSites); recalTableStream.printf("# Counted Bases %d%n", sum.countedBases); recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites); - recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites); + recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites); - if( sum.solidInsertedReferenceBases != 0 ) { + if (sum.solidInsertedReferenceBases != 0) { recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases); recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency); } // Output header saying which covariates were used and in what order - for( Covariate cov : requestedCovariates ) { - recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," ); + for (Covariate cov : requestedCovariates) { + recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ","); } recalTableStream.println("nObservations,nMismatches,Qempirical"); - if( DONT_SORT_OUTPUT ) { + if (DONT_SORT_OUTPUT) { printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); - } else { + } + else { printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); } @@ -564,45 +576,47 @@ private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) recalTableStream.println(TableRecalibrationWalker.EOF_MARKER); } - private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { + private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { final ArrayList keyList = new ArrayList(); - for( Object comp : data.keySet() ) { + for (Object comp : data.keySet()) { keyList.add((Comparable) comp); } Collections.sort(keyList); - for( Comparable comp : keyList ) { + for (Comparable comp : keyList) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps // For each Covariate in the key - for( Object compToPrint : key ) { + for (Object compToPrint : key) { // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); + recalTableStream.print(compToPrint + ","); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val ); + recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); + } + else { // Another layer in the nested hash map + printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val); } } } - private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { - for( Object comp : data.keySet() ) { + private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { + for (Object comp : data.keySet()) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps // For each Covariate in the key - for( Object compToPrint : key ) { + for (Object compToPrint : key) { // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); + recalTableStream.print(compToPrint + ","); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappings( recalTableStream, curPos + 1, key, (Map) val ); + recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); + } + else { // Another layer in the nested hash map + printMappings(recalTableStream, curPos + 1, key, (Map) val); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index 46ce006ee2..9d5747023f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -32,24 +33,24 @@ * User: rpoplin * Date: Oct 30, 2009 * - * The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases + * The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read. * In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code. * This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up. */ public interface Covariate { - public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers - public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate - //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows - //read-specific calculations to be done just once rather than for each offset. -} + public void initialize(RecalibrationArgumentCollection RAC); // Initialize any member variables using the command-line arguments passed to the walkers -interface RequiredCovariate extends Covariate { -} + public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker -interface StandardCovariate extends Covariate { + public void getValues(GATKSAMRecord read, Comparable[] comparable); + //Takes an array of size (at least) read.getReadLength() and fills it with covariate + //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows + //read-specific calculations to be done just once rather than for each offset. } -interface ExperimentalCovariate extends Covariate { -} +interface RequiredCovariate extends Covariate {} + +interface StandardCovariate extends Covariate {} + +interface ExperimentalCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 6b4fec04e8..b8d13ca10b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -1,9 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.EnumSet; @@ -39,67 +39,69 @@ * Date: Oct 30, 2009 * * The Cycle covariate. - * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) - * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle - * For example, for the read: AAACCCCGAAATTTTTACTG - * the cycle would be 11111111222333333344 - * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round + * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) + * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle + * For example, for the read: AAACCCCGAAATTTTTACTG + * the cycle would be 11111111222333333344 + * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round */ public class CycleCovariate implements StandardCovariate { private final static EnumSet DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); - private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); + private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - if( RAC.DEFAULT_PLATFORM != null ) { - if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SLX" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) || - RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ABI_SOLID" ) ) { + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + if (RAC.DEFAULT_PLATFORM != null) { + if (RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SLX") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ILLUMINA") || + RAC.DEFAULT_PLATFORM.contains("454") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SOLID") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ABI_SOLID")) { // nothing to do - } else { - throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid"); + } + else { + throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform. Implemented options are illumina, 454, and solid"); } } } // Used to pick out the covariate's value from attributes of the read - public void getValues(SAMRecord read, Comparable[] comparable) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { //----------------------------- // Illumina, Solid, PacBio, and Complete Genomics //----------------------------- - final NGSPlatform ngsPlatform = ((GATKSAMRecord)read).getNGSPlatform(); - if( DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform) ) { + final NGSPlatform ngsPlatform = read.getNGSPlatform(); + if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { final int init; final int increment; - if( !read.getReadNegativeStrandFlag() ) { + if (!read.getReadNegativeStrandFlag()) { // Differentiate between first and second of pair. // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. // Therefore the cycle covariate must differentiate between first and second of pair reads. // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because // the current sequential model would consider the effects independently instead of jointly. - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { //second of pair, positive strand init = -1; increment = -1; } - else - { + else { //first of pair, positive strand init = 1; increment = 1; } - } else { - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + } + else { + if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { //second of pair, negative strand init = -read.getReadLength(); increment = 1; } - else - { + else { //first of pair, negative strand init = read.getReadLength(); increment = -1; @@ -107,7 +109,7 @@ public void getValues(SAMRecord read, Comparable[] comparable) { } int cycle = init; - for(int i = 0; i < read.getReadLength(); i++) { + for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = cycle; cycle += increment; } @@ -116,7 +118,7 @@ public void getValues(SAMRecord read, Comparable[] comparable) { //----------------------------- // 454 and Ion Torrent //----------------------------- - else if( FLOW_CYCLE_PLATFORMS.contains(ngsPlatform) ) { + else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) { final int readLength = read.getReadLength(); final byte[] bases = read.getReadBases(); @@ -133,38 +135,78 @@ else if( FLOW_CYCLE_PLATFORMS.contains(ngsPlatform) ) { // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if( !read.getReadNegativeStrandFlag() ) { // Forward direction + if (!read.getReadNegativeStrandFlag()) { // Forward direction int iii = 0; - while( iii < readLength ) - { - while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; } - if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; } - if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; } + while (iii < readLength) { + while (iii < readLength && bases[iii] == (byte) 'T') { + comparable[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'A') { + comparable[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'C') { + comparable[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'G') { + comparable[iii] = cycle; + iii++; + } + if (iii < readLength) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { + comparable[iii] = cycle; + iii++; + } } - } else { // Negative direction - int iii = readLength-1; - while( iii >= 0 ) - { - while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; } - if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; } - if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; } + } + else { // Negative direction + int iii = readLength - 1; + while (iii >= 0) { + while (iii >= 0 && bases[iii] == (byte) 'T') { + comparable[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'A') { + comparable[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'C') { + comparable[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'G') { + comparable[iii] = cycle; + iii--; + } + if (iii >= 0) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { + comparable[iii] = cycle; + iii--; + } } } } - else { - throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); + else { + throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index a7717161a7..9a401d09f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -1,7 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.HashMap; @@ -42,63 +43,30 @@ public class DinucCovariate implements StandardCovariate { - private static final byte NO_CALL = (byte)'N'; + private static final byte NO_CALL = (byte) 'N'; private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL); private HashMap dinucHashMap; // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' }; + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + final byte[] BASES = {(byte) 'A', (byte) 'C', (byte) 'G', (byte) 'T'}; dinucHashMap = new HashMap(); - for( byte byte1 : BASES ) { - for( byte byte2: BASES ) { - dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow + for (byte byte1 : BASES) { + for (byte byte2 : BASES) { + dinucHashMap.put(Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2)); // This might seem silly, but Strings are too slow } } // Add the "no dinuc" entry too - dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC ); + dinucHashMap.put(Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC); } - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - - byte base; - byte prevBase; - final byte[] bases = read.getReadBases(); - // If this is a negative strand read then we need to reverse the direction for our previous base - if( read.getReadNegativeStrandFlag() ) { - // No dinuc at the beginning of the read - if( offset == bases.length-1 ) { - return NO_DINUC; - } - base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) ); - // Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads. - prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) ); - } else { - // No dinuc at the beginning of the read - if( offset == 0 ) { - return NO_DINUC; - } - base = bases[offset]; - // Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads. - prevBase = bases[offset - 1]; - } - - // Make sure the previous base is good - if( !BaseUtils.isRegularBase( prevBase ) ) { - return NO_DINUC; - } - - return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); - } - */ - /** * Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read. */ - public void getValues( SAMRecord read, Comparable[] result ) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { final HashMap dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap final int readLength = read.getReadLength(); final boolean negativeStrand = read.getReadNegativeStrandFlag(); @@ -108,50 +76,51 @@ public void getValues( SAMRecord read, Comparable[] result ) { int offset = 0; // If this is a negative strand read then we need to reverse the direction for our previous base - if(negativeStrand) { + if (negativeStrand) { bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place } - result[0] = NO_DINUC; // No dinuc at the beginning of the read + comparable[0] = NO_DINUC; // No dinuc at the beginning of the read prevBase = bases[0]; offset++; - while(offset < readLength) { - // Note: We are using the previous base in the read, not the - // previous base in the reference. This is done in part to be consistent with unmapped reads. - base = bases[offset]; - if( BaseUtils.isRegularBase( prevBase ) ) { - result[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) ); - } else { - result[offset] = NO_DINUC; - } + while (offset < readLength) { + // Note: We are using the previous base in the read, not the + // previous base in the reference. This is done in part to be consistent with unmapped reads. + base = bases[offset]; + if (BaseUtils.isRegularBase(prevBase)) { + comparable[offset] = dinucHashMapRef.get(Dinuc.hashBytes(prevBase, base)); + } + else { + comparable[offset] = NO_DINUC; + } - offset++; - prevBase = base; + offset++; + prevBase = base; } - if(negativeStrand) { - reverse( result ); + if (negativeStrand) { + reverse(comparable); } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { + @Override + public final Comparable getValue(final String str) { byte[] bytes = str.getBytes(); - final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( bytes[0], bytes[1] ) ); - if( returnDinuc.compareTo(NO_DINUC) == 0 ) { + final Dinuc returnDinuc = dinucHashMap.get(Dinuc.hashBytes(bytes[0], bytes[1])); + if (returnDinuc.compareTo(NO_DINUC) == 0) { return null; } return returnDinuc; } - /** * Reverses the given array in place. * - * @param array + * @param array any array */ private static void reverse(final Comparable[] array) { final int arrayLength = array.length; - for(int l = 0, r = arrayLength - 1; l < r; l++, r--) { + for (int l = 0, r = arrayLength - 1; l < r; l++, r--) { final Comparable temp = array[l]; array[l] = array[r]; array[r] = temp; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java index be4e4ebfcb..14ffd35a46 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2010 The Broad Institute @@ -38,55 +40,57 @@ public class GCContentCovariate implements ExperimentalCovariate { - int numBack = 7; + private int numBack = 7; // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { numBack = RAC.HOMOPOLYMER_NBACK; } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { // ATTGCCCCGTAAAAAAAGAGAA // 0000123456654321001122 - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) ) { + if (read.getReadGroup().getPlatform().equalsIgnoreCase("ILLUMINA") || read.getReadGroup().getPlatform().equalsIgnoreCase("SLX")) { int numGC = 0; - int startPos = 0; - int stopPos = 0; + int startPos; + int stopPos; final byte[] bases = read.getReadBases(); - if( !read.getReadNegativeStrandFlag() ) { // Forward direction + if (!read.getReadNegativeStrandFlag()) { // Forward direction startPos = Math.max(offset - numBack, 0); stopPos = Math.max(offset - 1, 0); - } else { // Negative direction + } + else { // Negative direction startPos = Math.min(offset + 2, bases.length); stopPos = Math.min(offset + numBack + 1, bases.length); } - for( int iii = startPos; iii < stopPos; iii++ ) { - if( bases[iii] == (byte)'G' || bases[iii] == (byte)'C' ) { + for (int iii = startPos; iii < stopPos; iii++) { + if (bases[iii] == (byte) 'G' || bases[iii] == (byte) 'C') { numGC++; } } return numGC; - } else { // This effect is specific to the Illumina platform + } + else { // This effect is specific to the Illumina platform return -1; } } - - public void getValues(SAMRecord read, Comparable[] comparable) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } - - - } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java index f9a75de6f9..004fb0bdb0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -40,15 +42,16 @@ public class HomopolymerCovariate implements ExperimentalCovariate { - int numBack = 7; + private int numBack; // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { numBack = RAC.HOMOPOLYMER_NBACK; } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { // This block of code is for if you don't want to only count consecutive bases // ATTGCCCCGTAAAAAAAAATA @@ -75,13 +78,14 @@ public final Comparable getValue( final SAMRecord read, final int offset ) { int numAgree = 0; // The number of consecutive bases that agree with you in the previous numBack bases of the read final byte[] bases = read.getReadBases(); int iii = offset; - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - while( iii <= bases.length-2 && bases[iii] == bases[iii+1] && numAgree < numBack ) { + if (!read.getReadNegativeStrandFlag()) { // Forward direction + while (iii <= bases.length - 2 && bases[iii] == bases[iii + 1] && numAgree < numBack) { numAgree++; iii++; } - } else { // Negative direction - while( iii >= 1 && bases[iii] == bases[iii-1] && numAgree < numBack ) { + } + else { // Negative direction + while (iii >= 1 && bases[iii] == bases[iii - 1] && numAgree < numBack) { numAgree++; iii--; } @@ -90,15 +94,16 @@ public final Comparable getValue( final SAMRecord read, final int offset ) { return numAgree; } - public void getValues(SAMRecord read, Comparable[] comparable) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index f9149a528b..54fa18106e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -38,23 +39,25 @@ public class MappingQualityCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final GATKSAMRecord read) { return read.getMappingQuality(); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } - public void getValues(SAMRecord read, Comparable[] comparable) { - for(int iii = 0; iii < read.getReadLength(); iii++) { - comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + for (int iii = 0; iii < read.getReadLength(); iii++) { + comparable[iii] = getValue(read); // BUGBUG: this can be optimized } } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index 64cae2b623..ecaa550060 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -41,34 +43,37 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { private int windowReach; // How far in each direction from the current base to look // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { windowReach = RAC.WINDOW_SIZE / 2; // integer division } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { // Loop over the list of base quality scores in the window and find the minimum final byte[] quals = read.getBaseQualities(); int minQual = quals[offset]; final int minIndex = Math.max(offset - windowReach, 0); final int maxIndex = Math.min(offset + windowReach, quals.length - 1); - for ( int iii = minIndex; iii < maxIndex; iii++ ) { - if( quals[iii] < minQual ) { + for (int iii = minIndex; iii < maxIndex; iii++) { + if (quals[iii] < minQual) { minQual = quals[iii]; } } return minQual; } - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } - - public void getValues(SAMRecord read, Comparable[] comparable) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 2495df57ac..fd720697f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -39,27 +41,29 @@ public class PositionCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { int cycle = offset; - if( read.getReadNegativeStrandFlag() ) { + if (read.getReadNegativeStrandFlag()) { cycle = read.getReadLength() - (offset + 1); } return cycle; } - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } - - public void getValues(SAMRecord read, Comparable[] comparable) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 23fdeebe3d..d6bdea5bfc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -33,38 +35,42 @@ * Date: Nov 13, 2009 * * The Primer Round covariate. - * For Solexa and 454 this is the same value of the length of the read. - * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + * For Solexa and 454 this is the same value of the length of the read. + * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf */ public class PrimerRoundCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" ) ) { + private Comparable getValue(final SAMRecord read, final int offset) { + if (read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || read.getReadGroup().getPlatform().equalsIgnoreCase("ABI_SOLID")) { int pos = offset; - if( read.getReadNegativeStrandFlag() ) { + if (read.getReadNegativeStrandFlag()) { pos = read.getReadLength() - (offset + 1); } return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf - } else { + } + else { return 1; // nothing to do here because it is always the same } } - // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); - } - - public void getValues(SAMRecord read, Comparable[] comparable) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index df0101e18d..a29a0530c9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -1,6 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; /* * Copyright (c) 2009 The Broad Institute @@ -38,26 +41,21 @@ public class QualityScoreCovariate implements RequiredCovariate { // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - } - - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return (int)(read.getBaseQualities()[offset]); + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { } - */ - public void getValues(SAMRecord read, Comparable[] comparable) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { byte[] baseQualities = read.getBaseQualities(); - for(int i = 0; i < read.getReadLength(); i++) { + for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = (int) baseQualities[i]; } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + @Override + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index 0c853c349d..33adf44172 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -35,33 +36,26 @@ * The Read Group covariate. */ -public class ReadGroupCovariate implements RequiredCovariate{ - - public static final String defaultReadGroup = "DefaultReadGroup"; +public class ReadGroupCovariate implements RequiredCovariate { // Initialize any member variables using the command-line arguments passed to the walkers - public void initialize( final RecalibrationArgumentCollection RAC ) { - } - - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return read.getReadGroup().getReadGroupId(); + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { } - */ - public void getValues(SAMRecord read, Comparable[] comparable) { + @Override + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { final String readGroupId = read.getReadGroup().getReadGroupId(); - for(int i = 0; i < read.getReadLength(); i++) { + for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = readGroupId; } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public final Comparable getValue( final String str ) { + @Override + public final Comparable getValue(final String str) { return str; } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index a0c928afa0..1a6b8cfcb8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -25,8 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.BaseUtils; @@ -34,9 +32,11 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.ArrayList; import java.util.List; @@ -63,46 +63,60 @@ public class RecalDataManager { public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color - private static boolean warnUserNullReadGroup = false; private static boolean warnUserNullPlatform = false; public enum SOLID_RECAL_MODE { - /** Treat reference inserted bases as reference matching bases. Very unsafe! */ + /** + * Treat reference inserted bases as reference matching bases. Very unsafe! + */ DO_NOTHING, - /** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */ + /** + * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. + */ SET_Q_ZERO, - /** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */ + /** + * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. + */ SET_Q_ZERO_BASE_N, - /** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */ + /** + * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. + */ REMOVE_REF_BIAS } public enum SOLID_NOCALL_STRATEGY { - /** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */ + /** + * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. + */ THROW_EXCEPTION, - /** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */ + /** + * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. + */ LEAVE_READ_UNRECALIBRATED, - /** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */ + /** + * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. + */ PURGE_READ } - RecalDataManager() { + public RecalDataManager() { data = new NestedHashMap(); dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; dataCollapsedByCovariate = null; } - RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker + public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) { + if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker data = null; dataCollapsedReadGroup = new NestedHashMap(); dataCollapsedQualityScore = new NestedHashMap(); dataCollapsedByCovariate = new ArrayList(); - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.add( new NestedHashMap() ); + for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.add(new NestedHashMap()); } - } else { + } + else { data = new NestedHashMap(); dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; @@ -112,54 +126,58 @@ public enum SOLID_NOCALL_STRATEGY { /** * Add the given mapping to all of the collapsed hash tables - * @param key The list of comparables that is the key for this mapping - * @param fullDatum The RecalDatum which is the data for this mapping + * + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table */ - public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) { + public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) { // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around //data.put(key, thisDatum); // add the mapping to the main table - final int qualityScore = Integer.parseInt( key[1].toString() ); + final int qualityScore = Integer.parseInt(key[1].toString()); final Object[] readGroupCollapsedKey = new Object[1]; final Object[] qualityScoreCollapsedKey = new Object[2]; final Object[] covariateCollapsedKey = new Object[3]; RecalDatum collapsedDatum; // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed - if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) { + if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) { readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group - collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get( readGroupCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey ); - } else { - collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported + collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey); + } + else { + collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported } } // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... qualityScoreCollapsedKey[1] = key[1]; // and quality score - collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get( qualityScoreCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedQualityScore.put( new RecalDatum(fullDatum), qualityScoreCollapsedKey ); - } else { - collapsedDatum.increment( fullDatum ); + collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey); + } + else { + collapsedDatum.increment(fullDatum); } // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { + for (int iii = 0; iii < dataCollapsedByCovariate.size(); iii++) { covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... covariateCollapsedKey[1] = key[1]; // and quality score ... final Object theCovariateElement = key[iii + 2]; // and the given covariate - if( theCovariateElement != null ) { + if (theCovariateElement != null) { covariateCollapsedKey[2] = theCovariateElement; - collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey ); - } else { - collapsedDatum.increment( fullDatum ); + collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey); + } + else { + collapsedDatum.increment(fullDatum); } } } @@ -167,150 +185,136 @@ public final void addToAllTables( final Object[] key, final RecalDatum fullDatum /** * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score - * that will be used in the sequential calculation in TableRecalibrationWalker + * that will be used in the sequential calculation in TableRecalibrationWalker + * * @param smoothing The smoothing parameter that goes into empirical quality score calculation - * @param maxQual At which value to cap the quality scores + * @param maxQual At which value to cap the quality scores */ - public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) { + public final void generateEmpiricalQualities(final int smoothing, final int maxQual) { recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual); recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual); - for( NestedHashMap map : dataCollapsedByCovariate ) { + for (NestedHashMap map : dataCollapsedByCovariate) { recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); checkForSingletons(map.data); } } - private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) { + private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) { - for( Object comp : data.keySet() ) { + for (Object comp : data.keySet()) { final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps - ((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual); - } else { // Another layer in the nested hash map - recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual); + if (val instanceof RecalDatum) { // We are at the end of the nested hash maps + ((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual); + } + else { // Another layer in the nested hash map + recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual); } } } - private void checkForSingletons( final Map data ) { + private void checkForSingletons(final Map data) { // todo -- this looks like it's better just as a data.valueSet() call? - for( Object comp : data.keySet() ) { + for (Object comp : data.keySet()) { final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps - if( data.keySet().size() == 1) { + if (val instanceof RecalDatum) { // We are at the end of the nested hash maps + if (data.keySet().size() == 1) { data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ... - // in a previous step of the sequential calculation model + // in a previous step of the sequential calculation model } - } else { // Another layer in the nested hash map - checkForSingletons( (Map) val ); + } + else { // Another layer in the nested hash map + checkForSingletons((Map) val); } } } /** * Get the appropriate collapsed table out of the set of all the tables held by this Object + * * @param covariate Which covariate indexes the desired collapsed HashMap * @return The desired collapsed HashMap */ - public final NestedHashMap getCollapsedTable( final int covariate ) { - if( covariate == 0) { + public final NestedHashMap getCollapsedTable(final int covariate) { + if (covariate == 0) { return dataCollapsedReadGroup; // Table where everything except read group has been collapsed - } else if( covariate == 1 ) { + } + else if (covariate == 1) { return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed - } else { - return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed + } + else { + return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed } } /** * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * * @param read The read to adjust - * @param RAC The list of shared command line arguments + * @param RAC The list of shared command line arguments */ - public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { - GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord)read).getReadGroup(); - - // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments - if( readGroup == null ) { - if( RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) { - if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) { - Utils.warnUser("The input .bam file contains reads with no read group. " + - "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); - warnUserNullReadGroup = true; - } - // There is no readGroup so defaulting to these values - readGroup = new GATKSAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); - readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); - ((GATKSAMRecord)read).setReadGroup( readGroup ); - } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() ); - } - } - - if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user - final String oldPlatform = readGroup.getPlatform(); - readGroup = new GATKSAMReadGroupRecord( RAC.FORCE_READ_GROUP ); - readGroup.setPlatform( oldPlatform ); - ((GATKSAMRecord)read).setReadGroup( readGroup ); - } + public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { + GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup(); - if( RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { - readGroup.setPlatform( RAC.FORCE_PLATFORM ); + if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { + readGroup.setPlatform(RAC.FORCE_PLATFORM); } - if ( readGroup.getPlatform() == null ) { - if( RAC.DEFAULT_PLATFORM != null ) { - if( !warnUserNullPlatform ) { + if (readGroup.getPlatform() == null) { + if (RAC.DEFAULT_PLATFORM != null) { + if (!warnUserNullPlatform) { Utils.warnUser("The input .bam file contains reads with no platform information. " + - "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); + "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName()); warnUserNullPlatform = true; } - readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); - } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() ); + readGroup.setPlatform(RAC.DEFAULT_PLATFORM); + } + else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); } } } /** * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space + * * @param read The SAMRecord to parse */ - public static void parseColorSpace( final SAMRecord read ) { + public static void parseColorSpace(final GATKSAMRecord read) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { - if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read + if (ReadUtils.isSOLiDRead(read)) { + if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).getBytes(); - } else { + if (attr instanceof String) { + colorSpace = ((String) attr).getBytes(); + } + else { throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); } // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); } final byte[] inconsistency = new byte[readBases.length]; int iii; byte prevBase = colorSpace[0]; // The sentinel - for( iii = 0; iii < readBases.length; iii++ ) { - final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); - inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 ); + for (iii = 0; iii < readBases.length; iii++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); + inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1); prevBase = readBases[iii]; } - read.setAttribute( RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency ); + read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); - } else { + } + else { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); } } } @@ -319,52 +323,57 @@ public static void parseColorSpace( final SAMRecord read ) { /** * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases * This method doesn't add the inconsistent tag to the read like parseColorSpace does - * @param read The SAMRecord to parse + * + * @param read The SAMRecord to parse * @param originalQualScores The array of original quality scores to modify during the correction - * @param solidRecalMode Which mode of solid recalibration to apply - * @param refBases The reference for this read + * @param solidRecalMode Which mode of solid recalibration to apply + * @param refBases The reference for this read * @return A new array of quality scores that have been ref bias corrected */ - public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases ) { + public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).getBytes(); - } else { + if (attr instanceof String) { + colorSpace = ((String) attr).getBytes(); + } + else { throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); } // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); final byte[] colorImpliedBases = readBases.clone(); - byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray( read.getCigar(), read.getReadBases(), refBases ); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); - refBasesDirRead = BaseUtils.simpleReverseComplement( refBasesDirRead.clone() ); + byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); + refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone()); } final int[] inconsistency = new int[readBases.length]; byte prevBase = colorSpace[0]; // The sentinel - for( int iii = 0; iii < readBases.length; iii++ ) { - final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); + for (int iii = 0; iii < readBases.length; iii++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); colorImpliedBases[iii] = thisBase; - inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 ); + inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1); prevBase = readBases[iii]; } // Now that we have the inconsistency array apply the desired correction to the inconsistent bases - if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO ) { // Set inconsistent bases and the one before it to Q0 + if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0 final boolean setBaseN = false; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } else if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N ) { + } + else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) { final boolean setBaseN = true; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } else if( solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases + } + else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); } - } else { + } + else { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); } @@ -372,26 +381,28 @@ public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualSc return originalQualScores; } - public static boolean checkNoCallColorSpace( final SAMRecord read ) { - if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { + public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { + if (ReadUtils.isSOLiDRead(read)) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).substring(1).getBytes(); // trim off the Sentinel - } else { + if (attr instanceof String) { + colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel + } + else { throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); } - for( byte color : colorSpace ) { - if( color != (byte)'0' && color != (byte)'1' && color != (byte)'2' && color != (byte)'3' ) { + for (byte color : colorSpace) { + if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { return true; // There is a bad color in this SOLiD read and the user wants to skip over it } } - } else { + } + else { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); } } @@ -400,90 +411,105 @@ public static boolean checkNoCallColorSpace( final SAMRecord read ) { /** * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param originalQualScores The array of original quality scores to set to zero if needed - * @param refBases The reference which has been RC'd if necessary - * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar + * @param refBases The reference which has been RC'd if necessary + * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar * @return The byte array of original quality scores some of which might have been set to zero */ - private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, - final byte[] refBases, final boolean setBaseN ) { + private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) { final boolean negStrand = read.getReadNegativeStrandFlag(); - for( int iii = 1; iii < originalQualScores.length; iii++ ) { - if( inconsistency[iii] == 1 ) { - if( readBases[iii] == refBases[iii] ) { - if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; } - else { originalQualScores[iii] = (byte)0; } - if( setBaseN ) { readBases[iii] = (byte)'N'; } + for (int iii = 1; iii < originalQualScores.length; iii++) { + if (inconsistency[iii] == 1) { + if (readBases[iii] == refBases[iii]) { + if (negStrand) { + originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0; + } + else { + originalQualScores[iii] = (byte) 0; + } + if (setBaseN) { + readBases[iii] = (byte) 'N'; + } } // Set the prev base to Q0 as well - if( readBases[iii-1] == refBases[iii-1] ) { - if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; } - else { originalQualScores[iii-1] = (byte)0; } - if( setBaseN ) { readBases[iii-1] = (byte)'N'; } + if (readBases[iii - 1] == refBases[iii - 1]) { + if (negStrand) { + originalQualScores[originalQualScores.length - iii] = (byte) 0; + } + else { + originalQualScores[iii - 1] = (byte) 0; + } + if (setBaseN) { + readBases[iii - 1] = (byte) 'N'; + } } } } - if( negStrand ) { - readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read + if (negStrand) { + readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read } - read.setReadBases( readBases ); + read.setReadBases(readBases); return originalQualScores; } /** * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param colorImpliedBases The bases implied by the color space, RC'd if necessary - * @param refBases The reference which has been RC'd if necessary + * @param refBases The reference which has been RC'd if necessary */ - private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, - final byte[] refBases) { + private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpaceQuals; - if( attr instanceof String ) { - String x = (String)attr; + if (attr instanceof String) { + String x = (String) attr; colorSpaceQuals = x.getBytes(); SAMUtils.fastqToPhred(colorSpaceQuals); - } else { + } + else { throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); } - for( int iii = 1; iii < inconsistency.length - 1; iii++ ) { - if( inconsistency[iii] == 1 ) { - for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read - if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step - if( readBases[jjj] == refBases[jjj] ) { - if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( 2 ); - if( rand == 0 ) { // The color implied base won the coin flip + for (int iii = 1; iii < inconsistency.length - 1; iii++) { + if (inconsistency[iii] == 1) { + for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read + if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step + if (readBases[jjj] == refBases[jjj]) { + if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2); + if (rand == 0) { // The color implied base won the coin flip readBases[jjj] = colorImpliedBases[jjj]; } - } else { - final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); - final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); + } + else { + final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); + final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); int diffInQuality = maxQuality - minQuality; int numLow = minQuality; - if( numLow == 0 ) { + if (numLow == 0) { numLow++; diffInQuality++; } - final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( numLow + numHigh ); - if( rand >= numLow ) { // higher q score won - if( maxQuality == (int)colorSpaceQuals[jjj] ) { + final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh); + if (rand >= numLow) { // higher q score won + if (maxQuality == (int) colorSpaceQuals[jjj]) { readBases[jjj] = colorImpliedBases[jjj]; } // else ref color had higher q score, and won out, so nothing to do here - } else { // lower q score won - if( minQuality == (int)colorSpaceQuals[jjj] ) { + } + else { // lower q score won + if (minQuality == (int) colorSpaceQuals[jjj]) { readBases[jjj] = colorImpliedBases[jjj]; } // else ref color had lower q score, and won out, so nothing to do here } @@ -494,52 +520,56 @@ private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBa } } - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read } - read.setReadBases( readBases ); - } else { // No color space quality tag in file + read.setReadBases(readBases); + } + else { // No color space quality tag in file throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName()); } } /** * Given the base and the color calculate the next base in the sequence + * * @param prevBase The base - * @param color The color + * @param color The color * @return The next base in the sequence */ - private static byte getNextBaseFromColor( SAMRecord read, final byte prevBase, final byte color ) { - switch(color) { + private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { + switch (color) { case '0': return prevBase; case '1': - return performColorOne( prevBase ); + return performColorOne(prevBase); case '2': - return performColorTwo( prevBase ); + return performColorTwo(prevBase); case '3': - return performColorThree( prevBase ); + return performColorThree(prevBase); default: - throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char)color + - " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); + throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + + " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); } } /** * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality - * @param read The read which contains the color space to check against + * + * @param read The read which contains the color space to check against * @param offset The offset in the read at which to check * @return Returns true if the base was inconsistent with the color space */ - public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) { + public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); - if( attr != null ) { - final byte[] inconsistency = (byte[])attr; + if (attr != null) { + final byte[] inconsistency = (byte[]) attr; // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! - if( read.getReadNegativeStrandFlag() ) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] != (byte)0; - } else { // Forward direction - return inconsistency[offset] != (byte)0; + if (read.getReadNegativeStrandFlag()) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] != (byte) 0; + } + else { // Forward direction + return inconsistency[offset] != (byte) 0; } // This block of code is for if you want to check both the offset and the next base for color space inconsistency @@ -557,7 +587,8 @@ public static boolean isInconsistentColorSpace( final SAMRecord read, final int // } //} - } else { // No inconsistency array, so nothing is inconsistent + } + else { // No inconsistency array, so nothing is inconsistent return false; } } @@ -566,36 +597,31 @@ public static boolean isInconsistentColorSpace( final SAMRecord read, final int * Computes all requested covariates for every offset in the given read * by calling covariate.getValues(..). * - * @param gatkRead The read for which to compute covariate values. + * @param gatkRead The read for which to compute covariate values. * @param requestedCovariates The list of requested covariates. * @return An array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. */ - public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) { - //compute all covariates for this read - final List requestedCovariatesRef = requestedCovariates; - final int numRequestedCovariates = requestedCovariatesRef.size(); - final int readLength = gatkRead.getReadLength(); - - final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; - final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; - - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for( int i = 0; i < numRequestedCovariates; i++ ) { - requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder ); - for(int j = 0; j < readLength; j++) { - //copy values into a 2D array that allows all covar types to be extracted at once for - //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. - covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; - } - } - - return covariateValues_offset_x_covar; - } + public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) { + //compute all covariates for this read + final int numRequestedCovariates = requestedCovariates.size(); + final int readLength = gatkRead.getReadLength(); + + final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; + final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; + + for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder); + for (int j = 0; j < readLength; j++) + covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. + } + + return covariateValues_offset_x_covar; + } /** - * Perform a ceratin transversion (A <-> C or G <-> T) on the base. + * Perform a certain transversion (A <-> C or G <-> T) on the base. * * @param base the base [AaCcGgTt] * @return the transversion of the base, or the input base if it's not one of the understood ones @@ -603,14 +629,19 @@ public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, fin private static byte performColorOne(byte base) { switch (base) { case 'A': - case 'a': return 'C'; + case 'a': + return 'C'; case 'C': - case 'c': return 'A'; + case 'c': + return 'A'; case 'G': - case 'g': return 'T'; + case 'g': + return 'T'; case 'T': - case 't': return 'G'; - default: return base; + case 't': + return 'G'; + default: + return base; } } @@ -623,14 +654,19 @@ private static byte performColorOne(byte base) { private static byte performColorTwo(byte base) { switch (base) { case 'A': - case 'a': return 'G'; + case 'a': + return 'G'; case 'C': - case 'c': return 'T'; + case 'c': + return 'T'; case 'G': - case 'g': return 'A'; + case 'g': + return 'A'; case 'T': - case 't': return 'C'; - default: return base; + case 't': + return 'C'; + default: + return base; } } @@ -643,14 +679,19 @@ private static byte performColorTwo(byte base) { private static byte performColorThree(byte base) { switch (base) { case 'A': - case 'a': return 'T'; + case 'a': + return 'T'; case 'C': - case 'c': return 'G'; + case 'c': + return 'G'; case 'G': - case 'g': return 'C'; + case 'g': + return 'C'; case 'T': - case 't': return 'A'; - default: return base; + case 't': + return 'A'; + default: + return base; } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 75de84cb40..9752b1deee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -43,36 +43,20 @@ public class RecalibrationArgumentCollection { // Shared Command Line Arguments ////////////////////////////////// @Hidden - @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") - public String DEFAULT_READ_GROUP = null; - @Hidden - @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; @Hidden - @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") - public String FORCE_READ_GROUP = null; - @Hidden - @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; @Hidden - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) + @Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false) public int WINDOW_SIZE = 5; - /** - * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. - */ - @Hidden - @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) - public int HOMOPOLYMER_NBACK = 7; - @Hidden - @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) - public boolean EXCEPTION_IF_NO_TILE = false; - /** * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the * reads which have had the reference inserted because of color space inconsistencies. */ - @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") + @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; /** @@ -80,6 +64,19 @@ public class RecalibrationArgumentCollection { * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in * their color space tag can not be recalibrated. */ - @Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false) + @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + + /** + * The context covariate will use a context of this size to calculate it's covariate value + */ + @Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false) + public int CONTEXT_SIZE = 8; + + /** + * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. + */ + @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false) + public int HOMOPOLYMER_NBACK = 7; + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 1ce02a3cf3..08151321fa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.text.XReadLines; @@ -85,12 +86,12 @@ * -o my_reads.recal.bam \ * -recalFile my_reads.recal_data.csv * - * */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @WalkerName("TableRecalibration") -@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta +@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +// This walker requires -I input.bam, it also requires -R reference.fasta public class TableRecalibrationWalker extends ReadWalker { public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration"; @@ -98,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker> classes = new PluginManager(Covariate.class).getPlugins(); @@ -205,31 +206,33 @@ public void initialize() { boolean foundAllCovariates = false; // Read in the data from the csv file and populate the data map and covariates list - logger.info( "Reading in the data from input csv file..." ); + logger.info("Reading in the data from input csv file..."); boolean sawEOF = false; try { - for ( String line : new XReadLines(RECAL_FILE) ) { + for (String line : new XReadLines(RECAL_FILE)) { lineNumber++; - if ( EOF_MARKER.equals(line) ) { + if (EOF_MARKER.equals(line)) { sawEOF = true; - } else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { + } + else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) { ; // Skip over the comment lines, (which start with '#') } // Read in the covariates that were used from the input file - else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data - if( foundAllCovariates ) { - throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); - } else { // Found the covariate list in input file, loop through all of them and instantiate them + else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data + if (foundAllCovariates) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE); + } + else { // Found the covariate list in input file, loop through all of them and instantiate them String[] vals = line.split(","); - for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical + for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical boolean foundClass = false; - for( Class covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + for (Class covClass : classes) { + if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) { foundClass = true; try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -237,107 +240,110 @@ else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is eit } } - if( !foundClass ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); + if (!foundClass) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option."); } } } - } else { // Found a line of data - if( !foundAllCovariates ) { + } + else { // Found a line of data + if (!foundAllCovariates) { foundAllCovariates = true; // At this point all the covariates should have been found and initialized - if( requestedCovariates.size() < 2 ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); + if (requestedCovariates.size() < 2) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE); } final boolean createCollapsedTables = true; // Initialize any covariate member variables using the shared argument collection - for( Covariate cov : requestedCovariates ) { - cov.initialize( RAC ); + for (Covariate cov : requestedCovariates) { + cov.initialize(RAC); } // Initialize the data hashMaps - dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); + dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size()); } addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap } } - } catch ( FileNotFoundException e ) { + } catch (FileNotFoundException e) { throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); - } catch ( NumberFormatException e ) { + } catch (NumberFormatException e) { throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); } - logger.info( "...done!" ); + logger.info("...done!"); - if ( !sawEOF ) { + if (!sawEOF) { final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; - if ( REQUIRE_EOF ) + if (REQUIRE_EOF) throw new UserException.MalformedFile(RECAL_FILE, errorMessage); logger.warn(errorMessage); } - logger.info( "The covariates being used here: " ); - for( Covariate cov : requestedCovariates ) { - logger.info( "\t" + cov.getClass().getSimpleName() ); + logger.info("The covariates being used here: "); + for (Covariate cov : requestedCovariates) { + logger.info("\t" + cov.getClass().getSimpleName()); } - if( dataManager == null ) { + if (dataManager == null) { throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); } // Create the tables of empirical quality scores that will be used in the sequential calculation - logger.info( "Generating tables of empirical qualities for use in sequential calculation..." ); - dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE ); - logger.info( "...done!" ); + logger.info("Generating tables of empirical qualities for use in sequential calculation..."); + dataManager.generateEmpiricalQualities(SMOOTHING, MAX_QUALITY_SCORE); + logger.info("...done!"); // Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used final SAMFileHeader header = getToolkit().getSAMFileHeader().clone(); - if( !NO_PG_TAG ) { + if (!NO_PG_TAG) { final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); try { final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); programRecord.setProgramVersion(version); - } catch (MissingResourceException e) {} + } catch (MissingResourceException e) { + } StringBuffer sb = new StringBuffer(); sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); sb.append(" Covariates=["); - for( Covariate cov : requestedCovariates ) { + for (Covariate cov : requestedCovariates) { sb.append(cov.getClass().getSimpleName()); sb.append(", "); } - sb.setCharAt(sb.length()-2, ']'); - sb.setCharAt(sb.length()-1, ' '); + sb.setCharAt(sb.length() - 2, ']'); + sb.setCharAt(sb.length() - 1, ' '); programRecord.setCommandLine(sb.toString()); List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) ) + List newRecords = new ArrayList(oldRecords.size() + 1); + for (SAMProgramRecord record : oldRecords) { + if (!record.getId().startsWith(PROGRAM_RECORD_NAME)) newRecords.add(record); } newRecords.add(programRecord); header.setProgramRecords(newRecords); // Write out the new header - OUTPUT_BAM.writeHeader( header ); + OUTPUT_BAM.writeHeader(header); } } /** * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * * @param line A line of CSV data read from the recalibration table data file */ private void addCSVData(final File file, final String line) { final String[] vals = line.split(","); // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); } @@ -345,15 +351,15 @@ private void addCSVData(final File file, final String line) { final Object[] key = new Object[requestedCovariates.size()]; Covariate cov; int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - key[iii] = cov.getValue( vals[iii] ); + for (iii = 0; iii < requestedCovariates.size(); iii++) { + cov = requestedCovariates.get(iii); + key[iii] = cov.getValue(vals[iii]); } // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); + final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0); // Add that datum to all the collapsed tables which will be used in the sequential calculation - dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); + dataManager.addToAllTables(key, datum, PRESERVE_QSCORES_LESS_THAN); } //--------------------------------------------------------------------------------------------------------------- @@ -366,64 +372,63 @@ private void addCSVData(final File file, final String line) { * For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read * * @param refBases References bases over the length of the read - * @param read The read to be recalibrated + * @param read The read to be recalibrated * @return The read with quality scores replaced */ - public SAMRecord map( ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { + public SAMRecord map(ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { - if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads. + if (read.getReadLength() == 0) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads. return read; } - RecalDataManager.parseSAMRecord( read, RAC ); + RecalDataManager.parseSAMRecord(read, RAC); byte[] originalQuals = read.getBaseQualities(); final byte[] recalQuals = originalQuals.clone(); final String platform = read.getReadGroup().getPlatform(); - if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) { - if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) { - final boolean badColor = RecalDataManager.checkNoCallColorSpace( read ); - if( badColor ) { + if (platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING)) { + if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION)) { + final boolean badColor = RecalDataManager.checkNoCallColorSpace(read); + if (badColor) { numReadsWithMalformedColorSpace++; - if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) { + if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them - } else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { + } + else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) { read.setReadFailsVendorQualityCheckFlag(true); return read; } } } - originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() ); + originalQuals = RecalDataManager.calcColorSpace(read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases()); } //compute all covariate values for this read - final Comparable[][] covariateValues_offset_x_covar = - RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates); + final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates); // For each base in the read - for( int offset = 0; offset < read.getReadLength(); offset++ ) { + for (int offset = 0; offset < read.getReadLength(); offset++) { final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); - if(qualityScore == null) - { - qualityScore = performSequentialQualityCalculation( fullCovariateKey ); + if (qualityScore == null) { + qualityScore = performSequentialQualityCalculation(fullCovariateKey); qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); } recalQuals[offset] = qualityScore; } - preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low + preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low - read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities - if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read + read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities + if (!DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null) { // Save the old qualities if the tag isn't already taken in the read read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals)); } - if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) { + if (!skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) { read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false)); } @@ -440,27 +445,28 @@ public SAMRecord map( ReferenceContext refBases, GATKSAMRecord read, ReadMetaDat * * Given the full recalibration table, we perform the following preprocessing steps: * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) * @param key The list of Comparables that were calculated from the covariates * @return A recalibrated quality score as a byte */ - private byte performSequentialQualityCalculation( final Object... key ) { + private byte performSequentialQualityCalculation(final Object... key) { - final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); + final byte qualFromRead = (byte) Integer.parseInt(key[1].toString()); final Object[] readGroupCollapsedKey = new Object[1]; final Object[] qualityScoreCollapsedKey = new Object[2]; final Object[] covariateCollapsedKey = new Object[3]; // The global quality shift (over the read group only) readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); + final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey)); double globalDeltaQ = 0.0; - if( globalRecalDatum != null ) { + if (globalRecalDatum != null) { final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; @@ -469,9 +475,9 @@ private byte performSequentialQualityCalculation( final Object... key ) { // The shift in quality between reported and empirical qualityScoreCollapsedKey[0] = key[0]; qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); + final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey)); double deltaQReported = 0.0; - if( qReportedRecalDatum != null ) { + if (qReportedRecalDatum != null) { final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; } @@ -481,17 +487,17 @@ private byte performSequentialQualityCalculation( final Object... key ) { double deltaQCovariateEmpirical; covariateCollapsedKey[0] = key[0]; covariateCollapsedKey[1] = key[1]; - for( int iii = 2; iii < key.length; iii++ ) { - covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); - if( covariateRecalDatum != null ) { + for (int iii = 2; iii < key.length; iii++) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey)); + if (covariateRecalDatum != null) { deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); - deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); } } final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE); // Verbose printouts used to validate with old recalibrator //if(key.contains(null)) { @@ -508,12 +514,13 @@ private byte performSequentialQualityCalculation( final Object... key ) { /** * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold + * * @param originalQuals The list of original base quality scores - * @param recalQuals A list of the new recalibrated quality scores + * @param recalQuals A list of the new recalibrated quality scores */ - private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) { - for( int iii = 0; iii < recalQuals.length; iii++ ) { - if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) { + private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) { + for (int iii = 0; iii < recalQuals.length; iii++) { + if (originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN) { recalQuals[iii] = originalQuals[iii]; } } @@ -527,6 +534,7 @@ private void preserveQScores( final byte[] originalQuals, final byte[] recalQual /** * Start the reduce with a handle to the output bam file + * * @return A FileWriter pointing to a new bam file */ public SAMFileWriter reduceInit() { @@ -535,12 +543,13 @@ public SAMFileWriter reduceInit() { /** * Output each read to disk - * @param read The read to output + * + * @param read The read to output * @param output The FileWriter to write the read to * @return The FileWriter */ - public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { - if( output != null ) { + public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { + if (output != null) { output.addAlignment(read); } return output; @@ -548,20 +557,22 @@ public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { /** * Do nothing + * * @param output The SAMFileWriter that outputs the bam file */ public void onTraversalDone(SAMFileWriter output) { - if( numReadsWithMalformedColorSpace != 0 ) { - if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) { + if (numReadsWithMalformedColorSpace != 0) { + if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " + - "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + - "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + - "These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!"); - } else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + + "These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!"); + } + else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) { Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " + - "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + - "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + - "These reads were completely removed from the output bam file."); + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + + "These reads were completely removed from the output bam file."); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index b27bef2650..1d7f92242a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -110,6 +110,13 @@ public class ValidationAmplicons extends RodWalker { @Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false) boolean lowerCaseSNPs = false; + /** + * If onlyOutputValidAmplicons is true, the output fasta file will contain only valid sequences. + * Useful for producing delivery-ready files. + */ + @Argument(doc="Only output valid sequences.",fullName="onlyOutputValidAmplicons",required=false) + boolean onlyOutputValidAmplicons = false; + /** * BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased. * This changes the size of the k-mer used for alignment. @@ -127,6 +134,10 @@ public class ValidationAmplicons extends RodWalker { @Argument(doc="Use Sequenom output format instead of regular FASTA",fullName="sqnm",required=false) boolean sequenomOutput = false; + @Hidden + @Argument(doc="Use ILMN output format instead of regular FASTA",fullName="ilmn",required=false) + boolean ilmnOutput = false; + GenomeLoc prevInterval; GenomeLoc allelePos; @@ -134,6 +145,7 @@ public class ValidationAmplicons extends RodWalker { StringBuilder sequence; StringBuilder rawSequence; boolean sequenceInvalid; + boolean isSiteSNP; List invReason; int indelCounter; @@ -162,6 +174,9 @@ public void initialize() { header.setSequenceDictionary(referenceDictionary); header.setSortOrder(SAMFileHeader.SortOrder.unsorted); } + + if (ilmnOutput) + out.println("Locus_Name,Target_Type,Sequence,Chromosome,Coordinate,Genome_Build_Version,Source,Source_Version,Sequence_Orientation,Plus_Minus,Force_Infinium_I"); } public Integer reduceInit() { @@ -227,6 +242,8 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo } rawSequence.append(Character.toUpperCase((char) ref.getBase())); } else if ( validate != null ) { + // record variant type in case it's needed in output format + isSiteSNP = (validate.isSNP()); // doesn't matter if there's a mask here too -- this is what we want to validate if ( validate.isFiltered() ) { logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site."); @@ -486,14 +503,22 @@ public void print() { valid = "Valid"; } - String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); - if (!sequenomOutput) - out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); - else { - seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record - probeName = probeName.replace("amplicon_","a"); - out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + if (!onlyOutputValidAmplicons || !sequenceInvalid) { + String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); + if (sequenomOutput) { + seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record + probeName = probeName.replace("amplicon_","a"); + out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + } + else if (ilmnOutput) { + String type = isSiteSNP?"SNP":"INDEL"; + seqIdentity = seqIdentity.replace("*",""); // no * in ref allele + out.printf("%s,%s,%s,%s,%d,37,1000G,ExomePhase1,Forward,Plus,FALSE%n",probeName,type,seqIdentity,allelePos.getContig(),allelePos.getStart()); + } + else{ + out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + } } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java index ae11d8102e..cd4c571365 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java @@ -106,37 +106,70 @@ public enum SAMPLE_SELECTION_MODE { POLY_BASED_ON_GL } + /** + * The input VCF file + */ @Input(fullName="variant", shortName = "V", doc="Input VCF file, can be specified multiple times", required=true) public List> variants; + /** + * The output VCF file + */ @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; + /** + * Sample name(s) to subset the input VCF to, prior to selecting variants. -sn A -sn B subsets to samples A and B. + */ @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) public Set sampleNames = new HashSet(0); + /** + * Sample regexps to subset the input VCF to, prior to selecting variants. -sn NA12* subsets to all samples with prefix NA12 + */ @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) public Set sampleExpressions ; + /** + * File containing a list of sample names to subset the input vcf to. Equivalent to specifying the contents of the file separately with -sn + */ @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) public Set sampleFiles; + /** + * A mode for selecting sites based on sample-level data. See the wiki documentation for more information. + */ @Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false) private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE; + /** + * An P[nonref] threshold for SAMPLE_SELECTION_MODE=POLY_BASED_ON_GL. See the wiki documentation for more information. + */ @Argument(shortName="samplePNonref",fullName="samplePNonref", doc="GL-based selection mode only: the probability" + " that a site is non-reference in the samples for which to include the site",required=false) private double samplePNonref = 0.99; + /** + * The number of sites in your validation set + */ @Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true) private int numValidationSites; + /** + * Do not exclude filtered sites (e.g. not PASS or .) from consideration for validation + */ @Argument(fullName="includeFilteredSites", shortName="ifs", doc="If true, will include filtered sites in set to choose variants from", required=false) private boolean INCLUDE_FILTERED_SITES = false; + /** + * Argument for the frequency selection mode. (AC/AF/AN) are taken from VCF info field, not recalculated. Typically specified for sites-only VCFs that still have AC/AF/AN information. + */ @Argument(fullName="ignoreGenotypes", shortName="ignoreGenotypes", doc="If true, will ignore genotypes in VCF, will take AC,AF from annotations and will make no sample selection", required=false) private boolean IGNORE_GENOTYPES = false; + /** + * Argument for the frequency selection mode. Allows reference (non-polymorphic) sites to be included in the validation set. + */ @Argument(fullName="ignorePolymorphicStatus", shortName="ignorePolymorphicStatus", doc="If true, will ignore polymorphic status in VCF, and will take VCF record directly without pre-selection", required=false) private boolean IGNORE_POLYMORPHIC = false; @@ -145,19 +178,14 @@ public enum SAMPLE_SELECTION_MODE { private int numFrequencyBins = 20; /** - * This argument selects allele frequency selection mode: - * KEEP_AF_SPECTRUM will choose variants so that the resulting allele frequency spectrum matches as closely as possible the input set - * UNIFORM will choose variants uniformly without regard to their allele frequency. - * - */ + * This argument selects allele frequency selection mode. See the wiki for more information. + */ @Argument(fullName="frequencySelectionMode", shortName="freqMode", doc="Allele Frequency selection mode", required=false) private AF_COMPUTATION_MODE freqMode = AF_COMPUTATION_MODE.KEEP_AF_SPECTRUM; /** - * This argument selects particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. - * When specified one or more times, a particular type of variant is selected. - * - */ + * This argument selects particular kinds of variants (i.e. SNP, INDEL) out of a list. If left unspecified, all types are considered. + */ @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) private List TYPES_TO_INCLUDE = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 74291e025b..d18c7e10a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; +import com.google.java.contract.Requires; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.util.IntervalTree; import net.sf.samtools.SAMSequenceRecord; @@ -19,11 +20,8 @@ import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification; -import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; -import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; -import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; @@ -32,7 +30,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; @@ -389,9 +386,9 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo nec.apply(tracker, ref, context, comp, eval); } - // eval=null against all comps of different type + // eval=null against all comps of different type that aren't bound to another eval for ( VariantContext otherComp : compSet ) { - if ( otherComp != comp ) { + if ( otherComp != comp && ! compHasMatchingEval(otherComp, evalSetBySample) ) { synchronized (nec) { nec.apply(tracker, ref, context, otherComp, null); } @@ -409,6 +406,35 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo return null; } + @Requires({"comp != null", "evals != null"}) + private boolean compHasMatchingEval(final VariantContext comp, final Collection evals) { + // find all of the matching comps + for ( final VariantContext eval : evals ) { + if ( eval != null && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH ) + return true; + } + + // nothing matched + return false; + } + + private enum EvalCompMatchType { NO_MATCH, STRICT, LENIENT } + + @Requires({"eval != null", "comp != null"}) + private EvalCompMatchType doEvalAndCompMatch(final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) { + // find all of the matching comps + if ( comp.getType() != eval.getType() ) + return EvalCompMatchType.NO_MATCH; + + // find the comp which matches both the reference allele and alternate allele from eval + final Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); + final Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); + if ((altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference()))) + return EvalCompMatchType.STRICT; + else + return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT; + } + private VariantContext findMatchingComp(final VariantContext eval, final Collection comps) { // if no comps, return null if ( comps == null || comps.isEmpty() ) @@ -419,26 +445,21 @@ private VariantContext findMatchingComp(final VariantContext eval, final Collect return comps.iterator().next(); // find all of the matching comps - List matchingComps = new ArrayList(comps.size()); - for ( VariantContext comp : comps ) { - if ( comp.getType() == eval.getType() ) - matchingComps.add(comp); - } - - // if no matching comp, return null - if ( matchingComps.size() == 0 ) - return null; - - // find the comp which matches both the reference allele and alternate allele from eval - Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); - for ( VariantContext comp : matchingComps ) { - Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); - if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) ) - return comp; + VariantContext lenientMatch = null; + for ( final VariantContext comp : comps ) { + switch ( doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) ) { + case STRICT: + return comp; + case LENIENT: + if ( lenientMatch == null ) lenientMatch = comp; + break; + case NO_MATCH: + ; + } } - // if none match, just return the first one unless we require a strict match - return (requireStrictAlleleMatch ? null : matchingComps.get(0)); + // nothing matched, just return lenientMatch, which might be null + return lenientMatch; } public Integer treeReduce(Integer lhs, Integer rhs) { return null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index ea12ada484..f4369401b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -80,6 +80,10 @@ public String toString() { return getName() + ": "; } + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return update2(eval,comp,tracker,ref,context,null); + } + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, NewEvaluationContext group) { //public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { Reasons interesting = new Reasons(); @@ -115,7 +119,7 @@ public String update2(VariantContext eval, VariantContext comp, RefMetaDataTrack if (evalSampGenotypes != null) evalSampGt = evalSampGenotypes.get(samp); - if (compSampGt == null || evalSampGt == null) { // Since either comp or eval (or both) are missing the site, the best we can do is hope to preserve phase [if the non-missing one preserves phase] + if (compSampGt == null || evalSampGt == null || compSampGt.isNoCall() || evalSampGt.isNoCall()) { // Since either comp or eval (or both) are missing the site, the best we can do is hope to preserve phase [if the non-missing one preserves phase] // Having an unphased site breaks the phasing for the sample [does NOT permit "transitive phasing"] - hence, must reset phasing knowledge for both comp and eval [put a null CompEvalGenotypes]: if (isNonNullButUnphased(compSampGt) || isNonNullButUnphased(evalSampGt)) samplePrevGenotypes.put(samp, null); @@ -205,7 +209,7 @@ public static boolean isRelevantToPhasing(VariantContext vc) { } public boolean isNonNullButUnphased(Genotype gt) { - return (gt != null && !genotypesArePhasedAboveThreshold(gt)); + return (gt != null && !gt.isNoCall() && !genotypesArePhasedAboveThreshold(gt)); } public boolean genotypesArePhasedAboveThreshold(Genotype gt) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java index ccec9af126..6cf8b7c2c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -15,7 +15,7 @@ * @Author chartl * @Date May 26, 2010 */ -@Analysis(name = "Indel length histograms", description = "Shows the distrbution of insertion/deletion event lengths (negative for deletion, positive for insertion)") +@Analysis(name = "Indel length histograms", description = "Shows the distribution of insertion/deletion event lengths (negative for deletion, positive for insertion)") public class IndelLengthHistogram extends VariantEvaluator { private static final int SIZE_LIMIT = 100; @DataPoint(description="Histogram of indel lengths") diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java new file mode 100644 index 0000000000..97aebc376a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +@Analysis(description = "Evaluation summary for multi-allelic variants") +public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval { + final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class); + + public enum Type { + SNP, INDEL + } + + // basic counts on various rates found + @DataPoint(description = "Number of processed loci") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs") + public int nSNPs = 0; + @DataPoint(description = "Number of multi-allelic SNPs") + public int nMultiSNPs = 0; + @DataPoint(description = "% processed sites that are multi-allelic SNPs", format = "%.5f") + public double processedMultiSnpRatio = 0; + @DataPoint(description = "% SNP sites that are multi-allelic", format = "%.3f") + public double variantMultiSnpRatio = 0; + + @DataPoint(description = "Number of Indels") + public int nIndels = 0; + @DataPoint(description = "Number of multi-allelic Indels") + public int nMultiIndels = 0; + @DataPoint(description = "% processed sites that are multi-allelic Indels", format = "%.5f") + public double processedMultiIndelRatio = 0; + @DataPoint(description = "% Indel sites that are multi-allelic", format = "%.3f") + public double variantMultiIndelRatio = 0; + + @DataPoint(description = "Number of Transitions") + public int nTi = 0; + @DataPoint(description = "Number of Transversions") + public int nTv = 0; + @DataPoint(description = "Overall TiTv ratio", format = "%.2f") + public double TiTvRatio = 0; + + @DataPoint(description = "Multi-allelic SNPs partially known") + public int knownSNPsPartial = 0; + @DataPoint(description = "Multi-allelic SNPs completely known") + public int knownSNPsComplete = 0; + @DataPoint(description = "Multi-allelic SNP Novelty Rate") + public String SNPNoveltyRate = "NA"; + + //TODO -- implement me + //@DataPoint(description = "Multi-allelic Indels partially known") + public int knownIndelsPartial = 0; + //@DataPoint(description = "Multi-allelic Indels completely known") + public int knownIndelsComplete = 0; + //@DataPoint(description = "Multi-allelic Indel Novelty Rate") + public String indelNoveltyRate = "NA"; + + @DataPoint(description="Histogram of allele frequencies for most common SNP alternate allele") + AFHistogram AFhistogramMaxSnp = new AFHistogram(); + + @DataPoint(description="Histogram of allele frequencies for less common SNP alternate alleles") + AFHistogram AFhistogramMinSnp = new AFHistogram(); + + @DataPoint(description="Histogram of allele frequencies for most common Indel alternate allele") + AFHistogram AFhistogramMaxIndel = new AFHistogram(); + + @DataPoint(description="Histogram of allele frequencies for less common Indel alternate alleles") + AFHistogram AFhistogramMinIndel = new AFHistogram(); + + /* + * AF histogram table object + */ + static class AFHistogram implements TableType { + private Object[] rowKeys, colKeys = {"count"}; + private int[] AFhistogram; + + private static final double AFincrement = 0.01; + private static final int numBins = (int)(1.00 / AFincrement); + + public AFHistogram() { + rowKeys = initRowKeys(); + AFhistogram = new int[rowKeys.length]; + } + + public Object[] getColumnKeys() { + return colKeys; + } + + public Object[] getRowKeys() { + return rowKeys; + } + + public Object getCell(int row, int col) { + return AFhistogram[row]; + } + + private static Object[] initRowKeys() { + ArrayList keyList = new ArrayList(numBins + 1); + for ( double a = 0.00; a <= 1.01; a += AFincrement ) { + keyList.add(String.format("%.2f", a)); + } + return keyList.toArray(); + } + + public String getName() { return "AFHistTable"; } + + public void update(final double AF) { + final int bin = (int)(numBins * MathUtils.round(AF, 2)); + AFhistogram[bin]++; + } + } + + public void initialize(VariantEvalWalker walker) {} + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || eval.isMonomorphicInSamples() ) + return null; + + // update counts + switch ( eval.getType() ) { + case SNP: + nSNPs++; + if ( !eval.isBiallelic() ) { + nMultiSNPs++; + calculatePairwiseTiTv(eval); + calculateSNPPairwiseNovelty(eval, comp); + updateAFhistogram(eval, AFhistogramMaxSnp, AFhistogramMinSnp); + } + break; + case INDEL: + nIndels++; + if ( !eval.isBiallelic() ) { + nMultiIndels++; + calculateIndelPairwiseNovelty(eval, comp); + updateAFhistogram(eval, AFhistogramMaxIndel, AFhistogramMinIndel); + } + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + return null; // we don't capture any interesting sites + } + + private void calculatePairwiseTiTv(VariantContext vc) { + for ( Allele alt : vc.getAlternateAlleles() ) { + if ( VariantContextUtils.isTransition(vc.getReference(), alt) ) + nTi++; + else + nTv++; + } + } + + private void calculateSNPPairwiseNovelty(VariantContext eval, VariantContext comp) { + if ( comp == null ) + return; + + int knownAlleles = 0; + for ( Allele alt : eval.getAlternateAlleles() ) { + if ( comp.getAlternateAlleles().contains(alt) ) + knownAlleles++; + } + + if ( knownAlleles == eval.getAlternateAlleles().size() ) + knownSNPsComplete++; + else if ( knownAlleles > 0 ) + knownSNPsPartial++; + } + + private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) { + } + + private void updateAFhistogram(VariantContext vc, AFHistogram max, AFHistogram min) { + + final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null); + if ( obj == null || !(obj instanceof List) ) + return; + + List list = (List)obj; + ArrayList AFs = new ArrayList(list.size()); + for ( String str : list ) { + AFs.add(Double.valueOf(str)); + } + + Collections.sort(AFs); + max.update(AFs.get(AFs.size()-1)); + for ( int i = 0; i < AFs.size() - 1; i++ ) + min.update(AFs.get(i)); + } + + private final String noveltyRate(final int all, final int known) { + final int novel = all - known; + final double rate = (novel / (1.0 * all)); + return all == 0 ? "NA" : String.format("%.2f", rate); + } + + public void finalizeEvaluation() { + processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci; + variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs; + processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci; + variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels; + + TiTvRatio = (double)nTi / (double)nTv; + + SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete); + indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index cb44ca5222..fdeb6919dd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -417,8 +417,6 @@ public ArrayList initializeStateKeys(HashMap data ) { } for( final VariantDatum datum : data ) { - final ArrayList pVarInGaussianLog10 = new ArrayList( gaussians.size() ); + final double[] pVarInGaussianLog10 = new double[gaussians.size()]; + int gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { final double pVarLog10 = gaussian.evaluateDatumLog10( datum ); - pVarInGaussianLog10.add( pVarLog10 ); + pVarInGaussianLog10[gaussianIndex++] = pVarLog10; } - final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10 ); - int iii = 0; + final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10, false ); + gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { - gaussian.assignPVarInGaussian( pVarInGaussianNormalized[iii++] ); //BUGBUG: to clean up + gaussian.assignPVarInGaussian( pVarInGaussianNormalized[gaussianIndex++] ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 7cc5b16252..3cdcf4982e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -372,6 +372,8 @@ private void createVisualizationScript( final ExpandingArrayList r stream.println("library(ggplot2)"); // For compactPDF in R 2.13+ stream.println("library(tools)"); + // For graphical functions R 2.14.2+ + stream.println("library(grid)"); createArrangeFunction( stream ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index 6d2ac643ba..378765051a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -27,7 +27,6 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.List; @@ -126,7 +125,7 @@ private void variationalBayesExpectationMaximization( final GaussianMixtureModel iteration++; model.maximizationStep( data ); currentChangeInMixtureCoefficients = model.normalizePMixtureLog10(); - model.expectationStep(data); + model.expectationStep( data ); if( iteration % 5 == 0 ) { // cut down on the number of output lines so that users can read the warning messages logger.info("Finished iteration " + iteration + ". \tCurrent change in mixture coefficients = " + String.format("%.5f", currentChangeInMixtureCoefficients)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 096085330d..684b9102a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -105,7 +105,7 @@ public class CombineVariants extends RodWalker { * and each named argument will be labeled as such in the output (i.e., set=name rather than * set=variants2). The order of arguments does not matter unless except for the naming, so * if you provide an rod priority list and no explicit names than variants, variants2, etc - * are techincally order dependent. It is strongly recommended to provide explicit names when + * are technically order dependent. It is strongly recommended to provide explicit names when * a rod priority list is provided. */ @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) @@ -120,6 +120,10 @@ public class CombineVariants extends RodWalker { @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; + @Hidden + @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) + public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE; + /** * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. */ @@ -236,13 +240,24 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo return 0; List mergedVCs = new ArrayList(); - Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); - // iterate over the types so that it's deterministic - for ( VariantContext.Type type : VariantContext.Type.values() ) { - if ( VCsByType.containsKey(type) ) - mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + + if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { + Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); + // iterate over the types so that it's deterministic + for (VariantContext.Type type : VariantContext.Type.values()) { + if (VCsByType.containsKey(type)) + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + } + else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs, + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + else { + logger.warn("Ignoring all records at site " + ref.getLocus()); } for ( VariantContext mergedVC : mergedVCs ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 6d94ffe6da..204851e1fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -25,22 +25,20 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.MendelianViolation; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SampleUtils; import java.io.File; import java.io.FileNotFoundException; @@ -557,7 +555,7 @@ private boolean isDiscordant (VariantContext vc, Collection comp // Look for this sample in the all vcs of the comp ROD track. boolean foundVariant = false; for (VariantContext compVC : compVCs) { - if (sampleHasVariant(compVC.getGenotype(g.getSampleName()))) { + if (haveSameGenotypes(g, compVC.getGenotype(g.getSampleName()))) { foundVariant = true; break; } @@ -684,7 +682,7 @@ private VariantContext subsetRecord(VariantContext vc, Set samples) { for (String sample : sub.getSampleNames()) { Genotype g = sub.getGenotype(sample); - if (g.isNotFiltered() && g.isCalled()) { + if ( g.isNotFiltered() ) { String dp = (String) g.getAttribute("DP"); if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index fdfca982c9..530258fe07 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -128,13 +128,13 @@ private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceCo // get the true reference allele Allele reportedRefAllele = vc.getReference(); - Allele observedRefAllele; + Allele observedRefAllele = null; // insertions if ( vc.isSimpleInsertion() ) { observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING); } // deletions - else if ( vc.isSimpleDeletion() || vc.isMixed() || vc.isMNP() ) { + else if ( vc.isSimpleDeletion() || vc.isMNP() ) { // we can't validate arbitrarily long deletions if ( reportedRefAllele.length() > 100 ) { logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart())); @@ -143,16 +143,15 @@ else if ( vc.isSimpleDeletion() || vc.isMixed() || vc.isMNP() ) { // deletions are associated with the (position of) the last (preceding) non-deleted base; // hence to get actually deleted bases we need offset = 1 - int offset = 1 ; - if ( vc.isMNP() ) offset = 0; // if it's an MNP, the reported position IS the first modified base + int offset = vc.isMNP() ? 0 : 1; byte[] refBytes = ref.getBases(); byte[] trueRef = new byte[reportedRefAllele.length()]; for (int i = 0; i < reportedRefAllele.length(); i++) trueRef[i] = refBytes[i+offset]; observedRefAllele = Allele.create(trueRef, true); } - // SNPs, etc. - else { + // SNPs, etc. but not mixed types because they are too difficult + else if ( !vc.isMixed() ) { byte[] refByte = new byte[1]; refByte[0] = ref.getBase(); observedRefAllele = Allele.create(refByte, true); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java new file mode 100644 index 0000000000..d8b01e91d8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java @@ -0,0 +1,204 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintStream; +import java.util.*; + +/** + * Yet another VCF to Ped converter. The world actually does need one that will + * work efficiently on large VCFs (or at least give a progress bar). This + * produces a binary ped file in SNP-major mode. + */ +public class VariantsToPed extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)") + File metaDataFile; + + @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") + PrintStream outBed; + + @Output(shortName="bim",fullName="bim",required=true,doc="output map file") + PrintStream outBim; + + @Output(shortName="fam",fullName="fam",required=true,doc="output fam file") + PrintStream outFam; + + @Argument(shortName="mgq",fullName="minGenotypeQuality",required=true,doc="If genotype quality is lower than this value, output NO_CALL") + int minGenotypeQuality = 0; + + private ValidateVariants vv = new ValidateVariants(); + + private static double APPROX_CM_PER_BP = 1000000.0/750000.0; + + private static final byte HOM_REF = 0x0; + private static final byte HOM_VAR = 0x3; + private static final byte HET = 0x2; + private static final byte NO_CALL = 0x1; + + // note that HET and NO_CALL are flippd from the documentation: that's because + // plink actually reads these in backwards; and we want to use a shift operator + // to put these in the appropriate location + + public void initialize() { + vv.variantCollection = variantCollection; + vv.dbsnp = dbsnp; + vv.DO_NOT_VALIDATE_FILTERED = true; + vv.type = ValidateVariants.ValidationType.REF; + // write magic bits into the ped file + try { + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 }); + } catch (IOException e) { + throw new ReviewedStingException("error writing to output file."); + } + // write to the fam file, the first six columns of the standard ped file + // first, load data from the input meta data file + Map> metaValues = new HashMap>(); + try { + if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { + for ( String line : new XReadLines(metaDataFile) ) { + outFam.printf("%s%n",line); + } + } else { + for ( String line : new XReadLines(metaDataFile) ) { + String[] split = line.split("\\t"); + String sampleID = split[0]; + String keyVals = split[1]; + HashMap values = new HashMap(); + for ( String kvp : keyVals.split(";") ) { + String[] kvp_split = kvp.split("="); + values.put(kvp_split[0],kvp_split[1]); + } + metaValues.put(sampleID,values); + } + } + } catch (FileNotFoundException e) { + throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); + } + // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype + int dummyID = 0; // increments for dummy parental and family IDs used + // want to be especially careful to maintain order here + Map headers = VCFUtils.getVCFHeadersFromRods(getToolkit()); + for ( Map.Entry header : headers.entrySet() ) { + if ( ! header.getKey().equals(variantCollection.variants.getName()) && ! metaDataFile.getAbsolutePath().endsWith(".fam") ) { + continue; + } + for ( String sample : header.getValue().getGenotypeSamples() ) { + Map mVals = metaValues.get(sample); + if ( mVals == null ) { + throw new UserException("No metadata provided for sample "+sample); + } + if ( ! mVals.containsKey("phenotype") ) { + throw new UserException("No phenotype data provided for sample "+sample); + } + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.get("phenotype"); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); + } + } + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null || ! tracker.hasValues(variantCollection.variants) || + tracker.getFirstValue(variantCollection.variants).isFiltered() || + ! tracker.getFirstValue(variantCollection.variants).isSNP() || + ! tracker.getFirstValue(variantCollection.variants).isBiallelic()) { + return 0; + } + try { + vv.map(tracker,ref,context); + } catch (UserException e) { + throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ + "Please run ValidateVariants for more detailed information."); + } + + VariantContext vc = tracker.getFirstValue(variantCollection.variants); + // write an entry into the map file + outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), + vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString()); + // write an entry into the bed file + int buf = 0; + int idx = 0; + byte out = 0x0; + byte[] toWrite = new byte[1+(vc.getNSamples()/4)]; + for (Genotype g : vc.getGenotypes() ) { + out |= getEncoding(g,buf); + if ( buf == 3 ) { + toWrite[idx] = out; + buf = 0; + out = 0x0; + idx++; + } else { + buf++; + } + } + if ( out != 0x0 ) { + toWrite[idx]=out; + } + try { + outBed.write(toWrite); + } catch (IOException e) { + throw new ReviewedStingException("Error writing to output file"); + } + + return 1; + } + + public Integer reduce(Integer m, Integer r) { + return r + m; + } + + public Integer reduceInit() { + return 0; + } + + private byte getEncoding(Genotype g, int offset) { + byte b; + if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { + b = NO_CALL; + } else if ( g.isHomRef() ) { + b = HOM_REF; + } else if ( g.isHomVar() ) { + b = HOM_VAR; + } else if ( g.isHet() ) { + b = HET; + } else { + b = NO_CALL; + } + + return (byte) (b << (2*offset)); + } + + private static String getID(VariantContext v) { + if ( v.hasID() ) { + return v.getID(); + } else { + return String.format("SNP-%s-%d",v.getChr(),v.getStart()); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 4b3aa4864b..4c8e8df5c3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -26,7 +26,6 @@ import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -49,7 +48,13 @@ * fields to print with the -F NAME, each of which appears as a single column in * the output file, with a header named NAME, and the value of this field in the VCF * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding - * in the INFO field (AC=10). Note that this tool does not support capturing any + * in the INFO field (AC=10). In addition, there are specially supported values like + * EVENTLENGTH (length of the event), TRANSITION (for SNPs), HET (count of het genotypes), + * HOM-REF (count of homozygous reference genotypes), HOM-VAR (count of homozygous variant + * genotypes), NO-CALL (count of no-call genotypes), TYPE (the type of event), VAR (count of + * non-reference genotypes), NSAMPLES (number of samples), NCALLED (number of called samples), + * GQ (from the genotype field; works only for a file with a single sample), and MULTI-ALLELIC + * (is the record from a multi-allelic site). Note that this tool does not support capturing any * GENOTYPE field values. If a VCF record is missing a value, then the tool by * default throws an error, but the special value NA can be emitted instead with * appropriate tool arguments. @@ -121,18 +126,13 @@ public class VariantsToTable extends RodWalker { int nRecords = 0; /** - * By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then - * VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this - * can make your resulting file unreadable and malformated according to tools like R, as the representation of - * multi-allelic INFO field values can be lists of values. + * By default, records with multiple ALT alleles will comprise just one line of output; note that in general this can make your resulting file + * unreadable/malformed for certain tools like R, as the representation of multi-allelic INFO field values are often comma-separated lists + * of values. Using the flag will cause multi-allelic records to be split into multiple lines of output (one for each allele in the ALT field); + * INFO field values that are not lists are copied for each of the output records while only the appropriate entry is used for lists. */ - @Advanced - @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) - public boolean keepMultiAllelic = false; - - @Hidden - @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false) - public boolean logACSum = false; + @Argument(fullName="splitMultiAllelic", shortName="SMA", doc="If provided, we will split multi-allelic records into multiple lines of output", required=false) + public boolean splitMultiAllelic = false; /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This @@ -144,6 +144,7 @@ public class VariantsToTable extends RodWalker { @Advanced @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) public boolean ALLOW_MISSING_DATA = false; + private final static String MISSING_DATA = "NA"; public void initialize() { // print out the header @@ -155,9 +156,9 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo return 0; for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { - if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) { - List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum); - out.println(Utils.join("\t", vals)); + if ( showFiltered || vc.isNotFiltered() ) { + for ( final List record : extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, splitMultiAllelic) ) + out.println(Utils.join("\t", record)); } } @@ -180,22 +181,25 @@ private static final boolean isWildCard(String s) { * * @param vc the VariantContext whose field values we can to capture * @param fields a non-null list of fields to capture from VC - * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise - * provides a value of NA - * @param kma if true, multiallelic variants are to be kept - * @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count. - * @return + * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA + * @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records + * @return List of lists of field values */ - private static List extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean kma, boolean logsum) { - List vals = new ArrayList(); + private static List> extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean splitMultiAllelic) { + + final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1; + final List> records = new ArrayList>(numRecordsToProduce); + for ( int i = 0; i < numRecordsToProduce; i++ ) + records.add(new ArrayList(fields.size())); for ( String field : fields ) { - String val = "NA"; - if ( getters.containsKey(field) ) { - val = getters.get(field).get(vc); + if ( splitMultiAllelic && field.equals("ALT") ) { // we need to special case the ALT field when splitting out multi-allelic records + addFieldValue(splitAltAlleles(vc), records); + } else if ( getters.containsKey(field) ) { + addFieldValue(getters.get(field).get(vc), records); } else if ( vc.hasAttribute(field) ) { - val = vc.getAttributeAsString(field, null); + addFieldValue(vc.getAttribute(field, null), records); } else if ( isWildCard(field) ) { Set wildVals = new HashSet(); for ( Map.Entry elt : vc.getAttributes().entrySet()) { @@ -204,51 +208,47 @@ private static List extractFields(VariantContext vc, List fields } } + String val = MISSING_DATA; if ( wildVals.size() > 0 ) { List toVal = new ArrayList(wildVals); Collections.sort(toVal); val = Utils.join(",", toVal); } + + addFieldValue(val, records); } else if ( ! allowMissingData ) { throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc)); + } else { + addFieldValue(MISSING_DATA, records); } + } - if (field.equals("AF") || field.equals("AC")) { - String afo = val; - - double af=0; - if (afo.contains(",")) { - String[] afs = afo.split(","); - afs[0] = afs[0].substring(1,afs[0].length()); - afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); - - double[] afd = new double[afs.length]; - - for (int k=0; k < afd.length; k++) - afd[k] = Double.valueOf(afs[k]); - - if (kma && logsum) - af = MathUtils.sum(afd); - else - af = MathUtils.arrayMax(afd); - //af = Double.valueOf(afs[0]); - - } - else - if (!afo.equals("NA")) - af = Double.valueOf(afo); - - val = Double.toString(af); + return records; + } - } - vals.add(val); + private static void addFieldValue(Object val, List> result) { + final int numResultRecords = result.size(); + + // if we're trying to create a single output record, add it + if ( numResultRecords == 1 ) { + result.get(0).add(val.toString()); + } + // if this field is a list of the proper size, add the appropriate entry to each record + else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { + final List list = (List)val; + for ( int i = 0; i < numResultRecords; i++ ) + result.get(i).add(list.get(i).toString()); + } + // otherwise, add the original value to all of the records + else { + final String valStr = val.toString(); + for ( List record : result ) + record.add(valStr); } - - return vals; } - public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { - return extractFields(vc, fields, allowMissingData, false, false); + public static List> extractFields(VariantContext vc, List fields, boolean allowMissingData) { + return extractFields(vc, fields, allowMissingData, false); } // // default reduce -- doesn't do anything at all @@ -272,12 +272,9 @@ public static abstract class Getter { public abstract String get(VariantContext getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); getters.put("REF", new Getter() { public String get(VariantContext vc) { - String x = ""; - if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x=x+new String(new byte[]{refByte}); - } - return x+vc.getReference().getDisplayString(); + StringBuilder x = new StringBuilder(); + x.append(getAlleleDisplayString(vc, vc.getReference())); + return x.toString(); } }); getters.put("ALT", new Getter() { @@ -285,14 +282,10 @@ public String get(VariantContext vc) { StringBuilder x = new StringBuilder(); int n = vc.getAlternateAlleles().size(); if ( n == 0 ) return "."; - if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x.append(new String(new byte[]{refByte})); - } for ( int i = 0; i < n; i++ ) { if ( i != 0 ) x.append(","); - x.append(vc.getAlternateAllele(i).getDisplayString()); + x.append(getAlleleDisplayString(vc, vc.getAlternateAllele(i))); } return x.toString(); } @@ -324,10 +317,29 @@ public String get(VariantContext vc) { getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); + getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } }); getters.put("GQ", new Getter() { public String get(VariantContext vc) { if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError()); }}); } - + + private static String getAlleleDisplayString(VariantContext vc, Allele allele) { + StringBuilder sb = new StringBuilder(); + if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) + sb.append((char)vc.getReferenceBaseForIndel().byteValue()); + sb.append(allele.getDisplayString()); + return sb.toString(); + } + + private static Object splitAltAlleles(VariantContext vc) { + final int numAltAlleles = vc.getAlternateAlleles().size(); + if ( numAltAlleles == 1 ) + return getAlleleDisplayString(vc, vc.getAlternateAllele(0)); + + final List alleles = new ArrayList(numAltAlleles); + for ( Allele allele : vc.getAlternateAlleles() ) + alleles.add(getAlleleDisplayString(vc, allele)); + return alleles; + } } diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java index d7b34a2530..f948a9bcfa 100644 --- a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java +++ b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java @@ -71,6 +71,14 @@ automatically autoRead(), and the API user will have to pass the public class LibBat { static { + // via Platform LSF Configuration Reference, by default quiet the BSUB output. + if ("Y".equals(System.getProperty("BSUB_QUIET", "Y"))) + LibC.setenv("BSUB_QUIET", "Y", 1); + String lsfLibDir = System.getenv("LSF_LIBDIR"); + if (lsfLibDir != null) { + NativeLibrary.addSearchPath("lsf", lsfLibDir); + NativeLibrary.addSearchPath("bat", lsfLibDir); + } /* LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+. nm $LSF_LIBDIR/liblsf.dylib | grep environ @@ -79,16 +87,14 @@ public class LibBat { */ if (Platform.isMac()) NativeLibrary.getInstance("environhack"); - String lsfLibDir = System.getenv("LSF_LIBDIR"); - if (lsfLibDir != null) { - NativeLibrary.addSearchPath("lsf", lsfLibDir); - NativeLibrary.addSearchPath("bat", lsfLibDir); - } - NativeLibrary.getInstance("lsf"); - // via Platform LSF Configuration Reference, by default quiet the BSUB output. - if ("Y".equals(System.getProperty("BSUB_QUIET", "Y"))) - LibC.setenv("BSUB_QUIET", "Y", 1); + NativeLibrary liblsf = NativeLibrary.getInstance("lsf"); Native.register("bat"); + // HACK: Running into a weird error: + // java.lang.UnsatisfiedLinkError: Unable to load library 'bat': <$LSF_LIBDIR>/libbat.so: undefined symbol: xdr_resourceInfoReq + // This function is very clearly unsatisfied by running 'nm $LSF_LIBDIR/libbat.so | grep xdr_resourceInfoReq' but is + // found in liblsf.so when running 'nm $LSF_LIBDIR/liblsf.so | grep xdr_resourceInfoReq'. For now holding on to a reference + // to the LSF lib just in case this is a problem with the NativeLibrary's internal WeakReferences and the library being unloaded? + liblsf.getFunction("xdr_resourceInfoReq").getName(); } // Via support@platform.com: diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java index cdfc329e81..00a6ac1ae8 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -70,17 +70,18 @@ else if(getShortFieldGetter().equals(getFieldName())) " * Short name of %1$s%n" + " * @return Short name of %1$s%n" + " */%n" + - "def %3$s = this.%1$s%n" + + "%5$sdef %3$s = this.%1$s%n" + "%n" + "/**%n" + " * Short name of %1$s%n" + " * @param value Short name of %1$s%n" + " */%n" + - "def %4$s(value: %2$s) { this.%1$s = value }%n", + "%5$sdef %4$s(value: %2$s) { this.%1$s = value }%n", getFieldName(), getFieldType(), getShortFieldGetter(), - getShortFieldSetter()); + getShortFieldSetter(), + getPrivacy()); } protected static final String REQUIRED_TEMPLATE = " + required(\"%1$s\", %3$s, spaceSeparated=true, escape=true, format=%2$s)"; @@ -135,11 +136,8 @@ private static List getArgumentFields(ArgumentDefinitio new IntervalFileArgumentField(argumentDefinition), new IntervalStringArgumentField(argumentDefinition)); - // ROD Bindings are set by the RodBindField - } else if (RodBindArgumentField.ROD_BIND_FIELD.equals(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) { - // TODO: Once everyone is using @Allows and @Requires correctly, we can stop blindly allowing Triplets - return Arrays.asList(new RodBindArgumentField(argumentDefinition), new InputIndexesArgumentField(argumentDefinition, Tribble.STANDARD_INDEX_EXTENSION)); - //return Collections.emptyList(); + } else if (NumThreadsArgumentField.NUM_THREADS_FIELD.equals(argumentDefinition.fullName)) { + return Arrays.asList(new NumThreadsArgumentField(argumentDefinition)); } else if ("input_file".equals(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) { return Arrays.asList(new InputTaggedFileDefinitionField(argumentDefinition), new InputIndexesArgumentField(argumentDefinition, BAMIndex.BAMIndexSuffix, ".bam")); @@ -166,10 +164,13 @@ else if (VCFWriter.class.isAssignableFrom(argumentDefinition.argumentType)) fields.add(new OutputArgumentField(argumentDefinition, gatherClass)); - if (SAMFileWriter.class.isAssignableFrom(argumentDefinition.argumentType)) + if (SAMFileWriter.class.isAssignableFrom(argumentDefinition.argumentType)) { fields.add(new SAMFileWriterIndexArgumentField(argumentDefinition)); - else if (VCFWriter.class.isAssignableFrom(argumentDefinition.argumentType)) + fields.add(new SAMFileWriterMD5ArgumentField(argumentDefinition)); + } + else if (VCFWriter.class.isAssignableFrom(argumentDefinition.argumentType)) { fields.add(new VCFWriterIndexArgumentField(argumentDefinition)); + } return fields; @@ -228,7 +229,7 @@ public IntervalStringArgumentField(ArgumentDefinition argumentDefinition) { @Override protected String getRawFieldName() { return super.getRawFieldName() + "String"; } @Override protected String getFullName() { return super.getFullName() + "String"; } @Override protected String getRawShortFieldName() { return super.getRawShortFieldName() + "String"; } - @Override protected String getFieldType() { return "List[String]"; } + @Override protected String getFieldType() { return "Seq[String]"; } @Override protected String getDefaultValue() { return "Nil"; } @Override public String getCommandLineTemplate() { return REPEAT_TEMPLATE; } @@ -250,7 +251,7 @@ public InputArgumentField(ArgumentDefinition argumentDefinition) { } @Override protected Class getInnerType() { return File.class; } - @Override protected String getFieldType() { return isMultiValued() ? "List[File]" : "File"; } + @Override protected String getFieldType() { return isMultiValued() ? "Seq[File]" : "File"; } @Override protected String getDefaultValue() { return isMultiValued() ? "Nil" : "_"; } } @@ -294,7 +295,7 @@ public MultiValuedArgumentField(ArgumentDefinition argumentDefinition) { } @Override protected Class getInnerType() { return mapType(argumentDefinition.componentType); } - @Override protected String getFieldType() { return String.format("List[%s]", getType(getInnerType())); } + @Override protected String getFieldType() { return String.format("Seq[%s]", getType(getInnerType())); } @Override protected String getDefaultValue() { return "Nil"; } @Override protected String getCommandLineTemplate() { return REPEAT_TEMPLATE; } } @@ -336,17 +337,16 @@ public DefaultArgumentField(ArgumentDefinition argumentDefinition, boolean useFo } // Allows the user to specify the track name, track type, and the file. - public static class RodBindArgumentField extends ArgumentDefinitionField { - public static final String ROD_BIND_FIELD = "rodBind"; + public static class NumThreadsArgumentField extends OptionedArgumentField { + public static final String NUM_THREADS_FIELD = "num_threads"; - public RodBindArgumentField(ArgumentDefinition argumentDefinition) { - super(argumentDefinition); + public NumThreadsArgumentField(ArgumentDefinition argumentDefinition) { + super(argumentDefinition, false); } - @Override protected Class getInnerType() { return null; } // RodBind does not need to be imported. - @Override protected String getFieldType() { return "List[RodBind]"; } - @Override protected String getDefaultValue() { return "Nil"; } - @Override protected String getCommandLineTemplate() { - return " + repeat(\"%1$s\", %3$s, formatPrefix=RodBind.formatCommandLineParameter, spaceSeparated=true, escape=true, format=%2$s)"; + + @Override + protected String getFreezeFields() { + return String.format("if (num_threads.isDefined) nCoresRequest = num_threads%n"); } } @@ -356,7 +356,7 @@ public InputTaggedFileDefinitionField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); } @Override protected Class getInnerType() { return null; } // TaggedFile does not need to be imported. - @Override protected String getFieldType() { return argumentDefinition.isMultiValued ? "List[File]" : "File"; } + @Override protected String getFieldType() { return argumentDefinition.isMultiValued ? "Seq[File]" : "File"; } @Override protected String getDefaultValue() { return argumentDefinition.isMultiValued ? "Nil" : "_"; } @Override protected String getCommandLineTemplate() { if (argumentDefinition.isMultiValued) { @@ -395,10 +395,11 @@ public InputIndexesArgumentField(ArgumentDefinition originalArgumentDefinition, } @Override protected String getFullName() { return this.indexFieldName; } @Override protected boolean isRequired() { return false; } - @Override protected String getFieldType() { return "List[File]"; } + @Override protected String getFieldType() { return "Seq[File]"; } @Override protected String getDefaultValue() { return "Nil"; } @Override protected Class getInnerType() { return File.class; } @Override protected String getRawFieldName() { return this.indexFieldName; } + @Override protected String getPrivacy() { return "private "; } @Override protected String getFreezeFields() { if (originalIsMultiValued) { if (originalSuffix == null) { @@ -434,53 +435,69 @@ public InputIndexesArgumentField(ArgumentDefinition originalArgumentDefinition, } } - // Tracks an automatically generated index - private static abstract class OutputIndexArgumentField extends ArgumentField { - protected final String indexFieldName; + // Tracks an automatically generated index, md5, etc. + private static abstract class AuxilliaryOutputArgumentField extends ArgumentField { protected final String originalFieldName; - public OutputIndexArgumentField(ArgumentDefinition originalArgumentDefinition) { - this.indexFieldName = originalArgumentDefinition.fullName + "Index"; + protected final String auxFieldName; + protected final String auxFieldLabel; + public AuxilliaryOutputArgumentField(ArgumentDefinition originalArgumentDefinition, String auxFieldLabel) { this.originalFieldName = originalArgumentDefinition.fullName; + this.auxFieldName = originalArgumentDefinition.fullName + auxFieldLabel; + this.auxFieldLabel = auxFieldLabel; } @Override protected Class getAnnotationIOClass() { return Output.class; } @Override public String getCommandLineAddition() { return ""; } - @Override protected String getDoc() { return "Automatically generated index for " + this.originalFieldName; } - @Override protected String getFullName() { return this.indexFieldName; } + @Override protected String getDoc() { return String.format("Automatically generated %s for %s", auxFieldLabel.toLowerCase(), this.originalFieldName); } + @Override protected String getFullName() { return this.auxFieldName; } @Override protected boolean isRequired() { return false; } @Override protected String getFieldType() { return "File"; } @Override protected String getDefaultValue() { return "_"; } @Override protected Class getInnerType() { return File.class; } - @Override protected String getRawFieldName() { return this.indexFieldName; } + @Override protected String getRawFieldName() { return this.auxFieldName; } + @Override protected String getPrivacy() { return "private "; } @Override public boolean isGather() { return true; } @Override protected String getGatherAnnotation() { - return String.format("@Gather(classOf[AutoIndexGatherFunction])%n"); + return String.format("@Gather(enabled=false)%n"); } } - private static class VCFWriterIndexArgumentField extends OutputIndexArgumentField { + private static class VCFWriterIndexArgumentField extends AuxilliaryOutputArgumentField { public VCFWriterIndexArgumentField(ArgumentDefinition originalArgumentDefinition) { - super(originalArgumentDefinition); + super(originalArgumentDefinition, "Index"); } @Override protected String getFreezeFields() { return String.format( - ("if (%2$s != null)%n" + + ("if (%2$s != null && !org.broadinstitute.sting.utils.io.IOUtils.isSpecialFile(%2$s))%n" + " if (!org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor.isCompressed(%2$s.getPath))%n" + " %1$s = new File(%2$s.getPath + \"%3$s\")%n"), - indexFieldName, originalFieldName, Tribble.STANDARD_INDEX_EXTENSION); + auxFieldName, originalFieldName, Tribble.STANDARD_INDEX_EXTENSION); } } - private static class SAMFileWriterIndexArgumentField extends OutputIndexArgumentField { + private static class SAMFileWriterIndexArgumentField extends AuxilliaryOutputArgumentField { public SAMFileWriterIndexArgumentField(ArgumentDefinition originalArgumentDefinition) { - super(originalArgumentDefinition); + super(originalArgumentDefinition, "Index"); } @Override protected String getFreezeFields() { return String.format( - ("if (%2$s != null)%n" + + ("if (%2$s != null && !org.broadinstitute.sting.utils.io.IOUtils.isSpecialFile(%2$s))%n" + " if (!%3$s)%n" + " %1$s = new File(%2$s.getPath.stripSuffix(\".bam\") + \"%4$s\")%n"), - indexFieldName, originalFieldName, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME, BAMIndex.BAMIndexSuffix); + auxFieldName, originalFieldName, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME, BAMIndex.BAMIndexSuffix); + } + } + + private static class SAMFileWriterMD5ArgumentField extends AuxilliaryOutputArgumentField { + public SAMFileWriterMD5ArgumentField(ArgumentDefinition originalArgumentDefinition) { + super(originalArgumentDefinition, "MD5"); + } + @Override protected String getFreezeFields() { + return String.format( + ("if (%2$s != null && !org.broadinstitute.sting.utils.io.IOUtils.isSpecialFile(%2$s))%n" + + " if (%3$s)%n" + + " %1$s = new File(%2$s.getPath + \"%4$s\")%n"), + auxFieldName, originalFieldName, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME, ".md5"); } } diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java index e90933504a..2428a13a80 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -56,7 +56,7 @@ public final String getArgumentAddition() { return String.format("%n" + "/** %s */%n" + "@%s(fullName=\"%s\", shortName=\"%s\", doc=\"%s\", required=%s, exclusiveOf=\"%s\", validation=\"%s\")%n" + - "%svar %s: %s = %s%n" + + "%s%svar %s: %s = %s%n" + "%s", getDoc(), getAnnotationIOClass().getSimpleName(), @@ -66,7 +66,7 @@ public final String getArgumentAddition() { isRequired(), getExclusiveOf(), getValidation(), - getGatherAnnotation(), getFieldName(), getFieldType(), getDefaultValue(), + getGatherAnnotation(), getPrivacy(), getFieldName(), getFieldType(), getDefaultValue(), getDefineAddition()); } @@ -143,6 +143,9 @@ protected Collection> getDependentClasses() { /** @return True if this field uses @Gather. */ public boolean isGather() { return false; } + /** @return Privacy for the field. */ + protected String getPrivacy() { return ""; } + /** @return The raw field name, which will be checked against scala build in types. */ protected abstract String getRawFieldName(); /** @return The field name checked against reserved words. */ diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index 9c40fb976a..a3f80af1c2 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -34,13 +34,11 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -85,7 +83,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { "%n" + "/** A dynamicly generated list of classes that the GATK Extensions depend on, but are not be detected by default by BCEL. */%n" + "class %s {%n" + - "val types = List(%n%s)%n" + + "val types = Seq(%n%s)%n" + "}%n"; @Output(fullName="output_directory", shortName="outDir", doc="Directory to output the generated scala", required=true) @@ -95,10 +93,6 @@ public class GATKExtensionsGenerator extends CommandLineProgram { GenomeAnalysisEngine GATKEngine = new GenomeAnalysisEngine(); WalkerManager walkerManager = new WalkerManager(); FilterManager filterManager = new FilterManager(); - // HACK: We're currently relying on the fact that RMDTrackBuilder is used only from RMD type lookups, not - // RMD track location. Therefore, no sequence dictionary is required. In the future, we should separate - // RMD track lookups from track creation. - RMDTrackBuilder trackBuilder = new RMDTrackBuilder(null,null,ValidationExclusion.TYPE.ALL); /** * Required main method implementation. @@ -147,7 +141,7 @@ protected int execute() { String clpConstructor = String.format("analysisName = \"%s\"%njavaMainClass = \"%s\"%n", clpClassName, clp.getName()); writeClass("org.broadinstitute.sting.queue.function.JavaCommandLineFunction", clpClassName, - false, clpConstructor, ArgumentDefinitionField.getArgumentFields(parser,clp), dependents, false); + false, clpConstructor, ArgumentDefinitionField.getArgumentFields(parser,clp), dependents); if (clp == CommandLineGATK.class) { for (Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(false).entrySet()) { @@ -169,7 +163,7 @@ protected int execute() { } writeClass(GATK_EXTENSIONS_PACKAGE_NAME + "." + clpClassName, walkerName, - isScatter, constructor, argumentFields, dependents, true); + isScatter, constructor, argumentFields, dependents); } catch (Exception e) { throw new ReviewedStingException("Error generating wrappers for walker " + walkerType, e); } @@ -242,8 +236,8 @@ private String getScatterClass(Class walkerType) { */ private void writeClass(String baseClass, String className, boolean isScatter, String constructor, List argumentFields, - Set> dependents, boolean isGATKWalker) throws IOException { - String content = getContent(CLASS_TEMPLATE, baseClass, className, constructor, isScatter, "", argumentFields, dependents, isGATKWalker); + Set> dependents) throws IOException { + String content = getContent(CLASS_TEMPLATE, baseClass, className, constructor, isScatter, "", argumentFields, dependents); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } @@ -257,7 +251,7 @@ private void writeClass(String baseClass, String className, boolean isScatter, */ private void writeFilter(String className, List argumentFields, Set> dependents) throws IOException { String content = getContent(TRAIT_TEMPLATE, "org.broadinstitute.sting.queue.function.CommandLineFunction", - className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents, false); + className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } @@ -351,8 +345,7 @@ private void writeFile(String fullClassName, String content) throws IOException */ private static String getContent(String scalaTemplate, String baseClass, String className, String constructor, boolean isScatter, String commandLinePrefix, - List argumentFields, Set> dependents, - boolean isGATKWalker) { + List argumentFields, Set> dependents) { StringBuilder arguments = new StringBuilder(); StringBuilder commandLine = new StringBuilder(commandLinePrefix); @@ -376,9 +369,6 @@ private static String getContent(String scalaTemplate, String baseClass, String if (isGather) importSet.add("import org.broadinstitute.sting.commandline.Gather"); - // Needed for ShellUtils.escapeShellArgument() - importSet.add("import org.broadinstitute.sting.queue.util.ShellUtils"); - // Sort the imports so that the are always in the same order. List sortedImports = new ArrayList(importSet); Collections.sort(sortedImports); @@ -386,10 +376,8 @@ private static String getContent(String scalaTemplate, String baseClass, String StringBuffer freezeFieldOverride = new StringBuffer(); for (String freezeField: freezeFields) freezeFieldOverride.append(freezeField); - if (freezeFieldOverride.length() > 0 || isGATKWalker) { - freezeFieldOverride.insert(0, String.format("override def freezeFieldValues = {%nsuper.freezeFieldValues%n")); - if ( isGATKWalker ) - freezeFieldOverride.append(String.format("if ( num_threads.isDefined ) nCoresRequest = num_threads%n")); + if (freezeFieldOverride.length() > 0) { + freezeFieldOverride.insert(0, String.format("override def freezeFieldValues() {%nsuper.freezeFieldValues()%n")); freezeFieldOverride.append(String.format("}%n%n")); } diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 673b1524da..61812629ce 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -2,57 +2,59 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; - /** * BaseUtils contains some basic utilities for manipulating nucleotides. */ public class BaseUtils { - public final static byte A = (byte)'A'; - public final static byte C = (byte)'C'; - public final static byte G = (byte)'G'; - public final static byte T = (byte)'T'; + public final static byte A = (byte) 'A'; + public final static byte C = (byte) 'C'; + public final static byte G = (byte) 'G'; + public final static byte T = (byte) 'T'; - public final static byte N = (byte)'N'; - public final static byte D = (byte)'D'; + public final static byte N = (byte) 'N'; + public final static byte D = (byte) 'D'; // // todo -- we need a generalized base abstraction using the Base enum. // - public final static byte[] BASES = { 'A', 'C', 'G', 'T' }; - public final static byte[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' }; + public final static byte[] BASES = {'A', 'C', 'G', 'T'}; + public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; public enum Base { - A ( 'A', 0 ), - C ( 'C', 1 ), - G ( 'G', 2 ), - T ( 'T', 3 ); + A('A', 0), + C('C', 1), + G('G', 2), + T('T', 3); byte b; int index; + private Base(char base, int index) { - this.b = (byte)base; + this.b = (byte) base; this.index = index; } public byte getBase() { return b; } - public char getBaseAsChar() { return (char)b; } + + public char getBaseAsChar() { return (char) b; } + public int getIndex() { return index; } public boolean sameBase(byte o) { return b == o; } - public boolean sameBase(char o) { return b == (byte)o; } - public boolean sameBase(int i) { return index == i; } - } + public boolean sameBase(char o) { return b == (byte) o; } + + public boolean sameBase(int i) { return index == i; } + } // todo -- fix me (enums?) public static final byte DELETION_INDEX = 4; public static final byte NO_CALL_INDEX = 5; // (this is 'N') - public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte)'G'); - public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte)'C'); - public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte)'A'); - public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte)'T'); - + public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G'); + public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C'); + public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A'); + public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T'); /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). @@ -64,28 +66,31 @@ public enum BaseSubstitutionType { /** * Returns the base substitution type of the 2 state SNP + * * @param base1 * @param base2 * @return */ - public static BaseSubstitutionType SNPSubstitutionType( byte base1, byte base2 ) { + public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) { BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION; //System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t); return t; } - public static boolean isTransition( byte base1, byte base2 ) { + public static boolean isTransition(byte base1, byte base2) { int b1 = simpleBaseToBaseIndex(base1); int b2 = simpleBaseToBaseIndex(base2); return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 || - b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; + b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; } - public static boolean isTransversion( byte base1, byte base2 ) { - return ! isTransition(base1, base2); + public static boolean isTransversion(byte base1, byte base2) { + return !isTransition(base1, base2); } - /** Private constructor. No instantiating this class! */ + /** + * Private constructor. No instantiating this class! + */ private BaseUtils() {} static public boolean basesAreEqual(byte base1, byte base2) { @@ -96,7 +101,6 @@ static public boolean extendedBasesAreEqual(byte base1, byte base2) { return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); } - /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -163,33 +167,37 @@ static public char[] iupacToBases(char code) { /** * Converts a simple base to a base index * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ static public int simpleBaseToBaseIndex(byte base) { switch (base) { case '*': // the wildcard character counts as an A case 'A': - case 'a': return 0; + case 'a': + return 0; case 'C': - case 'c': return 1; + case 'c': + return 1; case 'G': - case 'g': return 2; + case 'g': + return 2; case 'T': - case 't': return 3; + case 't': + return 3; - default: return -1; + default: + return -1; } } - /** * Converts a simple base to a base index * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ @Deprecated @@ -197,29 +205,37 @@ static public int simpleBaseToBaseIndex(char base) { switch (base) { case '*': // the wildcard character counts as an A case 'A': - case 'a': return 0; + case 'a': + return 0; case 'C': - case 'c': return 1; + case 'c': + return 1; case 'G': - case 'g': return 2; + case 'g': + return 2; case 'T': - case 't': return 3; + case 't': + return 3; - default: return -1; + default: + return -1; } } static public int extendedBaseToBaseIndex(byte base) { switch (base) { case 'd': - case 'D': return DELETION_INDEX; + case 'D': + return DELETION_INDEX; case 'n': - case 'N': return NO_CALL_INDEX; + case 'N': + return NO_CALL_INDEX; - default: return simpleBaseToBaseIndex(base); + default: + return simpleBaseToBaseIndex(base); } } @@ -232,11 +248,6 @@ static public boolean isRegularBase(byte base) { return simpleBaseToBaseIndex(base) != -1; } - @Deprecated - static public boolean isNBase(char base) { - return isNBase((byte)base); - } - static public boolean isNBase(byte base) { return base == 'N' || base == 'n'; } @@ -244,68 +255,83 @@ static public boolean isNBase(byte base) { /** * Converts a base index to a simple base * - * @param baseIndex 0, 1, 2, 3 + * @param baseIndex 0, 1, 2, 3 * @return A, C, G, T, or '.' if the index can't be understood */ static public byte baseIndexToSimpleBase(int baseIndex) { switch (baseIndex) { - case 0: return 'A'; - case 1: return 'C'; - case 2: return 'G'; - case 3: return 'T'; - default: return '.'; + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + return '.'; } } @Deprecated static public char baseIndexToSimpleBaseAsChar(int baseIndex) { - return (char)baseIndexToSimpleBase(baseIndex); + return (char) baseIndexToSimpleBase(baseIndex); } /** * Converts a base index to a base index representing its cross-talk partner * - * @param baseIndex 0, 1, 2, 3 + * @param baseIndex 0, 1, 2, 3 * @return 1, 0, 3, 2, or -1 if the index can't be understood */ static public int crossTalkPartnerIndex(int baseIndex) { switch (baseIndex) { - case 0: return 1; // A -> C - case 1: return 0; // C -> A - case 2: return 3; // G -> T - case 3: return 2; // T -> G - default: return -1; + case 0: + return 1; // A -> C + case 1: + return 0; // C -> A + case 2: + return 3; // G -> T + case 3: + return 2; // T -> G + default: + return -1; } } /** * Converts a base to the base representing its cross-talk partner * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return C, A, T, G, or '.' if the base can't be understood */ @Deprecated static public char crossTalkPartnerBase(char base) { - return (char)baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); + return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); } /** * Return the complement of a base index. * - * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) + * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) * @return the complementary base index */ static public byte complementIndex(int baseIndex) { switch (baseIndex) { - case 0: return 3; // a -> t - case 1: return 2; // c -> g - case 2: return 1; // g -> c - case 3: return 0; // t -> a - default: return -1; // wtf? + case 0: + return 3; // a -> t + case 1: + return 2; // c -> g + case 2: + return 1; // g -> c + case 3: + return 0; // t -> a + default: + return -1; // wtf? } } - /** + /** * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). * * @param base the base [AaCcGgTt] @@ -314,20 +340,25 @@ static public byte complementIndex(int baseIndex) { static public byte simpleComplement(byte base) { switch (base) { case 'A': - case 'a': return 'T'; + case 'a': + return 'T'; case 'C': - case 'c': return 'G'; + case 'c': + return 'G'; case 'G': - case 'g': return 'C'; + case 'g': + return 'C'; case 'T': - case 't': return 'A'; - default: return base; + case 't': + return 'A'; + default: + return base; } } @Deprecated static public char simpleComplement(char base) { - return (char)simpleComplement((byte)base); + return (char) simpleComplement((byte) base); } /** @@ -349,7 +380,7 @@ static public byte[] simpleReverseComplement(byte[] bases) { /** * Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) * - * @param bases the byte array of bases + * @param bases the byte array of bases * @return the complement of the base byte array */ static public byte[] simpleComplement(byte[] bases) { @@ -382,7 +413,7 @@ static public char[] simpleReverseComplement(char[] bases) { /** * Complement a char array of bases * - * @param bases the char array of bases + * @param bases the char array of bases * @return the complement of the base char array */ @Deprecated @@ -399,7 +430,7 @@ static public char[] simpleComplement(char[] bases) { /** * Reverse complement a String of bases. Preserves ambiguous bases. * - * @param bases the String of bases + * @param bases the String of bases * @return the reverse complement of the String */ @Deprecated @@ -407,11 +438,10 @@ static public String simpleReverseComplement(String bases) { return new String(simpleReverseComplement(bases.getBytes())); } - /** * Complement a String of bases. Preserves ambiguous bases. * - * @param bases the String of bases + * @param bases the String of bases * @return the complement of the String */ @Deprecated @@ -451,7 +481,7 @@ static public int mostFrequentBaseIndexNotRef(int[] baseCounts, byte refSimpleBa /** * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. * - * @param baseCounts counts of a,c,g,t in order. + * @param baseCounts counts of a,c,g,t in order. * @return the most common base */ static public byte mostFrequentSimpleBase(int[] baseCounts) { @@ -461,13 +491,13 @@ static public byte mostFrequentSimpleBase(int[] baseCounts) { /** * For the most frequent base in the sequence, return the percentage of the read it constitutes. * - * @param sequence the read sequence - * @return the percentage of the read that's made up of the most frequent base + * @param sequence the read sequence + * @return the percentage of the read that's made up of the most frequent base */ static public double mostFrequentBaseFraction(byte[] sequence) { int[] baseCounts = new int[4]; - for ( byte base : sequence ) { + for (byte base : sequence) { int baseIndex = simpleBaseToBaseIndex(base); if (baseIndex >= 0) { @@ -477,7 +507,7 @@ static public double mostFrequentBaseFraction(byte[] sequence) { int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts); - return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length); + return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length); } // -------------------------------------------------------------------------------- @@ -531,50 +561,50 @@ static public byte getRandomBase() { static public byte getRandomBase(char excludeBase) { return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase))); } - - - /** Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, + + /** + * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, * that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p). - * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period - * of 2 (it has a period of 4 as well), and so does - * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is + * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period + * of 2 (it has a period of 4 as well), and so does + * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is * the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range * or if specified minPeriod is greater than the sequence length. - * + * * @param seq * @return */ public static int sequencePeriod(byte[] seq, int minPeriod) { - int period = ( minPeriod > seq.length ? seq.length : minPeriod ); - // we assume that bases [0,period-1] repeat themselves and check this assumption - // until we find correct period - - for ( int pos = period ; pos < seq.length ; pos++ ) { - - int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' - // if our current hypothesis holds, base[pos] must be the same as base[offset] - - if ( Character.toUpperCase( seq[pos] ) != - Character.toUpperCase( seq[offset] ) - ) { - - // period we have been trying so far does not work. - // two possibilities: - // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; - // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; - // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance - // pos will be autoincremented and we will be checking next base - // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? - // hence we should first check if it matches the first base of the sequence, and to do that - // we set period to pos (thus trying the hypothesis that bases from start up to the current one, - // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base - // on the next loop re-entrance after pos is autoincremented) - if ( offset == 0 ) period = pos+1; - else period = pos-- ; - - } - } - return period; + int period = (minPeriod > seq.length ? seq.length : minPeriod); + // we assume that bases [0,period-1] repeat themselves and check this assumption + // until we find correct period + + for (int pos = period; pos < seq.length; pos++) { + + int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' + // if our current hypothesis holds, base[pos] must be the same as base[offset] + + if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) { + + // period we have been trying so far does not work. + // two possibilities: + // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; + // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; + // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance + // pos will be autoincremented and we will be checking next base + // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? + // hence we should first check if it matches the first base of the sequence, and to do that + // we set period to pos (thus trying the hypothesis that bases from start up to the current one, + // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base + // on the next loop re-entrance after pos is autoincremented) + if (offset == 0) + period = pos + 1; + else + period = pos--; + + } + } + return period; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 345161416b..41ca58157b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -145,7 +145,7 @@ public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException { } return new GenomeLoc(getContig(), this.contigIndex, - Math.min(getStart(), that.getStart()), + Math.min( getStart(), that.getStart() ), Math.max( getStop(), that.getStop()) ); } @@ -436,7 +436,7 @@ public boolean endsAt(GenomeLoc that) { * never be < 1. */ @Ensures("result > 0") - public long size() { + public int size() { return stop - start + 1; } @@ -465,4 +465,8 @@ public final double reciprocialOverlapFraction(final GenomeLoc o) { private final static double overlapPercent(final GenomeLoc gl1, final GenomeLoc gl2) { return (1.0 * gl1.intersect(gl2).size()) / gl1.size(); } + + public long sizeOfOverlap( final GenomeLoc that ) { + return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L ); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 26be0e59ef..d11adf9e3a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -127,6 +127,21 @@ public boolean isEmpty() { return mArray.isEmpty(); } + /** + * Determine if the given loc overlaps any loc in the sorted set + * + * @param loc the location to test + * @return + */ + public boolean overlaps(final GenomeLoc loc) { + for(final GenomeLoc e : mArray) { + if(e.overlapsP(loc)) { + return true; + } + } + return false; + } + /** * add a genomeLoc to the collection, simply inserting in order into the set * diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index df682f215f..085794babc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -24,11 +24,16 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -109,6 +114,51 @@ public boolean isReference() { return isReference; } + @Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"}) + public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) { + + if( refAllele.length() != altAllele.length() ) { refInsertLocation++; } + int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStartInRefCoords, haplotypeCigar, refInsertLocation); + if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype + return bases.clone(); + } + byte[] newHaplotype; + + try { + if( refAllele.length() == altAllele.length() ) { // SNP or MNP + newHaplotype = bases.clone(); + for( int iii = 0; iii < altAllele.length(); iii++ ) { + newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; + } + } else if( refAllele.length() < altAllele.length() ) { // insertion + final int altAlleleLength = altAllele.length(); + newHaplotype = new byte[bases.length + altAlleleLength]; + for( int iii = 0; iii < bases.length; iii++ ) { + newHaplotype[iii] = bases[iii]; + } + for( int iii = newHaplotype.length - 1; iii > haplotypeInsertLocation + altAlleleLength - 1; iii-- ) { + newHaplotype[iii] = newHaplotype[iii-altAlleleLength]; + } + for( int iii = 0; iii < altAlleleLength; iii++ ) { + newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; + } + } else { // deletion + final int shift = refAllele.length() - altAllele.length(); + newHaplotype = new byte[bases.length - shift]; + for( int iii = 0; iii < haplotypeInsertLocation + altAllele.length(); iii++ ) { + newHaplotype[iii] = bases[iii]; + } + for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length; iii++ ) { + newHaplotype[iii] = bases[iii+shift]; + } + } + } catch (Exception e) { // event already on haplotype is too large/complex to insert another allele, most likely because of not enough reference padding + return bases.clone(); + } + + return newHaplotype; + } + public static LinkedHashMap makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, final int haplotypeSize, final int numPrefBases) { @@ -165,4 +215,84 @@ public static LinkedHashMap makeHaplotypeListFromAlleles(List< return haplotypeMap; } + private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) { + int readBases = 0; + int refBases = 0; + boolean fallsInsideDeletion = false; + + int goal = refCoord - haplotypeStart; // The goal is to move this many reference bases + boolean goalReached = refBases == goal; + + Iterator cigarElementIterator = haplotypeCigar.getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (refBases + cigarElement.getLength() < goal) + shift = cigarElement.getLength(); + else + shift = goal - refBases; + + refBases += shift; + } + goalReached = refBases == goal; + + if (!goalReached && cigarElement.getOperator().consumesReadBases()) + readBases += cigarElement.getLength(); + + if (goalReached) { + // Is this base's reference position within this cigar element? Or did we use it all? + boolean endsWithinCigar = shift < cigarElement.getLength(); + + // If it isn't, we need to check the next one. There should *ALWAYS* be a next one + // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + if (!endsWithinCigar && !cigarElementIterator.hasNext()) + return -1; + + CigarElement nextCigarElement; + + // if we end inside the current cigar element, we just have to check if it is a deletion + if (endsWithinCigar) + fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; + + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + else { + nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) + return -1; + + nextCigarElement = cigarElementIterator.next(); + } + + // if it's a deletion, we will pass the information on to be handled downstream. + fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; + } + + // If we reached our goal outside a deletion, add the shift + if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) + readBases += shift; + + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) + else if (fallsInsideDeletion && !endsWithinCigar) + readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; + } + } + + if (!goalReached) + return -1; + + return (fallsInsideDeletion ? -1 : readBases); + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 759e1649df..a96cbffc5f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,9 +25,11 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.math.BigDecimal; @@ -39,40 +41,101 @@ * @author Kiran Garimella */ public class MathUtils { - /** Public constants - used for the Lanczos approximation to the factorial function - * (for the calculation of the binomial/multinomial probability in logspace) - * @param LANC_SEQ[] - an array holding the constants which correspond to the product - * of Chebyshev Polynomial coefficients, and points on the Gamma function (for interpolation) - * [see A Precision Approximation of the Gamma Function J. SIAM Numer. Anal. Ser. B, Vol. 1 1964. pp. 86-96] - * @param LANC_G - a value for the Lanczos approximation to the gamma function that works to - * high precision + + /** + * Private constructor. No instantiating this class! */ + private MathUtils() { + } + + public static final double[] log10Cache; + private static final double[] jacobianLogTable; + private static final double JACOBIAN_LOG_TABLE_STEP = 0.001; + private static final double MAX_JACOBIAN_TOLERANCE = 10.0; + private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; + private static final int MAXN = 11000; + private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients + + static { + log10Cache = new double[LOG10_CACHE_SIZE]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + log10Cache[0] = Double.NEGATIVE_INFINITY; + for (int k = 1; k < LOG10_CACHE_SIZE; k++) + log10Cache[k] = Math.log10(k); - /** Private constructor. No instantiating this class! */ - private MathUtils() {} + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); - @Requires({"d > 0.0"}) - public static int fastPositiveRound(double d) { - return (int) (d + 0.5); + } } + // A fast implementation of the Math.round() method. This method does not perform + // under/overflow checking, so this shouldn't be used in the general case (but is fine + // if one is already make those checks before calling in to the rounding). public static int fastRound(double d) { - if ( d > 0.0 ) { - return fastPositiveRound(d); - } else { - return -1*fastPositiveRound(-1*d); + return (d > 0) ? (int) (d + 0.5d) : (int) (d - 0.5d); + } + + public static double approximateLog10SumLog10(final double[] vals) { + return approximateLog10SumLog10(vals, vals.length); + } + + public static double approximateLog10SumLog10(final double[] vals, final int endIndex) { + + final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); + double approxSum = vals[maxElementIndex]; + if (approxSum == Double.NEGATIVE_INFINITY) + return approxSum; + + for (int i = 0; i < endIndex; i++) { + if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) + continue; + + final double diff = approxSum - vals[i]; + if (diff < MathUtils.MAX_JACOBIAN_TOLERANCE) { + // See notes from the 2-inout implementation below + final int ind = fastRound(diff / MathUtils.JACOBIAN_LOG_TABLE_STEP); // hard rounding + approxSum += MathUtils.jacobianLogTable[ind]; + } + } + + return approxSum; + } + + public static double approximateLog10SumLog10(double small, double big) { + // make sure small is really the smaller value + if (small > big) { + final double t = big; + big = small; + small = t; } + + if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY) + return big; + + final double diff = big - small; + if (diff >= MathUtils.MAX_JACOBIAN_TOLERANCE) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + final int ind = fastRound(diff / MathUtils.JACOBIAN_LOG_TABLE_STEP); // hard rounding + return big + MathUtils.jacobianLogTable[ind]; } public static double sum(Collection numbers) { - return sum(numbers,false); + return sum(numbers, false); } - public static double sum( Collection numbers, boolean ignoreNan ) { + public static double sum(Collection numbers, boolean ignoreNan) { double sum = 0; - for ( Number n : numbers ) { - if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) { + for (Number n : numbers) { + if (!ignoreNan || !Double.isNaN(n.doubleValue())) { sum += n.doubleValue(); } } @@ -82,66 +145,84 @@ public static double sum( Collection numbers, boolean ignoreNan ) { public static int nonNanSize(Collection numbers) { int size = 0; - for ( Number n : numbers) { + for (Number n : numbers) { size += Double.isNaN(n.doubleValue()) ? 0 : 1; } return size; } - public static double average( Collection numbers, boolean ignoreNan) { - if ( ignoreNan ) { - return sum(numbers,true)/nonNanSize(numbers); - } else { - return sum(numbers,false)/nonNanSize(numbers); + public static double average(Collection x) { + return (double) sum(x) / x.size(); + } + + public static double average(Collection numbers, boolean ignoreNan) { + if (ignoreNan) { + return sum(numbers, true) / nonNanSize(numbers); + } + else { + return sum(numbers, false) / nonNanSize(numbers); } } - public static double variance( Collection numbers, Number mean, boolean ignoreNan ) { + public static double variance(Collection numbers, Number mean, boolean ignoreNan) { double mn = mean.doubleValue(); double var = 0; - for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; } - if ( ignoreNan ) { return var/(nonNanSize(numbers)-1); } - return var/(numbers.size()-1); + for (Number n : numbers) { + var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0; + } + if (ignoreNan) { + return var / (nonNanSize(numbers) - 1); + } + return var / (numbers.size() - 1); } public static double variance(Collection numbers, Number mean) { - return variance(numbers,mean,false); + return variance(numbers, mean, false); } public static double variance(Collection numbers, boolean ignoreNan) { - return variance(numbers,average(numbers,ignoreNan),ignoreNan); + return variance(numbers, average(numbers, ignoreNan), ignoreNan); } public static double variance(Collection numbers) { - return variance(numbers,average(numbers,false),false); + return variance(numbers, average(numbers, false), false); } public static double sum(double[] values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) + s += v; return s; } + public static long sum(int[] x) { + long total = 0; + for (int v : x) + total += v; + return total; + } /** * Calculates the log10 cumulative sum of an array with log10 probabilities + * * @param log10p the array with log10 probabilites - * @param upTo index in the array to calculate the cumsum up to + * @param upTo index in the array to calculate the cumsum up to * @return the log10 of the cumulative sum */ - public static double log10CumulativeSumLog10(double [] log10p, int upTo) { + public static double log10CumulativeSumLog10(double[] log10p, int upTo) { return log10sumLog10(log10p, 0, upTo); } /** * Converts a real space array of probabilities into a log10 array + * * @param prRealSpace * @return */ public static double[] toLog10(double[] prRealSpace) { double[] log10s = new double[prRealSpace.length]; - for ( int i = 0; i < prRealSpace.length; i++ ) + for (int i = 0; i < prRealSpace.length; i++) log10s[i] = Math.log10(prRealSpace[i]); return log10s; } @@ -154,7 +235,7 @@ public static double log10sumLog10(double[] log10p, int start, int finish) { double sum = 0.0; double maxValue = Utils.findMaxEntry(log10p); - for ( int i = start; i < finish; i++ ) { + for (int i = start; i < finish; i++) { sum += Math.pow(10.0, log10p[i] - maxValue); } @@ -163,21 +244,23 @@ public static double log10sumLog10(double[] log10p, int start, int finish) { public static double sumDoubles(List values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) + s += v; return s; } public static int sumIntegers(List values) { int s = 0; - for ( int v : values) s += v; + for (int v : values) + s += v; return s; } public static double sumLog10(double[] log10values) { return Math.pow(10.0, log10sumLog10(log10values)); -// double s = 0.0; -// for ( double v : log10values) s += Math.pow(10.0, v); -// return s; + // double s = 0.0; + // for ( double v : log10values) s += Math.pow(10.0, v); + // return s; } public static double log10sumLog10(double[] log10values) { @@ -185,11 +268,11 @@ public static double log10sumLog10(double[] log10values) { } public static boolean wellFormedDouble(double val) { - return ! Double.isInfinite(val) && ! Double.isNaN(val); + return !Double.isInfinite(val) && !Double.isNaN(val); } public static double bound(double value, double minBoundary, double maxBoundary) { - return Math.max(Math.min(value, maxBoundary), minBoundary); + return Math.max(Math.min(value, maxBoundary), minBoundary); } public static boolean isBounded(double val, double lower, double upper) { @@ -197,7 +280,7 @@ public static boolean isBounded(double val, double lower, double upper) { } public static boolean isPositive(double val) { - return ! isNegativeOrZero(val); + return !isNegativeOrZero(val); } public static boolean isPositiveOrZero(double val) { @@ -209,17 +292,19 @@ public static boolean isNegativeOrZero(double val) { } public static boolean isNegative(double val) { - return ! isPositiveOrZero(val); + return !isPositiveOrZero(val); } /** * Compares double values for equality (within 1e-6), or inequality. * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first double value + * @param b the second double value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b) { return compareDoubles(a, b, 1e-6); } + public static byte compareDoubles(double a, double b) { + return compareDoubles(a, b, 1e-6); + } /** * Compares double values for equality (within epsilon), or inequality. @@ -227,23 +312,28 @@ public static boolean isNegative(double val) { * @param a the first double value * @param b the second double value * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b, double epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareDoubles(double a, double b, double epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } /** * Compares float values for equality (within 1e-6), or inequality. * - * @param a the first float value - * @param b the second float value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first float value + * @param b the second float value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b) { return compareFloats(a, b, 1e-6f); } + public static byte compareFloats(float a, float b) { + return compareFloats(a, b, 1e-6f); + } /** * Compares float values for equality (within epsilon), or inequality. @@ -251,47 +341,50 @@ public static byte compareDoubles(double a, double b, double epsilon) * @param a the first float value * @param b the second float value * @param epsilon the precision within which two float values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b, float epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareFloats(float a, float b, float epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } - public static double NormalDistribution(double mean, double sd, double x) - { - double a = 1.0 / (sd*Math.sqrt(2.0 * Math.PI)); - double b = Math.exp(-1.0 * (Math.pow(x - mean,2.0)/(2.0 * sd * sd))); + public static double NormalDistribution(double mean, double sd, double x) { + double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); + double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); return a * b; } - public static double binomialCoefficient (int n, int k) { + public static double binomialCoefficient(int n, int k) { return Math.pow(10, log10BinomialCoefficient(n, k)); } + /** * Computes a binomial probability. This is computed using the formula - * - * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) - * + *

+ * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) + *

* where n is the number of trials, k is the number of successes, and p is the probability of success * - * @param n number of Bernoulli trials - * @param k number of successes - * @param p probability of success - * - * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. + * @param n number of Bernoulli trials + * @param k number of successes + * @param p probability of success + * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. */ - public static double binomialProbability (int n, int k, double p) { + public static double binomialProbability(int n, int k, double p) { return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); } /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. - * @param start - start of the cumulant sum (over hits) - * @param end - end of the cumulant sum (over hits) - * @param total - number of attempts for the number of hits + * + * @param start - start of the cumulant sum (over hits) + * @param end - end of the cumulant sum (over hits) + * @param total - number of attempts for the number of hits * @param probHit - probability of a successful hit * @return - returns the cumulative probability */ @@ -300,11 +393,11 @@ public static double binomialCumulativeProbability(int start, int end, int total double prevProb; BigDecimal probCache = BigDecimal.ZERO; - for(int hits = start; hits < end; hits++) { + for (int hits = start; hits < end; hits++) { prevProb = cumProb; double probability = binomialProbability(total, hits, probHit); cumProb += probability; - if ( probability > 0 && cumProb - prevProb < probability/2 ) { // loss of precision + if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision probCache = probCache.add(new BigDecimal(prevProb)); cumProb = 0.0; hits--; // repeat loop @@ -314,20 +407,20 @@ public static double binomialCumulativeProbability(int start, int end, int total return probCache.add(new BigDecimal(cumProb)).doubleValue(); } - + /** * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] - * + *

+ * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + *

* where xi represents the number of times outcome i was observed, n is the number of total observations. * In this implementation, the value of n is inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @return the multinomial of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. */ - public static double multinomialCoefficient (int [] k) { + public static double multinomialCoefficient(int[] k) { int n = 0; for (int xi : k) { n += xi; @@ -339,37 +432,38 @@ public static double multinomialCoefficient (int [] k) { /** * Computes a multinomial probability efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) - * + *

+ * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) + *

* where xi represents the number of times outcome i was observed, n is the number of total observations, and * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is * inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur - * @return the multinomial probability of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur + * @return the multinomial probability of the specified configuration. */ - public static double multinomialProbability (int[] k, double[] p) { + public static double multinomialProbability(int[] k, double[] p) { if (p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); int n = 0; - double [] log10P = new double[p.length]; - for (int i=0; i l) { double rms = 0.0; for (int i : l) - rms += i*i; + rms += i * i; rms /= l.size(); return Math.sqrt(rms); } - public static double distanceSquared( final double[] x, final double[] y ) { + public static double distanceSquared(final double[] x, final double[] y) { double dist = 0.0; - for(int iii = 0; iii < x.length; iii++) { + for (int iii = 0; iii < x.length; iii++) { dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); } return dist; } public static double round(double num, int digits) { - double result = num * Math.pow(10.0, (double)digits); + double result = num * Math.pow(10.0, (double) digits); result = Math.round(result); - result = result / Math.pow(10.0, (double)digits); + result = result / Math.pow(10.0, (double) digits); return result; } - /** * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). * - * @param array the array to be normalized + * @param array the array to be normalized * @param takeLog10OfOutput if true, the output will be transformed back into log10 units - * * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed - */ + */ public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) { return normalizeFromLog10(array, takeLog10OfOutput, false); } @@ -457,7 +550,7 @@ public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOut // all negative) the largest value; also, we need to convert to normal-space. double maxValue = Utils.findMaxEntry(array); - // we may decide to just normalize in log space with converting to linear space + // we may decide to just normalize in log space without converting to linear space if (keepInLogSpace) { for (int i = 0; i < array.length; i++) array[i] -= maxValue; @@ -476,67 +569,52 @@ public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOut sum += normalized[i]; for (int i = 0; i < array.length; i++) { double x = normalized[i] / sum; - if ( takeLog10OfOutput ) x = Math.log10(x); + if (takeLog10OfOutput) + x = Math.log10(x); normalized[i] = x; } return normalized; } - public static double[] normalizeFromLog10(List array, boolean takeLog10OfOutput) { - double[] normalized = new double[array.size()]; - - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = MathUtils.arrayMaxDouble( array ); - for (int i = 0; i < array.size(); i++) - normalized[i] = Math.pow(10, array.get(i) - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.size(); i++) - sum += normalized[i]; - for (int i = 0; i < array.size(); i++) { - double x = normalized[i] / sum; - if ( takeLog10OfOutput ) x = Math.log10(x); - normalized[i] = x; - } - - return normalized; - } /** * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). * - * @param array the array to be normalized - * + * @param array the array to be normalized * @return a newly allocated array corresponding the normalized values in array - */ + */ public static double[] normalizeFromLog10(double[] array) { return normalizeFromLog10(array, false); } - public static double[] normalizeFromLog10(List array) { - return normalizeFromLog10(array, false); + public static int maxElementIndex(final double[] array) { + return maxElementIndex(array, array.length); } - public static int maxElementIndex(double[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + public static int maxElementIndex(final double[] array, final int endIndex) { + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); int maxI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( maxI == -1 || array[i] > array[maxI] ) + for (int i = 0; i < endIndex; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } return maxI; } - public static int maxElementIndex(int[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + public static int maxElementIndex(final int[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final int[] array, int endIndex) { + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); int maxI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( maxI == -1 || array[i] > array[maxI] ) + for (int i = 0; i < endIndex; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } @@ -551,16 +629,21 @@ public static double arrayMin(double[] array) { return array[minElementIndex(array)]; } + public static int arrayMin(int[] array) { + return array[minElementIndex(array)]; + } + public static byte arrayMin(byte[] array) { return array[minElementIndex(array)]; } public static int minElementIndex(double[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } @@ -568,32 +651,52 @@ public static int minElementIndex(double[] array) { } public static int minElementIndex(byte[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } return minI; - } + } + + public static int minElementIndex(int[] array) { + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = -1; + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) + minI = i; + } + + return minI; + } public static int arrayMaxInt(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) + throw new IllegalArgumentException("Array size cannot be 0!"); int m = array.get(0); - for ( int e : array ) m = Math.max(m, e); + for (int e : array) + m = Math.max(m, e); return m; } public static double arrayMaxDouble(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) + throw new IllegalArgumentException("Array size cannot be 0!"); double m = array.get(0); - for ( double e : array ) m = Math.max(m, e); + for (double e : array) + m = Math.max(m, e); return m; } @@ -631,12 +734,19 @@ public static double average(List vals) { return average(vals, vals.size()); } + public static double average(int[] x) { + int sum = 0; + for (int v : x) + sum += v; + return (double) sum / x.length; + } + public static byte average(byte[] vals) { int sum = 0; for (byte v : vals) { sum += v; } - return (byte) Math.floor(sum/vals.length); + return (byte) Math.floor(sum / vals.length); } public static double averageDouble(List vals) { @@ -707,7 +817,6 @@ public int compare(Integer a, Integer b) { return permutation; } - public static int[] permuteArray(int[] array, Integer[] permutation) { int[] output = new int[array.length]; for (int i = 0; i < output.length; i++) { @@ -748,8 +857,9 @@ public static List permuteList(List list, Integer[] permutation) { return output; } - - /** Draw N random elements from list. */ + /** + * Draw N random elements from list. + */ public static List randomSubset(List list, int N) { if (list.size() <= N) { return list; @@ -770,6 +880,25 @@ public static List randomSubset(List list, int N) { return ans; } + /** + * Draw N random elements from an array. + * + * @param array your objects + * @param n number of elements to select at random from the list + * @return a new list with the N randomly chosen elements from list + */ + @Requires({"array != null", "n>=0"}) + @Ensures({"result != null", "result.length == Math.min(n, array.length)"}) + public static Object[] randomSubset(final Object[] array, final int n) { + if (array.length <= n) + return array.clone(); + + Object[] shuffledArray = arrayShuffle(array); + Object[] result = new Object[n]; + System.arraycopy(shuffledArray, 0, result, 0, n); + return result; + } + public static double percentage(double x, double base) { return (base > 0 ? (x / base) * 100.0 : 0); } @@ -793,13 +922,14 @@ public static int countOccurrences(char c, String s) { public static int countOccurrences(T x, List l) { int count = 0; for (T y : l) { - if (x.equals(y)) count++; + if (x.equals(y)) + count++; } return count; } - public static int countOccurrences(byte element, byte [] array) { + public static int countOccurrences(byte element, byte[] array) { int count = 0; for (byte y : array) { if (element == y) @@ -814,13 +944,13 @@ public static int countOccurrences(byte element, byte [] array) { * Better than sorting if N (number of elements to return) is small * * @param array the array - * @param n number of top elements to return + * @param n number of top elements to return * @return the n larger elements of the array */ - public static Collection getNMaxElements(double [] array, int n) { + public static Collection getNMaxElements(double[] array, int n) { ArrayList maxN = new ArrayList(n); double lastMax = Double.MAX_VALUE; - for (int i=0; i getNMaxElements(double [] array, int n) { */ static public ArrayList sampleIndicesWithReplacement(int n, int k) { - ArrayList chosen_balls = new ArrayList (k); - for (int i=0; i< k; i++) { + ArrayList chosen_balls = new ArrayList(k); + for (int i = 0; i < k; i++) { //Integer chosen_ball = balls[rand.nextInt(k)]; chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); //balls.remove(chosen_ball); @@ -872,11 +1002,11 @@ static public ArrayList sampleIndicesWithoutReplacement(int n, int k) { /** * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times - - * @param indices the list of indices for elements to extract - * @param list the list from which the elements should be extracted - * @param the template type of the ArrayList - * @return a new ArrayList consisting of the elements at the specified indices + * + * @param indices the list of indices for elements to extract + * @param list the list from which the elements should be extracted + * @param the template type of the ArrayList + * @return a new ArrayList consisting of the elements at the specified indices */ static public ArrayList sliceListByIndices(List indices, List list) { ArrayList subset = new ArrayList(); @@ -898,27 +1028,28 @@ public static Comparable orderStatisticSearch(int orderStat, List li ArrayList equalToX = new ArrayList(); ArrayList greaterThanX = new ArrayList(); - for(Comparable y : list) { - if(x.compareTo(y) > 0) { + for (Comparable y : list) { + if (x.compareTo(y) > 0) { lessThanX.add(y); - } else if(x.compareTo(y) < 0) { + } + else if (x.compareTo(y) < 0) { greaterThanX.add(y); - } else + } + else equalToX.add(y); } - if(lessThanX.size() > orderStat) + if (lessThanX.size() > orderStat) return orderStatisticSearch(orderStat, lessThanX); - else if(lessThanX.size() + equalToX.size() >= orderStat) + else if (lessThanX.size() + equalToX.size() >= orderStat) return orderStat; else return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); } - public static Object getMedian(List list) { - return orderStatisticSearch((int) Math.ceil(list.size()/2), list); + return orderStatisticSearch((int) Math.ceil(list.size() / 2), list); } public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { @@ -926,7 +1057,7 @@ public static byte getQScoreOrderStatistic(List reads, List // list index maps to a q-score only through the offset index // returns the kth-largest q-score. - if( reads.size() == 0) { + if (reads.size() == 0) { return 0; } @@ -938,25 +1069,27 @@ public static byte getQScoreOrderStatistic(List reads, List final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; - for(int iter = 0; iter < reads.size(); iter ++) { + for (int iter = 0; iter < reads.size(); iter++) { SAMRecord read = reads.get(iter); int offset = offsets.get(iter); byte quality = read.getBaseQualities()[offset]; - if(quality < qk) { + if (quality < qk) { lessThanQReads.add(read); lessThanQOffsets.add(offset); - } else if(quality > qk) { + } + else if (quality > qk) { greaterThanQReads.add(read); greaterThanQOffsets.add(offset); - } else { + } + else { equalToQReads.add(reads.get(iter)); } } - if(lessThanQReads.size() > k) + if (lessThanQReads.size() > k) return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); - else if(equalToQReads.size() + lessThanQReads.size() >= k) + else if (equalToQReads.size() + lessThanQReads.size() >= k) return qk; else return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); @@ -964,10 +1097,18 @@ else if(equalToQReads.size() + lessThanQReads.size() >= k) } public static byte getQScoreMedian(List reads, List offsets) { - return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); + return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } - /** A utility class that computes on the fly average and standard deviation for a stream of numbers. + public static long sum(Collection x) { + long sum = 0; + for (int v : x) + sum += v; + return sum; + } + + /** + * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that * it could overflow any naive summation-based scheme or cause loss of precision). * Instead, adding a new number observed @@ -983,20 +1124,31 @@ public static class RunningAverage { public void add(double obs) { obs_count++; double oldMean = mean; - mean += ( obs - mean ) / obs_count; // update mean - s += ( obs - oldMean ) * ( obs - mean ); + mean += (obs - mean) / obs_count; // update mean + s += (obs - oldMean) * (obs - mean); } public void addAll(Collection col) { - for ( Number o : col ) { + for (Number o : col) { add(o.doubleValue()); } } - public double mean() { return mean; } - public double stddev() { return Math.sqrt(s/(obs_count - 1)); } - public double var() { return s/(obs_count - 1); } - public long observationCount() { return obs_count; } + public double mean() { + return mean; + } + + public double stddev() { + return Math.sqrt(s / (obs_count - 1)); + } + + public double var() { + return s / (obs_count - 1); + } + + public long observationCount() { + return obs_count; + } public RunningAverage clone() { RunningAverage ra = new RunningAverage(); @@ -1007,296 +1159,213 @@ public RunningAverage clone() { } public void merge(RunningAverage other) { - if ( this.obs_count > 0 || other.obs_count > 0 ) { // if we have any observations at all - this.mean = ( this.mean * this.obs_count + other.mean * other.obs_count ) / ( this.obs_count + other.obs_count ); + if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all + this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); this.s += other.s; } this.obs_count += other.obs_count; } } - + // // useful common utility routines // - public static double rate(long n, long d) { return n / (1.0 * Math.max(d, 1)); } - public static double rate(int n, int d) { return n / (1.0 * Math.max(d, 1)); } - - public static long inverseRate(long n, long d) { return n == 0 ? 0 : d / Math.max(n, 1); } - public static long inverseRate(int n, int d) { return n == 0 ? 0 : d / Math.max(n, 1); } - - public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); } - public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); } - - public static final double[] log10Cache; - public static final double[] jacobianLogTable; - public static final int JACOBIAN_LOG_TABLE_SIZE = 101; - public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; - public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0/JACOBIAN_LOG_TABLE_STEP; - public static final double MAX_JACOBIAN_TOLERANCE = 10.0; - private static final int MAXN = 11000; - private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients - - static { - log10Cache = new double[LOG10_CACHE_SIZE]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; - - log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k=1; k < LOG10_CACHE_SIZE; k++) - log10Cache[k] = Math.log10(k); - - for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) - * JACOBIAN_LOG_TABLE_STEP)); - - } + public static double rate(long n, long d) { + return n / (1.0 * Math.max(d, 1)); } - static public double softMax(final double[] vec) { - double acc = vec[0]; - for (int k=1; k < vec.length; k++) - acc = softMax(acc,vec[k]); - - return acc; - + public static double rate(int n, int d) { + return n / (1.0 * Math.max(d, 1)); } - static public double max(double x0, double x1, double x2) { - double a = Math.max(x0,x1); - return Math.max(a,x2); + public static long inverseRate(long n, long d) { + return n == 0 ? 0 : d / Math.max(n, 1); } - - static public double softMax(final double x0, final double x1, final double x2) { - // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); - // better approximation: do Jacobian logarithm function on data pairs - double a = softMax(x0,x1); - return softMax(a,x2); + public static long inverseRate(int n, int d) { + return n == 0 ? 0 : d / Math.max(n, 1); } - static public double softMax(final double x, final double y) { - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - - // slow exact version: - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); + public static double ratio(int num, int denom) { + return ((double) num) / (Math.max(denom, 1)); + } - double diff = x-y; + public static double ratio(long num, long denom) { + return ((double) num) / (Math.max(denom, 1)); + } - if (diff > MAX_JACOBIAN_TOLERANCE) - return x; - else if (diff < -MAX_JACOBIAN_TOLERANCE) - return y; - else if (diff >= 0) { - int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); - return x + jacobianLogTable[ind]; - } - else { - int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); - return y + jacobianLogTable[ind]; - } + static public double max(double x0, double x1, double x2) { + double a = Math.max(x0, x1); + return Math.max(a, x2); } - public static double phredScaleToProbability (byte q) { - return Math.pow(10,(-q)/10.0); + public static double phredScaleToProbability(byte q) { + return Math.pow(10, (-q) / 10.0); } - public static double phredScaleToLog10Probability (byte q) { - return ((-q)/10.0); + public static double phredScaleToLog10Probability(byte q) { + return ((-q) / 10.0); } /** * Returns the phred scaled value of probability p + * * @param p probability (between 0 and 1). * @return phred scaled probability of p */ - public static byte probabilityToPhredScale (double p) { + public static byte probabilityToPhredScale(double p) { return (byte) ((-10) * Math.log10(p)); } - public static double log10ProbabilityToPhredScale (double log10p) { + public static double log10ProbabilityToPhredScale(double log10p) { return (-10) * log10p; } /** * Converts LN to LOG10 + * * @param ln log(x) * @return log10(x) */ - public static double lnToLog10 (double ln) { + public static double lnToLog10(double ln) { return ln * Math.log10(Math.exp(1)); } /** * Constants to simplify the log gamma function calculation. */ - private static final double - zero = 0.0, - one = 1.0, - half = .5, - a0 = 7.72156649015328655494e-02, - a1 = 3.22467033424113591611e-01, - a2 = 6.73523010531292681824e-02, - a3 = 2.05808084325167332806e-02, - a4 = 7.38555086081402883957e-03, - a5 = 2.89051383673415629091e-03, - a6 = 1.19270763183362067845e-03, - a7 = 5.10069792153511336608e-04, - a8 = 2.20862790713908385557e-04, - a9 = 1.08011567247583939954e-04, - a10 = 2.52144565451257326939e-05, - a11 = 4.48640949618915160150e-05, - tc = 1.46163214496836224576e+00, - tf = -1.21486290535849611461e-01, - tt = -3.63867699703950536541e-18, - t0 = 4.83836122723810047042e-01, - t1 = -1.47587722994593911752e-01, - t2 = 6.46249402391333854778e-02, - t3 = -3.27885410759859649565e-02, - t4 = 1.79706750811820387126e-02, - t5 = -1.03142241298341437450e-02, - t6 = 6.10053870246291332635e-03, - t7 = -3.68452016781138256760e-03, - t8 = 2.25964780900612472250e-03, - t9 = -1.40346469989232843813e-03, - t10 = 8.81081882437654011382e-04, - t11 = -5.38595305356740546715e-04, - t12 = 3.15632070903625950361e-04, - t13 = -3.12754168375120860518e-04, - t14 = 3.35529192635519073543e-04, - u0 = -7.72156649015328655494e-02, - u1 = 6.32827064025093366517e-01, - u2 = 1.45492250137234768737e+00, - u3 = 9.77717527963372745603e-01, - u4 = 2.28963728064692451092e-01, - u5 = 1.33810918536787660377e-02, - v1 = 2.45597793713041134822e+00, - v2 = 2.12848976379893395361e+00, - v3 = 7.69285150456672783825e-01, - v4 = 1.04222645593369134254e-01, - v5 = 3.21709242282423911810e-03, - s0 = -7.72156649015328655494e-02, - s1 = 2.14982415960608852501e-01, - s2 = 3.25778796408930981787e-01, - s3 = 1.46350472652464452805e-01, - s4 = 2.66422703033638609560e-02, - s5 = 1.84028451407337715652e-03, - s6 = 3.19475326584100867617e-05, - r1 = 1.39200533467621045958e+00, - r2 = 7.21935547567138069525e-01, - r3 = 1.71933865632803078993e-01, - r4 = 1.86459191715652901344e-02, - r5 = 7.77942496381893596434e-04, - r6 = 7.32668430744625636189e-06, - w0 = 4.18938533204672725052e-01, - w1 = 8.33333333333329678849e-02, - w2 = -2.77777777728775536470e-03, - w3 = 7.93650558643019558500e-04, - w4 = -5.95187557450339963135e-04, - w5 = 8.36339918996282139126e-04, - w6 = -1.63092934096575273989e-03; + private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03; /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long with 32 bit shift + * double to long with 32 bit shift */ - private static final int HI (double x) { - return (int)(Double.doubleToLongBits(x) >> 32); + private static final int HI(double x) { + return (int) (Double.doubleToLongBits(x) >> 32); } /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long without shift + * double to long without shift */ - private static final int LO (double x) { - return (int)Double.doubleToLongBits(x); + private static final int LO(double x) { + return (int) Double.doubleToLongBits(x); } /** * Most efficent implementation of the lnGamma (FDLIBM) * Use via the log10Gamma wrapper method. */ - private static double lnGamma (double x) { - double t,y,z,p,p1,p2,p3,q,r,w; + private static double lnGamma(double x) { + double t, y, z, p, p1, p2, p3, q, r, w; int i; int hx = HI(x); int lx = LO(x); /* purge off +-inf, NaN, +-0, and negative arguments */ - int ix = hx&0x7fffffff; - if (ix >= 0x7ff00000) return Double.POSITIVE_INFINITY; - if ((ix|lx)==0 || hx < 0) return Double.NaN; - if (ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + int ix = hx & 0x7fffffff; + if (ix >= 0x7ff00000) + return Double.POSITIVE_INFINITY; + if ((ix | lx) == 0 || hx < 0) + return Double.NaN; + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ return -Math.log(x); } /* purge off 1 and 2 */ - if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0; - /* for x < 2.0 */ - else if(ix<0x40000000) { - if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) + r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ r = -Math.log(x); - if(ix>=0x3FE76944) {y = one-x; i= 0;} - else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;} - else {y = x; i=2;} - } else { + if (ix >= 0x3FE76944) { + y = one - x; + i = 0; + } + else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } + else { + y = x; + i = 2; + } + } + else { r = zero; - if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */ - else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */ - else {y=x-one;i=2;} + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ + else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ + else { + y = x - one; + i = 2; + } } - switch(i) { - case 0: - z = y*y; - p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); - p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); - p = y*p1+p2; - r += (p-0.5*y); break; - case 1: - z = y*y; - w = z*y; - p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ - p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); - p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); - p = z*p1-(tt-w*(p2+y*p3)); - r += (tf + p); break; - case 2: - p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); - p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); - r += (-0.5*y + p1/p2); + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); } } - else if(ix<0x40200000) { /* x < 8.0 */ - i = (int)x; + else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int) x; t = zero; - y = x-(double)i; - p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); - q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); - r = half*y+p/q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch(i) { - case 7: z *= (y+6.0); /* FALLTHRU */ - case 6: z *= (y+5.0); /* FALLTHRU */ - case 5: z *= (y+4.0); /* FALLTHRU */ - case 4: z *= (y+3.0); /* FALLTHRU */ - case 3: z *= (y+2.0); /* FALLTHRU */ - r += Math.log(z); break; + y = x - (double) i; + p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += Math.log(z); + break; } /* 8.0 <= x < 2**58 */ - } else if (ix < 0x43900000) { + } + else if (ix < 0x43900000) { t = Math.log(x); - z = one/x; - y = z*z; - w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); - r = (x-half)*(t-one)+w; - } else + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; + } + else /* 2**58 <= x <= inf */ - r = x*(Math.log(x)-one); + r = x * (Math.log(x) - one); return r; } @@ -1308,7 +1377,7 @@ else if(ix<0x40200000) { /* x < 8.0 */ * @param x the x parameter * @return the log10 of the gamma function at x. */ - public static double log10Gamma (double x) { + public static double log10Gamma(double x) { return lnToLog10(lnGamma(x)); } @@ -1320,16 +1389,15 @@ public static double log10Gamma (double x) { * @param k number of successes * @return the log10 of the binomial coefficient */ - public static double log10BinomialCoefficient (int n, int k) { - return log10Gamma(n+1) - log10Gamma(k+1) - log10Gamma(n-k+1); + public static double log10BinomialCoefficient(int n, int k) { + return log10Gamma(n + 1) - log10Gamma(k + 1) - log10Gamma(n - k + 1); } - public static double log10BinomialProbability (int n, int k, double log10p) { - double log10OneMinusP = Math.log10(1-Math.pow(10,log10p)); - return log10BinomialCoefficient(n, k) + log10p*k + log10OneMinusP*(n-k); + public static double log10BinomialProbability(int n, int k, double log10p) { + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); } - /** * Calculates the log10 of the multinomial coefficient. Designed to prevent * overflows even with very large numbers. @@ -1338,38 +1406,252 @@ public static double log10BinomialProbability (int n, int k, double log10p) { * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) * @return */ - public static double log10MultinomialCoefficient (int n, int [] k) { + public static double log10MultinomialCoefficient(int n, int[] k) { double denominator = 0.0; for (int x : k) { - denominator += log10Gamma(x+1); + denominator += log10Gamma(x + 1); } - return log10Gamma(n+1) - denominator; + return log10Gamma(n + 1) - denominator; } /** * Computes the log10 of the multinomial distribution probability given a vector * of log10 probabilities. Designed to prevent overflows even with very large numbers. * - * @param n number of trials - * @param k array of number of successes for each possibility + * @param n number of trials + * @param k array of number of successes for each possibility * @param log10p array of log10 probabilities * @return */ - public static double log10MultinomialProbability (int n, int [] k, double [] log10p) { + public static double log10MultinomialProbability(int n, int[] k, double[] log10p) { if (log10p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); double log10Prod = 0.0; - for (int i=0; i Double[] vectorSum(E v1[], E v2[]) { + if (v1.length != v2.length) + throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); + + Double[] result = new Double[v1.length]; + for (int k = 0; k < v1.length; k++) + result[k] = v1[k].doubleValue() + v2[k].doubleValue(); + + return result; + } + + public static Double[] scalarTimesVector(E a, E[] v1) { + + Double result[] = new Double[v1.length]; + for (int k = 0; k < v1.length; k++) + result[k] = a.doubleValue() * v1[k].doubleValue(); + + return result; + } + + public static Double dotProduct(E[] v1, E[] v2) { + if (v1.length != v2.length) + throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); + + Double result = 0.0; + for (int k = 0; k < v1.length; k++) + result += v1[k].doubleValue() * v2[k].doubleValue(); + + return result; + + } + + public static double[] vectorLog10(double v1[]) { + double result[] = new double[v1.length]; + for (int k = 0; k < v1.length; k++) + result[k] = Math.log10(v1[k]); + + return result; + + } + + // todo - silly overloading, just because Java can't unbox/box arrays of primitive types, and we can't do generics with primitive types! + public static Double[] vectorLog10(Double v1[]) { + Double result[] = new Double[v1.length]; + for (int k = 0; k < v1.length; k++) + result[k] = Math.log10(v1[k]); + + return result; + + } + + /** + * Creates an integer out of a bitset + * + * @param bitSet the bitset + * @return an integer with the bitset representation + */ + public static long intFrom(final BitSet bitSet) { + long number = 0; + for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1)) + number |= 1L << bitIndex; + + return number; + } + + /** + * Creates a BitSet representation of a given integer + * + * @param number the number to turn into a bitset + * @return a bitset representation of the integer + */ + public static BitSet bitSetFrom(long number) { + BitSet bitSet = new BitSet(); + int bitIndex = 0; + while (number > 0) { + if (number%2 > 0) + bitSet.set(bitIndex); + bitIndex++; + number /= 2; + } + return bitSet; + } + + /** + * Converts a BitSet into the dna string representation. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the + * base_10 representation of the sequence. This is important for us to know how to bring the number + * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented + * as 0's and leading 0's are omitted). + * + * quasi-canonical because A is represented by a 0, therefore, + * instead of : 0, 1, 2, 3, 10, 11, 12, ... + * we have : 0, 1, 2, 3, 00, 01, 02, ... + * + * but we can correctly decode it because we know the final length. + * + * @param bitSet the bitset representation of the dna sequence + * @return the dna sequence represented by the bitset + */ + public static String dnaFrom(final BitSet bitSet) { + long number = intFrom(bitSet); // the base_10 representation of the bit set + long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later) + long nextContext = 4; // the next context (we advance it so we know which one was preceding it). + int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. + while (nextContext <= number) { // find the length of the dna string (i) + preContext = nextContext; // keep track of the number of combinations in the preceding context + nextContext += Math.pow(4, ++i);// calculate the next context + } + number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + + String dna = ""; + while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) + byte base = (byte) (number % 4); + switch (base) { + case 0 : dna = "A" + dna; break; + case 1 : dna = "C" + dna; break; + case 2 : dna = "G" + dna; break; + case 3 : dna = "T" + dna; break; + } + number /= 4; + } + for (int j = dna.length(); j < i; j++) + dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) + + return dna; + } + + /** + * Creates a BitSet representation of a given dna string. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * The bit representation of a dna string is the simple: + * 0 A 4 AA 8 CA + * 1 C 5 AC ... + * 2 G 6 AG 1343 TTGGT + * 3 T 7 AT 1364 TTTTT + * + * To convert from dna to number, we convert the dna string to base10 and add all combinations that + * preceded the string (with smaller lengths). + * + * @param dna the dna sequence + * @return the bitset representing the dna sequence + */ + public static BitSet bitSetFrom(String dna) { + if (dna.length() > 31) + throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length())); + + long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set + long preContext = 0; // the sum of all combinations that preceded the length of the dna string + for (int i=0; i0) + preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...) + } + + return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Median.java b/public/java/src/org/broadinstitute/sting/utils/Median.java new file mode 100644 index 0000000000..7ebe8d2d72 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/Median.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import java.util.*; + +/** + * Utility class for calculating median from a data set, potentially limiting the size of data to a + * fixed amount + * + * @author Your Name + * @since Date created + */ +public class Median { + final List values; + final int maxValuesToKeep; + boolean sorted = false; + + public Median() { + this(Integer.MAX_VALUE); + } + + public Median(final int maxValuesToKeep) { + this.maxValuesToKeep = maxValuesToKeep; + this.values = new ArrayList(); + } + + public boolean isFull() { + return values.size() >= maxValuesToKeep; + } + + public int size() { + return values.size(); + } + + public boolean isEmpty() { + return values.isEmpty(); + } + + public T getMedian() { + if ( isEmpty() ) + throw new IllegalStateException("Cannot get median value from empty array"); + return getMedian(null); // note that value null will never be used + } + + /** + * Returns the floor(n + 1 / 2) item from the list of values if the list + * has values, or defaultValue is the list is empty. + */ + public T getMedian(final T defaultValue) { + if ( isEmpty() ) + return defaultValue; + + if ( ! sorted ) { + sorted = true; + Collections.sort(values); + } + + final int offset = (int)Math.floor((values.size() + 1) * 0.5) - 1; + return values.get(offset); + } + + public boolean add(final T value) { + if ( ! isFull() ) { + sorted = false; + return values.add(value); + } + else + return false; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java index 4f01f2b7aa..597dc48034 100644 --- a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java +++ b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java @@ -87,7 +87,7 @@ public static final NGSPlatform fromReadGroup(SAMReadGroupRecord rg) { /** * Returns the NGSPlatform corresponding to the PL tag in the read group * @param plFromRG -- the PL field (or equivalent) in a ReadGroup object - * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match + * @return an NGSPlatform object matching the PL field of the header, or UNKNOWN if there was no match */ public static final NGSPlatform fromReadGroupPL(final String plFromRG) { if ( plFromRG == null ) return UNKNOWN; @@ -105,4 +105,14 @@ public static final NGSPlatform fromReadGroupPL(final String plFromRG) { return UNKNOWN; } + + /** + * checks whether or not the requested platform is listed in the set (and is not unknown) + * + * @param platform the read group string that describes the platform used + * @return true if the platform is known (i.e. it's in the list and is not UNKNOWN) + */ + public static final boolean isKnown (final String platform) { + return fromReadGroupPL(platform) != UNKNOWN; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/PathUtils.java b/public/java/src/org/broadinstitute/sting/utils/PathUtils.java index 822d04dfd5..db655d25c3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/PathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/PathUtils.java @@ -1,10 +1,14 @@ package org.broadinstitute.sting.utils; +import org.apache.commons.io.comparator.LastModifiedFileComparator; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; import java.io.FilenameFilter; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.List; /** @@ -17,6 +21,8 @@ * A set of static utility methods for common operations on paths. */ public class PathUtils { + private static Logger logger = Logger.getLogger(PathUtils.class); + /** * Constructor access disallowed...static utility methods only! */ @@ -36,7 +42,7 @@ public static List findFilesInPath(final File basePath, final String rel List filesInPath = new ArrayList(); FilenameFilter filter = new OrFilenameFilter(new DirectoryFilter(), - new ExtensionFilter(extension)); + new ExtensionFilter(extension)); File[] contents = basePath.listFiles( filter ); for (File content : contents) { String relativeFileName = relativePrefix.trim().length() != 0 ? @@ -118,4 +124,47 @@ public static void refreshVolume(File file) { } } -} + + /** + * Walk over the GATK released directories to find the most recent JAR files corresponding + * to the version prefix. For example, providing input "1.2" will + * return the full path to the most recent GenomeAnalysisTK.jar in the GATK_RELEASE_DIR + * in directories that match gatkReleaseDir/GenomeAnalysisTK-1.2* + * + * @param gatkReleaseDir Path to directory containing GATK release binaries (e.g., /humgen/gsa-hpprojects/GATK/bin/) + * @param releaseVersionNumber Desired GATK version number (e.g., 1.2) + * @return A file pointing to the most recent GATK file in the release directory with GATK release number + */ + public static File findMostRecentGATKVersion(final File gatkReleaseDir, final String releaseVersionNumber) { + final String versionString = "GenomeAnalysisTK-" + releaseVersionNumber; + + final List gatkJars = new ArrayList(); + for ( final String path : gatkReleaseDir.list(new isGATKVersion(versionString)) ) { + gatkJars.add(new File(gatkReleaseDir.getAbsolutePath() + "/" + path + "/GenomeAnalysisTK.jar")); + } + + if ( gatkJars.isEmpty() ) + return null; + else { + Collections.sort(gatkJars, LastModifiedFileComparator.LASTMODIFIED_REVERSE); + //for ( File jar : gatkJars ) logger.info(String.format("%s => %d", jar, jar.lastModified())); + final File last = gatkJars.get(0); + logger.debug(String.format("findMostRecentGATKVersion: Found %d jars for %s, keeping last one %s", + gatkJars.size(), releaseVersionNumber, last)); + return last; + } + } + + private final static class isGATKVersion implements FilenameFilter { + private final String versionString; + + private isGATKVersion(final String versionString) { + this.versionString = versionString; + } + + @Override + public boolean accept(final File file, final String s) { + return s.contains(versionString); + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 7ec6a74d73..7756ac71b3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -10,6 +10,8 @@ */ public class QualityUtils { public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; + public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); + public final static double MIN_REASONABLE_ERROR = 0.0001; public final static byte MAX_REASONABLE_Q_SCORE = 40; public final static byte MIN_USABLE_Q_SCORE = 6; @@ -55,6 +57,14 @@ static public double qualToErrorProb(byte qual) { return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. } + static public double[] qualArrayToLog10ErrorProb(byte[] quals) { + double[] returnArray = new double[quals.length]; + for( int iii = 0; iii < quals.length; iii++ ) { + returnArray[iii] = ((double) quals[iii])/-10.0; + } + return returnArray; + } + /** * Convert a probability to a quality score. Note, this is capped at Q40. * diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java new file mode 100644 index 0000000000..6279e0061b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.utils.activeregion; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 1/4/12 + */ + +public class ActiveRegion implements HasGenomeLocation { + + private final ArrayList reads = new ArrayList(); + private final GenomeLoc activeRegionLoc; + private final GenomeLoc extendedLoc; + private final int extension; + private GenomeLoc fullExtentReferenceLoc = null; + private final GenomeLocParser genomeLocParser; + public final boolean isActive; + + public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { + this.activeRegionLoc = activeRegionLoc; + this.isActive = isActive; + this.genomeLocParser = genomeLocParser; + this.extension = extension; + extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + fullExtentReferenceLoc = extendedLoc; + } + + // add each read to the bin and extend the reference genome activeRegionLoc if needed + public void add( final GATKSAMRecord read ) { + fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); + reads.add( read ); + } + + public ArrayList getReads() { return reads; } + + public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { + return getReference( referenceReader, 0 ); + } + + public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), + Math.max(1, fullExtentReferenceLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); + } + + @Override + public GenomeLoc getLocation() { return activeRegionLoc; } + public GenomeLoc getExtendedLoc() { return extendedLoc; } + public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } + + public int getExtension() { return extension; } + public int size() { return reads.size(); } + public void clearReads() { reads.clear(); } + public void remove( final GATKSAMRecord read ) { reads.remove( read ); } + public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index 4f096f86e1..1864522942 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -673,7 +673,7 @@ public byte[] baqRead(SAMRecord read, IndexedFastaSequenceFile refReader, Calcul } /** - * Returns true if we don't think this read is eligable for the BAQ calculation. Examples include non-PF reads, + * Returns true if we don't think this read is eligible for the BAQ calculation. Examples include non-PF reads, * duplicates, or unmapped reads. Used by baqRead to determine if a read should fall through the calculation. * * @param read diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java index 26356a4a4d..adfeef5180 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java @@ -34,7 +34,7 @@ public class BAQSamIterator implements StingSAMIterator { "cmode != null" , "qmode != null"}) public BAQSamIterator(IndexedFastaSequenceFile refReader, StingSAMIterator it, BAQ.CalculationMode cmode, BAQ.QualityMode qmode) { - if ( cmode == BAQ.CalculationMode.OFF) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with calculation mode OFF"); + if ( cmode == BAQ.CalculationMode.OFF ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with calculation mode OFF"); if ( qmode == BAQ.QualityMode.DONT_MODIFY ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with quailty mode DONT_MODIFY"); this.refReader = refReader; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 921a0a599b..62a67a1f2e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -4,6 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -70,27 +71,27 @@ public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord read) break; case SOFTCLIP_BASES: - if ( read.getReadUnmappedFlag() ) { + if (read.getReadUnmappedFlag()) { // we can't process unmapped reads throw new UserException("Read Clipper cannot soft clip unmapped reads"); } //System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); int myStop = stop; - if ( (stop + 1 - start) == read.getReadLength() ) { + if ((stop + 1 - start) == read.getReadLength()) { // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); //break; myStop--; // just decrement stop } - if ( start > 0 && myStop != read.getReadLength() - 1 ) + if (start > 0 && myStop != read.getReadLength() - 1) throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); Cigar oldCigar = read.getCigar(); int scLeft = 0, scRight = read.getReadLength(); - if ( start == 0 ) + if (start == 0) scLeft = myStop + 1; else scRight = start; @@ -134,8 +135,7 @@ else if (matchesCount > 0) { unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); matchesCount = 0; unclippedCigar.add(element); - } - else + } else unclippedCigar.add(element); } if (matchesCount > 0) @@ -284,10 +284,9 @@ private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int } @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) - private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { + private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { if (start == 0 && stop == read.getReadLength() - 1) return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); // If the read is unmapped there is no Cigar string and neither should we create a new cigar string @@ -296,8 +295,8 @@ private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { // the cigar may force a shift left or right (or both) in case we are left with insertions // starting or ending the read after applying the hard clip on start/stop. int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; - byte [] newBases = new byte[newLength]; - byte [] newQuals = new byte[newLength]; + byte[] newBases = new byte[newLength]; + byte[] newQuals = new byte[newLength]; int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); @@ -316,16 +315,25 @@ private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { if (start == 0) hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar)); + if (read.hasBaseIndelQualities()) { + byte[] newBaseInsertionQuals = new byte[newLength]; + byte[] newBaseDeletionQuals = new byte[newLength]; + System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); + System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); + hardClippedRead.setBaseQualities(newBaseInsertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); + hardClippedRead.setBaseQualities(newBaseDeletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + } + return hardClippedRead; } @Requires({"!cigar.isEmpty()"}) - private CigarShift hardClipCigar (Cigar cigar, int start, int stop) { + private CigarShift hardClipCigar(Cigar cigar, int start, int stop) { Cigar newCigar = new Cigar(); int index = 0; int totalHardClipCount = stop - start + 1; - int alignmentShift = 0; // caused by hard clipping insertions or deletions + int alignmentShift = 0; // caused by hard clipping deletions // hard clip the beginning of the cigar string if (start == 0) { @@ -353,7 +361,7 @@ private CigarShift hardClipCigar (Cigar cigar, int start, int stop) { // element goes beyond what we need to clip else if (index + shift > stop + 1) { int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop-index+1); + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1); newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } @@ -388,7 +396,7 @@ else if (index + shift > stop + 1) { if (index + shift < start) newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); - // element goes beyond our clip starting position + // element goes beyond our clip starting position else { int elementLengthAfterChopping = start - index; alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength() - (start - index)); @@ -396,7 +404,7 @@ else if (index + shift > stop + 1) { // if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClipCount += elementLengthAfterChopping; - // otherwise, maintain what's left of this last operator + // otherwise, maintain what's left of this last operator else newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } @@ -408,7 +416,7 @@ else if (index + shift > stop + 1) { } // check if we are hard clipping indels - while(cigarElementIterator.hasNext()) { + while (cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); @@ -444,34 +452,30 @@ private CigarShift cleanHardClippedCigar(Cigar cigar) { boolean readHasStarted = false; boolean addedHardClips = false; - while(!cigarStack.empty()) { + while (!cigarStack.empty()) { CigarElement cigarElement = cigarStack.pop(); - if ( !readHasStarted && - cigarElement.getOperator() != CigarOperator.INSERTION && + if (!readHasStarted && +// cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.DELETION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) readHasStarted = true; - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClip += cigarElement.getLength(); - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION) - shift += cigarElement.getLength(); - - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) totalHardClip += cigarElement.getLength(); if (readHasStarted) { - if (i==1) { + if (i == 1) { if (!addedHardClips) { if (totalHardClip > 0) inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); addedHardClips = true; } inverseCigarStack.push(cigarElement); - } - else { + } else { if (!addedHardClips) { if (totalHardClip > 0) cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); @@ -498,7 +502,7 @@ private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) { int newShift = 0; int oldShift = 0; - boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift + boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift for (CigarElement cigarElement : newCigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) newShift += cigarElement.getLength(); @@ -509,7 +513,7 @@ private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) { } for (CigarElement cigarElement : oldCigar.getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP ) + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) oldShift += cigarElement.getLength(); else if (readHasStarted) break; @@ -522,7 +526,7 @@ private int calculateHardClippingAlignmentShift(CigarElement cigarElement, int c if (cigarElement.getOperator() == CigarOperator.INSERTION) return -clippedLength; - // Deletions should be added to the total hard clip count + // Deletions should be added to the total hard clip count else if (cigarElement.getOperator() == CigarOperator.DELETION) return cigarElement.getLength(); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index afe7fa9753..7a664bd616 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -374,24 +374,43 @@ public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. * + * Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the + * left of right tail) by specifying either refStart < 0 or refStop < 0. + * * @param refStart first base to clip (inclusive) * @param refStop last base to clip (inclusive) * @return a new read, without the clipped bases */ - @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + @Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { - int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); - int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + if (read.isEmpty()) + return read; - if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) - return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); + int start; + int stop; + + // Determine the read coordinate to start and stop hard clipping + if (refStart < 0) { + if (refStop < 0) + throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")"); + start = 0; + stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + } + else { + if (refStop >= 0) + throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")"); + start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); + stop = read.getReadLength() - 1; + } + +// if ((start == 0 && stop == read.getReadLength() - 1)) +// return GATKSAMRecord.emptyRead(read); if (start < 0 || stop > read.getReadLength() - 1) throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); if ( start > stop ) - throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); + throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop)); if ( start > 0 && stop < read.getReadLength() - 1) throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index e44c10f1f2..3c2ed18e45 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -18,6 +18,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { + public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); protected final static Logger log = Logger.getLogger(VCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column @@ -252,7 +253,7 @@ public Feature decode(String line) { // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) || - (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) + (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + " tokens, and saw " + nParts + " )", lineNo); @@ -518,8 +519,11 @@ protected static List parseAlleles(String ref, String alts, int lineNo) * @param lineNo the line number for this record */ private static void checkAllele(String allele, boolean isRef, int lineNo) { - if ( allele == null || allele.length() == 0 ) - generateException("Empty alleles are not permitted in VCF records", lineNo); + if ( allele == null || allele.length() == 0 ) + generateException("Empty alleles are not permitted in VCF records", lineNo); + + if ( MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) + log.warn(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); if ( isSymbolicAllele(allele) ) { if ( isRef ) { @@ -540,12 +544,15 @@ private static void checkAllele(String allele, boolean isRef, int lineNo) { } /** - * return true if this is a symbolic allele (e.g. ) otherwise false + * return true if this is a symbolic allele (e.g. ) or + * structural variation breakend (with [ or ]), otherwise false * @param allele the allele to check * @return true if the allele is a symbolic allele, otherwise false */ private static boolean isSymbolicAllele(String allele) { - return (allele != null && allele.startsWith("<") && allele.endsWith(">") && allele.length() > 2); + return (allele != null && allele.length() > 2 && + ((allele.startsWith("<") && allele.endsWith(">")) || + (allele.contains("[") || allele.contains("]")))); } /** @@ -572,12 +579,13 @@ protected static boolean isSingleNucleotideEvent(List alleles) { public static int computeForwardClipping(List unclippedAlleles, String ref) { boolean clipping = true; + final byte ref0 = (byte)ref.charAt(0); for ( Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) continue; - if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) { + if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { clipping = false; break; } @@ -604,7 +612,7 @@ protected static int computeReverseClipping(List unclippedAlleles, Strin stillClipping = false; else if ( ref.length() == clipping ) generateException("bad alleles encountered", lineNo); - else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] ) + else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) ) stillClipping = false; } if ( stillClipping ) @@ -613,6 +621,7 @@ else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-cli return clipping; } + /** * clip the alleles, based on the reference * diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java index c299511db5..84ecc7fcd2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java @@ -27,10 +27,8 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Comparator; -import java.util.PriorityQueue; -import java.util.Set; -import java.util.TreeSet; +import java.util.*; +import java.util.concurrent.PriorityBlockingQueue; /** * This class writes VCF files, allowing records to be passed in unsorted. @@ -39,20 +37,26 @@ public abstract class SortingVCFWriterBase implements VCFWriter { // The VCFWriter to which to actually write the sorted VCF records - private VCFWriter innerWriter = null; + private final VCFWriter innerWriter; // the current queue of un-emitted records - private PriorityQueue queue = null; + private final Queue queue; // The locus until which we are permitted to write out (inclusive) protected Integer mostUpstreamWritableLoc; protected static final int BEFORE_MOST_UPSTREAM_LOC = 0; // No real locus index is <= 0 // The set of chromosomes already passed over and to which it is forbidden to return - private Set finishedChromosomes = null; + private final Set finishedChromosomes; // Should we call innerWriter.close() in close() - private boolean takeOwnershipOfInner; + private final boolean takeOwnershipOfInner; + + // -------------------------------------------------------------------------------- + // + // Constructors + // + // -------------------------------------------------------------------------------- /** * create a local-sorting VCF writer, given an inner VCF writer to write to @@ -62,16 +66,27 @@ public abstract class SortingVCFWriterBase implements VCFWriter { */ public SortingVCFWriterBase(VCFWriter innerWriter, boolean takeOwnershipOfInner) { this.innerWriter = innerWriter; - this.queue = new PriorityQueue(50, new VariantContextComparator()); - this.mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC; this.finishedChromosomes = new TreeSet(); this.takeOwnershipOfInner = takeOwnershipOfInner; + + // has to be PriorityBlockingQueue to be thread-safe + // see http://getsatisfaction.com/gsa/topics/missing_loci_output_in_multi_thread_mode_when_implement_sortingvcfwriterbase?utm_content=topic_link&utm_medium=email&utm_source=new_topic + this.queue = new PriorityBlockingQueue(50, new VariantContextComparator()); + + this.mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC; } public SortingVCFWriterBase(VCFWriter innerWriter) { this(innerWriter, false); // by default, don't own inner } + // -------------------------------------------------------------------------------- + // + // public interface functions + // + // -------------------------------------------------------------------------------- + + @Override public void writeHeader(VCFHeader header) { innerWriter.writeHeader(header); } @@ -79,6 +94,7 @@ public void writeHeader(VCFHeader header) { /** * attempt to close the VCF file; we need to flush the queue first */ + @Override public void close() { stopWaitingToSort(); @@ -86,27 +102,14 @@ public void close() { innerWriter.close(); } - private void stopWaitingToSort() { - emitRecords(true); - mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC; - } - - protected void emitSafeRecords() { - emitRecords(false); - } - - protected void noteCurrentRecord(VariantContext vc) { - // did the user break the contract by giving a record too late? - if (mostUpstreamWritableLoc != null && vc.getStart() < mostUpstreamWritableLoc) // went too far back, since may have already written anything that is <= mostUpstreamWritableLoc - throw new IllegalArgumentException("Permitted to write any record upstream of position " + mostUpstreamWritableLoc + ", but a record at " + vc.getChr() + ":" + vc.getStart() + " was just added."); - } /** * add a record to the file * * @param vc the Variant Context object */ - public void add(VariantContext vc) { + @Override + public synchronized void add(VariantContext vc) { /* Note that the code below does not prevent the successive add()-ing of: (chr1, 10), (chr20, 200), (chr15, 100) since there is no implicit ordering of chromosomes: */ @@ -125,7 +128,43 @@ public void add(VariantContext vc) { emitSafeRecords(); } - private void emitRecords(boolean emitUnsafe) { + /** + * Gets a string representation of this object. + * @return a string representation of this object + */ + @Override + public String toString() { + return getClass().getName(); + } + + // -------------------------------------------------------------------------------- + // + // protected interface functions for subclasses to use + // + // -------------------------------------------------------------------------------- + + private synchronized void stopWaitingToSort() { + emitRecords(true); + mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC; + } + + protected synchronized void emitSafeRecords() { + emitRecords(false); + } + + protected void noteCurrentRecord(VariantContext vc) { + // did the user break the contract by giving a record too late? + if (mostUpstreamWritableLoc != null && vc.getStart() < mostUpstreamWritableLoc) // went too far back, since may have already written anything that is <= mostUpstreamWritableLoc + throw new IllegalArgumentException("Permitted to write any record upstream of position " + mostUpstreamWritableLoc + ", but a record at " + vc.getChr() + ":" + vc.getStart() + " was just added."); + } + + // -------------------------------------------------------------------------------- + // + // private implementation functions + // + // -------------------------------------------------------------------------------- + + private synchronized void emitRecords(boolean emitUnsafe) { while (!queue.isEmpty()) { VCFRecord firstRec = queue.peek(); @@ -140,15 +179,6 @@ private void emitRecords(boolean emitUnsafe) { } } - /** - * Gets a string representation of this object. - * @return a string representation of this object - */ - @Override - public String toString() { - return getClass().getName(); - } - private static class VariantContextComparator implements Comparator { public int compare(VCFRecord r1, VCFRecord r2) { return r1.vc.getStart() - r2.vc.getStart(); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 453155be7e..01cc367c44 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -203,7 +203,7 @@ public LazyGenotypesContext.LazyData createGenotypeMap(String str, List if ( genotypeAlleleLocation > 0 ) generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); - List GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); + List GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; // add it to the list diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index bb822f2edf..97166833b1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; @@ -149,7 +150,11 @@ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, Supported count = Integer.valueOf(numberStr); } - type = VCFHeaderLineType.valueOf(mapping.get("Type")); + try { + type = VCFHeaderLineType.valueOf(mapping.get("Type")); + } catch (Exception e) { + throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)"); + } if (type == VCFHeaderLineType.Flag && !allowFlagValues()) throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java index d280ac8049..8652d3c282 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java @@ -34,7 +34,7 @@ * Date: Dec 29, 2009 */ -public class NestedHashMap{ +public class NestedHashMap { public final Map data = new HashMap(); diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java b/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java new file mode 100644 index 0000000000..e84b1432e0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.io.IOUtils; + +import javax.crypto.Cipher; +import java.io.File; +import java.io.InputStream; +import java.security.*; +import java.security.spec.InvalidKeySpecException; +import java.security.spec.KeySpec; +import java.security.spec.PKCS8EncodedKeySpec; +import java.security.spec.X509EncodedKeySpec; +import java.util.Arrays; + +/** + * A set of cryptographic utility methods and constants. + * + * Contains methods to: + * + * -Create a public/private key pair + * -Read and write public/private keys to/from files/streams + * -Load the GATK master private/public keys + * -Encrypt/decrypt data + * + * Also contains constants that control the cryptographic defaults + * throughout the GATK. + * + * @author David Roazen + */ +public class CryptUtils { + + // --------------------------------------------------------------------------------- + // Constants (these control the default cryptographic settings throughout the GATK): + // --------------------------------------------------------------------------------- + + /** + * Default key length in bits of newly-created keys. 2048 bits provides a good balance between + * security and speed. + */ + public static final int DEFAULT_KEY_LENGTH = 2048; + + /** + * Default encryption algorithm to use, when none is specified. + */ + public static final String DEFAULT_ENCRYPTION_ALGORITHM = "RSA"; + + /** + * Default random-number generation algorithm to use, when none is specified. + */ + public static final String DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM = "SHA1PRNG"; + + /** + * Name of the public key file distributed with the GATK. This file is packaged + * into the GATK jar, and we use the system ClassLoader to find it. + */ + public static final String GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME = "GATK_public.key"; + + /** + * Location of the master copy of the GATK private key. + */ + public static final String GATK_MASTER_PRIVATE_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_private.key"; + + /** + * Location of the master copy of the GATK public key. This file should always be the same as + * the public key file distributed with the GATK (and there are automated tests to ensure that it is). + */ + public static final String GATK_MASTER_PUBLIC_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_public.key"; + + /** + * Directory where generated GATK user keys are stored. See the GATKKey class for more information. + */ + public static final String GATK_USER_KEY_DIRECTORY = "/humgen/gsa-hpprojects/GATK/data/gatk_user_keys/"; + + + // ----------------------- + // Utility Methods: + // ----------------------- + + /** + * Generate a new public/private key pair using the default encryption settings defined above. + * + * @return A new public/private key pair created using the default settings + */ + public static KeyPair generateKeyPair() { + return generateKeyPair(DEFAULT_KEY_LENGTH, DEFAULT_ENCRYPTION_ALGORITHM, DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Generate a new public/private key pair using custom encryption settings. + * + * @param keyLength Length of the key in bits + * @param encryptionAlgorithm Encryption algorithm to use + * @param randNumberAlgorithm Random-number generation algorithm to use + * @return A new public/private key pair, created according to the specified parameters + */ + public static KeyPair generateKeyPair( int keyLength, String encryptionAlgorithm, String randNumberAlgorithm ) { + try { + KeyPairGenerator keyGen = KeyPairGenerator.getInstance(encryptionAlgorithm); + SecureRandom randomnessSource = createRandomnessSource(randNumberAlgorithm); + + keyGen.initialize(keyLength, randomnessSource); + return keyGen.generateKeyPair(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( Exception e ) { + throw new ReviewedStingException("Error while generating key pair", e); + } + } + + /** + * Create a source of randomness using the default random-number generation algorithm. + * + * @return A randomness source that uses the default algorithm + */ + public static SecureRandom createRandomnessSource() { + return createRandomnessSource(DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Create a source of randomness using a custom random-number generation algorithm. + * + * @param randAlgorithm The random-number generation algorithm to use + * @return A randomness sources that uses the specified algorithm + */ + public static SecureRandom createRandomnessSource ( String randAlgorithm ) { + try { + return SecureRandom.getInstance(randAlgorithm); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested random-number generation algorithm %s", randAlgorithm), e); + } + } + + /** + * Writes a public/private key pair to disk + * + * @param keyPair The key pair we're writing to disk + * @param privateKeyFile Location to write the private key + * @param publicKeyFile Location to write the public key + */ + public static void writeKeyPair ( KeyPair keyPair, File privateKeyFile, File publicKeyFile ) { + writeKey(keyPair.getPrivate(), privateKeyFile); + writeKey(keyPair.getPublic(), publicKeyFile); + } + + /** + * Writes an arbitrary key to disk + * + * @param key The key to write + * @param destination Location to write the key to + */ + public static void writeKey ( Key key, File destination ) { + IOUtils.writeByteArrayToFile(key.getEncoded(), destination); + } + + /** + * Reads in a public key created using the default encryption algorithm from a file. + * + * @param source File containing the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( File source ) { + return decodePublicKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a public key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( InputStream source ) { + return decodePublicKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a public key into a usable object. + * + * @param rawKey The encoded bytes of a public key as read from, eg., a file. The + * key must be in the standard X.509 format for a public key. + * @param encryptionAlgorithm The encryption algorithm used to create the public key + * @return The public key as a usable object + */ + public static PublicKey decodePublicKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new X509EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePublic(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedStingException("Unable to use X.509 key specification to decode the given key", e); + } + } + + /** + * Reads in a private key created using the default encryption algorithm from a file. + * + * @param source File containing the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( File source ) { + return decodePrivateKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a private key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( InputStream source ) { + return decodePrivateKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a private key into a usable object. + * + * @param rawKey The encoded bytes of a private key as read from, eg., a file. The + * key must be in the standard PKCS #8 format for a private key. + * @param encryptionAlgorithm The encryption algorithm used to create the private key + * @return The private key as a usable object + */ + public static PrivateKey decodePrivateKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new PKCS8EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePrivate(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedStingException("Unable to use the PKCS #8 key specification to decode the given key", e); + } + } + + /** + * Loads the copy of the GATK public key that is distributed with the GATK. Uses the system + * ClassLoader to locate the public key file, which should be stored at the root of the GATK + * jar file. + * + * @return The GATK public key as a usable object + */ + public static PublicKey loadGATKDistributedPublicKey() { + InputStream publicKeyInputStream = ClassLoader.getSystemResourceAsStream(GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME); + + if ( publicKeyInputStream == null ) { + throw new ReviewedStingException(String.format("Could not locate the GATK public key %s in the classpath", + GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME)); + } + + return readPublicKey(publicKeyInputStream); + } + + /** + * Loads the master copy of the GATK private key. You must have the appropriate UNIX permissions + * to do this! + * + * @return The GATK master private key as a usable object + */ + public static PrivateKey loadGATKMasterPrivateKey() { + return readPrivateKey(new File(GATK_MASTER_PRIVATE_KEY_FILE)); + } + + /** + * Loads the master copy of the GATK public key. This should always be the same as the + * public key distributed with the GATK returned by loadGATKDistributedPublicKey(). + * + * @return The GATK master public key as a usable object + */ + public static PublicKey loadGATKMasterPublicKey() { + return readPublicKey(new File(GATK_MASTER_PUBLIC_KEY_FILE)); + } + + /** + * Encrypts the given data using the key provided. + * + * @param data The data to encrypt, as a byte array + * @param encryptKey The key with which to encrypt the data + * @return The encrypted version of the provided data + */ + public static byte[] encryptData ( byte[] data, Key encryptKey ) { + return transformDataUsingCipher(data, encryptKey, Cipher.ENCRYPT_MODE); + } + + /** + * Decrypts the given data using the key provided. + * + * @param encryptedData Data to decrypt, as a byte array + * @param decryptKey The key with which to decrypt the data + * @return The decrypted version of the provided data + */ + public static byte[] decryptData ( byte[] encryptedData, Key decryptKey ) { + return transformDataUsingCipher(encryptedData, decryptKey, Cipher.DECRYPT_MODE); + } + + /** + * Helper method for encryption/decryption that takes data and processes it using + * the given key + * + * @param data Data to encrypt/decrypt + * @param key Key to use to encrypt/decrypt the data + * @param cipherMode Specifies whether we are encrypting or decrypting + * @return The encrypted/decrypted data + */ + private static byte[] transformDataUsingCipher ( byte[] data, Key key, int cipherMode ) { + try { + Cipher cipher = Cipher.getInstance(key.getAlgorithm()); + cipher.init(cipherMode, key); + return cipher.doFinal(data); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested algorithm %s", + key.getAlgorithm()), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedStingException("Key is invalid", e); + } + catch ( GeneralSecurityException e ) { + throw new ReviewedStingException("Error during encryption", e); + } + } + + /** + * Tests whether the public/private keys provided can each decrypt data encrypted by + * the other key -- ie., tests whether these two keys are part of the same public/private + * key pair. + * + * @param privateKey The private key to test + * @param publicKey The public key to test + * @return True if the keys are part of the same key pair and can decrypt each other's + * encrypted data, otherwise false. + */ + public static boolean keysDecryptEachOther ( PrivateKey privateKey, PublicKey publicKey ) { + byte[] plainText = "Test PlainText".getBytes(); + + byte[] dataEncryptedUsingPrivateKey = CryptUtils.encryptData(plainText, privateKey); + byte[] dataEncryptedUsingPublicKey = CryptUtils.encryptData(plainText, publicKey); + + byte[] privateKeyDataDecryptedWithPublicKey = CryptUtils.decryptData(dataEncryptedUsingPrivateKey, publicKey); + byte[] publicKeyDataDecryptedWithPrivateKey = CryptUtils.decryptData(dataEncryptedUsingPublicKey, privateKey); + + // Make sure we actually transformed the data during encryption: + if ( Arrays.equals(plainText, dataEncryptedUsingPrivateKey) || + Arrays.equals(plainText, dataEncryptedUsingPublicKey) || + Arrays.equals(dataEncryptedUsingPrivateKey, dataEncryptedUsingPublicKey) ) { + return false; + } + + // Make sure that we were able to recreate the original plaintext using + // both the public key on the private-key-encrypted data and the private + // key on the public-key-encrypted data: + if ( ! Arrays.equals(plainText, privateKeyDataDecryptedWithPublicKey) || + ! Arrays.equals(plainText, publicKeyDataDecryptedWithPrivateKey) ) { + return false; + } + + return true; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java b/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java new file mode 100644 index 0000000000..408cb56aab --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.IOUtils; + +import java.io.*; +import java.security.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +/** + * Class to represent a GATK user key. + * + * A GATK user key contains an email address and a cryptographic signature. + * The signature is the SHA-1 hash of the email address encrypted using + * the GATK master private key. The GATK master public key (distributed + * with the GATK) is used to decrypt the signature and validate the key + * at the start of each GATK run that requires a key. + * + * Keys are cryptographically secure in that valid keys definitely come + * from us and cannot be fabricated, however nothing prevents keys from + * being shared between users. + * + * GATK user keys have the following on-disk format: + * + * GZIP Container: + * Email address + * NUL byte (delimiter) + * Cryptographic Signature (encrypted SHA-1 hash of email address) + * + * The key data is wrapped within a GZIP container to placate over-zealous + * email filters (since keys must often be emailed) and also to provide an + * additional integrity check via the built-in GZIP CRC. + * + * @author David Roazen + */ +public class GATKKey { + + /** + * Private key used to sign the GATK key. Required only when creating a new + * key from scratch, not when loading an existing key from disk. + */ + private PrivateKey privateKey; + + /** + * Public key used to validate the GATK key. + */ + private PublicKey publicKey; + + /** + * The user's email address, stored within the key and signed. + */ + private String emailAddress; + + /** + * The cryptographic signature of the email address. By default, this is + * the SHA-1 hash of the email address encrypted using the RSA algorithm. + */ + private byte[] signature; + + /** + * The combination of hash/encryption algorithms to use to generate the signature. + * By default this is "SHA1withRSA" + */ + private String signingAlgorithm; + + /** + * Default hash/encryption algorithms to use to sign the key. + */ + public static final String DEFAULT_SIGNING_ALGORITHM = "SHA1withRSA"; + + /** + * Byte value used to separate the email address from its signature in the key file. + */ + public static final byte GATK_KEY_SECTIONAL_DELIMITER = 0; + + + // ----------------------- + // Constructors: + // ----------------------- + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair. The private key is used for signing, and the + * public key is used to validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress ) { + this(privateKey, publicKey, emailAddress, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair, and additionally specify the signing algorithm + * to use. The private key is used for signing, and the public key is used to + * validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + * @param signingAlgorithm The combination of hash and encryption algorithms to use to sign the key + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress, String signingAlgorithm ) { + if ( privateKey == null || publicKey == null || emailAddress == null || emailAddress.length() == 0 || signingAlgorithm == null ) { + throw new ReviewedStingException("Cannot construct GATKKey using null/empty arguments"); + } + + this.privateKey = privateKey; + this.publicKey = publicKey; + this.emailAddress = emailAddress; + this.signingAlgorithm = signingAlgorithm; + + validateEmailAddress(); + generateSignature(); + + if ( ! isValid() ) { + throw new ReviewedStingException("Newly-generated GATK key fails validation -- this should never happen!"); + } + } + + /** + * Constructor to load an existing GATK key from a file. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + */ + public GATKKey ( PublicKey publicKey, File keyFile ) { + this(publicKey, keyFile, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to load an existing GATK key from a file, and additionally specify + * the signing algorithm used to sign the key being loaded. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + * @param signingAlgorithm The combination of hash and encryption algorithms used to sign the key + */ + public GATKKey ( PublicKey publicKey, File keyFile, String signingAlgorithm ) { + if ( publicKey == null || keyFile == null || signingAlgorithm == null ) { + throw new ReviewedStingException("Cannot construct GATKKey using null arguments"); + } + + this.publicKey = publicKey; + this.signingAlgorithm = signingAlgorithm; + + readKey(keyFile); + } + + // ----------------------- + // Public API Methods: + // ----------------------- + + /** + * Writes out this key to a file in the format described at the top of this class, + * encapsulating the key within a GZIP container. + * + * @param destination File to write the key to + */ + public void writeKey ( File destination ) { + try { + byte[] keyBytes = marshalKeyData(); + IOUtils.writeByteArrayToStream(keyBytes, new GZIPOutputStream(new FileOutputStream(destination))); + } + catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(destination, e); + } + } + + /** + * Checks whether the signature of this key is cryptographically valid (ie., can be + * decrypted by the public key to produce a valid SHA-1 hash of the email address + * in the key). + * + * @return True if the key's signature passes validation, otherwise false + */ + public boolean isValid() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initVerify(publicKey); + sig.update(emailAddress.getBytes()); + return sig.verify(signature); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + // If the GATK public key is invalid, it's likely our problem, not the user's: + throw new ReviewedStingException(String.format("Public key %s is invalid", publicKey), e); + } + catch ( SignatureException e ) { + throw new UserException.UnreadableKeyException("Signature is invalid or signing algorithm was unable to process the input data", e); + } + } + + // ----------------------- + // Private Helper Methods: + // ----------------------- + + /** + * Helper method that creates a signature for this key using the combination of + * hash/encryption algorithms specified at construction time. + */ + private void generateSignature() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initSign(privateKey, CryptUtils.createRandomnessSource()); + sig.update(emailAddress.getBytes()); + signature = sig.sign(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedStingException(String.format("Private key %s is invalid", privateKey), e); + } + catch ( SignatureException e ) { + throw new ReviewedStingException(String.format("Error creating signature for email address %s", emailAddress), e); + } + } + + /** + * Helper method that reads in a GATK key from a file. Should not be called directly -- + * use the appropriate constructor above. + * + * @param source File to read the key from + */ + private void readKey ( File source ) { + try { + byte[] keyBytes = IOUtils.readStreamIntoByteArray(new GZIPInputStream(new FileInputStream(source))); + + // As a sanity check, compare the number of bytes read to the uncompressed file size + // stored in the GZIP ISIZE field. If they don't match, the key must be corrupt: + if ( keyBytes.length != IOUtils.getGZIPFileUncompressedSize(source) ) { + throw new UserException.UnreadableKeyException("Number of bytes read does not match the uncompressed size specified in the GZIP ISIZE field"); + } + + unmarshalKeyData(keyBytes); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(source, e); + } + catch ( IOException e ) { + throw new UserException.UnreadableKeyException(source, e); + } + catch ( UserException.CouldNotReadInputFile e ) { + throw new UserException.UnreadableKeyException(source, e); + } + } + + /** + * Helper method that assembles the email address and signature into a format + * suitable for writing to disk. + * + * @return The aggregated key data, ready to be written to disk + */ + private byte[] marshalKeyData() { + byte[] emailAddressBytes = emailAddress.getBytes(); + byte[] assembledKey = new byte[emailAddressBytes.length + 1 + signature.length]; + + System.arraycopy(emailAddressBytes, 0, assembledKey, 0, emailAddressBytes.length); + assembledKey[emailAddressBytes.length] = GATK_KEY_SECTIONAL_DELIMITER; + System.arraycopy(signature, 0, assembledKey, emailAddressBytes.length + 1, signature.length); + + return assembledKey; + } + + /** + * Helper method that parses the raw key data from disk into its component + * email address and signature. Performs some basic validation in the process. + * + * @param keyBytes The raw, uncompressed key data read from disk + */ + private void unmarshalKeyData ( byte[] keyBytes ) { + int delimiterPosition = -1; + + for ( int i = 0; i < keyBytes.length; i++ ) { + if ( keyBytes[i] == GATK_KEY_SECTIONAL_DELIMITER ) { + delimiterPosition = i; + break; + } + } + + if ( delimiterPosition == -1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no sectional delimiter"); + } + else if ( delimiterPosition == 0 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no email address"); + } + else if ( delimiterPosition == keyBytes.length - 1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no signature"); + } + + byte[] emailAddressBytes = new byte[delimiterPosition]; + System.arraycopy(keyBytes, 0, emailAddressBytes, 0, delimiterPosition); + emailAddress = new String(emailAddressBytes); + + signature = new byte[keyBytes.length - delimiterPosition - 1]; + System.arraycopy(keyBytes, delimiterPosition + 1, signature, 0, keyBytes.length - delimiterPosition - 1); + } + + /** + * Helper method that ensures that the user's email address does not contain the NUL byte, which we + * reserve as a delimiter within each key file. + */ + private void validateEmailAddress() { + for ( byte b : emailAddress.getBytes() ) { + if ( b == GATK_KEY_SECTIONAL_DELIMITER ) { + throw new UserException(String.format("Email address must not contain a byte with value %d", GATK_KEY_SECTIONAL_DELIMITER)); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 2ece3b077d..d625cec202 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -30,6 +30,7 @@ import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; @@ -132,6 +133,10 @@ public CouldNotReadInputFile(File file, String message, Exception e) { public CouldNotReadInputFile(File file, Exception e) { this(file, e.getMessage()); } + + public CouldNotReadInputFile(String message) { + super(message); + } } @@ -151,6 +156,10 @@ public CouldNotCreateOutputFile(String filename, String message, Exception e) { public CouldNotCreateOutputFile(File file, Exception e) { super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), e.getMessage())); } + + public CouldNotCreateOutputFile(String message, Exception e) { + super(message, e); + } } public static class MissortedBAM extends UserException { @@ -265,7 +274,7 @@ public DeprecatedArgument(String param, String doc) { public static class IncompatibleSequenceDictionaries extends UserException { public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", - name1, name2, message, name1, prettyPrintSequenceRecords(dict1), name2, prettyPrintSequenceRecords(dict2))); + name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2))); } } @@ -276,17 +285,11 @@ public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDiction + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." + "\nYou can use the ReorderSam utility to fix this problem: http://www.broadinstitute.org/gsa/wiki/index.php/ReorderSam" + "\n %s contigs = %s", - name, name, prettyPrintSequenceRecords(dict))); + name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } } - private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) { - String[] sequenceRecordNames = new String[sequenceDictionary.size()]; - int sequenceRecordIndex = 0; - for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) - sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); - return Arrays.deepToString(sequenceRecordNames); - } + public static class MissingWalker extends UserException { public MissingWalker(String walkerName, String message) { @@ -319,4 +322,32 @@ public CouldNotCreateReferenceIndexFileBecauseOfLock(File f) { "and try again.", null); } } + + public static class UnreadableKeyException extends UserException { + public UnreadableKeyException ( File f, Exception e ) { + super(String.format("Key file %s cannot be read (possibly the key file is corrupt?). Error was: %s. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for help.", + f.getAbsolutePath(), e.getMessage())); + } + + public UnreadableKeyException ( String message, Exception e ) { + this(String.format("%s. Error was: %s", message, e.getMessage())); + } + + public UnreadableKeyException ( String message ) { + super(String.format("Key file cannot be read (possibly the key file is corrupt?): %s. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for help.", + message)); + } + } + + public static class KeySignatureVerificationException extends UserException { + public KeySignatureVerificationException ( File f ) { + super(String.format("The signature in key file %s failed cryptographic verification. " + + "If this key was valid in the past, it's likely been revoked. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " + + "for help.", + f.getAbsolutePath())); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 43ef4aa741..44b586bcd2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -167,7 +167,7 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop if ( start < myCache.start || stop > myCache.stop || myCache.seq == null || myCache.seq.getContigIndex() != contigInfo.getSequenceIndex() ) { cacheMisses++; myCache.start = Math.max(start - cacheMissBackup, 0); - myCache.stop = Math.min(myCache.start + cacheSize, contigInfo.getSequenceLength()); + myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength()); myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); //System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index e5500ca213..eea45567f9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -1,9 +1,16 @@ package org.broadinstitute.sting.utils.fragments; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; @@ -121,4 +128,87 @@ public final static FragmentCollection create(List return create(reads, reads.size(), SamRecordGetter); } + public final static List mergeOverlappingPairedFragments( List overlappingPair ) { + final byte MIN_QUAL_BAD_OVERLAP = 16; + if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } + + GATKSAMRecord firstRead = overlappingPair.get(0); + GATKSAMRecord secondRead = overlappingPair.get(1); + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + firstRead = overlappingPair.get(1); + secondRead = overlappingPair.get(0); + } + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A + } + if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { + return overlappingPair; // fragments contain indels so don't merge them + } + + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); + + final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); + final int numBases = firstReadStop + secondRead.getReadLength(); + final byte[] bases = new byte[numBases]; + final byte[] quals = new byte[numBases]; + final byte[] insertionQuals = new byte[numBases]; + final byte[] deletionQuals = new byte[numBases]; + final byte[] firstReadBases = firstRead.getReadBases(); + final byte[] firstReadQuals = firstRead.getBaseQualities(); + final byte[] secondReadBases = secondRead.getReadBases(); + final byte[] secondReadQuals = secondRead.getBaseQualities(); + + for(int iii = 0; iii < firstReadStop; iii++) { + bases[iii] = firstReadBases[iii]; + quals[iii] = firstReadQuals[iii]; + } + for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { + if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { + return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + } + bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); + quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); + } + for(int iii = firstRead.getReadLength(); iii < numBases; iii++) { + bases[iii] = secondReadBases[iii-firstReadStop]; + quals[iii] = secondReadQuals[iii-firstReadStop]; + } + + final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader()); + returnRead.setAlignmentStart(firstRead.getUnclippedStart()); + returnRead.setReadBases( bases ); + returnRead.setBaseQualities( quals ); + returnRead.setReadGroup( firstRead.getReadGroup() ); + returnRead.setReferenceName( firstRead.getReferenceName() ); + final CigarElement c = new CigarElement(bases.length, CigarOperator.M); + final ArrayList cList = new ArrayList(); + cList.add(c); + returnRead.setCigar( new Cigar( cList )); + returnRead.setMappingQuality( firstRead.getMappingQuality() ); + + if( firstRead.hasBaseIndelQualities() || secondRead.hasBaseIndelQualities() ) { + final byte[] firstReadInsertionQuals = firstRead.getBaseInsertionQualities(); + final byte[] firstReadDeletionQuals = firstRead.getBaseDeletionQualities(); + final byte[] secondReadInsertionQuals = secondRead.getBaseInsertionQualities(); + final byte[] secondReadDeletionQuals = secondRead.getBaseDeletionQualities(); + for(int iii = 0; iii < firstReadStop; iii++) { + insertionQuals[iii] = firstReadInsertionQuals[iii]; + deletionQuals[iii] = firstReadDeletionQuals[iii]; + } + for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { + insertionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadInsertionQuals[iii] : secondReadInsertionQuals[iii-firstReadStop] ); // Purposefully checking the highest *base* quality score + deletionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadDeletionQuals[iii] : secondReadDeletionQuals[iii-firstReadStop] ); // Purposefully checking the highest *base* quality score + } + for(int iii = firstRead.getReadLength(); iii < numBases; iii++) { + insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop]; + deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop]; + } + returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION ); + returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION ); + } + + final ArrayList returnList = new ArrayList(); + returnList.add(returnRead); + return returnList; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f8655f74a5..ea1eaeb514 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -314,10 +314,10 @@ else if (file.exists()) * @param reference The reference for the intervals. * @return A map of contig names with their sizes. */ - public static Map getContigSizes(File reference) { + public static Map getContigSizes(File reference) { ReferenceDataSource referenceSource = new ReferenceDataSource(reference); List locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList(); - Map lengths = new LinkedHashMap(); + Map lengths = new LinkedHashMap(); for (GenomeLoc loc: locs) lengths.put(loc.getContig(), loc.size()); return lengths; diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java index b3fdb93d30..160df0e510 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -29,14 +29,18 @@ import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.*; public class IOUtils { private static Logger logger = Logger.getLogger(IOUtils.class); + private static final File DEV_DIR = new File("/dev"); /** * Checks if the temp directory has been setup and throws an exception if they user hasn't set it correctly. @@ -301,12 +305,17 @@ public static List tail(File file, int count) throws IOException { } /** - * Tries to delete a file. Emits a warning if the file was unable to be deleted. + * Tries to delete a file. Emits a warning if the file + * is not a special file and was unable to be deleted. * * @param file File to delete. * @return true if the file was deleted. */ public static boolean tryDelete(File file) { + if (isSpecialFile(file)) { + logger.debug("Not trying to delete " + file); + return false; + } boolean deleted = FileUtils.deleteQuietly(file); if (deleted) logger.debug("Deleted " + file); @@ -385,4 +394,182 @@ public static LineIterator lineIterator(File file) { } } + + /** + * Returns true if the file is a special file. + * @param file File path to check. + * @return true if the file is a special file. + */ + public static boolean isSpecialFile(File file) { + return file != null && (file.getAbsolutePath().startsWith("/dev/") || file.equals(DEV_DIR)); + } + + /** + * Reads the entirety of the given file into a byte array. Uses a read buffer size of 4096 bytes. + * + * @param source File to read + * @return The contents of the file as a byte array + */ + public static byte[] readFileIntoByteArray ( File source ) { + return readFileIntoByteArray(source, 4096); + } + + /** + * Reads the entirety of the given file into a byte array using the requested read buffer size. + * + * @param source File to read + * @param readBufferSize Number of bytes to read in at one time + * @return The contents of the file as a byte array + */ + public static byte[] readFileIntoByteArray ( File source, int readBufferSize ) { + if ( source == null ) { + throw new ReviewedStingException("Source file was null"); + } + + byte[] fileContents; + + try { + fileContents = readStreamIntoByteArray(new FileInputStream(source), readBufferSize); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(source, e); + } + + if ( fileContents.length != source.length() ) { + throw new UserException.CouldNotReadInputFile(String.format("Unable to completely read file %s: read only %d/%d bytes", + source.getAbsolutePath(), fileContents.length, source.length())); + } + + return fileContents; + } + + /** + * Reads all data from the given stream into a byte array. Uses a read buffer size of 4096 bytes. + * + * @param in Stream to read data from + * @return The contents of the stream as a byte array + */ + public static byte[] readStreamIntoByteArray ( InputStream in ) { + return readStreamIntoByteArray(in, 4096); + } + + /** + * Reads all data from the given stream into a byte array using the requested read buffer size. + * + * @param in Stream to read data from + * @param readBufferSize Number of bytes to read in at one time + * @return The contents of the stream as a byte array + */ + public static byte[] readStreamIntoByteArray ( InputStream in, int readBufferSize ) { + if ( in == null ) { + throw new ReviewedStingException("Input stream was null"); + } + else if ( readBufferSize <= 0 ) { + throw new ReviewedStingException("Read buffer size must be > 0"); + } + + // Use a fixed-size buffer for each read, but a dynamically-growing buffer + // to hold the accumulated contents of the file/stream: + byte[] readBuffer = new byte[readBufferSize]; + ByteArrayOutputStream fileBuffer = new ByteArrayOutputStream(readBufferSize * 4); + + try { + try { + int currentBytesRead; + + while ( (currentBytesRead = in.read(readBuffer, 0, readBuffer.length)) >= 0 ) { + fileBuffer.write(readBuffer, 0, currentBytesRead); + } + } + finally { + in.close(); + } + } + catch ( IOException e ) { + throw new UserException.CouldNotReadInputFile("I/O error reading from input stream", e); + } + + return fileBuffer.toByteArray(); + } + + /** + * Writes the given array of bytes to a file + * + * @param bytes Data to write + * @param destination File to write the data to + */ + public static void writeByteArrayToFile ( byte[] bytes, File destination ) { + if ( destination == null ) { + throw new ReviewedStingException("Destination file was null"); + } + + try { + writeByteArrayToStream(bytes, new FileOutputStream(destination)); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(destination, e); + } + } + + /** + * Writes the given array of bytes to a stream + * + * @param bytes Data to write + * @param out Stream to write the data to + */ + public static void writeByteArrayToStream ( byte[] bytes, OutputStream out ) { + if ( bytes == null || out == null ) { + throw new ReviewedStingException("Data to write or output stream was null"); + } + + try { + try { + out.write(bytes); + } + finally { + out.close(); + } + } + catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile("I/O error writing to output stream", e); + } + } + + /** + * Determines the uncompressed size of a GZIP file. Uses the GZIP ISIZE field in the last + * 4 bytes of the file to get this information. + * + * @param gzipFile GZIP-format file whose uncompressed size to determine + * @return The uncompressed size (in bytes) of the GZIP file + */ + public static int getGZIPFileUncompressedSize ( File gzipFile ) { + if ( gzipFile == null ) { + throw new ReviewedStingException("GZIP file to examine was null"); + } + + try { + // The GZIP ISIZE field holds the uncompressed size of the compressed data. + // It occupies the last 4 bytes of any GZIP file: + RandomAccessFile in = new RandomAccessFile(gzipFile, "r"); + in.seek(gzipFile.length() - 4); + byte[] sizeBytes = new byte[4]; + in.read(sizeBytes, 0, 4); + + ByteBuffer byteBuf = ByteBuffer.wrap(sizeBytes); + byteBuf.order(ByteOrder.LITTLE_ENDIAN); // The GZIP spec mandates little-endian byte order + int uncompressedSize = byteBuf.getInt(); + + // If the size read in is negative, we've overflowed our signed integer: + if ( uncompressedSize < 0 ) { + throw new UserException.CouldNotReadInputFile(String.format("Cannot accurately determine the uncompressed size of file %s " + + "because it's either larger than %d bytes or the GZIP ISIZE field is corrupt", + gzipFile.getAbsolutePath(), Integer.MAX_VALUE)); + } + + return uncompressedSize; + } + catch ( IOException e ) { + throw new UserException.CouldNotReadInputFile(gzipFile, e); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 18051ce92d..7c2a67aba0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -40,7 +40,7 @@ * @author mhanna * @version 0.1 */ -public abstract class AbstractReadBackedPileup,PE extends PileupElement> implements ReadBackedPileup { +public abstract class AbstractReadBackedPileup, PE extends PileupElement> implements ReadBackedPileup { protected final GenomeLoc loc; protected final PileupElementTracker pileupElementTracker; @@ -55,23 +55,18 @@ public abstract class AbstractReadBackedPileup reads, List offsets ) { + public AbstractReadBackedPileup(GenomeLoc loc, List reads, List offsets) { this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads,offsets); + this.pileupElementTracker = readsOffsets2Pileup(reads, offsets); } - public AbstractReadBackedPileup(GenomeLoc loc, List reads, int offset ) { - this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads,offset); - } /** * Create a new version of a read backed pileup at loc without any aligned reads - * */ public AbstractReadBackedPileup(GenomeLoc loc) { this(loc, new UnifiedPileupElementTracker()); @@ -81,11 +76,10 @@ public AbstractReadBackedPileup(GenomeLoc loc) { * Create a new version of a read backed pileup at loc, using the reads and their corresponding * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a * pointer to pileup. Don't go changing the data in pileup. - * */ public AbstractReadBackedPileup(GenomeLoc loc, List pileup) { - if ( loc == null ) throw new ReviewedStingException("Illegal null genomeloc in ReadBackedPileup"); - if ( pileup == null ) throw new ReviewedStingException("Illegal null pileup in ReadBackedPileup"); + if (loc == null) throw new ReviewedStingException("Illegal null genomeloc in ReadBackedPileup"); + if (pileup == null) throw new ReviewedStingException("Illegal null pileup in ReadBackedPileup"); this.loc = loc; this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); @@ -94,12 +88,13 @@ public AbstractReadBackedPileup(GenomeLoc loc, List pileup) { /** * Optimization of above constructor where all of the cached data is provided + * * @param loc * @param pileup */ public AbstractReadBackedPileup(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads) { - if ( loc == null ) throw new ReviewedStingException("Illegal null genomeloc in UnifiedReadBackedPileup"); - if ( pileup == null ) throw new ReviewedStingException("Illegal null pileup in UnifiedReadBackedPileup"); + if (loc == null) throw new ReviewedStingException("Illegal null genomeloc in UnifiedReadBackedPileup"); + if (pileup == null) throw new ReviewedStingException("Illegal null pileup in UnifiedReadBackedPileup"); this.loc = loc; this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); @@ -115,16 +110,21 @@ protected AbstractReadBackedPileup(GenomeLoc loc, PileupElementTracker track calculateCachedData(); } - protected AbstractReadBackedPileup(GenomeLoc loc, Map> pileupsBySample) { + protected AbstractReadBackedPileup(GenomeLoc loc, Map> pileupsBySample) { this.loc = loc; PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); - for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { - tracker.addElements(pileupEntry.getKey(),pileupEntry.getValue().pileupElementTracker); + for (Map.Entry> pileupEntry : pileupsBySample.entrySet()) { + tracker.addElements(pileupEntry.getKey(), pileupEntry.getValue().pileupElementTracker); addPileupToCumulativeStats(pileupEntry.getValue()); } this.pileupElementTracker = tracker; } + public AbstractReadBackedPileup(GenomeLoc loc, List reads, int offset) { + this.loc = loc; + this.pileupElementTracker = readsOffsets2Pileup(reads, offset); + } + /** * Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront, * so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling @@ -135,12 +135,12 @@ protected void calculateCachedData() { nDeletions = 0; nMQ0Reads = 0; - for ( PileupElement p : pileupElementTracker ) { + for (PileupElement p : pileupElementTracker) { size++; - if ( p.isDeletion() ) { + if (p.isDeletion()) { nDeletions++; } - if ( p.getRead().getMappingQuality() == 0 ) { + if (p.getRead().getMappingQuality() == 0) { nMQ0Reads++; } } @@ -148,12 +148,12 @@ protected void calculateCachedData() { protected void calculateAbstractSize() { abstractSize = 0; - for ( PileupElement p : pileupElementTracker ) { + for (PileupElement p : pileupElementTracker) { abstractSize += p.getRepresentativeCount(); } } - protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileup) { + protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileup) { size += pileup.getNumberOfElements(); abstractSize += pileup.depthOfCoverage(); nDeletions += pileup.getNumberOfDeletions(); @@ -167,14 +167,17 @@ protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileu * @param offsets * @return */ - private PileupElementTracker readsOffsets2Pileup(List reads, List offsets ) { - if ( reads == null ) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); - if ( offsets == null ) throw new ReviewedStingException("Illegal null offsets list in UnifiedReadBackedPileup"); - if ( reads.size() != offsets.size() ) throw new ReviewedStingException("Reads and offset lists have different sizes!"); + private PileupElementTracker readsOffsets2Pileup(List reads, List offsets) { + if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); + if (offsets == null) throw new ReviewedStingException("Illegal null offsets list in UnifiedReadBackedPileup"); + if (reads.size() != offsets.size()) + throw new ReviewedStingException("Reads and offset lists have different sizes!"); UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for ( int i = 0; i < reads.size(); i++ ) { - pileup.add(createNewPileupElement(reads.get(i),offsets.get(i))); + for (int i = 0; i < reads.size(); i++) { + GATKSAMRecord read = reads.get(i); + int offset = offsets.get(i); + pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important } return pileup; @@ -187,20 +190,22 @@ private PileupElementTracker readsOffsets2Pileup(List reads, * @param offset * @return */ - private PileupElementTracker readsOffsets2Pileup(List reads, int offset ) { - if ( reads == null ) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); - if ( offset < 0 ) throw new ReviewedStingException("Illegal offset < 0 UnifiedReadBackedPileup"); + private PileupElementTracker readsOffsets2Pileup(List reads, int offset) { + if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); + if (offset < 0) throw new ReviewedStingException("Illegal offset < 0 UnifiedReadBackedPileup"); UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for ( int i = 0; i < reads.size(); i++ ) { - pileup.add(createNewPileupElement( reads.get(i), offset )); + for (GATKSAMRecord read : reads) { + pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important } return pileup; } - protected abstract AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset); + protected abstract AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); + + protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip); + protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip, String nextEventBases, int nextEventLength ); // -------------------------------------------------------- // @@ -217,32 +222,31 @@ private PileupElementTracker readsOffsets2Pileup(List reads, */ @Override public RBP getPileupWithoutDeletions() { - if ( getNumberOfDeletions() > 0 ) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (getNumberOfDeletions() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutDeletions(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupWithoutDeletions(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( !p.isDeletion() ) { + for (PE p : tracker) { + if (!p.isDeletion()) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } else { - return (RBP)this; + return (RBP) this; } } @@ -256,21 +260,20 @@ public RBP getPileupWithoutDeletions() { */ @Override public RBP getOverlappingFragmentFilteredPileup() { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getOverlappingFragmentFilteredPileup(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - Map filteredPileup = new HashMap(); + return (RBP) createNewPileup(loc, filteredTracker); + } else { + Map filteredPileup = new HashMap(); - for ( PE p : pileupElementTracker ) { + for (PE p : pileupElementTracker) { String readName = p.getRead().getReadName(); // if we've never seen this read before, life is good @@ -292,10 +295,10 @@ public RBP getOverlappingFragmentFilteredPileup() { } UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE filteredElement: filteredPileup.values()) + for (PE filteredElement : filteredPileup.values()) filteredTracker.add(filteredElement); - return (RBP)createNewPileup(loc,filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } @@ -309,264 +312,299 @@ public RBP getOverlappingFragmentFilteredPileup() { */ @Override public RBP getPileupWithoutMappingQualityZeroReads() { - if ( getNumberOfMappingQualityZeroReads() > 0 ) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (getNumberOfMappingQualityZeroReads() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutMappingQualityZeroReads(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupWithoutMappingQualityZeroReads(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( p.getRead().getMappingQuality() > 0 ) { + for (PE p : tracker) { + if (p.getRead().getMappingQuality() > 0) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } else { - return (RBP)this; + return (RBP) this; } } public RBP getPositiveStrandPileup() { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPositiveStrandPileup(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPositiveStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + return (RBP) createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( !p.getRead().getReadNegativeStrandFlag() ) { + for (PE p : tracker) { + if (!p.getRead().getReadNegativeStrandFlag()) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } /** * Gets the pileup consisting of only reads on the negative strand. + * * @return A read-backed pileup consisting only of reads on the negative strand. */ public RBP getNegativeStrandPileup() { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getNegativeStrandPileup(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getNegativeStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + return (RBP) createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( p.getRead().getReadNegativeStrandFlag() ) { + for (PE p : tracker) { + if (p.getRead().getReadNegativeStrandFlag()) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } /** * Gets a pileup consisting of all those elements passed by a given filter. + * * @param filter Filter to use when testing for elements. * @return a pileup without the given filtered elements. */ public RBP getFilteredPileup(PileupElementFilter filter) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getFilteredPileup(filter); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getFilteredPileup(filter); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { + return (RBP) createNewPileup(loc, filteredTracker); + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : pileupElementTracker ) { - if( filter.allow(p) ) + for (PE p : pileupElementTracker) { + if (filter.allow(p)) filteredTracker.add(p); } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } - /** Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. + * * @param minBaseQ * @param minMapQ * @return */ @Override - public RBP getBaseAndMappingFilteredPileup( int minBaseQ, int minMapQ ) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + public RBP getBaseAndMappingFilteredPileup(int minBaseQ, int minMapQ) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ,minMapQ); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ, minMapQ); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { + return (RBP) createNewPileup(loc, filteredTracker); + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : pileupElementTracker ) { - if ( p.getRead().getMappingQuality() >= minMapQ && + for (PE p : pileupElementTracker) { + if (p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || - ((p instanceof ExtendedEventPileupElement) && ((ExtendedEventPileupElement)p).getType() == ExtendedEventPileupElement.Type.NOEVENT) || - p.getQual() >= minBaseQ) ) { + ((p instanceof ExtendedEventPileupElement) && ((ExtendedEventPileupElement) p).getType() == ExtendedEventPileupElement.Type.NOEVENT) || + p.getQual() >= minBaseQ)) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } - /** Returns subset of this pileup that contains only bases with quality >= minBaseQ. + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ. * This method allocates and returns a new instance of ReadBackedPileup. + * * @param minBaseQ * @return */ @Override - public RBP getBaseFilteredPileup( int minBaseQ ) { + public RBP getBaseFilteredPileup(int minBaseQ) { return getBaseAndMappingFilteredPileup(minBaseQ, -1); } - /** Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. + /** + * Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. * This method allocates and returns a new instance of ReadBackedPileup. + * * @param minMapQ * @return */ @Override - public RBP getMappingFilteredPileup( int minMapQ ) { + public RBP getMappingFilteredPileup(int minMapQ) { return getBaseAndMappingFilteredPileup(-1, minMapQ); } /** * Gets a list of the read groups represented in this pileup. + * * @return */ @Override public Collection getReadGroups() { Set readGroups = new HashSet(); - for(PileupElement pileupElement: this) + for (PileupElement pileupElement : this) readGroups.add(pileupElement.getRead().getReadGroup().getReadGroupId()); return readGroups; } /** * Gets the pileup for a given read group. Horrendously inefficient at this point. + * * @param targetReadGroupId Identifier for the read group. * @return A read-backed pileup containing only the reads in the given read group. */ @Override public RBP getPileupForReadGroup(String targetReadGroupId) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroup(targetReadGroupId); - if(pileup != null) - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroup(targetReadGroupId); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - else { + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(targetReadGroupId != null) { - if(read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) + if (targetReadGroupId != null) { + if (read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) filteredTracker.add(p); } - else { - if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + } + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; + } + } + + /** + * Gets the pileup for a set of read groups. Horrendously inefficient at this point. + * + * @param rgSet List of identifiers for the read groups. + * @return A read-backed pileup containing only the reads in the given read groups. + */ + @Override + public RBP getPileupForReadGroups(final HashSet rgSet) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroups(rgSet); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PE p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (rgSet != null && !rgSet.isEmpty()) { + if (read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } @Override public RBP getPileupForLane(String laneID) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID); - if(pileup != null) - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupForLane(laneID); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - else { + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(laneID != null) { - if(read.getReadGroup() != null && - (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different - (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same + if (laneID != null) { + if (read.getReadGroup() != null && + (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different + (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } public Collection getSamples() { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; return new HashSet(tracker.getSamples()); - } - else { + } else { Collection sampleNames = new HashSet(); - for(PileupElement p: this) { + for (PileupElement p : this) { GATKSAMRecord read = p.getRead(); String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; sampleNames.add(sampleName); @@ -583,103 +621,98 @@ public Collection getSamples() { */ @Override public RBP getDownsampledPileup(int desiredCoverage) { - if ( getNumberOfElements() <= desiredCoverage ) - return (RBP)this; + if (getNumberOfElements() <= desiredCoverage) + return (RBP) this; // randomly choose numbers corresponding to positions in the reads list TreeSet positions = new TreeSet(); - for ( int i = 0; i < desiredCoverage; /* no update */ ) { - if ( positions.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(size)) ) + for (int i = 0; i < desiredCoverage; /* no update */) { + if (positions.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(size))) i++; } - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); int current = 0; - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); List filteredPileup = new ArrayList(); - for(PileupElement p: perSampleElements) { - if(positions.contains(current)) + for (PileupElement p : perSampleElements) { + if (positions.contains(current)) filteredPileup.add(p); } - if(!filteredPileup.isEmpty()) { - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + if (!filteredPileup.isEmpty()) { + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } current++; } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + return (RBP) createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); Iterator positionIter = positions.iterator(); - while ( positionIter.hasNext() ) { - int nextReadToKeep = (Integer)positionIter.next(); + while (positionIter.hasNext()) { + int nextReadToKeep = (Integer) positionIter.next(); filteredTracker.add(tracker.get(nextReadToKeep)); } - return (RBP)createNewPileup(getLocation(), filteredTracker); + return (RBP) createNewPileup(getLocation(), filteredTracker); } } @Override public RBP getPileupForSamples(Collection sampleNames) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleNames); - return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; - } - else { + return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; + } else { HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. - if(read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } @Override public RBP getPileupForSample(String sampleName) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleName); - return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; - } - else { + return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(sampleName != null) { - if(read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) + if (sampleName != null) { + if (read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } @@ -691,9 +724,9 @@ public RBP getPileupForSample(String sampleName) { /** * The best way to access PileupElements where you only care about the bases and quals in the pileup. - * + *

* for (PileupElement p : this) { doSomething(p); } - * + *

* Provides efficient iteration of the data. * * @return @@ -703,9 +736,17 @@ public Iterator iterator() { return new Iterator() { private final Iterator wrappedIterator = pileupElementTracker.iterator(); - public boolean hasNext() { return wrappedIterator.hasNext(); } - public PileupElement next() { return wrappedIterator.next(); } - public void remove() { throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); } + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public PileupElement next() { + return wrappedIterator.next(); + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); + } }; } @@ -748,7 +789,7 @@ public int getNumberOfElements() { */ @Override public int depthOfCoverage() { - if ( abstractSize == -1 ) + if (abstractSize == -1) calculateAbstractSize(); return abstractSize; } @@ -758,7 +799,7 @@ public int depthOfCoverage() { */ @Override public boolean isEmpty() { - return size==0; + return size == 0; } @@ -780,19 +821,18 @@ public GenomeLoc getLocation() { public int[] getBaseCounts() { int[] counts = new int[4]; - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; - for(final String sample: tracker.getSamples()) { - int[] countsBySample = createNewPileup(loc,tracker.getElements(sample)).getBaseCounts(); - for(int i = 0; i < counts.length; i++) + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (final String sample : tracker.getSamples()) { + int[] countsBySample = createNewPileup(loc, tracker.getElements(sample)).getBaseCounts(); + for (int i = 0; i < counts.length; i++) counts[i] += countsBySample[i]; } - } - else { - for ( PileupElement pile : this ) { + } else { + for (PileupElement pile : this) { // skip deletion sites - if ( ! pile.isDeletion() ) { - int index = BaseUtils.simpleBaseToBaseIndex((char)pile.getBase()); + if (!pile.isDeletion()) { + int index = BaseUtils.simpleBaseToBaseIndex((char) pile.getBase()); if (index != -1) counts[index]++; } @@ -821,65 +861,80 @@ public String getPileupString(Character ref) { /** * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * * @return */ @Override public List getReads() { List reads = new ArrayList(getNumberOfElements()); - for ( PileupElement pile : this ) { reads.add(pile.getRead()); } + for (PileupElement pile : this) { + reads.add(pile.getRead()); + } return reads; } /** * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * * @return */ @Override public List getOffsets() { List offsets = new ArrayList(getNumberOfElements()); - for ( PileupElement pile : this ) { offsets.add(pile.getOffset()); } + for (PileupElement pile : this) { + offsets.add(pile.getOffset()); + } return offsets; } /** * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * * @return */ @Override public byte[] getBases() { byte[] v = new byte[getNumberOfElements()]; int pos = 0; - for ( PileupElement pile : pileupElementTracker ) { v[pos++] = pile.getBase(); } + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getBase(); + } return v; } /** * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time + * * @return */ @Override public byte[] getQuals() { byte[] v = new byte[getNumberOfElements()]; int pos = 0; - for ( PileupElement pile : pileupElementTracker ) { v[pos++] = pile.getQual(); } + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getQual(); + } return v; } /** * Get an array of the mapping qualities + * * @return */ @Override public byte[] getMappingQuals() { byte[] v = new byte[getNumberOfElements()]; int pos = 0; - for ( PileupElement pile : pileupElementTracker ) { v[pos++] = (byte)pile.getRead().getMappingQuality(); } + for (PileupElement pile : pileupElementTracker) { + v[pos++] = (byte) pile.getRead().getMappingQuality(); + } return v; } - static String quals2String( byte[] quals ) { + static String quals2String(byte[] quals) { StringBuilder qualStr = new StringBuilder(); - for ( int qual : quals ) { + for (int qual : quals) { qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 qualStr.append(qualChar); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java index 1e5e4d4e5a..8df0aa0b8c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java @@ -12,7 +12,7 @@ * are seen on the base-by-base basis (i.e. the pileup does keep the information about the current reference base being deleted * in some reads), but the information about the extended event (deletion length, string of all deleted bases) is not kept. * The insertions that may be present in some reads are not seen at all in such strict reference traversal mode. - * + *

* By convention, any extended event (indel) is mapped onto the reference at the last base prior to the event (i.e. * last base before the insertion or deletion). If the special "extended" traversal mode is turned on and there is * an indel in at least one read that maps onto the reference position Z, the walker's map function will be called twice: @@ -22,15 +22,17 @@ * (covered) reference position. Note that if the extended event at Z was a deletion, the "standard" base pileup at * Z+1 and following bases may still contain deleted bases. However the fully extended event call will be performed * only once, at the position where the indel maps (starts). - * + *

* This class wraps an "extended" event (indel) so that in can be added to a pileup of events at a given location. - * + *

* Created by IntelliJ IDEA. * User: asivache * Date: Dec 21, 2009 * Time: 2:57:55 PM * To change this template use File | Settings | File Templates. */ + +// Extended events are slated for removal public class ExtendedEventPileupElement extends PileupElement { public enum Type { NOEVENT, DELETION, INSERTION @@ -39,40 +41,52 @@ public enum Type { private Type type = null; private int eventLength = -1; private String eventBases = null; // if it is a deletion, we do not have information about the actual deleted bases - // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases + // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases private SAMRecord read; private int offset; // position in the read immediately BEFORE the event // This is broken! offset is always zero because these member variables are shadowed by base class - /** Constructor for extended pileup element (indel). + + public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) { + super(read, offset, type == Type.DELETION, false, false, false,null,-1); // extended events are slated for removal + this.read = read; + this.offset = offset; + this.eventLength = eventLength; + this.eventBases = eventBases; + this.type = type; + } + + /** + * Quick constructor for insertions. * - * @param read the read, in which the indel is observed - * @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion) - * @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT) + * @param read the read, in which the indel is observed + * @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion) + * @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT) * @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent) */ - public ExtendedEventPileupElement( GATKSAMRecord read, int offset, int length, byte[] eventBases ) { - super(read, offset); - this.eventLength = length; - if ( length <= 0 ) type = Type.NOEVENT; - else { - if ( eventBases != null ) { - this.eventBases = new String(eventBases).toUpperCase(); - type = Type.INSERTION; - } else { - type = Type.DELETION; - } - } + public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int length, byte[] eventBases) { + this(read, offset, length, new String(eventBases).toUpperCase(), Type.INSERTION); + } + + /** + * Quick constructor for non indels (matches) + * + * @param read the read + * @param offset where in the read the match is + */ + public ExtendedEventPileupElement(GATKSAMRecord read, int offset) { + this(read, offset, -1, null, Type.NOEVENT); } - /** Constructor for deletion or noevent calls - does not take event bases as an argument (as those should - * be null or are ignored in these cases anyway) - * @param read - * @param offset - * @param length + /** + * Quick constructor for deletions + * + * @param read the read + * @param offset the last base before the deletion starts (left aligned deletion) + * @param length length of this deletion */ - public ExtendedEventPileupElement( GATKSAMRecord read, int offset, int length ) { - this(read,offset, length, null); + public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int length) { + this(read, offset, length, null, Type.DELETION); } public boolean isDeletion() { @@ -87,46 +101,54 @@ public boolean isIndel() { return isDeletion() || isInsertion(); } - public Type getType() { return type; } + public Type getType() { + return type; + } // The offset can be negative with insertions at the start of the read, but a valid base does exist at this position with // a valid base quality. The following code attempts to compensate for that.' @Override public byte getBase() { - return getBase(offset >= 0 ? offset : offset+eventLength); + return getBase(offset >= 0 ? offset : offset + eventLength); } @Override public int getBaseIndex() { - return getBaseIndex(offset >= 0 ? offset : offset+eventLength); + return getBaseIndex(offset >= 0 ? offset : offset + eventLength); } @Override public byte getQual() { - return getQual(offset >= 0 ? offset : offset+eventLength); + return getQual(offset >= 0 ? offset : offset + eventLength); } - /** Returns length of the event (number of inserted or deleted bases */ - public int getEventLength() { return eventLength; } + /** + * Returns length of the event (number of inserted or deleted bases + */ + public int getEventLength() { + return eventLength; + } - /** Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. - * */ - public String getEventBases() { return eventBases; } + /** + * Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + */ + public String getEventBases() { + return eventBases; + } @Override public String toString() { char c = '.'; - String fillStr = null ; - if ( isDeletion() ) { + String fillStr = null; + if (isDeletion()) { c = '-'; - char [] filler = new char[eventLength]; + char[] filler = new char[eventLength]; Arrays.fill(filler, 'D'); fillStr = new String(filler); - } - else if ( isInsertion() ) c = '+'; - return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel()? - (isInsertion() ? eventBases : fillStr ): "", getMappingQual()); + } else if (isInsertion()) c = '+'; + return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel() ? + (isInsertion() ? eventBases : fillStr) : "", getMappingQual()); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 2d13d6e59d..9dbfc52f30 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -3,6 +3,8 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** @@ -21,25 +23,84 @@ public class PileupElement implements Comparable { protected final GATKSAMRecord read; protected final int offset; - + protected final boolean isDeletion; + protected final boolean isBeforeDeletion; + protected final boolean isBeforeInsertion; + protected final boolean isNextToSoftClip; + protected final int eventLength; + protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases + // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases + + + /** + * Creates a new pileup element. + * + * @param read the read we are adding to the pileup + * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) + * @param isDeletion whether or not this base is a deletion + * @param isBeforeDeletion whether or not this base is before a deletion + * @param isBeforeInsertion whether or not this base is before an insertion + * @param isNextToSoftClip whether or not this base is next to a soft clipped base + * @param nextEventBases bases in event in case element comes before insertion or deletion + * @param nextEventLength length of next event in case it's insertion or deletion + */ @Requires({ "read != null", "offset >= -1", "offset <= read.getReadLength()"}) - public PileupElement( GATKSAMRecord read, int offset ) { + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip, + final String nextEventBases, final int nextEventLength) { + if (offset < 0 && isDeletion) + throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset"); + this.read = read; this.offset = offset; + this.isDeletion = isDeletion; + this.isBeforeDeletion = isBeforeDeletion; + this.isBeforeInsertion = isBeforeInsertion; + this.isNextToSoftClip = isNextToSoftClip; + if (isBeforeInsertion) + eventBases = nextEventBases; + else + eventBases = null; // ignore argument in any other case + if (isBeforeDeletion || isBeforeInsertion) + eventLength = nextEventLength; + else + eventLength = -1; } + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) { + this(read,offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null, -1); + } public boolean isDeletion() { + return isDeletion; + } + + public boolean isBeforeDeletion() { + return isBeforeDeletion; + } + + public boolean isBeforeInsertion() { + return isBeforeInsertion; + } + + public boolean isNextToSoftClip() { + return isNextToSoftClip; + } + + public boolean isInsertionAtBeginningOfRead() { return offset == -1; } @Ensures("result != null") - public GATKSAMRecord getRead() { return read; } + public GATKSAMRecord getRead() { + return read; + } @Ensures("result == offset") - public int getOffset() { return offset; } + public int getOffset() { + return offset; + } public byte getBase() { return getBase(offset); @@ -52,6 +113,28 @@ public int getBaseIndex() { public byte getQual() { return getQual(offset); } + + public byte getBaseInsertionQual() { + return getBaseInsertionQual(offset); + } + + public byte getBaseDeletionQual() { + return getBaseDeletionQual(offset); + } + + /** + * Returns length of the event (number of inserted or deleted bases + */ + public int getEventLength() { + return eventLength; + } + + /** + * Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + */ + public String getEventBases() { + return eventBases; + } public int getMappingQual() { return read.getMappingQuality(); @@ -59,30 +142,38 @@ public int getMappingQual() { @Ensures("result != null") public String toString() { - return String.format("%s @ %d = %c Q%d", getRead().getReadName(), getOffset(), (char)getBase(), getQual()); + return String.format("%s @ %d = %c Q%d", getRead().getReadName(), getOffset(), (char) getBase(), getQual()); } protected byte getBase(final int offset) { - return isDeletion() ? DELETION_BASE : read.getReadBases()[offset]; + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_BASE : read.getReadBases()[offset]; } protected int getBaseIndex(final int offset) { - return BaseUtils.simpleBaseToBaseIndex(isDeletion() ? DELETION_BASE : read.getReadBases()[offset]); + return BaseUtils.simpleBaseToBaseIndex((isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_BASE : read.getReadBases()[offset]); } protected byte getQual(final int offset) { - return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset]; + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseQualities()[offset]; + } + + protected byte getBaseInsertionQual(final int offset) { + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseInsertionQualities()[offset]; + } + + protected byte getBaseDeletionQual(final int offset) { + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseDeletionQualities()[offset]; } @Override public int compareTo(final PileupElement pileupElement) { - if ( offset < pileupElement.offset ) + if (offset < pileupElement.offset) return -1; - else if ( offset > pileupElement.offset ) + else if (offset > pileupElement.offset) return 1; - else if ( read.getAlignmentStart() < pileupElement.read.getAlignmentStart() ) + else if (read.getAlignmentStart() < pileupElement.read.getAlignmentStart()) return -1; - else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() ) + else if (read.getAlignmentStart() > pileupElement.read.getAlignmentStart()) return 1; else return 0; @@ -94,13 +185,29 @@ else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() ) // // -------------------------------------------------------------------------- - public boolean isReducedRead() { - return read.isReducedRead(); - } - +// public boolean isReducedRead() { +// return read.isReducedRead(); +// } + + /** + * Returns the number of elements in the pileup element. + *

+ * Unless this is a reduced read, the number of elements in a pileup element is one. In the event of + * this being a reduced read and a deletion, we return the average number of elements between the left + * and right elements to the deletion. We assume the deletion to be left aligned. + * + * @return + */ public int getRepresentativeCount() { - // TODO -- if we ever decide to reduce the representation of deletions then this will need to be fixed - return (!isDeletion() && isReducedRead()) ? read.getReducedCount(offset) : 1; + int representativeCount = 1; + + if (read.isReducedRead() && !isInsertionAtBeginningOfRead()) { + if (isDeletion() && (offset + 1 >= read.getReadLength()) ) // deletion in the end of the read + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); + + representativeCount = (isDeletion()) ? Math.round((read.getReducedCount(offset) + read.getReducedCount(offset + 1)) / 2) : read.getReducedCount(offset); + } + return representativeCount; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index 43ad063523..e547534dd6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -30,33 +30,34 @@ import java.util.*; -public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup implements ReadBackedExtendedEventPileup { +public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup implements ReadBackedExtendedEventPileup { private int nInsertions; private int maxDeletionLength; // cached value of the length of the longest deletion observed at the site public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, List pileupElements) { - super(loc,pileupElements); + super(loc, pileupElements); } public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { - super(loc,tracker); + super(loc, tracker); } /** * Optimization of above constructor where all of the cached data is provided + * * @param loc * @param pileup */ public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, List pileup, int size, - int maxDeletionLength, int nInsertions, int nDeletions, int nMQ0Reads) { - super(loc,pileup,size,nDeletions,nMQ0Reads); + int maxDeletionLength, int nInsertions, int nDeletions, int nMQ0Reads) { + super(loc, pileup, size, nDeletions, nMQ0Reads); this.maxDeletionLength = maxDeletionLength; this.nInsertions = nInsertions; } // this is the good new one - public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { - super(loc,pileupElementsBySample); + public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { + super(loc, pileupElementsBySample); } /** @@ -71,31 +72,36 @@ protected void calculateCachedData() { nInsertions = 0; nMQ0Reads = 0; - for ( ExtendedEventPileupElement p : this.toExtendedIterable() ) { + for (ExtendedEventPileupElement p : this.toExtendedIterable()) { - if ( p.isDeletion() ) { + if (p.isDeletion()) { maxDeletionLength = Math.max(maxDeletionLength, p.getEventLength()); } else { - if ( p.isInsertion() ) nInsertions++; + if (p.isInsertion()) nInsertions++; } } } @Override - protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileup) { + protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileup) { super.addPileupToCumulativeStats(pileup); - ReadBackedExtendedEventPileup extendedEventPileup = ((ReadBackedExtendedEventPileup)pileup); + ReadBackedExtendedEventPileup extendedEventPileup = ((ReadBackedExtendedEventPileup) pileup); this.nInsertions += extendedEventPileup.getNumberOfInsertions(); this.maxDeletionLength += extendedEventPileup.getMaxDeletionLength(); } @Override protected ReadBackedExtendedEventPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { - return new ReadBackedExtendedEventPileupImpl(loc,tracker); + return new ReadBackedExtendedEventPileupImpl(loc, tracker); } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset) { + protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) { + throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); + } + @Override + protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, + boolean isNextToSoftClip,String nextEventBases, int nextEventLength) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } @@ -110,10 +116,12 @@ public int getNumberOfInsertions() { return nInsertions; } - /** Returns the length of the longest deletion observed at the site this + /** + * Returns the length of the longest deletion observed at the site this * pileup is associated with (NOTE: by convention, both insertions and deletions * are associated with genomic location immediately before the actual event). If * there are no deletions at the site, returns 0. + * * @return */ @Override @@ -123,36 +131,47 @@ public int getMaxDeletionLength() { public Iterable toExtendedIterable() { return new Iterable() { - public Iterator iterator() { return pileupElementTracker.iterator(); } + public Iterator iterator() { + return pileupElementTracker.iterator(); + } }; } /** * Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time + * * @return */ @Override public byte[] getEvents() { byte[] v = new byte[getNumberOfElements()]; int i = 0; - for ( ExtendedEventPileupElement e : this.toExtendedIterable() ) { - switch ( e.getType() ) { - case INSERTION: v[i] = 'I'; break; - case DELETION: v[i] = 'D'; break; - case NOEVENT: v[i] = '.'; break; - default: throw new ReviewedStingException("Unknown event type encountered: "+e.getType()); + for (ExtendedEventPileupElement e : this.toExtendedIterable()) { + switch (e.getType()) { + case INSERTION: + v[i] = 'I'; + break; + case DELETION: + v[i] = 'D'; + break; + case NOEVENT: + v[i] = '.'; + break; + default: + throw new ReviewedStingException("Unknown event type encountered: " + e.getType()); } i++; } return v; - } + } - /** A shortcut for getEventStringsWithCounts(null); + /** + * A shortcut for getEventStringsWithCounts(null); * * @return */ @Override - public List> getEventStringsWithCounts() { + public List> getEventStringsWithCounts() { return getEventStringsWithCounts(null); } @@ -163,44 +182,48 @@ public String getShortPileupString() { // insertion, deletion or no-event, respectively. return String.format("%s %s E %s", getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate - new String(getEvents()) ); + new String(getEvents())); } - /** Returns String representation of all distinct extended events (indels) at the site along with + /** + * Returns String representation of all distinct extended events (indels) at the site along with * observation counts (numbers of reads) for each distinct event. If refBases is null, a simple string representation for * deletions will be generated as "D" (i.e. "5D"); if the reference bases are provided, the actual * deleted sequence will be used in the string representation (e.g. "-AAC"). - * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and - * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) + * + * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and + * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) * @return list of distinct events; first element of a pair is a string representation of the event, second element - * gives the number of reads, in which that event was observed + * gives the number of reads, in which that event was observed */ @Override - public List> getEventStringsWithCounts(byte[] refBases) { - Map events = new HashMap(); + public List> getEventStringsWithCounts(byte[] refBases) { + Map events = new HashMap(); - for ( ExtendedEventPileupElement e : this.toExtendedIterable() ) { + for (ExtendedEventPileupElement e : this.toExtendedIterable()) { Integer cnt; String indel = null; - switch ( e.getType() ) { + switch (e.getType()) { case INSERTION: - indel = "+"+e.getEventBases(); + indel = "+" + e.getEventBases(); break; case DELETION: - indel = getDeletionString(e.getEventLength(),refBases); + indel = getDeletionString(e.getEventLength(), refBases); break; - case NOEVENT: continue; - default: throw new ReviewedStingException("Unknown event type encountered: "+e.getType()); + case NOEVENT: + continue; + default: + throw new ReviewedStingException("Unknown event type encountered: " + e.getType()); } cnt = events.get(indel); - if ( cnt == null ) events.put(indel,1); - else events.put(indel,cnt.intValue()+1); + if (cnt == null) events.put(indel, 1); + else events.put(indel, cnt.intValue() + 1); } - List> eventList = new ArrayList>(events.size()); - for ( Map.Entry m : events.entrySet() ) { - eventList.add( new Pair(m.getKey(),m.getValue())); + List> eventList = new ArrayList>(events.size()); + for (Map.Entry m : events.entrySet()) { + eventList.add(new Pair(m.getKey(), m.getValue())); } return eventList; } @@ -208,18 +231,19 @@ public List> getEventStringsWithCounts(byte[] refBases) { /** * Builds string representation of the deletion event. If refBases is null, the representation will be * "D" (e.g. "5D"); if the reference bases are provided, a verbose representation (e.g. "-AAC") - * will be generated. NOTE: refBases must start with the base prior to the actual deletion (i.e. deleted + * will be generated. NOTE: refBases must start with the base prior to the actual deletion (i.e. deleted * base(s) are refBase[1], refBase[2], ...), and the length of the passed array must be sufficient to accomodate the * deletion length (i.e. size of refBase must be at least length+1). + * * @param length * @param refBases * @return */ private String getDeletionString(int length, byte[] refBases) { - if ( refBases == null ) { - return Integer.toString(length)+"D"; // if we do not have reference bases, we can only report something like "5D" + if (refBases == null) { + return Integer.toString(length) + "D"; // if we do not have reference bases, we can only report something like "5D" } else { - return "-"+new String(refBases,1,length).toUpperCase(); + return "-" + new String(refBases, 1, length).toUpperCase(); } } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 02767df7cd..ccd9d509fb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; +import java.util.HashSet; import java.util.List; /** @@ -129,6 +130,13 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForReadGroup(String readGroupId); + /** + * Gets all the reads associated with a given read groups. + * @param rgSet Set of identifiers for the read group. + * @return A pileup containing only the reads in the given read groups. + */ + public ReadBackedPileup getPileupForReadGroups(final HashSet rgSet); + /** * Gets all reads in a given lane id. (Lane ID is the read group * id stripped of the last .XX sample identifier added by the GATK). diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index b7445be8dd..759d64b2fe 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -29,48 +29,55 @@ import java.util.List; import java.util.Map; -public class ReadBackedPileupImpl extends AbstractReadBackedPileup implements ReadBackedPileup { +public class ReadBackedPileupImpl extends AbstractReadBackedPileup implements ReadBackedPileup { public ReadBackedPileupImpl(GenomeLoc loc) { super(loc); } - public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets ) { - super(loc,reads,offsets); + public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets) { + super(loc, reads, offsets); } - public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset ) { - super(loc,reads,offset); + public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset) { + super(loc, reads, offset); } public ReadBackedPileupImpl(GenomeLoc loc, List pileupElements) { - super(loc,pileupElements); + super(loc, pileupElements); } - public ReadBackedPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { - super(loc,pileupElementsBySample); + public ReadBackedPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { + super(loc, pileupElementsBySample); } /** * Optimization of above constructor where all of the cached data is provided + * * @param loc * @param pileup */ public ReadBackedPileupImpl(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads) { - super(loc,pileup,size,nDeletions,nMQ0Reads); + super(loc, pileup, size, nDeletions, nMQ0Reads); } protected ReadBackedPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { - super(loc,tracker); + super(loc, tracker); } @Override protected ReadBackedPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { - return new ReadBackedPileupImpl(loc,tracker); + return new ReadBackedPileupImpl(loc, tracker); } @Override - protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset) { - return new PileupElement(read,offset); + protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, + boolean isNextToSoftClip) { + return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null,0); + } + + protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, + boolean isNextToSoftClip,String nextEventBases, final int nextEventLength) { + return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, nextEventBases,nextEventLength); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java new file mode 100644 index 0000000000..048f8e58ca --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java @@ -0,0 +1,50 @@ +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Iterator; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 2/13/12 + */ + +public class BQSRSamIterator implements StingSAMIterator { + private final StingSAMIterator it; + private final BaseRecalibration bqsr; + + /** + * Creates a new BQSRSamIterator and applies BQSR on the fly to incoming reads. + * + * @param it The incoming SamIterator to wrap + * @param bqsr The object which holds the BQSR table information and knows how to apply it + */ + @Requires({ + "it != null", + "bqsr != null"}) + public BQSRSamIterator(StingSAMIterator it, BaseRecalibration bqsr) { + if ( bqsr == null ) throw new ReviewedStingException("BUG: shouldn't create BQSRSamIterator with null recalibration object"); + + this.it = it; + this.bqsr = bqsr; + } + + @Requires("hasNext()") + @Ensures("result != null") + public SAMRecord next() { + SAMRecord read = it.next(); + bqsr.recalibrateRead((GATKSAMRecord) read); + return read; + } + + public boolean hasNext() { return this.it.hasNext(); } + public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } + public void close() { it.close(); } + public Iterator iterator() { return this; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java new file mode 100644 index 0000000000..74083ced26 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.*; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Utility methods to facilitate on-the-fly base quality score recalibration. + * + * User: rpoplin + * Date: 2/4/12 + */ + +public class BaseRecalibration { + + private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation + public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + public static final String EOF_MARKER = "EOF"; + private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here? + private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values. + + public BaseRecalibration( final File RECAL_FILE ) { + // Get a list of all available covariates + final List> classes = new PluginManager(Covariate.class).getPlugins(); + + int lineNumber = 0; + boolean foundAllCovariates = false; + + // Read in the data from the csv file and populate the data map and covariates list + boolean sawEOF = false; + try { + for ( String line : new XReadLines(RECAL_FILE) ) { + lineNumber++; + if ( EOF_MARKER.equals(line) ) { + sawEOF = true; + } else if( COMMENT_PATTERN.matcher(line).matches() ) { + ; // Skip over the comment lines, (which start with '#') + } + // Read in the covariates that were used from the input file + else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data + if( foundAllCovariates ) { + throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); + } else { // Found the covariate list in input file, loop through all of them and instantiate them + String[] vals = line.split(","); + for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical + boolean foundClass = false; + for( Class covClass : classes ) { + if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + foundClass = true; + try { + Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + + } + } + + if( !foundClass ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); + } + } + } + + } else { // Found a line of data + if( !foundAllCovariates ) { + foundAllCovariates = true; + + // At this point all the covariates should have been found and initialized + if( requestedCovariates.size() < 2 ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); + } + + final boolean createCollapsedTables = true; + + // Initialize any covariate member variables using the shared argument collection + RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + for( Covariate cov : requestedCovariates ) { + cov.initialize( RAC ); + } + // Initialize the data hashMaps + dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); + + } + addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); + } catch ( NumberFormatException e ) { + throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); + } + + if ( !sawEOF ) { + final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; + throw new UserException.MalformedFile(RECAL_FILE, errorMessage); + } + + if( dataManager == null ) { + throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); + } + + dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE ); + } + + /** + * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * @param line A line of CSV data read from the recalibration table data file + */ + private void addCSVData(final File file, final String line) { + final String[] vals = line.split(","); + + // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly + if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical + throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + } + + final Object[] key = new Object[requestedCovariates.size()]; + Covariate cov; + int iii; + for( iii = 0; iii < requestedCovariates.size(); iii++ ) { + cov = requestedCovariates.get( iii ); + key[iii] = cov.getValue( vals[iii] ); + } + final String modelString = vals[iii++]; + final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.getErrorModelFromString(modelString); + + // Create a new datum using the number of observations, number of mismatches, and reported quality score + final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); + // Add that datum to all the collapsed tables which will be used in the sequential calculation + + dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + } + + public void recalibrateRead( final GATKSAMRecord read ) { + + //compute all covariate values for this read + RecalDataManager.computeCovariates(read, requestedCovariates); + final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read ); + + for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) { + final byte[] originalQuals = read.getBaseQualities( errorModel ); + final byte[] recalQuals = originalQuals.clone(); + + // For each base in the read + for( int offset = 0; offset < read.getReadLength(); offset++ ) { + + final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel); + final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates + + // BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables? + //Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); + //if( qualityScore == null ) { + final byte qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); + // qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); + //} + + recalQuals[offset] = qualityScore; + } + + preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low + read.setBaseQualities( recalQuals, errorModel ); + } + } + + /** + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * @param key The list of Comparables that were calculated from the covariates + * @return A recalibrated quality score as a byte + */ + private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) { + + final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + + // The global quality shift (over the read group only) + readGroupCollapsedKey[0] = key[0]; + final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey )); + double globalDeltaQ = 0.0; + if( globalRecalDatum != null ) { + final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); + final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; + } + + // The shift in quality between reported and empirical + qualityScoreCollapsedKey[0] = key[0]; + qualityScoreCollapsedKey[1] = key[1]; + final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey )); + double deltaQReported = 0.0; + if( qReportedRecalDatum != null ) { + final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + double deltaQCovariateEmpirical; + covariateCollapsedKey[0] = key[0]; + covariateCollapsedKey[1] = key[1]; + for( int iii = 2; iii < key.length; iii++ ) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey )); + if( covariateRecalDatum != null ) { + deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + } + } + + final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + } + + /** + * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold + * @param originalQuals The list of original base quality scores + * @param recalQuals A list of the new recalibrated quality scores + */ + private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) { + for( int iii = 0; iii < recalQuals.length; iii++ ) { + if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + recalQuals[iii] = originalQuals[iii]; + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java index 02512c8dc9..682c766170 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java @@ -36,8 +36,10 @@ else if (r2.getReadUnmappedFlag()) result = cmpContig; else { - if (r1.getAlignmentStart() < r2.getAlignmentStart()) result = -1; - else result = 1; + if (r1.getAlignmentStart() < r2.getAlignmentStart()) + result = -1; + else + result = 1; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index b8e8921014..3b27364182 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -1,745 +1,774 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; - - -public class AlignmentUtils { - - public static class MismatchCount { - public int numMismatches = 0; - public long mismatchQualities = 0; - } - - public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { - return getMismatchCount(r, refSeq, refIndex).mismatchQualities; - } - - public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { - return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength()); - } - - // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single - // todo -- high performance implementation. We can do a lot better than this right now - public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { - MismatchCount mc = new MismatchCount(); - - int readIdx = 0; - int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count - byte[] readSeq = r.getReadBases(); - Cigar c = r.getCigar(); - for (int i = 0 ; i < c.numCigarElements() ; i++) { - - if ( readIdx > endOnRead ) break; - - CigarElement ce = c.getCigarElement(i); - switch ( ce.getOperator() ) { - case M: - for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { - if ( refIndex >= refSeq.length ) - continue; - if ( readIdx < startOnRead ) continue; - if ( readIdx > endOnRead ) break; - byte refChr = refSeq[refIndex]; - byte readChr = readSeq[readIdx]; - // Note: we need to count X/N's as mismatches because that's what SAM requires - //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - // continue; // do not count Ns/Xs/etc ? - if ( readChr != refChr ) { - mc.numMismatches++; - mc.mismatchQualities += r.getBaseQualities()[readIdx]; - } - } - break; - case I: - case S: - readIdx += ce.getLength(); - break; - case D: - case N: - refIndex += ce.getLength(); - break; - case H: - case P: - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - return mc; - } - - /** Returns the number of mismatches in the pileup within the given reference context. - * - * @param pileup the pileup with reads - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { - int mismatches = 0; - for ( PileupElement p : pileup ) - mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); - return mismatches; - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param p the pileup element - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { - return mismatchesInRefWindow(p, ref, ignoreTargetSite, false); - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param p the pileup element - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @param qualitySumInsteadOfMismatchCount if true, return the quality score sum of the mismatches rather than the count - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) { - int sum = 0; - - int windowStart = ref.getWindow().getStart(); - int windowStop = ref.getWindow().getStop(); - byte[] refBases = ref.getBases(); - byte[] readBases = p.getRead().getReadBases(); - byte[] readQualities = p.getRead().getBaseQualities(); - Cigar c = p.getRead().getCigar(); - - int readIndex = 0; - int currentPos = p.getRead().getAlignmentStart(); - int refIndex = Math.max(0, currentPos - windowStart); - - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { - // are we past the ref window? - if ( currentPos > windowStop ) - break; - - // are we before the ref window? - if ( currentPos < windowStart ) - continue; - - byte refChr = refBases[refIndex++]; - - // do we need to skip the target site? - if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) - continue; - - byte readChr = readBases[readIndex]; - if ( readChr != refChr ) - sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - currentPos += cigarElementLength; - if ( currentPos > windowStart ) - refIndex += Math.min(cigarElementLength, currentPos - windowStart); - break; - case H: - case P: - break; - } - } - - return sum; - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param read the SAMRecord - * @param ref the reference context - * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good - * @param windowSize window size (on each side) to test - * @return a bitset representing which bases are good - */ - public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { - // first determine the positions with mismatches - int readLength = read.getReadLength(); - BitSet mismatches = new BitSet(readLength); - - // it's possible we aren't starting at the beginning of a read, - // and we don't need to look at any of the previous context outside our window - // (although we do need future context) - int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize); - int currentReadPos = read.getAlignmentStart(); - - byte[] refBases = ref.getBases(); - int refIndex = readStartPos - ref.getWindow().getStart(); - if ( refIndex < 0 ) { - throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); - } - - byte[] readBases = read.getReadBases(); - int readIndex = 0; - - Cigar c = read.getCigar(); - - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++) { - // skip over unwanted bases - if ( currentReadPos++ < readStartPos ) - continue; - - // this is possible if reads extend beyond the contig end - if ( refIndex >= refBases.length ) - break; - - byte refChr = refBases[refIndex]; - byte readChr = readBases[readIndex]; - if ( readChr != refChr ) - mismatches.set(readIndex); - - refIndex++; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - if ( currentReadPos >= readStartPos ) - refIndex += cigarElementLength; - currentReadPos += cigarElementLength; - break; - case H: - case P: - break; - } - } - - // all bits are set to false by default - BitSet result = new BitSet(readLength); - - int currentPos = 0, leftPos = 0, rightPos; - int mismatchCount = 0; - - // calculate how many mismatches exist in the windows to the left/right - for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { - if ( mismatches.get(rightPos) ) - mismatchCount++; - } - if ( mismatchCount <= maxMismatches ) - result.set(currentPos); - - // now, traverse over the read positions - while ( currentPos < readLength ) { - // add a new rightmost position - if ( rightPos < readLength && mismatches.get(rightPos++) ) - mismatchCount++; - // re-penalize the previous position - if ( mismatches.get(currentPos++) ) - mismatchCount++; - // don't penalize the current position - if ( mismatches.get(currentPos) ) - mismatchCount--; - // subtract the leftmost position - if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) ) - mismatchCount--; - - if ( mismatchCount <= maxMismatches ) - result.set(currentPos); - } - - return result; - } - /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. - * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but - * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is - * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. - * Formally, this method simply returns the number of M elements in the cigar. - * @param r alignment - * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). - */ - public static int getNumAlignmentBlocks(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) { - if (e.getOperator() == CigarOperator.M ) n++; - } - - return n; - } - - public static int getNumAlignedBases(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) { - if (e.getOperator() == CigarOperator.M ) { n += e.getLength(); } - } - - return n; - } - - public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) { - - final byte[] alignment = new byte[read.length]; - int refPos = 0; - int alignPos = 0; - - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos++] = '+'; - } - break; - case D: - case N: - refPos += elementLength; - break; - case M: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = ref[refPos]; - alignPos++; - refPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - - public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) { - - boolean atDeletion = false; - if(pileupOffset == -1) { - atDeletion = true; - pileupOffset = refLocus - alignmentStart; - final CigarElement ce = cigar.getCigarElement(0); - if( ce.getOperator() == CigarOperator.S ) { - pileupOffset += ce.getLength(); - } - } - int pos = 0; - int alignmentPos = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - pos += elementLength; - if( pos >= pileupOffset ) { - return alignmentPos; - } - break; - case D: - case N: - if(!atDeletion) { - alignmentPos += elementLength; - } else { - if( pos + elementLength - 1 >= pileupOffset ) { - return alignmentPos + (pileupOffset - pos); - } else { - pos += elementLength; - alignmentPos += elementLength; - } - } - break; - case M: - if( pos + elementLength - 1 >= pileupOffset ) { - return alignmentPos + (pileupOffset - pos); - } else { - pos += elementLength; - alignmentPos += elementLength; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignmentPos; - } - - public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) { - - int alignmentLength = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - break; - case D: - case N: - alignmentLength += elementLength; - break; - case M: - alignmentLength += elementLength; - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - - final byte[] alignment = new byte[alignmentLength]; - int alignPos = 0; - int readPos = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - if( alignPos > 0 ) { - if( alignment[alignPos-1] == BaseUtils.A ) { alignment[alignPos-1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.C ) { alignment[alignPos-1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.T ) { alignment[alignPos-1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.G ) { alignment[alignPos-1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } - } - case S: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - readPos++; - } - break; - case D: - case N: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = PileupElement.DELETION_BASE; - alignPos++; - } - break; - case M: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = read[readPos]; - alignPos++; - readPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - - /** - * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format - * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and - * alignment reference index/start. - * @param r record - * @return true if read is unmapped - */ - public static boolean isReadUnmapped(final SAMRecord r) { - if ( r.getReadUnmappedFlag() ) return true; - - // our life would be so much easier if all sam files followed the specs. In reality, - // sam files (including those generated by maq or bwa) miss headers altogether. When - // reading such a SAM file, reference name is set, but since there is no sequence dictionary, - // null is always returned for referenceIndex. Let's be paranoid here, and make sure that - // we do not call the read "unmapped" when it has only reference name set with ref. index missing - // or vice versa. - if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX - || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) - && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; - return true; - } - - /** - * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format - * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and - * alignment reference index/start of the mate. - * @param r sam record for the read - * @return true if read's mate is unmapped - */ - public static boolean isMateUnmapped(final SAMRecord r) { - if ( r.getMateUnmappedFlag() ) return true; - - // our life would be so much easier if all sam files followed the specs. In reality, - // sam files (including those generated by maq or bwa) miss headers altogether. When - // reading such a SAM file, reference name is set, but since there is no sequence dictionary, - // null is always returned for referenceIndex. Let's be paranoid here, and make sure that - // we do not call the read "unmapped" when it has only reference name set with ref. index missing - // or vice versa. - if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX - || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) - && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; - return true; - } - - /** Returns true is read is mapped and mapped uniquely (Q>0). - * - * @param read - * @return - */ - public static boolean isReadUniquelyMapped(SAMRecord read) { - return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0; - } - - /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). - * @param read - * @return - */ - public static byte [] getQualsInCycleOrder(SAMRecord read) { - if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getBaseQualities(); - - return Utils.reverse(read.getBaseQualities()); - } - - /** Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities - * are available this method will throw a runtime exception. - * @param read - * @return - */ - public static byte [] getOriginalQualsInCycleOrder(SAMRecord read) { - if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getOriginalBaseQualities(); - - return Utils.reverse(read.getOriginalBaseQualities()); - } - - /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq - * starting at 0-based position refIndex on the refSeq and specified by its cigar. - * The last argument readIndex specifies 0-based position on the read where the alignment described by the - * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. - * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case - * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are - * always the positions where the cigar starts on the ref and on the read, respectively. - * - * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar - * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT - * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence - * is not repeated), the original cigar is returned. - * @param cigar structure of the original alignment - * @param refSeq reference sequence the read is aligned to - * @param readSeq read sequence - * @param refIndex 0-based alignment start position on ref - * @param readIndex 0-based alignment start position on read - * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) - */ - public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { - - int indexOfIndel = -1; - for ( int i = 0; i < cigar.numCigarElements(); i++ ) { - CigarElement ce = cigar.getCigarElement(i); - if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) { - // if there is more than 1 indel, don't left align - if ( indexOfIndel != -1 ) - return cigar; - indexOfIndel = i; - } - } - - // if there is no indel or if the alignment starts with an insertion (so that there - // is no place on the read to move that insertion further left), we are done - if ( indexOfIndel < 1 ) return cigar; - - final int indelLength = cigar.getCigarElement(indexOfIndel).getLength(); - - byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); - if ( altString == null ) - return cigar; - - Cigar newCigar = cigar; - for ( int i = 0; i < indelLength; i++ ) { - newCigar = moveCigarLeft(newCigar, indexOfIndel); - byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); - - // check to make sure we haven't run off the end of the read - boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar); - - if ( Arrays.equals(altString, newAltString) ) { - cigar = newCigar; - i = -1; - if ( reachedEndOfRead ) - cigar = cleanUpCigar(cigar); - } - - if ( reachedEndOfRead ) - break; - } - - return cigar; - } - - private static boolean cigarHasZeroSizeElement(Cigar c) { - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getLength() == 0 ) - return true; - } - return false; - } - - private static Cigar cleanUpCigar(Cigar c) { - ArrayList elements = new ArrayList(c.numCigarElements()-1); - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getLength() != 0 && - (elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) { - elements.add(ce); - } - } - return new Cigar(elements); - } - - private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) { - // get the first few elements - ArrayList elements = new ArrayList(cigar.numCigarElements()); - for ( int i = 0; i < indexOfIndel - 1; i++) - elements.add(cigar.getCigarElement(i)); - - // get the indel element and move it left one base - CigarElement ce = cigar.getCigarElement(indexOfIndel-1); - elements.add(new CigarElement(ce.getLength()-1, ce.getOperator())); - elements.add(cigar.getCigarElement(indexOfIndel)); - if ( indexOfIndel+1 < cigar.numCigarElements() ) { - ce = cigar.getCigarElement(indexOfIndel+1); - elements.add(new CigarElement(ce.getLength()+1, ce.getOperator())); - } else { - elements.add(new CigarElement(1, CigarOperator.M)); - } - - // get the last few elements - for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++) - elements.add(cigar.getCigarElement(i)); - return new Cigar(elements); - } - - private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { - CigarElement indel = cigar.getCigarElement(indexOfIndel); - int indelLength = indel.getLength(); - - int totalRefBases = 0; - for ( int i = 0; i < indexOfIndel; i++ ) { - CigarElement ce = cigar.getCigarElement(i); - int length = ce.getLength(); - - switch( ce.getOperator() ) { - case M: - readIndex += length; - refIndex += length; - totalRefBases += length; - break; - case S: - readIndex += length; - break; - case N: - refIndex += length; - totalRefBases += length; - break; - default: - break; - } - } - - // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them - if ( totalRefBases + indelLength > refSeq.length ) - indelLength -= (totalRefBases + indelLength - refSeq.length); - - // the indel-based reference string - byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))]; - - // add the bases before the indel, making sure it's not aligned off the end of the reference - if ( refIndex > alt.length || refIndex > refSeq.length ) - return null; - System.arraycopy(refSeq, 0, alt, 0, refIndex); - int currentPos = refIndex; - - // take care of the indel - if ( indel.getOperator() == CigarOperator.D ) { - refIndex += indelLength; - } else { - System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength); - currentPos += indelLength; - } - - // add the bases after the indel, making sure it's not aligned off the end of the reference - if ( refSeq.length - refIndex > alt.length - currentPos ) - return null; - System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex); - - return alt; - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; + + +public class AlignmentUtils { + + public static class MismatchCount { + public int numMismatches = 0; + public long mismatchQualities = 0; + } + + public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r, refSeq, refIndex).mismatchQualities; + } + + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r, refSeq, refIndex, 0, r.getReadLength()); + } + + // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single + // todo -- high performance implementation. We can do a lot better than this right now + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { + MismatchCount mc = new MismatchCount(); + + int readIdx = 0; + int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count + byte[] readSeq = r.getReadBases(); + Cigar c = r.getCigar(); + for (int i = 0; i < c.numCigarElements(); i++) { + + if (readIdx > endOnRead) break; + + CigarElement ce = c.getCigarElement(i); + switch (ce.getOperator()) { + case M: + for (int j = 0; j < ce.getLength(); j++, refIndex++, readIdx++) { + if (refIndex >= refSeq.length) + continue; + if (readIdx < startOnRead) continue; + if (readIdx > endOnRead) break; + byte refChr = refSeq[refIndex]; + byte readChr = readSeq[readIdx]; + // Note: we need to count X/N's as mismatches because that's what SAM requires + //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + // continue; // do not count Ns/Xs/etc ? + if (readChr != refChr) { + mc.numMismatches++; + mc.mismatchQualities += r.getBaseQualities()[readIdx]; + } + } + break; + case I: + case S: + readIdx += ce.getLength(); + break; + case D: + case N: + refIndex += ce.getLength(); + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + + } + return mc; + } + + /** + * Returns the number of mismatches in the pileup within the given reference context. + * + * @param pileup the pileup with reads + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { + int mismatches = 0; + for (PileupElement p : pileup) + mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); + return mismatches; + } + + /** + * Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { + return mismatchesInRefWindow(p, ref, ignoreTargetSite, false); + } + + /** + * Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @param qualitySumInsteadOfMismatchCount + * if true, return the quality score sum of the mismatches rather than the count + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) { + int sum = 0; + + int windowStart = ref.getWindow().getStart(); + int windowStop = ref.getWindow().getStop(); + byte[] refBases = ref.getBases(); + byte[] readBases = p.getRead().getReadBases(); + byte[] readQualities = p.getRead().getBaseQualities(); + Cigar c = p.getRead().getCigar(); + + int readIndex = 0; + int currentPos = p.getRead().getAlignmentStart(); + int refIndex = Math.max(0, currentPos - windowStart); + + for (int i = 0; i < c.numCigarElements(); i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { + // are we past the ref window? + if (currentPos > windowStop) + break; + + // are we before the ref window? + if (currentPos < windowStart) + continue; + + byte refChr = refBases[refIndex++]; + + // do we need to skip the target site? + if (ignoreTargetSite && ref.getLocus().getStart() == currentPos) + continue; + + byte readChr = readBases[readIndex]; + if (readChr != refChr) + sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1; + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + currentPos += cigarElementLength; + if (currentPos > windowStart) + refIndex += Math.min(cigarElementLength, currentPos - windowStart); + break; + case H: + case P: + break; + } + } + + return sum; + } + + /** + * Returns the number of mismatches in the pileup element within the given reference context. + * + * @param read the SAMRecord + * @param ref the reference context + * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good + * @param windowSize window size (on each side) to test + * @return a bitset representing which bases are good + */ + public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { + // first determine the positions with mismatches + int readLength = read.getReadLength(); + BitSet mismatches = new BitSet(readLength); + + // it's possible we aren't starting at the beginning of a read, + // and we don't need to look at any of the previous context outside our window + // (although we do need future context) + int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize); + int currentReadPos = read.getAlignmentStart(); + + byte[] refBases = ref.getBases(); + int refIndex = readStartPos - ref.getWindow().getStart(); + if (refIndex < 0) { + throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); + } + + byte[] readBases = read.getReadBases(); + int readIndex = 0; + + Cigar c = read.getCigar(); + + for (int i = 0; i < c.numCigarElements(); i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++) { + // skip over unwanted bases + if (currentReadPos++ < readStartPos) + continue; + + // this is possible if reads extend beyond the contig end + if (refIndex >= refBases.length) + break; + + byte refChr = refBases[refIndex]; + byte readChr = readBases[readIndex]; + if (readChr != refChr) + mismatches.set(readIndex); + + refIndex++; + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + if (currentReadPos >= readStartPos) + refIndex += cigarElementLength; + currentReadPos += cigarElementLength; + break; + case H: + case P: + break; + } + } + + // all bits are set to false by default + BitSet result = new BitSet(readLength); + + int currentPos = 0, leftPos = 0, rightPos; + int mismatchCount = 0; + + // calculate how many mismatches exist in the windows to the left/right + for (rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { + if (mismatches.get(rightPos)) + mismatchCount++; + } + if (mismatchCount <= maxMismatches) + result.set(currentPos); + + // now, traverse over the read positions + while (currentPos < readLength) { + // add a new rightmost position + if (rightPos < readLength && mismatches.get(rightPos++)) + mismatchCount++; + // re-penalize the previous position + if (mismatches.get(currentPos++)) + mismatchCount++; + // don't penalize the current position + if (mismatches.get(currentPos)) + mismatchCount--; + // subtract the leftmost position + if (leftPos < currentPos - windowSize && mismatches.get(leftPos++)) + mismatchCount--; + + if (mismatchCount <= maxMismatches) + result.set(currentPos); + } + + return result; + } + + /** + * Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. + * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but + * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is + * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. + * Formally, this method simply returns the number of M elements in the cigar. + * + * @param r alignment + * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). + */ + public static int getNumAlignmentBlocks(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) { + if (e.getOperator() == CigarOperator.M) n++; + } + + return n; + } + + public static int getNumAlignedBases(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) + if (e.getOperator() == CigarOperator.M) + n += e.getLength(); + + return n; + } + + public static byte[] alignmentToByteArray(final Cigar cigar, final byte[] read, final byte[] ref) { + + final byte[] alignment = new byte[read.length]; + int refPos = 0; + int alignPos = 0; + + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + case S: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos++] = '+'; + } + break; + case D: + case N: + refPos += elementLength; + break; + case M: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos] = ref[refPos]; + alignPos++; + refPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return alignment; + } + + public static int calcAlignmentByteArrayOffset(final Cigar cigar, PileupElement pileup, final int alignmentStart, final int refLocus) { + int pileupOffset = pileup.getOffset(); + + // Special case for reads starting with insertion + if (pileup.isInsertionAtBeginningOfRead()) + return 0; + + // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases + if (pileup.isDeletion()) { + pileupOffset = refLocus - alignmentStart; + final CigarElement ce = cigar.getCigarElement(0); + if (ce.getOperator() == CigarOperator.S) { + pileupOffset += ce.getLength(); + } + } + + int pos = 0; + int alignmentPos = 0; + + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + case S: + pos += elementLength; + if (pos >= pileupOffset) { + return alignmentPos; + } + break; + case D: + case N: + if (!pileup.isDeletion()) { + alignmentPos += elementLength; + } else { + if (pos + elementLength - 1 >= pileupOffset) { + return alignmentPos + (pileupOffset - pos); + } else { + pos += elementLength; + alignmentPos += elementLength; + } + } + break; + case M: + if (pos + elementLength - 1 >= pileupOffset) { + return alignmentPos + (pileupOffset - pos); + } else { + pos += elementLength; + alignmentPos += elementLength; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + + return alignmentPos; + } + + public static byte[] readToAlignmentByteArray(final Cigar cigar, final byte[] read) { + + int alignmentLength = 0; + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + case S: + break; + case D: + case N: + alignmentLength += elementLength; + break; + case M: + alignmentLength += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + + final byte[] alignment = new byte[alignmentLength]; + int alignPos = 0; + int readPos = 0; + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + if (alignPos > 0) { + if (alignment[alignPos - 1] == BaseUtils.A) { + alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[alignPos - 1] == BaseUtils.C) { + alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[alignPos - 1] == BaseUtils.T) { + alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[alignPos - 1] == BaseUtils.G) { + alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; + } + } + case S: + for (int jjj = 0; jjj < elementLength; jjj++) { + readPos++; + } + break; + case D: + case N: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos] = PileupElement.DELETION_BASE; + alignPos++; + } + break; + case M: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos] = read[readPos]; + alignPos++; + readPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return alignment; + } + + /** + * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and + * alignment reference index/start. + * + * @param r record + * @return true if read is unmapped + */ + public static boolean isReadUnmapped(final SAMRecord r) { + if (r.getReadUnmappedFlag()) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers altogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ((r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) + && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) return false; + return true; + } + + /** + * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and + * alignment reference index/start of the mate. + * + * @param r sam record for the read + * @return true if read's mate is unmapped + */ + public static boolean isMateUnmapped(final SAMRecord r) { + if (r.getMateUnmappedFlag()) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers altogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ((r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) + && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) return false; + return true; + } + + /** + * Returns true is read is mapped and mapped uniquely (Q>0). + * + * @param read + * @return + */ + public static boolean isReadUniquelyMapped(SAMRecord read) { + return (!AlignmentUtils.isReadUnmapped(read)) && read.getMappingQuality() > 0; + } + + /** + * Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from + * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base + * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array + * of read's base qualitites is inverted (in this case new array is allocated and returned). + * + * @param read + * @return + */ + public static byte[] getQualsInCycleOrder(SAMRecord read) { + if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getBaseQualities(); + + return Utils.reverse(read.getBaseQualities()); + } + + /** + * Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from + * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base + * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array + * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities + * are available this method will throw a runtime exception. + * + * @param read + * @return + */ + public static byte[] getOriginalQualsInCycleOrder(SAMRecord read) { + if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getOriginalBaseQualities(); + + return Utils.reverse(read.getOriginalBaseQualities()); + } + + /** + * Takes the alignment of the read sequence readSeq to the reference sequence refSeq + * starting at 0-based position refIndex on the refSeq and specified by its cigar. + * The last argument readIndex specifies 0-based position on the read where the alignment described by the + * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. + * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case + * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are + * always the positions where the cigar starts on the ref and on the read, respectively. + *

+ * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar + * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT + * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence + * is not repeated), the original cigar is returned. + * + * @param cigar structure of the original alignment + * @param refSeq reference sequence the read is aligned to + * @param readSeq read sequence + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) + */ + public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { + + int indexOfIndel = -1; + for (int i = 0; i < cigar.numCigarElements(); i++) { + CigarElement ce = cigar.getCigarElement(i); + if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { + // if there is more than 1 indel, don't left align + if (indexOfIndel != -1) + return cigar; + indexOfIndel = i; + } + } + + // if there is no indel or if the alignment starts with an insertion (so that there + // is no place on the read to move that insertion further left), we are done + if (indexOfIndel < 1) return cigar; + + final int indelLength = cigar.getCigarElement(indexOfIndel).getLength(); + + byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); + if (altString == null) + return cigar; + + Cigar newCigar = cigar; + for (int i = 0; i < indelLength; i++) { + newCigar = moveCigarLeft(newCigar, indexOfIndel); + byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); + + // check to make sure we haven't run off the end of the read + boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar); + + if (Arrays.equals(altString, newAltString)) { + cigar = newCigar; + i = -1; + if (reachedEndOfRead) + cigar = cleanUpCigar(cigar); + } + + if (reachedEndOfRead) + break; + } + + return cigar; + } + + private static boolean cigarHasZeroSizeElement(Cigar c) { + for (CigarElement ce : c.getCigarElements()) { + if (ce.getLength() == 0) + return true; + } + return false; + } + + private static Cigar cleanUpCigar(Cigar c) { + ArrayList elements = new ArrayList(c.numCigarElements() - 1); + for (CigarElement ce : c.getCigarElements()) { + if (ce.getLength() != 0 && + (elements.size() != 0 || ce.getOperator() != CigarOperator.D)) { + elements.add(ce); + } + } + return new Cigar(elements); + } + + private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) { + // get the first few elements + ArrayList elements = new ArrayList(cigar.numCigarElements()); + for (int i = 0; i < indexOfIndel - 1; i++) + elements.add(cigar.getCigarElement(i)); + + // get the indel element and move it left one base + CigarElement ce = cigar.getCigarElement(indexOfIndel - 1); + elements.add(new CigarElement(ce.getLength() - 1, ce.getOperator())); + elements.add(cigar.getCigarElement(indexOfIndel)); + if (indexOfIndel + 1 < cigar.numCigarElements()) { + ce = cigar.getCigarElement(indexOfIndel + 1); + elements.add(new CigarElement(ce.getLength() + 1, ce.getOperator())); + } else { + elements.add(new CigarElement(1, CigarOperator.M)); + } + + // get the last few elements + for (int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++) + elements.add(cigar.getCigarElement(i)); + return new Cigar(elements); + } + + private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + CigarElement indel = cigar.getCigarElement(indexOfIndel); + int indelLength = indel.getLength(); + + int totalRefBases = 0; + for (int i = 0; i < indexOfIndel; i++) { + CigarElement ce = cigar.getCigarElement(i); + int length = ce.getLength(); + + switch (ce.getOperator()) { + case M: + readIndex += length; + refIndex += length; + totalRefBases += length; + break; + case S: + readIndex += length; + break; + case N: + refIndex += length; + totalRefBases += length; + break; + default: + break; + } + } + + // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them + if (totalRefBases + indelLength > refSeq.length) + indelLength -= (totalRefBases + indelLength - refSeq.length); + + // the indel-based reference string + byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))]; + + // add the bases before the indel, making sure it's not aligned off the end of the reference + if (refIndex > alt.length || refIndex > refSeq.length) + return null; + System.arraycopy(refSeq, 0, alt, 0, refIndex); + int currentPos = refIndex; + + // take care of the indel + if (indel.getOperator() == CigarOperator.D) { + refIndex += indelLength; + } else { + System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength); + currentPos += indelLength; + } + + // add the bases after the indel, making sure it's not aligned off the end of the reference + if (refSeq.length - refIndex > alt.length - currentPos) + return null; + System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex); + + return alt; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index cedd56bdfb..b17e325fca 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -27,7 +27,7 @@ public class ArtificialSAMUtils { * @param chromosomeSize how large each chromosome is * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) */ - public static void createArtificialBamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) { + public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); File outFile = new File(filename); @@ -51,7 +51,7 @@ public static void createArtificialBamFile( String filename, int numberOfChromos * @param chromosomeSize how large each chromosome is * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) */ - public static void createArtificialSamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) { + public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); File outFile = new File(filename); @@ -72,16 +72,15 @@ public static void createArtificialSamFile( String filename, int numberOfChromos * @param numberOfChromosomes the number of chromosomes to create * @param startingChromosome the starting number for the chromosome (most likely set to 1) * @param chromosomeSize the length of each chromosome - * * @return */ - public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) { + public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { SAMFileHeader header = new SAMFileHeader(); header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); SAMSequenceDictionary dict = new SAMSequenceDictionary(); // make up some sequence records for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - SAMSequenceRecord rec = new SAMSequenceRecord("chr" + ( x ), chromosomeSize /* size */); + SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); rec.setSequenceLength(chromosomeSize); dict.addSequence(rec); } @@ -95,10 +94,9 @@ public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, * @param header the header to set * @param readGroupID the read group ID tag * @param sampleName the sample name - * * @return the adjusted SAMFileHeader */ - public static SAMFileHeader createDefaultReadGroup( SAMFileHeader header, String readGroupID, String sampleName ) { + public static SAMFileHeader createDefaultReadGroup(SAMFileHeader header, String readGroupID, String sampleName) { SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID); rec.setSample(sampleName); List readGroups = new ArrayList(); @@ -113,10 +111,9 @@ public static SAMFileHeader createDefaultReadGroup( SAMFileHeader header, String * @param header the header to set * @param readGroupIDs the read group ID tags * @param sampleNames the sample names - * * @return the adjusted SAMFileHeader */ - public static SAMFileHeader createEnumeratedReadGroups( SAMFileHeader header, List readGroupIDs, List sampleNames ) { + public static SAMFileHeader createEnumeratedReadGroups(SAMFileHeader header, List readGroupIDs, List sampleNames) { if (readGroupIDs.size() != sampleNames.size()) { throw new ReviewedStingException("read group count and sample name count must be the same"); } @@ -137,18 +134,16 @@ public static SAMFileHeader createEnumeratedReadGroups( SAMFileHeader header, Li /** * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read * - * * @param header the SAM header to associate the read with * @param name the name of the read * @param refIndex the reference index, i.e. what chromosome to associate it with * @param alignmentStart where to start the alignment * @param length the length of the read - * * @return the artificial read */ public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { - if( (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || - (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) ) + if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || + (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START)) throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart); GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(name); @@ -183,10 +178,9 @@ public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String na * @param alignmentStart where to start the alignment * @param bases the sequence of the read * @param qual the qualities of the read - * * @return the artificial read */ - public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual ) { + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { if (bases.length != qual.length) { throw new ReviewedStingException("Passed in read string is different length then the quality array"); } @@ -210,10 +204,9 @@ public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String n * @param bases the sequence of the read * @param qual the qualities of the read * @param cigar the cigar string of the read - * * @return the artificial read */ - public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar ) { + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); rec.setCigarString(cigar); return rec; @@ -221,24 +214,23 @@ public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String n /** * Create an artificial read with the following default parameters : - * header: - * numberOfChromosomes = 1 - * startingChromosome = 1 - * chromosomeSize = 1000000 - * read: - * name = "default_read" - * refIndex = 0 - * alignmentStart = 1 - * - * @param bases the sequence of the read - * @param qual the qualities of the read - * @param cigar the cigar string of the read + * header: + * numberOfChromosomes = 1 + * startingChromosome = 1 + * chromosomeSize = 1000000 + * read: + * name = "default_read" + * refIndex = 0 + * alignmentStart = 1 * + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read * @return the artificial read */ - public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { + public static GATKSAMRecord createArtificialRead(byte[] bases, byte[] qual, String cigar) { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); } @@ -253,7 +245,7 @@ public final static List createPair(SAMFileHeader header, String right.setProperPairFlag(true); left.setFirstOfPairFlag(leftIsFirst); - right.setFirstOfPairFlag(! leftIsFirst); + right.setFirstOfPairFlag(!leftIsFirst); left.setReadNegativeStrandFlag(leftIsNegative); left.setMateNegativeStrandFlag(!leftIsNegative); @@ -279,11 +271,10 @@ public final static List createPair(SAMFileHeader header, String * @param startingChr the chromosome (reference ID) to start from * @param endingChr the id to end with * @param readCount the number of reads per chromosome - * * @return StingSAMIterator representing the specified amount of fake data */ - public static StingSAMIterator mappedReadIterator( int startingChr, int endingChr, int readCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static StingSAMIterator mappedReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); } @@ -295,11 +286,10 @@ public static StingSAMIterator mappedReadIterator( int startingChr, int endingCh * @param endingChr the id to end with * @param readCount the number of reads per chromosome * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * * @return StingSAMIterator representing the specified amount of fake data */ - public static StingSAMIterator mappedAndUnmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static StingSAMIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } @@ -310,11 +300,10 @@ public static StingSAMIterator mappedAndUnmappedReadIterator( int startingChr, i * @param startingChr the chromosome (reference ID) to start from * @param endingChr the id to end with * @param readCount the number of reads per chromosome - * * @return StingSAMIterator representing the specified amount of fake data */ - public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static ArtificialSAMQueryIterator queryReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); } @@ -326,11 +315,10 @@ public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int * @param endingChr the id to end with * @param readCount the number of reads per chromosome * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * * @return StingSAMIterator representing the specified amount of fake data */ - public static StingSAMIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static StingSAMIterator queryReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } @@ -345,6 +333,7 @@ private final static int ranIntInclusive(Random ran, int start, int stop) { * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second * may be, depending on where this sampled insertSize puts it. + * * @param header * @param loc * @param readLen @@ -360,22 +349,22 @@ public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header final int pos = loc.getStart(); final List pileupElements = new ArrayList(); - for ( int i = 0; i < pileupSize / 2; i++ ) { + for (int i = 0; i < pileupSize / 2; i++) { final String readName = "read" + i; final int leftStart = ranIntInclusive(ran, 1, pos); - final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize); + final int fragmentSize = (int) (ran.nextGaussian() * insertSizeVariation + insertSize); final int rightStart = leftStart + fragmentSize - readLen; - if ( rightStart <= 0 ) continue; + if (rightStart <= 0) continue; List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); final GATKSAMRecord left = pair.get(0); final GATKSAMRecord right = pair.get(1); - pileupElements.add(new PileupElement(left, pos - leftStart)); + pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false)); - if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) { - pileupElements.add(new PileupElement(right, pos - rightStart)); + if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { + pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 96713edc26..648dafb816 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -24,10 +24,12 @@ package org.broadinstitute.sting.utils.sam; -import com.google.java.contract.Ensures; import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; import org.broadinstitute.sting.utils.NGSPlatform; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -44,7 +46,14 @@ * */ public class GATKSAMRecord extends BAMRecord { - public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; + // ReduceReads specific attribute tags + public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool + public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start + public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end + + // Base Quality Score Recalibrator specific attribute tags + public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; + public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // the SAMRecord data we're caching private String mReadString = null; @@ -153,6 +162,64 @@ public boolean equals(Object o) { return super.equals(o); } + /** + * Setters and Accessors for base insertion and base deletion quality scores + */ + public void setBaseQualities( final byte[] quals, final RecalDataManager.BaseRecalibrationType errorModel ) { + switch( errorModel ) { + case BASE_SUBSTITUTION: + setBaseQualities(quals); + break; + case BASE_INSERTION: + setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, SAMUtils.phredToFastq(quals) ); + break; + case BASE_DELETION: + setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, SAMUtils.phredToFastq(quals) ); + break; + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); + } + } + + public byte[] getBaseQualities( final RecalDataManager.BaseRecalibrationType errorModel ) { + switch( errorModel ) { + case BASE_SUBSTITUTION: + return getBaseQualities(); + case BASE_INSERTION: + return getBaseInsertionQualities(); + case BASE_DELETION: + return getBaseDeletionQualities(); + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); + } + } + + public boolean hasBaseIndelQualities() { + return getAttribute( BQSR_BASE_INSERTION_QUALITIES ) != null || getAttribute( BQSR_BASE_DELETION_QUALITIES ) != null; + } + + public byte[] getBaseInsertionQualities() { + byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_INSERTION_QUALITIES ) ); + if( quals == null ) { + quals = new byte[getBaseQualities().length]; + Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); + } + return quals; + } + + public byte[] getBaseDeletionQualities() { + byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_DELETION_QUALITIES ) ); + if( quals == null ) { + quals = new byte[getBaseQualities().length]; + Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + } + return quals; + } + /** * Efficient caching accessor that returns the GATK NGSPlatform of this read * @return @@ -184,18 +251,22 @@ public boolean isReducedRead() { return getReducedReadCounts() != null; } + /** + * The number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @return the number of bases corresponding to the i'th base of the reduced read + */ public final byte getReducedCount(final int i) { byte firstCount = getReducedReadCounts()[0]; byte offsetCount = getReducedReadCounts()[i]; return (i==0) ? firstCount : (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); } - /////////////////////////////////////////////////////////////////////////////// // *** GATKSAMRecord specific methods ***// /////////////////////////////////////////////////////////////////////////////// - /** * Checks whether an attribute has been set for the given key. * @@ -277,7 +348,6 @@ public void simplify () { * * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ - @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) public int getSoftStart() { int start = this.getUnclippedStart(); for (CigarElement cigarElement : this.getCigar().getCigarElements()) { @@ -286,17 +356,17 @@ public int getSoftStart() { else break; } + return start; } /** * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. * - * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. * * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ - @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) public int getSoftEnd() { int stop = this.getUnclippedStart(); @@ -313,9 +383,40 @@ public int getSoftEnd() { else shift = 0; } + return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + /** + * Determines the original alignment start of a previously clipped read. + * + * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end + * + * @return the alignment start of a read before it was clipped + */ + public int getOriginalAlignmentStart() { + int originalAlignmentStart = getUnclippedStart(); + Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT); + if (alignmentShift != null) + originalAlignmentStart += alignmentShift; + return originalAlignmentStart; + } + + /** + * Determines the original alignment end of a previously clipped read. + * + * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end + * + * @return the alignment end of a read before it was clipped + */ + public int getOriginalAlignmentEnd() { + int originalAlignmentEnd = getUnclippedEnd(); + Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT); + if (alignmentShift != null) + originalAlignmentEnd -= alignmentShift; + return originalAlignmentEnd; + } + /** * Creates an empty GATKSAMRecord with the read's header, read group and mate * information, but empty (not-null) fields: @@ -358,4 +459,21 @@ public static GATKSAMRecord emptyRead(GATKSAMRecord read) { return emptyRead; } + /** + * Shallow copy of everything, except for the attribute list and the temporary attributes. + * A new list of the attributes is created for both, but the attributes themselves are copied by reference. + * This should be safe because callers should never modify a mutable value returned by any of the get() methods anyway. + * + * @return a shallow copy of the GATKSAMRecord + * @throws CloneNotSupportedException + */ + @Override + public Object clone() throws CloneNotSupportedException { + final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); + if (temporaryAttributes != null) { + for (Object attribute : temporaryAttributes.keySet()) + clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); + } + return clone; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index f2e54713f3..91389f0bff 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -29,6 +29,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -58,7 +59,7 @@ public enum ClippingTail { /** * A HashMap of the SAM spec read flag names - *

+ * * Note: This is not being used right now, but can be useful in the future */ private static final Map readFlagNames = new HashMap(); @@ -79,49 +80,47 @@ public enum ClippingTail { /** * This enum represents all the different ways in which a read can overlap an interval. - *

+ * * NO_OVERLAP_CONTIG: * read and interval are in different contigs. - *

+ * * NO_OVERLAP_LEFT: * the read does not overlap the interval. - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * NO_OVERLAP_RIGHT: * the read does not overlap the interval. - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval - *

- * |-----------| (interval) - * <-------------------> (read) - *

+ * + * |-----------| (interval) + * <-------------------> (read) + * * OVERLAP_CONTAINED: * the read starts and ends inside the interval - *

- * |----------------| (interval) - * <--------> (read) + * + * |----------------| (interval) + * <--------> (read) */ - public enum ReadAndIntervalOverlap { - NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED - } + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} /** * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular @@ -141,15 +140,15 @@ public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader hea /** * is this base inside the adaptor of the read? - *

+ * * There are two cases to treat here: - *

+ * * 1) Read is in the negative strand => Adaptor boundary is on the left tail * 2) Read is in the positive strand => Adaptor boundary is on the right tail - *

+ * * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) * - * @param read the read to test + * @param read the read to test * @param basePos base position in REFERENCE coordinates (not read coordinates) * @return whether or not the base is in the adaptor */ @@ -166,38 +165,42 @@ public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos * the read boundary. If the read is in the positive strand, this is the first base after the end of the * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the * beginning of the fragment. - *

+ * * There are two cases we need to treat here: - *

+ * * 1) Our read is in the reverse strand : - *

- * <----------------------| * - * |---------------------> - *

- * in these cases, the adaptor boundary is at the mate start (minus one) - *

+ * + * <----------------------| * + * |---------------------> + * + * in these cases, the adaptor boundary is at the mate start (minus one) + * * 2) Our read is in the forward strand : - *

- * |----------------------> * - * <----------------------| - *

- * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * |----------------------> * + * <----------------------| + * + * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) * * @param read the read being tested for the adaptor boundary * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig. */ public static Integer getAdaptorBoundary(final SAMRecord read) { + final int MAXIMUM_ADAPTOR_LENGTH = 8; final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) - if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another - return null; // chromosome or unmapped pairs - - int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) + if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or unmapped pairs + return null; + + Integer adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) if (read.getReadNegativeStrandFlag()) adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) else adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header) + if ( (adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH) ) + adaptorBoundary = null; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor + return adaptorBoundary; } @@ -264,7 +267,7 @@ public final static List sortReadsByCoordinate(List + * * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. * * @param read @@ -272,7 +275,7 @@ public final static List sortReadsByCoordinate(List + * * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. * * @param read @@ -288,7 +291,7 @@ public final static int getFirstInsertionOffset(SAMRecord read) { */ public final static int getLastInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); - if (e.getOperator() == CigarOperator.I) + if ( e.getOperator() == CigarOperator.I ) return e.getLength(); else return 0; @@ -297,8 +300,7 @@ public final static int getLastInsertionOffset(SAMRecord read) { /** * Determines what is the position of the read in relation to the interval. * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. - * - * @param read the read + * @param read the read * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ @@ -309,30 +311,30 @@ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); - if (!read.getReferenceName().equals(interval.getContig())) + if ( !read.getReferenceName().equals(interval.getContig()) ) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - else if (uStop < interval.getStart()) + else if ( uStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - else if (uStart > interval.getStop()) + else if ( uStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - else if (sStop < interval.getStart()) + else if ( sStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; - else if (sStart > interval.getStop()) + else if ( sStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; - else if ((sStart >= interval.getStart()) && - (sStop <= interval.getStop())) + else if ( (sStart >= interval.getStart()) && + (sStop <= interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - else if ((sStart < interval.getStart()) && - (sStop > interval.getStop())) + else if ( (sStart < interval.getStart()) && + (sStop > interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - else if ((sStart < interval.getStart())) + else if ( (sStart < interval.getStart()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT; else @@ -340,36 +342,52 @@ else if ((sStart < interval.getStart())) } /** - * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in - * a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns - * the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the - * deletion. + * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of + * two corner cases: + * + * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside + * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it + * doesn't matter because it already returns the previous base by default. + * + * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the + * read starts with an insertion, and you're requesting the first read based coordinate, it will skip + * the leading insertion (because it has the same reference coordinate as the following base). * * @param read * @param refCoord * @param tail * @return the read coordinate corresponding to the requested reference coordinate for clipping. */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { Pair result = getReadCoordinateForReferenceCoordinate(read, refCoord); int readCoord = result.getFirst(); + // Corner case one: clipping the right tail and falls on deletion, move to the next + // read coordinate. It is not a problem for the left tail because the default answer + // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) readCoord++; + // clipping the left tail and first base is insertion, go to the next read coordinate + // with the same reference coordinate. Advance to the next cigar element, or to the + // end of the read if there is no next element. + Pair firstElementIsInsertion = readStartsWithInsertion(read); + if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst()) + readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1); + return readCoord; } /** * Returns the read coordinate corresponding to the requested reference coordinate. - *

+ * * WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function * will return the last read base before the deletion. This function returns a * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with * a deletion. - *

+ * * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a * pre-processed result according to normal clipping needs. Or you can use this function and tailor the * behavior to your needs. @@ -421,7 +439,7 @@ public static Pair getReadCoordinateForReferenceCoordinate(GAT if (endsWithinCigar) fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. else { nextCigarElement = cigarElementIterator.next(); @@ -442,13 +460,13 @@ public static Pair getReadCoordinateForReferenceCoordinate(GAT if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) readBases += shift; - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion else if (fallsInsideDeletion && endsWithinCigar) readBases--; } @@ -457,7 +475,6 @@ else if (fallsInsideDeletion && endsWithinCigar) if (!goalReached) throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - return new Pair(readBases, fallsInsideDeletion); } @@ -465,12 +482,11 @@ else if (fallsInsideDeletion && endsWithinCigar) * Compares two SAMRecords only the basis on alignment start. Note that * comparisons are performed ONLY on the basis of alignment start; any * two SAM records with the same alignment start will be considered equal. - *

+ * * Unmapped alignments will all be considered equal. */ @Requires({"read1 != null", "read2 != null"}) - @Ensures("result == 0 || result == 1 || result == -1") public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { AlignmentStartComparator comp = new AlignmentStartComparator(); return comp.compare(read1, read2); @@ -479,7 +495,7 @@ public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { /** * Is a base inside a read? * - * @param read the read to evaluate + * @param read the read to evaluate * @param referenceCoordinate the reference coordinate of the base to test * @return true if it is inside the read, false otherwise. */ @@ -502,4 +518,142 @@ public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { } + /** + * Checks if a read starts with an insertion. It looks beyond Hard and Soft clips + * if there are any. + * + * @param read + * @return A pair with the answer (true/false) and the element or null if it doesn't exist + */ + public static Pair readStartsWithInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.INSERTION) + return new Pair(true, cigarElement); + + else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP) + break; + } + return new Pair(false, null); + } + + /** + * Returns the coverage distribution of a list of reads within the desired region. + * + * See getCoverageDistributionOfRead for information on how the coverage is calculated. + * + * @param list the list of reads covering the region + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { + int [] totalCoverage = new int[stopLocation - startLocation + 1]; + + for (GATKSAMRecord read : list) { + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); + } + + return totalCoverage; + } + + /** + * Returns the coverage distribution of a single read within the desired region. + * + * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample + * reads for variant regions, and deletions count as variants) + * + * @param read the read to get the coverage distribution of + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { + int [] coverage = new int[stopLocation - startLocation + 1]; + int refLocation = read.getSoftStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + switch (cigarElement.getOperator()) { + case S: + case M: + case EQ: + case N: + case X: + case D: + for (int i = 0; i < cigarElement.getLength(); i++) { + if (refLocation >= startLocation && refLocation <= stopLocation) { + int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; + coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases + } + refLocation++; + } + break; + + case P: + case I: + case H: + break; + } + + if (refLocation > stopLocation) + break; + } + return coverage; + } + + /** + * Makes association maps for the reads and loci coverage as described below : + * + * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. + * Note: Locus is in reference coordinates. + * Example: Locus => {read1, read2, ..., readN} + * + * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. + * Example: Read => {true, true, false, ... false} + * + * @param readList the list of reads to generate the association mappings + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return the two hashmaps described above + */ + public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { + int arraySize = stopLocation - startLocation + 1; + + HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); + HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); + + + for (int i = startLocation; i <= stopLocation; i++) + locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists + + for (GATKSAMRecord read : readList) { + readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays + + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + + for (int i=0; i 0) { + // Update the hash for this locus + HashSet readSet = locusToReadMap.get(refLocation); + readSet.add(read); + + // Add this locus to the read hash + readToLocusMap.get(read)[refLocation - startLocation] = true; + } + else + // Update the boolean array with a 'no coverage' from this read to this locus + readToLocusMap.get(read)[refLocation-startLocation] = false; + } + } + return new Pair>, HashMap>(locusToReadMap, readToLocusMap); + } + + public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { + String[] sequenceRecordNames = new String[sequenceDictionary.size()]; + int sequenceRecordIndex = 0; + for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) + sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); + return Arrays.deepToString(sequenceRecordNames); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java index c3f437f11b..52b4109fef 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java @@ -212,7 +212,13 @@ public static boolean wouldBeNoCallAllele(byte[] bases) { * @return true if the bases represent a symbolic allele */ public static boolean wouldBeSymbolicAllele(byte[] bases) { - return bases.length > 2 && bases[0] == '<' && bases[bases.length-1] == '>'; + if ( bases.length <= 2 ) + return false; + else { + final String strBases = new String(bases); + return (bases[0] == '<' && bases[bases.length-1] == '>') || + (strBases.contains("[") || strBases.contains("]")); + } } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index 1691129c94..747d83e6f9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -29,7 +29,9 @@ public Genotype(String sampleName, List alleles, double log10PError, Set } public Genotype(String sampleName, List alleles, double log10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { - if ( alleles != null ) + if ( alleles == null || alleles.isEmpty() ) + this.alleles = Collections.emptyList(); + else this.alleles = Collections.unmodifiableList(alleles); commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes); if ( log10Likelihoods != null ) @@ -89,9 +91,6 @@ public List getAlleles() { } public List getAlleles(Allele allele) { - if ( getType() == Type.UNAVAILABLE ) - throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); - List al = new ArrayList(); for ( Allele a : alleles ) if ( a.equals(allele) ) @@ -112,7 +111,7 @@ public Allele getAllele(int i) { * @return the ploidy of this genotype */ public int getPloidy() { - if ( alleles == null ) + if ( alleles.size() == 0 ) throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype"); return alleles.size(); } @@ -134,7 +133,7 @@ public Type getType() { } protected Type determineType() { - if ( alleles == null ) + if ( alleles.size() == 0 ) return Type.UNAVAILABLE; boolean sawNoCall = false, sawMultipleAlleles = false; @@ -234,8 +233,7 @@ private GenotypeLikelihoods getLikelihoods(String key, boolean asPL) { } public void validate() { - if ( alleles == null ) return; - if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0"); + if ( alleles.size() == 0) return; // int nNoCalls = 0; for ( Allele allele : alleles ) { @@ -254,7 +252,7 @@ public String getGenotypeString() { } public String getGenotypeString(boolean ignoreRefState) { - if ( alleles == null ) + if ( alleles.size() == 0 ) return null; // Notes: diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 247e412ddb..f5c57ca44f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -920,6 +920,9 @@ public void extraStrictValidation(Allele reference, Byte paddedRefBase, Set=maxAC1) { + for ( Allele a : getAlternateAlleles() ) { + final int ac = getCalledChrCount(a); + if ( ac >= maxAC1 ) { maxAC1 = ac; best = a; } @@ -1238,6 +1247,9 @@ public Allele getAltAlleleWithHighestAlleleCount() { } public int[] getGLIndecesOfAllele(Allele inputAllele) { + + // TODO -- this information is cached statically by the UnifiedGenotyperEngine; pull it out into a common utils class for all to use + int[] idxVector = new int[3]; int numAlleles = this.getAlleles().size(); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index c9a4965c1c..fc50df3a59 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -29,6 +29,7 @@ import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -64,8 +65,10 @@ public class VariantContextUtils { * @return the attributes map provided as input, returned for programming convenience */ public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { + final int AN = vc.getCalledChrCount(); + // if everyone is a no-call, remove the old attributes if requested - if ( vc.getCalledChrCount() == 0 && removeStaleValues ) { + if ( AN == 0 && removeStaleValues ) { if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) attributes.remove(VCFConstants.ALLELE_COUNT_KEY); if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) @@ -76,19 +79,22 @@ public static Map calculateChromosomeCounts(VariantContext vc, M } if ( vc.hasGenotypes() ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getCalledChrCount()); + attributes.put(VCFConstants.ALLELE_NUMBER_KEY, AN); // if there are alternate alleles, record the relevant tags if ( vc.getAlternateAlleles().size() > 0 ) { - ArrayList alleleFreqs = new ArrayList(); - ArrayList alleleCounts = new ArrayList(); - double totalChromosomes = (double)vc.getCalledChrCount(); + final ArrayList alleleFreqs = new ArrayList(); + final ArrayList alleleCounts = new ArrayList(); for ( Allele allele : vc.getAlternateAlleles() ) { int altChromosomes = vc.getCalledChrCount(allele); alleleCounts.add(altChromosomes); - // todo -- this is a performance problem - String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); - alleleFreqs.add(freq); + if ( AN == 0 ) { + alleleFreqs.add("0.0"); + } else { + // todo -- this is a performance problem + final String freq = String.format(makePrecisionFormatStringFromDenominatorValue((double)AN), ((double)altChromosomes / (double)AN)); + alleleFreqs.add(freq); + } } attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); @@ -112,41 +118,8 @@ public static Map calculateChromosomeCounts(VariantContext vc, M */ public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) { final VariantContext vc = builder.make(); - - // if everyone is a no-call, remove the old attributes if requested - if ( vc.getCalledChrCount() == 0 && removeStaleValues ) { - if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) - builder.rmAttribute(VCFConstants.ALLELE_COUNT_KEY); - if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) - builder.rmAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); - if ( vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) - builder.rmAttribute(VCFConstants.ALLELE_NUMBER_KEY); - return; - } - - if ( vc.hasGenotypes() ) { - builder.attribute(VCFConstants.ALLELE_NUMBER_KEY, vc.getCalledChrCount()); - - // if there are alternate alleles, record the relevant tags - if ( vc.getAlternateAlleles().size() > 0 ) { - ArrayList alleleFreqs = new ArrayList(); - ArrayList alleleCounts = new ArrayList(); - double totalChromosomes = (double)vc.getCalledChrCount(); - for ( Allele allele : vc.getAlternateAlleles() ) { - int altChromosomes = vc.getCalledChrCount(allele); - alleleCounts.add(altChromosomes); - String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); - alleleFreqs.add(freq); - } - - builder.attribute(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); - builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); - } - else { - builder.attribute(VCFConstants.ALLELE_COUNT_KEY, 0); - builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); - } - } + final Map attrs = calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues); + builder.attributes(attrs); } private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { @@ -464,7 +437,23 @@ public enum FilteredRecordMergeType { /** * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. */ - KEEP_IF_ALL_UNFILTERED + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL + } + + @Hidden + public enum MultipleAllelesMergeType { + /** + * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. + */ + BY_TYPE, + /** + * Merge all allele types at the same start position into the same VCF record. + */ + MIX_TYPES } /** @@ -635,7 +624,7 @@ public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, } // if at least one record was unfiltered and we want a union, clear all of the filters - if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) filters.clear(); @@ -1056,6 +1045,14 @@ public static boolean isTransversion(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; } + public static boolean isTransition(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + public static boolean isTransversion(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + /** * create a genome location, given a variant context * @param genomeLocParser parser diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 61829dcfc2..e33f6717a0 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -1,13 +1,21 @@ package org.broadinstitute.sting; -import org.apache.log4j.*; +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.utils.crypt.CryptUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.io.IOUtils; -import java.io.*; -import java.util.*; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * @@ -53,6 +61,8 @@ public abstract class BaseTest { public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; + public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf"; public static final String refseqAnnotationLocation = annotationDataLocation + "refseq/"; public static final String hg18Refseq = refseqAnnotationLocation + "refGene-big-table-hg18.txt"; @@ -80,6 +90,9 @@ public abstract class BaseTest { public static final File testDirFile = new File("public/testdata/"); public static final String testDir = testDirFile.getAbsolutePath() + "/"; + public static final String keysDataLocation = validationDataLocation + "keys/"; + public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; + /** before the class starts up */ static { // setup a basic log configuration @@ -134,7 +147,7 @@ public abstract class BaseTest { */ public static class TestDataProvider { private static final Map> tests = new HashMap>(); - private String name; + protected String name; /** * Create a new TestDataProvider instance bound to the class variable C diff --git a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java new file mode 100644 index 0000000000..db89aee78b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.utils.Median; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class MedianUnitTest extends BaseTest { + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class MedianTestProvider extends TestDataProvider { + final List values = new ArrayList(); + final int cap; + final Integer expected; + + public MedianTestProvider(int expected, int cap, Integer ... values) { + super(MedianTestProvider.class); + this.expected = expected; + this.cap = cap; + this.values.addAll(Arrays.asList(values)); + this.name = String.format("values=%s expected=%d cap=%d", this.values, expected, cap); + } + } + + @DataProvider(name = "MedianTestProvider") + public Object[][] makeMedianTestProvider() { + new MedianTestProvider(1, 1000, 0, 1, 2); + new MedianTestProvider(1, 1000, 1, 0, 1, 2); + new MedianTestProvider(1, 1000, 0, 1, 2, 3); + new MedianTestProvider(2, 1000, 0, 1, 2, 3, 4); + new MedianTestProvider(2, 1000, 4, 1, 2, 3, 0); + new MedianTestProvider(1, 1000, 1); + new MedianTestProvider(2, 1000, 2); + new MedianTestProvider(1, 1000, 1, 2); + + new MedianTestProvider(1, 3, 1); + new MedianTestProvider(1, 3, 1, 2); + new MedianTestProvider(2, 3, 1, 2, 3); + new MedianTestProvider(2, 3, 1, 2, 3, 4); + new MedianTestProvider(2, 3, 1, 2, 3, 4, 5); + + new MedianTestProvider(1, 3, 1); + new MedianTestProvider(1, 3, 1, 2); + new MedianTestProvider(2, 3, 3, 2, 1); + new MedianTestProvider(3, 3, 4, 3, 2, 1); + new MedianTestProvider(4, 3, 5, 4, 3, 2, 1); + + return MedianTestProvider.getTests(MedianTestProvider.class); + } + + @Test(dataProvider = "MedianTestProvider") + public void testBasicLikelihoods(MedianTestProvider cfg) { + final Median median = new Median(cfg.cap); + + int nAdded = 0; + for ( final int value : cfg.values ) + if ( median.add(value) ) + nAdded++; + + Assert.assertEquals(nAdded, median.size()); + + Assert.assertEquals(cfg.values.isEmpty(), median.isEmpty()); + Assert.assertEquals(cfg.values.size() >= cfg.cap, median.isFull()); + Assert.assertEquals(median.getMedian(), cfg.expected, cfg.toString()); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testEmptyMedian() { + final Median median = new Median(); + Assert.assertTrue(median.isEmpty()); + final Integer d = 100; + Assert.assertEquals(median.getMedian(d), d); + median.getMedian(); + } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index ca7653b580..c9e3b6b1b7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -30,6 +30,7 @@ import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -45,7 +46,7 @@ import java.util.*; public class WalkerTest extends BaseTest { - private static final boolean ENABLE_REPORTING = false; + private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; @BeforeMethod public void initializeRandomGenerator() { @@ -121,11 +122,19 @@ public String buildCommandLine(String... arguments) { } public class WalkerTestSpec { + + // Arguments implicitly included in all Walker command lines, unless explicitly + // disabled using the disableImplicitArgs() method below. + final String IMPLICIT_ARGS = ENABLE_PHONE_HOME_FOR_TESTS ? + String.format("-et %s", GATKRunReport.PhoneHomeOption.STANDARD) : + String.format("-et %s -K %s", GATKRunReport.PhoneHomeOption.NO_ET, gatkKeyFile); + String args = ""; int nOutputFiles = -1; List md5s = null; List exts = null; Class expectedException = null; + boolean includeImplicitArgs = true; // the default output path for the integration test private File outputFileLocation = null; @@ -159,6 +168,10 @@ public WalkerTestSpec(String args, int nOutputFiles, Class expectedException) { this.expectedException = expectedException; } + public String getArgsWithImplicitArgs() { + return args + (includeImplicitArgs ? " " + IMPLICIT_ARGS : ""); + } + public void setOutputFileLocation(File outputFileLocation) { this.outputFileLocation = outputFileLocation; } @@ -180,6 +193,9 @@ public void addAuxFile(String expectededMD5sum, File outputfile) { auxillaryFiles.put(expectededMD5sum, outputfile); } + public void disableImplicitArgs() { + includeImplicitArgs = false; + } } protected boolean parameterize() { @@ -213,7 +229,7 @@ protected Pair, List> executeTest(final String name, WalkerTe tmpFiles.add(fl); } - final String args = String.format(spec.args, tmpFiles.toArray()); + final String args = String.format(spec.getArgsWithImplicitArgs(), tmpFiles.toArray()); System.out.println(Utils.dupString('-', 80)); if ( spec.expectsException() ) { @@ -277,13 +293,10 @@ private Pair, List> executeTest(String name, File outputFileL * @param args the argument list * @param expectedException the expected exception or null */ - public static void executeTest(String name, String args, Class expectedException) { + private void executeTest(String name, String args, Class expectedException) { CommandLineGATK instance = new CommandLineGATK(); String[] command = Utils.escapeExpressions(args); - // add the logging level to each of the integration test commands - command = Utils.appendArray(command, "-et", ENABLE_REPORTING ? "STANDARD" : "NO_ET"); - // run the executable boolean gotAnException = false; try { diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 5da8cebf47..20f3e1e352 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -79,7 +79,8 @@ public void timeDownsampling(int reps) { false, BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, - null, + null, // no BAQ + null, // no BQSR (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java index fde0ce62f5..8cf9f7ce08 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java @@ -27,6 +27,7 @@ import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -91,4 +92,16 @@ public void testNumberAndSizeOfIndexLevels() { Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1); } + @Test( expectedExceptions = UserException.MalformedFile.class ) + public void testDetectTruncatedBamIndexWordBoundary() { + GATKBAMIndex index = new GATKBAMIndex(new File(validationDataLocation + "truncated_at_word_boundary.bai")); + index.readReferenceSequence(0); + } + + @Test( expectedExceptions = UserException.MalformedFile.class ) + public void testDetectTruncatedBamIndexNonWordBoundary() { + GATKBAMIndex index = new GATKBAMIndex(new File(validationDataLocation + "truncated_at_non_word_boundary.bai")); + index.readReferenceSequence(0); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 4011594f32..7282d6c485 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -6,6 +6,7 @@ import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -85,6 +86,55 @@ public void testIndelBaseQualityFiltering() { Assert.assertTrue(foundExtendedEventPileup,"Extended event pileup not found"); } + @Test + public void testIndelsInRegularPileup() { + final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); + + SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); + before.setReadBases(bases); + before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + before.setCigarString("10M"); + + SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); + during.setReadBases(indelBases); + during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + during.setCigarString("4M2I6M"); + + SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); + after.setReadBases(bases); + after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + after.setCigarString("10M"); + + List reads = Arrays.asList(before,during,after); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + boolean foundIndel = false; + while (li.hasNext()) { + AlignmentContext context = li.next(); + if(!context.hasBasePileup()) + continue; + + ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); + for (PileupElement p : pileup) { + if (p.isBeforeInsertion()) { + foundIndel = true; + Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); + Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); + break; + } + } + + } + + Assert.assertTrue(foundIndel,"Indel in pileup not found"); + } /** * Right now, the GATK's extended event pileup DOES NOT include reads which stop immediately before an insertion @@ -308,6 +358,7 @@ private static ReadProperties createTestReadProperties() { BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ + null, // no BQSR (byte) -1 ); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index c9b81a9d35..b3b9ab555c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -30,7 +30,7 @@ import org.testng.annotations.Test; public class GATKReportUnitTest extends BaseTest { - @Test + @Test(enabled = false) public void testParse() throws Exception { String reportPath = validationDataLocation + "exampleGATKReport.eval"; GATKReport report = new GATKReport(reportPath); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsUnitTest.java similarity index 74% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsUnitTest.java index 8cd10048aa..0fcaad3bf6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsUnitTest.java @@ -1,20 +1,19 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.sam.ArtificialReadsTraversal; import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; - -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; - import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + /* * Copyright (c) 2009 The Broad Institute @@ -44,11 +43,11 @@ /** * @author aaron *

- * Class PrintReadsWalkerUnitTest + * Class PrintReadsUnitTest *

* This tests the print reads walker, using the artificial reads traversal */ -public class PrintReadsWalkerUnitTest extends BaseTest { +public class PrintReadsUnitTest extends BaseTest { /** * our private fake reads traversal. This traversal seeds the @@ -60,19 +59,23 @@ public class PrintReadsWalkerUnitTest extends BaseTest { private ReferenceContext bases = null; //private ReferenceContext ref = new ReferenceContext() + PrintReadsWalker walker; + ArtificialSAMFileWriter writer; + @BeforeMethod public void before() { trav = new ArtificialReadsTraversal(); readTotal = ( ( trav.endingChr - trav.startingChr ) + 1 ) * trav.readsPerChr + trav.unMappedReads; + + walker = new PrintReadsWalker(); + writer = new ArtificialSAMFileWriter(); + walker.out = writer; + walker.initialize(); } /** test that we get out the same number of reads we put in */ @Test public void testReadCount() { - PrintReadsWalker walker = new PrintReadsWalker(); - ArtificialSAMFileWriter writer = new ArtificialSAMFileWriter(); - walker.out = writer; - trav.traverse(walker, null, writer); assertEquals(writer.getRecords().size(), readTotal); } @@ -80,10 +83,6 @@ public void testReadCount() { /** test that we're ok with a null read */ @Test public void testNullRead() { - PrintReadsWalker walker = new PrintReadsWalker(); - ArtificialSAMFileWriter writer = new ArtificialSAMFileWriter(); - walker.out = writer; - SAMRecord rec = walker.map(bases, null, null); assertTrue(rec == null); } @@ -91,10 +90,6 @@ public void testNullRead() { /** tes that we get the read we put into the map function */ @Test public void testReturnRead() { - PrintReadsWalker walker = new PrintReadsWalker(); - ArtificialSAMFileWriter writer = new ArtificialSAMFileWriter(); - walker.out = writer; - SAMFileHeader head = ArtificialSAMUtils.createArtificialSamHeader(3,1,1000); GATKSAMRecord rec = ArtificialSAMUtils.createArtificialRead(head, "FakeRead", 1, 1, 50); SAMRecord ret = walker.map(bases, rec, null); @@ -102,20 +97,6 @@ public void testReturnRead() { assertTrue(ret.getReadName().equals(rec.getReadName())); } - /** test that the read makes it to the output source */ - @Test - public void testReducingRead() { - PrintReadsWalker walker = new PrintReadsWalker(); - ArtificialSAMFileWriter writer = new ArtificialSAMFileWriter(); - walker.out = writer; - - SAMFileHeader head = ArtificialSAMUtils.createArtificialSamHeader(3,1,1000); - SAMRecord rec = ArtificialSAMUtils.createArtificialRead(head, "FakeRead", 1, 1, 50); - SAMRecord ret = walker.map(bases, null,null); - walker.reduce(ret,writer); - - assertTrue(writer.getRecords().size() == 1); - } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 0aec946638..02026b375e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public void testHasAnnotsNotAsking2() { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("e70eb5f80c93e366dcbe3cf684c154e4")); + Arrays.asList("3b7796fa7c7dc94878bedadf7938db4c")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -66,7 +66,7 @@ public void testNoAnnotsNotAsking2() { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("1e52761fdff73a5361b5eb0a6e5d9dad")); + Arrays.asList("279cace364f747f9bae7fe391b5026f0")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -82,7 +82,7 @@ public void testNoAnnotsAsking2() { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("bb4eebfaffc230cb8a31e62e7b53a300")); + Arrays.asList("e488abd05d6162758698a3a7579866a6")); executeTest("test exclude annotations", spec); } @@ -110,6 +110,14 @@ public void testDBTagWithDbsnp() { executeTest("getting DB tag with dbSNP", spec); } + @Test + public void testMultipleIdsWithDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3withIDs.vcf -L " + validationDataLocation + "vcfexample3withIDs.vcf", 1, + Arrays.asList("cd7e3d43b8f5579c461b3e588a295fa8")); + executeTest("adding multiple IDs with dbSNP", spec); + } + @Test public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( @@ -171,7 +179,7 @@ public void testSnpEffAnnotationsUnsupportedVersion() { @Test public void testTDTAnnotation() { - final String MD5 = "204e67536a17af7eaa6bf0a910818997"; + final String MD5 = "a78c1e950740d3c13c0258960c5fa8e1"; WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf -NO_HEADER -ped " + validationDataLocation + "ug.random50000.family.ped -o %s", 1, diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 1a01ef8e8e..9aae1f0ae9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public void testBeagleOutput() { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847")); + "-o %s -NO_HEADER", 1, Arrays.asList("6d0f213918e3b9ea33bc2f8a51a462f1")); executeTest("test BeagleOutputToVCF", spec); } @@ -72,7 +72,7 @@ public void testBeagleOutput2() { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee")); + "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("ddbf490f1d9f37cc79fe414c8d40886f")); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java new file mode 100644 index 0000000000..aa6a72ef97 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -0,0 +1,103 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.BitSet; +import java.util.Random; + +/** + * Short one line description of the walker. + * + *

+ * [Long description of the walker] + *

+ * + * + *

Input

+ *

+ * [Description of the Input] + *

+ * + *

Output

+ *

+ * [Description of the Output] + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T [walker name]
+ *  
+ * + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ContextCovariateUnitTest { + ContextCovariate covariate; + RecalibrationArgumentCollection RAC; + Random random; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ContextCovariate(); + random = GenomeAnalysisEngine.getRandomGenerator(); + covariate.initialize(RAC); + + } + + @Test(enabled = true) + public void testSimpleContexts() { + byte [] quals = createRandomReadQuals(101); + byte [] bbases = createRandomReadBases(101); + String bases = stringFrom(bbases); + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + CovariateValues values = covariate.getValues(read); + verifyCovariateArray((BitSet []) values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases); + verifyCovariateArray((BitSet []) values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases); + verifyCovariateArray((BitSet []) values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases); + } + + private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { + for (int i=0; i= contextSize) + Assert.assertEquals(MathUtils.dnaFrom(values[i]), bases.substring(i-contextSize, i)); + else + Assert.assertNull(values[i]); + } + } + + private String stringFrom(byte [] array) { + String s = ""; + for (byte value : array) + s += (char) value; + return s; + } + + private byte [] createRandomReadQuals(int length) { + byte [] quals = new byte[length]; + for (int i=0; i e = new HashMap(); - e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); - e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" ); + e.put( "--min_base_quality_score 26", "258c1b33349eb3b2d395ec4d69302725" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -125,12 +132,20 @@ public void testCallingParameters() { } } + @Test + public void testSLOD() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("6172d2f3d370132f4c57a26aa94c256e")); + executeTest("test SLOD", spec); + } + @Test public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" ); - e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); + e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -164,8 +179,8 @@ public void testConfidence2() { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "2cb2544739e01f6c08fd820112914317" ); - e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" ); + e.put( 0.01, "926b58038dd4989bf7eda697a847eea9" ); + e.put( 1.0 / 1850, "93f44105b43b65730a3b821e27b0fa16" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -189,7 +204,7 @@ public void testMultiTechnologies() { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("2b2729414ae855d390e7940956745bce")); + Arrays.asList("a1b75a7e12b160b0be823228c958573f")); executeTest(String.format("test multiple technologies"), spec); } @@ -208,7 +223,7 @@ public void testCallingWithBAQ() { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("95c6120efb92e5a325a5cec7d77c2dab")); + Arrays.asList("3bda1279cd6dcb47885f3e19466f11b9")); executeTest(String.format("test calling with BAQ"), spec); } @@ -227,7 +242,7 @@ public void testSimpleIndels() { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b11df6587e4e16cb819d76a900446946")); + Arrays.asList("d9fc3ba94a0d46029778c7b457e7292a")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -242,7 +257,7 @@ public void testIndelsWithLowMinAlleleCnt() { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("2ad52c2e75b3ffbfd8f03237c444e8e6")); + Arrays.asList("b2e30ae3e5ffa6108f9f6178b1d2e679")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -255,7 +270,7 @@ public void testMultiTechnologyIndels() { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("59068bc8888ad5f08790946066d76602")); + Arrays.asList("2cd182a84613fa91a6020466d2d327e2")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -265,7 +280,7 @@ public void testWithIndelAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("fa4f3ee67d98b64102a8a3ec81a3bc81")); + Arrays.asList("9cd08dc412a007933381e9c76c073899")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -275,7 +290,7 @@ public void testWithIndelAllelesPassedIn2() { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("df90890e43d735573a3b3e4f289ca46b")); + Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -285,7 +300,7 @@ public void testWithIndelAllelesPassedIn3() { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("cff6dd0f4eb1ef0b6fc476da6ffead19")); + Arrays.asList("2609675a356f2dfc86f8a1d911210978")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -294,7 +309,7 @@ public void testWithIndelAllelesPassedIn4() { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("1e2a4aab26e9ab0dae709d33a669e036")); + Arrays.asList("4fdd8da77167881b71b3547da5c13f94")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index 65de6697b3..b53daaf397 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -34,8 +34,8 @@ public Object[][] createCCTestData() { new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "ab4940a16ab990181bd8368c76b23853" ); new CCTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "17d4b8001c982a70185e344929cf3941"); - new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "36c0c467b6245c2c6c4e9c956443a154" ); - new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "955a8fa2ddb2b04c406766ccd9ac45cc" ); + new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "714e65d6cb51ae32221a77ce84cbbcdc" ); + new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "932f0063abb2a23c22ec992ef8d36aa5" ); return CCTest.getTests(CCTest.class); } @@ -91,8 +91,8 @@ public String toString() { public Object[][] createTRTestData() { new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0b7123ae9f4155484b68e4a4f96c5504" ); new TRTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "d04cf1f6df486e45226ebfbf93a188a5"); - new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b2f4757bc47cf23bd9a09f756c250787" ); - new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "502c7df4d4923c4d078b014bf78bed34" ); + new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "74314e5562c1a65547bb0edaacffe602" ); + new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "41c2f82f7789421f3690ed3c35b8f2e4" ); return TRTest.getTests(TRTest.class); } @@ -291,7 +291,7 @@ public void testCountCovariatesVCFPlusDBsnp() { @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "828d247c6e8ef5ebdf3603dc0ce79f61" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "aac7df368ca589dc0a66d5bd9ad007e3" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -317,7 +317,7 @@ public void testCountCovariatesNoIndex() { @Test(dependsOnMethods = "testCountCovariatesNoIndex") public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "991f093a0e610df235d28ada418ebf33" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "02249d9933481052df75c58a2a1a8e63" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 5c3a43c96e..36c093e8fb 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -30,7 +30,7 @@ public void testFunctionClassWithSnpeff() { "-o %s" ), 1, - Arrays.asList("f909fd8374f663e983b9b3fda4cf5cf1") + Arrays.asList("c8d8bffa5c572df9dec7364f71a1b943") ); executeTest("testFunctionClassWithSnpeff", spec); } @@ -277,7 +277,7 @@ public void testSelect1() { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("4f60acc8a4b21c4b4ccb51ad9071449c")); + 1, Arrays.asList("c49e239292704447a36e01ee9a71e729")); executeTestParallel("testSelect1", spec); } @@ -335,7 +335,7 @@ public void testEvalTrackWithoutGenotypes() { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("190e1a171132832bf92fbca56a9c40bb")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("e42cda858649a35eaa9d14ea2d70a956")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -347,7 +347,7 @@ public void testMultipleEvalTracksWithoutGenotypes() { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("08586d443fdcf3b7f63b8f9e3a943c62")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9561cb4c7aa36dcf30ba253385299859")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -463,7 +463,7 @@ public void testModernVCFWithLargeIndels() { "-o %s" ), 1, - Arrays.asList("a6f8b32fa732632da13dfe3ddcc73cef") + Arrays.asList("397b0e77459b9b69d2e0dd1dac320c3c") ); executeTest("testModernVCFWithLargeIndels", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 5a4d6e6a1b..d74aac79d4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -110,7 +110,7 @@ public void combinePLs(String file1, String file2, String md5) { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("b14f8cbb5d03a2e613b12da4da9efd9a")); + Arrays.asList("ab72f4bfb16d3894942149173a087647")); executeTest("threeWayWithRefs", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 042de2a27d..900e3d489e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -45,7 +45,7 @@ public void testRepeatedLineSelection() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -sn B -sn C --variant " + testfile), 1, - Arrays.asList("b74038779fe6485dbb8734ae48178356") + Arrays.asList("5085a2f8cddfeae9f6274f905025184f") ); executeTest("testRepeatedLineSelection--" + testfile, spec); @@ -58,7 +58,7 @@ public void testDiscordance() { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s -NO_HEADER", 1, - Arrays.asList("78e6842325f1f1bc9ab30d5e7737ee6e") + Arrays.asList("929bbb96381541c162dc7e5462e26ea2") ); executeTest("testDiscordance--" + testFile, spec); @@ -129,6 +129,19 @@ public void testMultipleRecordsAtOnePosition() { executeTest("testMultipleRecordsAtOnePositionFirstIsFiltered--" + testFile, spec); } + @Test + public void testNoGTs() { + String testFile = validationDataLocation + "vcf4.1.example.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("f17885e5cbd5387edb99112047ea43c1") + ); + + executeTest("testMultipleRecordsAtOnePositionFirstIsFiltered--" + testFile, spec); + } + @Test public void testParallelization() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java index 19021c1c28..6188f22558 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -27,10 +27,8 @@ import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; import java.util.*; -import java.io.File; public class VariantsToTableIntegrationTest extends WalkerTest { private String variantsToTableCmd(String moreArgs) { @@ -38,7 +36,15 @@ private String variantsToTableCmd(String moreArgs) { " --variant:vcf " + validationDataLocation + "/soap_gatk_annotated.vcf" + " -T VariantsToTable" + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" + - " -L chr1 -KMA -o %s" + moreArgs; + " -L chr1 -o %s" + moreArgs; + } + + private String variantsToTableMultiAllelicCmd(String moreArgs) { + return "-R " + b37KGReference + + " --variant " + validationDataLocation + "/multiallelic.vcf" + + " -T VariantsToTable" + + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" + + " -o %s" + moreArgs; } @Test(enabled = true) @@ -53,4 +59,18 @@ public void testComplexVariantsToTableFail() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(""), 1, UserException.class); executeTest("testComplexVariantsToTable-FAIL", spec); } + + @Test(enabled = true) + public void testMultiAllelicOneRecord() { + WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""), + Arrays.asList("13dd36c08be6c800f23988e6000d963e")); + executeTest("testMultiAllelicOneRecord", spec).getFirst(); + } + + @Test(enabled = true) + public void testMultiAllelicSplitRecords() { + WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(" -SMA"), + Arrays.asList("17a0fc80409d2fc00ad2bbb94b3a346b")); + executeTest("testMultiAllelicSplitRecords", spec).getFirst(); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java new file mode 100644 index 0000000000..25bd7a2ebd --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for Haplotype Class + */ +public class HaplotypeUnitTest extends BaseTest { + @BeforeClass + public void init() { + } + + @Test + public void testSimpleInsertionAllele() { + final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(bases.length(), CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AACTTCTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("-", "ACTT", 1, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCACTTAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("-", "ACTT", 7, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCAACTGGTCAAACTTCTGGTCAACTGGTCA"; + basicInsertTest("-", "ACTT", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testSimpleDeletionAllele() { + final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(bases.length(), CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "ATCAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("ACTT", "-", 1, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("ACTT", "-", 7, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCAACTGGTCAATCAACTGGTCA"; + basicInsertTest("ACTT", "-", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testSimpleSNPAllele() { + final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(bases.length(), CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AGTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("C", "G", 1, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCTACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("A", "T", 7, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCAACTGGTCAAATGGTCAACTGGTCA"; + basicInsertTest("C", "A", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testComplexInsertionAllele() { + final String bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(10, CigarOperator.I)); + h1CigarList.add(new CigarElement(8, CigarOperator.M)); + h1CigarList.add(new CigarElement(3, CigarOperator.D)); + h1CigarList.add(new CigarElement(7, CigarOperator.M)); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AACTTTCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("-", "ACTT", 1, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCACTTGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("-", "ACTT", 7, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGACTTGGGGA" + "AGGC"; + basicInsertTest("-", "ACTT", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testComplexDeletionAllele() { + final String bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(10, CigarOperator.I)); + h1CigarList.add(new CigarElement(8, CigarOperator.M)); + h1CigarList.add(new CigarElement(3, CigarOperator.D)); + h1CigarList.add(new CigarElement(7, CigarOperator.M)); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "A" + "CGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("ACTT", "-", 1, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("ACTT", "-", 7, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGA" + "AGGC"; + basicInsertTest("ACTT", "-", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testComplexSNPAllele() { + final String bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(10, CigarOperator.I)); + h1CigarList.add(new CigarElement(8, CigarOperator.M)); + h1CigarList.add(new CigarElement(3, CigarOperator.D)); + h1CigarList.add(new CigarElement(7, CigarOperator.M)); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AGCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("T", "G", 1, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCTATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("G", "T", 7, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGCGGGA" + "AGGC"; + basicInsertTest("G", "C", 17, h1Cigar, bases, h1bases); + } + + private void basicInsertTest(String ref, String alt, int loc, Cigar cigar, String hap, String newHap) { + final int INDEL_PADDING_BASE = (ref.length() == alt.length() ? 0 : 1); + final Haplotype h = new Haplotype(hap.getBytes()); + final Allele h1refAllele = Allele.create(ref, true); + final Allele h1altAllele = Allele.create(alt, false); + final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE, 0, cigar) ); + final Haplotype h1expected = new Haplotype(newHap.getBytes()); + Assert.assertEquals(h1, h1expected); + + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 251c105c3a..1ba6c74d46 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -26,22 +26,20 @@ package org.broadinstitute.sting.utils; +import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import org.broadinstitute.sting.BaseTest; - -import java.util.List; -import java.util.ArrayList; -import java.util.Collections; +import java.util.*; /** * Basic unit test for MathUtils */ public class MathUtilsUnitTest extends BaseTest { @BeforeClass - public void init() { } + public void init() { + } /** * Tests that we get the right values from the binomial distribution @@ -66,20 +64,20 @@ public void testBinomialProbability() { public void testMultinomialProbability() { logger.warn("Executing testMultinomialProbability"); - int[] counts0 = { 2, 0, 1 }; - double[] probs0 = { 0.33, 0.33, 0.34 }; + int[] counts0 = {2, 0, 1}; + double[] probs0 = {0.33, 0.33, 0.34}; Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6); - int[] counts1 = { 10, 20, 30 }; - double[] probs1 = { 0.25, 0.25, 0.50 }; + int[] counts1 = {10, 20, 30}; + double[] probs1 = {0.25, 0.25, 0.50}; Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9); - int[] counts2 = { 38, 82, 50, 36 }; - double[] probs2 = { 0.25, 0.25, 0.25, 0.25 }; + int[] counts2 = {38, 82, 50, 36}; + double[] probs2 = {0.25, 0.25, 0.25, 0.25}; Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10); - int[] counts3 = { 1, 600, 1 }; - double[] probs3 = { 0.33, 0.33, 0.34 }; + int[] counts3 = {1, 600, 1}; + double[] probs3 = {0.33, 0.33, 0.34}; Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286); } @@ -123,19 +121,21 @@ public void testSliceListByIndices() { Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); } - /** Tests that we correctly compute mean and standard deviation from a stream of numbers */ + /** + * Tests that we correctly compute mean and standard deviation from a stream of numbers + */ @Test public void testRunningAverage() { logger.warn("Executing testRunningAverage"); - int [] numbers = {1,2,4,5,3,128,25678,-24}; + int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; MathUtils.RunningAverage r = new MathUtils.RunningAverage(); - for ( int i = 0 ; i < numbers.length ; i++ ) r.add((double)numbers[i]); + for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]); - Assert.assertEquals((long)numbers.length, r.observationCount()); - Assert.assertTrue(r.mean()- 3224.625 < 2e-10 ); - Assert.assertTrue(r.stddev()-9072.6515881128 < 2e-10); + Assert.assertEquals((long) numbers.length, r.observationCount()); + Assert.assertTrue(r.mean() - 3224.625 < 2e-10); + Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10); } @Test @@ -174,4 +174,149 @@ public void testLog10Factorial() { Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); } + @Test(enabled = true) + public void testRandomSubset() { + Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0); + Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1); + Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2); + Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3); + Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4); + Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5); + Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6); + Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7); + Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8); + Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9); + Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10); + Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10); + + for (int i = 0; i < 25; i++) + Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5))); + + } + + @Test(enabled = true) + public void testArrayShuffle() { + Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + for (int i = 0; i < 25; i++) { + Object[] t = MathUtils.arrayShuffle(x); + Assert.assertTrue(hasUniqueElements(t)); + Assert.assertTrue(hasAllElements(x, t)); + } + } + + /** + * Private functions used by testArrayShuffle() + */ + private boolean hasUniqueElements(Object[] x) { + for (int i = 0; i < x.length; i++) + for (int j = i + 1; j < x.length; j++) + if (x[i].equals(x[j]) || x[i] == x[j]) + return false; + return true; + } + + private boolean hasAllElements(final Object[] expected, final Object[] actual) { + HashSet set = new HashSet(); + set.addAll(Arrays.asList(expected)); + set.removeAll(Arrays.asList(actual)); + return set.isEmpty(); + } + + @Test(enabled = true) + public void testIntAndBitSetConversion() { + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE); + } + + @Test(enabled = true) + public void testDNAAndBitSetConversion() { + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31) + } + + @Test + public void testApproximateLog10SumLog10() { + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + } + + @Test + public void testNormalizeFromLog10() { + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, -1.0, -1.1, -7.8})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, 0.0, -0.1, -6.8})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[]{-8.9, -6.7, -9.4, 0.0, -8.9})); + + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.0}), new double[]{0.25, 0.25, 0.25, 0.25})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -1.0}), new double[]{0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -2.0}), new double[]{0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); + } + + /** + * Private function used by testNormalizeFromLog10() + */ + private boolean compareDoubleArrays(double[] b1, double[] b2) { + if( b1.length != b2.length ) { + return false; // sanity check + } + + for( int i=0; i < b1.length; i++ ){ + if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 ) + return false; + } + return true; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index 18108e0a10..16b141bc34 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -112,8 +112,9 @@ private static boolean isCigarValid(Cigar cigar) { } } - if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) - return true; // we don't accept reads starting or ending in deletions (add any other constraint here) +// if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) } return false; @@ -190,4 +191,18 @@ public static Cigar invertCigar (Cigar cigar) { return invertedCigar; } + /** + * Checks whether or not the read has any cigar element that is not H or S + * + * @param read + * @return true if it has any M, I or D, false otherwise + */ + public static boolean readHasNonClippedBases(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } + + } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index 4dad68dc55..bc918c0a4f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -30,12 +30,12 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.HashMap; import java.util.List; /** @@ -59,10 +59,11 @@ public void testHardClipBothEndsByReferenceCoordinates() { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); int readLength = alnStart - alnEnd; - for (int i=0; i= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + assertUnclippedLimits(read, clippedRead); } } } @@ -72,12 +73,14 @@ public void testHardClipByReadCoordinates() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int readLength = read.getReadLength(); - for (int i=0; i %s", i, read.getCigarString(), clipLeft.getCigarString())); + Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); - GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength-1); + GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1); Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); } } } @@ -86,19 +89,27 @@ public void testHardClipByReadCoordinates() { public void testHardClipByReferenceCoordinates() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - for (int i=alnStart; i<=alnEnd; i++) { - if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side - GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); - if (!clipLeft.isEmpty()) - Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + int start = read.getSoftStart(); + int stop = read.getSoftEnd(); + +// System.out.println(String.format("CIGAR: %s (%d, %d)", cigar.toString(), start, stop)); + +// if (ReadUtils.readIsEntirelyInsertion(read)) +// System.out.println("debug"); + + for (int i = start; i <= stop; i++) { + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i); + if (!clipLeft.isEmpty()) { +// System.out.println(String.format("\t left [%d] %s -> %s ", i-start+1, cigar.toString(), clipLeft.getCigarString())); + Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); } - if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side - GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. - Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. +// System.out.println(String.format("\t right [%d] %s -> %s ", i-start+1, cigar.toString(), clipRight.getCigarString())); + Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); } } } @@ -111,10 +122,14 @@ public void testHardClipByReferenceCoordinatesLeftTail() { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side - for (int i=alnStart; i<=alnEnd; i++) { - GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); - if (!clipLeft.isEmpty()) - Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + for (int i = alnStart; i <= alnEnd; i++) { + GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); + + if (!clipLeft.isEmpty()) { +// System.out.println(String.format("Left Tail [%d]: %s (%d,%d,%d : %d,%d,%d) -> %s (%d,%d,%d : %d,%d,%d)", i, cigar.toString(), read.getUnclippedStart(), read.getSoftStart(), read.getAlignmentStart(), read.getAlignmentEnd(), read.getSoftEnd(), read.getUnclippedEnd(), clipLeft.getCigarString(), clipLeft.getUnclippedStart(), clipLeft.getSoftStart(), clipLeft.getAlignmentStart(), clipLeft.getAlignmentEnd(), clipLeft.getSoftEnd(), clipLeft.getUnclippedEnd())); + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); + } } } } @@ -127,10 +142,12 @@ public void testHardClipByReferenceCoordinatesRightTail() { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side - for (int i=alnStart; i<=alnEnd; i++) { + for (int i = alnStart; i <= alnEnd; i++) { GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); + } } } } @@ -145,43 +162,36 @@ public void testHardClipLowQualEnds() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int readLength = read.getReadLength(); - byte [] quals = new byte[readLength]; + byte[] quals = new byte[readLength]; for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { - - // create a read with nLowQualBases in the left tail - Utils.fillArrayWithByte(quals, HIGH_QUAL); + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) quals[addLeft] = LOW_QUAL; read.setBaseQualities(quals); GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests - - // Make sure the low qualities are gone - assertNoLowQualBases(clipLeft, LOW_QUAL); + assertUnclippedLimits(read, clipLeft); // Make sure limits haven't changed + assertNoLowQualBases(clipLeft, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); - // Can't run this test with the current contract of no hanging insertions -// Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); - // create a read with nLowQualBases in the right tail - Utils.fillArrayWithByte(quals, HIGH_QUAL); + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the right tail for (int addRight = 0; addRight < nLowQualBases; addRight++) quals[readLength - addRight - 1] = LOW_QUAL; read.setBaseQualities(quals); GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests - - // Make sure the low qualities are gone - assertNoLowQualBases(clipRight, LOW_QUAL); +// System.out.println(String.format("Debug [%d]: %s -> %s / %s", nLowQualBases, cigar.toString(), clipLeft.getCigarString(), clipRight.getCigarString())); - // Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions - //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); // Make sure limits haven't changed + assertNoLowQualBases(clipRight, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); - // create a read with nLowQualBases in the both tails - if (nLowQualBases <= readLength/2) { - Utils.fillArrayWithByte(quals, HIGH_QUAL); + if (nLowQualBases <= readLength / 2) { + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases on both tails for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { quals[addBoth] = LOW_QUAL; quals[readLength - addBoth - 1] = LOW_QUAL; @@ -189,83 +199,25 @@ public void testHardClipLowQualEnds() { read.setBaseQualities(quals); GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests - - // Make sure the low qualities are gone - assertNoLowQualBases(clipBoth, LOW_QUAL); - - // Can't run this test with the current contract of no hanging insertions - //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); + assertUnclippedLimits(read, clipBoth); // Make sure limits haven't changed + assertNoLowQualBases(clipBoth, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2 * nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); } } -// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString())); } - - // ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug ) - final byte[] BASES = {'A','C','G','T','A','C','G','T'}; - final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; - final String CIGAR = "1S1M5I1S"; - - final byte[] CLIPPED_BASES = {}; - final byte[] CLIPPED_QUALS = {}; - final String CLIPPED_CIGAR = ""; - - - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); - GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR); - - ReadClipperTestUtils.assertEqualReads(ReadClipper.hardClipLowQualEnds(read, (byte) 2), expected); } @Test(enabled = true) public void testHardClipSoftClippedBases() { - - // Generate a list of cigars to test for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read); + CigarCounter original = new CigarCounter(read); + CigarCounter clipped = new CigarCounter(clippedRead); - int sumHardClips = 0; - int sumMatches = 0; - - boolean tail = true; - for (CigarElement element : read.getCigar().getCigarElements()) { - // Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right) - if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP) - tail = true; - - // Adds all H, S and D's (next to hard/soft clips). - // All these should be hard clips after clipping. - if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION)) - sumHardClips += element.getLength(); - - // this means we're no longer on the tail (insertions can still potentially be the tail because - // of the current contract of clipping out hanging insertions - else if (element.getOperator() != CigarOperator.INSERTION) - tail = false; - - // Adds all matches to verify that they remain the same after clipping - if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) - sumMatches += element.getLength(); - } - - for (CigarElement element : clippedRead.getCigar().getCigarElements()) { - // Test if clipped read has Soft Clips (shouldn't have any!) - Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString())); - - // Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for - if (element.getOperator() == CigarOperator.HARD_CLIP) - sumHardClips -= element.getLength(); - - // Make sure all matches are still there - if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) - sumMatches -= element.getLength(); - } - Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips)); - Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); - - -// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS } } @@ -276,38 +228,39 @@ public void testHardClipLeadingInsertions() { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read); + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); - if (! clippedRead.isEmpty()) { + if (!clippedRead.isEmpty()) { Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone - } - else + } else Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped } } } @Test(enabled = true) - public void testRevertSoftClippedBases() - { - for (Cigar cigar: cigarList) { + public void testRevertSoftClippedBases() { + for (Cigar cigar : cigarList) { final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); - if ( leadingSoftClips > 0 || tailSoftClips > 0) { + assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed + + if (leadingSoftClips > 0 || tailSoftClips > 0) { final int expectedStart = read.getAlignmentStart() - leadingSoftClips; final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); - } - else + } else Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); } } @@ -315,12 +268,25 @@ public void testRevertSoftClippedBases() private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { if (!read.isEmpty()) { - byte [] quals = read.getBaseQualities(); - for (int i=0; i 0; } @@ -335,10 +301,46 @@ private int leadingCigarElementLength(Cigar cigar, CigarOperator operator) { return 0; } - private boolean cigarHasElementsDifferentThanInsertionsAndHardClips (Cigar cigar) { + private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(Cigar cigar) { for (CigarElement cigarElement : cigar.getCigarElements()) if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) return true; return false; } + + private class CigarCounter { + private HashMap counter; + + public Integer getCounterForOp(CigarOperator operator) { + return counter.get(operator); + } + + public CigarCounter(GATKSAMRecord read) { + CigarOperator[] operators = CigarOperator.values(); + counter = new HashMap(operators.length); + + for (CigarOperator op : operators) + counter.put(op, 0); + + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength()); + } + + public boolean assertHardClippingSoftClips(CigarCounter clipped) { + for (CigarOperator op : counter.keySet()) { + if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) { + int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP); + int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP); + int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP); + + Assert.assertEquals(counterTotal, clippedHard); + Assert.assertTrue(clippedSoft == 0); + } else + Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op)); + } + return true; + } + + } + } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java new file mode 100644 index 0000000000..7681ed7d1a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils.codecs.vcf; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class VCFCodecUnitTest extends BaseTest { + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class AlleleClippingTestProvider extends TestDataProvider { + final String ref; + final List alleles = new ArrayList(); + final int expectedClip; + + private AlleleClippingTestProvider(final int expectedClip, final String ref, final String ... alleles) { + super(AlleleClippingTestProvider.class); + this.ref = ref; + for ( final String allele : alleles ) + this.alleles.add(Allele.create(allele)); + this.expectedClip = expectedClip; + } + + @Override + public String toString() { + return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); + } + } + + @DataProvider(name = "AlleleClippingTestProvider") + public Object[][] MakeAlleleClippingTest() { + // pair clipping + new AlleleClippingTestProvider(0, "ATT", "CCG"); + new AlleleClippingTestProvider(1, "ATT", "CCT"); + new AlleleClippingTestProvider(2, "ATT", "CTT"); + new AlleleClippingTestProvider(2, "ATT", "ATT"); // cannot completely clip allele + + // triplets + new AlleleClippingTestProvider(0, "ATT", "CTT", "CGG"); + new AlleleClippingTestProvider(1, "ATT", "CTT", "CGT"); // the T can go + new AlleleClippingTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go + + return AlleleClippingTestProvider.getTests(AlleleClippingTestProvider.class); + } + + + @Test(dataProvider = "AlleleClippingTestProvider") + public void TestAlleleClipping(AlleleClippingTestProvider cfg) { + int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1); + Assert.assertEquals(result, cfg.expectedClip); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index c8a0c0ed66..b7bbae68da 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -9,7 +9,7 @@ public class VCFIntegrationTest extends WalkerTest { - @Test + @Test(enabled = true) public void testReadingAndWritingWitHNoChanges() { String md5ofInputVCF = "a990ba187a69ca44cb9bc2bb44d00447"; @@ -25,4 +25,28 @@ public void testReadingAndWritingWitHNoChanges() { WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF)); executeTest("Test Variants To VCF from new output", spec2); } + + @Test + // See https://getsatisfaction.com/gsa/topics/support_vcf_4_1_structural_variation_breakend_alleles?utm_content=topic_link&utm_medium=email&utm_source=new_topic + public void testReadingAndWritingBreakpointAlleles() { + String testVCF = testDir + "breakpoint-example.vcf"; + //String testVCF = validationDataLocation + "multiallelic.vcf"; + + String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s "; + + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("76075307afd26b4db6234795d9fb3c2f")); + executeTest("Test reading and writing breakpoint VCF", spec1); + } + + @Test + public void testReadingAndWritingSamtools() { + String testVCF = validationDataLocation + "samtools.vcf"; + + String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s "; + + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("87d5b180ef5f9dc5aaee4b02601b43a2")); + executeTest("Test reading and writing samtools vcf", spec1); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java new file mode 100644 index 0000000000..f5cfea148e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.security.Key; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; +import java.util.Arrays; + +public class CryptUtilsUnitTest extends BaseTest { + + @Test + public void testGenerateValidKeyPairWithDefaultSettings() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + Assert.assertTrue(CryptUtils.keysDecryptEachOther(keyPair.getPrivate(), keyPair.getPublic())); + } + + @DataProvider( name = "InvalidKeyPairSettings" ) + public Object[][] invalidKeyPairSettingsDataProvider() { + return new Object[][] { + { -1, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, "Made-up algorithm", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, "Made-up algorithm"} + }; + } + + @Test( dataProvider = "InvalidKeyPairSettings", expectedExceptions = ReviewedStingException.class ) + public void testGenerateKeyPairWithInvalidSettings( int keyLength, String encryptionAlgorithm, String randomNumberGenerationAlgorithm ) { + KeyPair keyPair = CryptUtils.generateKeyPair(keyLength, encryptionAlgorithm, randomNumberGenerationAlgorithm); + } + + @Test + public void testGATKMasterKeyPairMutualDecryption() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testGATKMasterKeyPairMutualDecryption")); + } + + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKMasterPublicKey())); + } + + @Test + public void testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption")); + } + + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKDistributedPublicKey())); + } + + @Test + public void testKeyPairWriteThenRead() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + File privateKeyFile = createTempFile("testKeyPairWriteThenRead_private", "key"); + File publicKeyFile = createTempFile("testKeyPairWriteThenRead_public", "key"); + + CryptUtils.writeKeyPair(keyPair, privateKeyFile, publicKeyFile); + + assertKeysAreEqual(keyPair.getPrivate(), CryptUtils.readPrivateKey(privateKeyFile)); + assertKeysAreEqual(keyPair.getPublic(), CryptUtils.readPublicKey(publicKeyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromFile", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(keyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromStream", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(new FileInputStream(keyFile))); + } + + @Test + public void testPrivateKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromFile", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(keyFile)); + } + + @Test + public void testPrivateKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromStream", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(new FileInputStream(keyFile))); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPublicKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPublicKey(nonExistentFile); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPrivateKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPrivateKey(nonExistentFile); + } + + @Test + public void testDecodePublicKey() { + PublicKey originalKey = CryptUtils.generateKeyPair().getPublic(); + PublicKey decodedKey = CryptUtils.decodePublicKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testDecodePrivateKey() { + PrivateKey originalKey = CryptUtils.generateKeyPair().getPrivate(); + PrivateKey decodedKey = CryptUtils.decodePrivateKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testLoadGATKMasterPrivateKey() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testLoadGATKMasterPrivateKey")); + } + + PrivateKey gatkMasterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + } + + @Test + public void testLoadGATKMasterPublicKey() { + PublicKey gatkMasterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + } + + @Test + public void testLoadGATKDistributedPublicKey() { + PublicKey gatkDistributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + } + + private void assertKeysAreEqual( Key originalKey, Key keyFromDisk ) { + Assert.assertTrue(Arrays.equals(originalKey.getEncoded(), keyFromDisk.getEncoded())); + Assert.assertEquals(originalKey.getAlgorithm(), keyFromDisk.getAlgorithm()); + Assert.assertEquals(originalKey.getFormat(), keyFromDisk.getFormat()); + } + + private boolean gatkPrivateKeyExistsButReadPermissionDenied() { + File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); + return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java new file mode 100644 index 0000000000..8fb75ef381 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class GATKKeyIntegrationTest extends WalkerTest { + + public static final String BASE_COMMAND = String.format("-T PrintReads -R %s -I %s -o %%s", + testDir + "exampleFASTA.fasta", + testDir + "exampleBAM.bam"); + public static final String MD5_UPON_SUCCESSFUL_RUN = "b9dc5bf6753ca2819e70b056eaf61258"; + + + private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { + String command = BASE_COMMAND + String.format(" %s %s", etArg, keyArg); + + WalkerTestSpec spec = expectedException != null ? + new WalkerTestSpec(command, 1, expectedException) : + new WalkerTestSpec(command, 1, Arrays.asList(md5)); + + spec.disableImplicitArgs(); // Turn off automatic inclusion of -et/-K args by WalkerTest + executeTest(testName, spec); + } + + @Test + public void testValidKeyNoET() { + runGATKKeyTest("testValidKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStdout() { + runGATKKeyTest("testValidKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStandard() { + runGATKKeyTest("testValidKeyETStandard", + "", + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testNoKeyNoET() { + runGATKKeyTest("testNoKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStdout() { + runGATKKeyTest("testNoKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStandard() { + runGATKKeyTest("testNoKeyETStandard", + "", + "", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testRevokedKey() { + runGATKKeyTest("testRevokedKey", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "revoked.key", + UserException.KeySignatureVerificationException.class, + null); + } + + @DataProvider(name = "CorruptKeyTestData") + public Object[][] corruptKeyDataProvider() { + return new Object[][] { + { "corrupt_empty.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_file.key", UserException.UnreadableKeyException.class }, + { "corrupt_random_contents.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_isize_field.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_crc.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_email_address.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_sectional_delimiter.key", UserException.KeySignatureVerificationException.class }, + { "corrupt_no_signature.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_signature.key", UserException.KeySignatureVerificationException.class }, + { "corrupt_non_gzipped_valid_key.key", UserException.UnreadableKeyException.class } + }; + } + + @Test(dataProvider = "CorruptKeyTestData") + public void testCorruptKey ( String corruptKeyName, Class expectedException ) { + runGATKKeyTest(String.format("testCorruptKey (%s)", corruptKeyName), + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + corruptKeyName, + expectedException, + null); + } + + @Test + public void testCorruptButNonRequiredKey() { + runGATKKeyTest("testCorruptButNonRequiredKey", + "", + "-K " + keysDataLocation + "corrupt_random_contents.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java new file mode 100644 index 0000000000..660f957969 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.SkipException; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; + +public class GATKKeyUnitTest extends BaseTest { + + @Test + public void testCreateGATKKeyUsingMasterKeyPair() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testCreateGATKKeyUsingMasterKeyPair")); + } + + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey masterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + + // We should be able to create a valid GATKKey using our master key pair: + GATKKey key = new GATKKey(masterPrivateKey, masterPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test + public void testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey")); + } + + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey distributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + + // We should also be able to create a valid GATKKey using our master private + // key and the public key we distribute with the GATK: + GATKKey key = new GATKKey(masterPrivateKey, distributedPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testKeyPairMismatch() { + KeyPair firstKeyPair = CryptUtils.generateKeyPair(); + KeyPair secondKeyPair = CryptUtils.generateKeyPair(); + + // Attempting to create a GATK Key with private and public keys that aren't part of the + // same key pair should immediately trigger a validation failure: + GATKKey key = new GATKKey(firstKeyPair.getPrivate(), secondKeyPair.getPublic(), "foo@bar.com"); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testEncryptionAlgorithmMismatch() { + KeyPair keyPair = CryptUtils.generateKeyPair(CryptUtils.DEFAULT_KEY_LENGTH, "DSA", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + + // Attempting to use a DSA private key to create an RSA signature should throw an error: + GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), "foo@bar.com", "SHA1withRSA"); + } + + @Test( expectedExceptions = UserException.class ) + public void testInvalidEmailAddress() { + String emailAddressWithNulByte = new String(new byte[] { 0 }); + KeyPair keyPair = CryptUtils.generateKeyPair(); + + // Email addresses cannot contain the NUL byte, since it's used as a sectional delimiter in the key file: + GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), emailAddressWithNulByte); + } + + @Test + public void testCreateGATKKeyFromValidKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "valid.key")); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = UserException.UnreadableKeyException.class ) + public void testCreateGATKKeyFromCorruptKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "corrupt_random_contents.key")); + } + + @Test + public void testCreateGATKKeyFromRevokedKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "revoked.key")); + Assert.assertFalse(key.isValid()); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testCreateGATKKeyFromNonExistentFile() { + File nonExistentFile = new File("ghfdkgsdhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), nonExistentFile); + } + + private boolean gatkPrivateKeyExistsButReadPermissionDenied() { + File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); + return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index a9035ffd92..0a8caa8cc9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -8,13 +8,12 @@ import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.testng.Assert; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; - +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -341,7 +340,7 @@ public void testOverlappingIntervalsFromSameSourceWithIntersection() { @Test public void testGetContigLengths() { - Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); + Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); Assert.assertEquals((long)lengths.get("chr1"), 247249719); Assert.assertEquals((long)lengths.get("chr2"), 242951149); Assert.assertEquals((long)lengths.get("chr3"), 199501827); diff --git a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java index 4caf7f485c..941d2b14c5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java @@ -1,14 +1,44 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.io; import org.apache.commons.io.FileUtils; import org.broadinstitute.sting.BaseTest; import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Random; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; public class IOUtilsUnitTest extends BaseTest { @@ -194,4 +224,102 @@ public void testResourceProperties() { Assert.assertEquals(resource.getPath(), "foo"); Assert.assertEquals(resource.getRelativeClass(), Resource.class); } + + @Test + public void testIsSpecialFile() { + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); + Assert.assertFalse(IOUtils.isSpecialFile(null)); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); + } + + @DataProvider( name = "ByteArrayIOTestData") + public Object[][] byteArrayIOTestDataProvider() { + return new Object[][] { + // file size, read buffer size + { 0, 4096 }, + { 1, 4096 }, + { 2000, 4096 }, + { 4095, 4096 }, + { 4096, 4096 }, + { 4097, 4096 }, + { 6000, 4096 }, + { 8191, 4096 }, + { 8192, 4096 }, + { 8193, 4096 }, + { 10000, 4096 } + }; + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToFile(dataWritten, tempFile); + byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); + byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentFileIntoByteArray() { + File nonExistentFile = new File("djfhsdkjghdfk"); + Assert.assertFalse(nonExistentFile.exists()); + + IOUtils.readFileIntoByteArray(nonExistentFile); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadNullStreamIntoByteArray() { + IOUtils.readStreamIntoByteArray(null); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { + IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), + -1); + } + + @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) + public void testWriteByteArrayToUncreatableFile() { + IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteNullByteArrayToFile() { + IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteByteArrayToNullStream() { + IOUtils.writeByteArrayToStream(new byte[]{0}, null); + } + + private byte[] getDeterministicRandomData ( int size ) { + GenomeAnalysisEngine.resetRandomGenerator(); + Random rand = GenomeAnalysisEngine.getRandomGenerator(); + + byte[] randomData = new byte[size]; + rand.nextBytes(randomData); + + return randomData; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java new file mode 100755 index 0000000000..520fb70405 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -0,0 +1,83 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +public class GATKSAMRecordUnitTest extends BaseTest { + GATKSAMRecord read, reducedRead; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; + final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; + final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets + + @BeforeClass + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + + reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); + reducedRead.setReadBases(BASES.getBytes()); + reducedRead.setBaseQualityString(QUALS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); + } + + @Test + public void testReducedReads() { + Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); + Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); + + Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); + for (int i = 0; i < reducedRead.getReadLength(); i++) { + Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); + } + } + + @Test + public void testReducedReadPileupElement() { + PileupElement readp = new PileupElement(read, 0, false, false, false, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false); + + Assert.assertFalse(readp.getRead().isReducedRead()); + + Assert.assertTrue(reducedreadp.getRead().isReducedRead()); + Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); + Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); + } + + @Test + public void testGetOriginalAlignments() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + + // A regular read with all matches + Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); + + // Alignment start shifted + int alignmentShift = 2; + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, alignmentShift); + Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); + + // Both alignments shifted + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, alignmentShift); + Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); + + // Alignment end shifted + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null); + Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); + + } + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index b9f831028b..7598f62a6e 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -1,57 +1,11 @@ package org.broadinstitute.sting.utils.sam; -import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.testng.Assert; -import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; public class ReadUtilsUnitTest extends BaseTest { - GATKSAMRecord read, reducedRead; - final static String BASES = "ACTG"; - final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; - final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets - - @BeforeTest - public void init() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); - read.setReadBases(new String(BASES).getBytes()); - read.setBaseQualityString(new String(QUALS)); - - reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); - reducedRead.setReadBases(BASES.getBytes()); - reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); - } - - @Test - public void testReducedReads() { - Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); - Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); - - Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); - for (int i = 0; i < reducedRead.getReadLength(); i++) { - Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); - } - } - - @Test - public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0); - PileupElement reducedreadp = new PileupElement(reducedRead, 0); - - Assert.assertFalse(readp.isReducedRead()); - - Assert.assertTrue(reducedreadp.isReducedRead()); - Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); - Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); - } - @Test public void testGetAdaptorBoundary() { final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; @@ -103,10 +57,31 @@ public void testGetAdaptorBoundary() { read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); Assert.assertNull(boundary); + read.setInferredInsertSize(10); // Test case 6: read is unmapped read.setReadUnmappedFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); Assert.assertNull(boundary); + read.setReadUnmappedFlag(false); + + // Test case 7: reads don't overlap and look like this: + // <--------| + // |------> + // first read: + myStart = 980; + read.setAlignmentStart(myStart); + read.setInferredInsertSize(20); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertNull(boundary); + + // second read: + myStart = 1000; + read.setAlignmentStart(myStart); + read.setMateAlignmentStart(980); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertNull(boundary); } } diff --git a/public/keys/GATK_public.key b/public/keys/GATK_public.key new file mode 100644 index 0000000000..05cdde1c27 Binary files /dev/null and b/public/keys/GATK_public.key differ diff --git a/public/packages/GATKEngine.xml b/public/packages/GATKEngine.xml index 283b5eabfa..68459f6d2d 100644 --- a/public/packages/GATKEngine.xml +++ b/public/packages/GATKEngine.xml @@ -36,6 +36,8 @@ + + diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 621afe8170..e26541e987 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -29,14 +29,14 @@ class DataProcessingPipeline extends QScript { var reference: File = _ @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true) - var dbSNP: List[File] = List() + var dbSNP: Seq[File] = Seq() /**************************************************************************** * Optional Parameters ****************************************************************************/ @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false) - var indels: List[File] = List() + var indels: Seq[File] = Seq() @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ @@ -118,13 +118,13 @@ class DataProcessingPipeline extends QScript { // Because the realignment only happens after these scripts are executed, in case you are using // bwa realignment, this function will operate over the original bam files and output over the // (to be realigned) bam files. - def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = { + def createSampleFiles(bamFiles: Seq[File], realignedBamFiles: Seq[File]): Map[String, Seq[File]] = { // Creating a table with SAMPLE information from each input BAM file - val sampleTable = scala.collection.mutable.Map.empty[String, List[File]] + val sampleTable = scala.collection.mutable.Map.empty[String, Seq[File]] val realignedIterator = realignedBamFiles.iterator for (bam <- bamFiles) { - val rBam = realignedIterator.next // advance to next element in the realignedBam list so they're in sync. + val rBam = realignedIterator.next() // advance to next element in the realignedBam list so they're in sync. val samReader = new SAMFileReader(bam) val header = samReader.getFileHeader @@ -138,12 +138,12 @@ class DataProcessingPipeline extends QScript { for (rg <- readGroups) { val sample = rg.getSample if (!sampleTable.contains(sample)) - sampleTable(sample) = List(rBam) + sampleTable(sample) = Seq(rBam) else if ( !sampleTable(sample).contains(rBam)) sampleTable(sample) :+= rBam } } - return sampleTable.toMap + sampleTable.toMap } // Rebuilds the Read Group string to give BWA @@ -161,8 +161,8 @@ class DataProcessingPipeline extends QScript { // Takes a list of processed BAM files and realign them using the BWA option requested (bwase or bwape). // Returns a list of realigned BAM files. - def performAlignment(bams: List[File]): List[File] = { - var realignedBams: List[File] = List() + def performAlignment(bams: Seq[File]): Seq[File] = { + var realignedBams: Seq[File] = Seq() var index = 1 for (bam <- bams) { // first revert the BAM file to the original qualities @@ -194,10 +194,10 @@ class DataProcessingPipeline extends QScript { realignedBams :+= rgRealignedBamFile index = index + 1 } - return realignedBams + realignedBams } - def getIndelCleaningModel(): ConsensusDeterminationModel = { + def getIndelCleaningModel: ConsensusDeterminationModel = { if (cleaningModel == "KNOWNS_ONLY") ConsensusDeterminationModel.KNOWNS_ONLY else if (cleaningModel == "USE_SW") @@ -206,17 +206,17 @@ class DataProcessingPipeline extends QScript { ConsensusDeterminationModel.USE_READS } - def revertBams(bams: List[File], removeAlignmentInformation: Boolean): List[File] = { - var revertedBAMList: List[File] = List() + def revertBams(bams: Seq[File], removeAlignmentInformation: Boolean): Seq[File] = { + var revertedBAMList: Seq[File] = Seq() for (bam <- bams) revertedBAMList :+= revertBAM(bam, removeAlignmentInformation) - return revertedBAMList + revertedBAMList } def revertBAM(bam: File, removeAlignmentInformation: Boolean): File = { val revertedBAM = swapExt(bam, ".bam", ".reverted.bam") add(revert(bam, revertedBAM, removeAlignmentInformation)) - return revertedBAM + revertedBAM } /**************************************************************************** @@ -224,22 +224,22 @@ class DataProcessingPipeline extends QScript { ****************************************************************************/ - def script = { + def script() { // final output list of processed bam files - var cohortList: List[File] = List() + var cohortList: Seq[File] = Seq() // sets the model for the Indel Realigner - cleanModelEnum = getIndelCleaningModel() + cleanModelEnum = getIndelCleaningModel // keep a record of the number of contigs in the first bam file in the list - val bams = QScriptUtils.createListFromFile(input) + val bams = QScriptUtils.createSeqFromFile(input) if (nContigs < 0) nContigs = QScriptUtils.getNumberOfContigs(bams(0)) val realignedBAMs = if (useBWApe || useBWAse || useBWAsw) {performAlignment(bams)} else {revertBams(bams, false)} // generate a BAM file per sample joining all per lane files if necessary - val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs) + val sampleBAMFiles: Map[String, Seq[File]] = createSampleFiles(bams, realignedBAMs) // if this is a 'knowns only' indel realignment run, do it only once for all samples. val globalIntervals = new File(outputDir + projectName + ".intervals") @@ -317,7 +317,7 @@ class DataProcessingPipeline extends QScript { this.maxRecordsInRam = 100000 } - case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { + case class target (inBams: Seq[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY) this.input_file = inBams this.out = outIntervals @@ -330,7 +330,7 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outIntervals + ".target" } - case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { + case class clean (inBams: Seq[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { this.input_file = inBams this.targetIntervals = tIntervals this.out = outBam @@ -347,11 +347,11 @@ class DataProcessingPipeline extends QScript { case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { this.knownSites ++= qscript.dbSNP - this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") + this.covariate ++= Seq("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam this.recal_file = outRecalFile if (!defaultPlatform.isEmpty) this.default_platform = defaultPlatform - if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString) + if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString) else if (qscript.intervals != null) this.intervals :+= qscript.intervals this.scatterCount = nContigs this.analysisName = queueLogDir + outRecalFile + ".covariates" @@ -363,7 +363,7 @@ class DataProcessingPipeline extends QScript { this.recal_file = inRecalFile this.baq = CalculationMode.CALCULATE_AS_NECESSARY this.out = outBam - if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString) + if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString) else if (qscript.intervals != null) this.intervals :+= qscript.intervals this.no_pg_tag = qscript.testMode this.scatterCount = nContigs @@ -395,7 +395,7 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".dedup" } - case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs { + case class joinBams (inBams: Seq[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs { this.input = inBams this.output = outBam this.analysisName = queueLogDir + outBam + ".joinBams" @@ -495,7 +495,7 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".bwasw" } - case class writeList(inBams: List[File], outBamList: File) extends ListWriterFunction { + case class writeList(inBams: Seq[File], outBamList: File) extends ListWriterFunction { this.inputFiles = inBams this.listFile = outBamList this.analysisName = queueLogDir + outBamList + ".bamList" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 8c9063c293..22ac524536 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -134,8 +134,8 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/1kg/processing/official_release/phase1/ALL.wgs.VQSR_consensus_biallelic.20101123.indels.sites.vcf", "1000G_biallelic.indels", b37, true, false)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf", - "Mills_Devine_2hit.indels", b37, true, true)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", + "Mills_and_1000G_gold_standard.indels", b37, true, true)) // // example call set for wiki tutorial diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index c06601a2d2..b860358cac 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -25,9 +25,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false) var noIndels: Boolean = false - @Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false) - var LOCAL_ET: Boolean = false - @Argument(shortName="mbq", doc="The minimum Phred-Scaled quality score threshold to be considered a good base.", required=false) var minimumBaseQuality: Int = -1 @@ -46,20 +43,24 @@ class MethodsDevelopmentCallingPipeline extends QScript { val bamList: File, val goldStandard_VCF: File, val intervals: String, - val titvTarget: Double, - val trancheTarget: Double, + val indelTranchTarget: Double, + val snpTrancheTarget: Double, val isLowpass: Boolean, val isExome: Boolean, val nSamples: Int) { val name = qscript.outputDir + baseName val clusterFile = new File(name + ".clusters") - val rawVCF = new File(name + ".raw.vcf") + val rawSnpVCF = new File(name + ".raw.vcf") val rawIndelVCF = new File(name + ".raw.indel.vcf") val filteredIndelVCF = new File(name + ".filtered.indel.vcf") - val recalibratedVCF = new File(name + ".recalibrated.vcf") - val tranchesFile = new File(name + ".tranches") - val vqsrRscript = name + ".vqsr.r" - val recalFile = new File(name + ".tranches.recal") + val recalibratedSnpVCF = new File(name + ".snp.recalibrated.vcf") + val recalibratedIndelVCF = new File(name + ".indel.recalibrated.vcf") + val tranchesSnpFile = new File(name + ".snp.tranches") + val tranchesIndelFile = new File(name + ".indel.tranches") + val vqsrSnpRscript = name + ".snp.vqsr.r" + val vqsrIndelRscript = name + ".indel.vqsr.r" + val recalSnpFile = new File(name + ".snp.tranches.recal") + val recalIndelFile = new File(name + ".indel.tranches.recal") val goldStandardRecalibratedVCF = new File(name + "goldStandard.recalibrated.vcf") val goldStandardTranchesFile = new File(name + "goldStandard.tranches") val goldStandardRecalFile = new File(name + "goldStandard.tranches.recal") @@ -88,6 +89,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { val training_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf" val badSites_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.terrible.vcf" val projectConsensus_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/ALL.wgs.projectConsensus_v2b.20101123.snps.sites.vcf" + val indelGoldStandardCallset = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf" val lowPass: Boolean = true val exome: Boolean = true @@ -101,69 +103,69 @@ class MethodsDevelopmentCallingPipeline extends QScript { "NA12878_gold" -> new Target("NA12878.goldStandard", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/data/goldStandard.list"), new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** There is no gold standard for the gold standard ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, lowPass, !exome, 391), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, lowPass, !exome, 391), "NA12878_wgs_b37" -> new Target("NA12878.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1), "NA12878_wgs_decoy" -> new Target("NA12878.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1), "NA12878_wgs_hg18" -> new Target("NA12878.HiSeq.WGS.hg18", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 90.0, 99.0, !lowPass, !exome, 1), "NA12878_wex_b37" -> new Target("NA12878.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/seq/picard_aggregation/C339/NA12878/v3/NA12878.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 1), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1), "NA12878_wex_hg18" -> new Target("NA12878.HiSeq.WEx.hg18", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 3.3, 98.0, !lowPass, exome, 1), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1), "NA12878_wex_decoy" -> new Target("NA12878.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.NA12878.clean.dedup.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 1), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1), "CEUTrio_wex_b37" -> new Target("CEUTrio.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 3), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3), "CEUTrio_wgs_b37" -> new Target("CEUTrio.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3), "CEUTrio_wex_decoy" -> new Target("CEUTrio.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.list"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 3), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3), "CEUTrio_wgs_decoy" -> new Target("CEUTrio.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.list"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3), "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.GA2.hg19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1), "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass, !exome, 79), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 79), "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 99.0, !lowPass, exome, 96), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 99.0, !lowPass, exome, 96), "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 99.0, lowPass, !exome, 60), // chunked interval list to use with Queue's scatter/gather functionality + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 90.0, 99.0, lowPass, !exome, 60), // chunked interval list to use with Queue's scatter/gather functionality "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass, !exome, 363) + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 363) ) @@ -181,15 +183,15 @@ class MethodsDevelopmentCallingPipeline extends QScript { val goldStandard = true for (target <- targets) { if( !skipCalling ) { - if (!noIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target)) + if (!noIndels) add(new indelCall(target), new indelRecal(target), new indelCut(target), new indelEvaluation(target)) add(new snpCall(target)) - add(new VQSR(target, !goldStandard)) - add(new applyVQSR(target, !goldStandard)) + add(new snpRecal(target, !goldStandard)) + add(new snpCut(target, !goldStandard)) add(new snpEvaluation(target)) } if ( runGoldStandard ) { - add(new VQSR(target, goldStandard)) - add(new applyVQSR(target, goldStandard)) + add(new snpRecal(target, goldStandard)) + add(new snpCut(target, goldStandard)) } } } @@ -198,7 +200,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; memoryLimit = 4; - phone_home = if ( LOCAL_ET ) GATKRunReport.PhoneHomeOption.STANDARD else GATKRunReport.PhoneHomeOption.AWS_S3 + phone_home = GATKRunReport.PhoneHomeOption.NO_ET } def bai(bam: File) = new File(bam + ".bai") @@ -222,7 +224,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.min_base_quality_score = minimumBaseQuality if (qscript.deletions >= 0) this.max_deletion_fraction = qscript.deletions - this.out = t.rawVCF + this.out = t.rawSnpVCF this.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP this.baq = if (noBAQ || t.isExome) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY} this.analysisName = t.name + "_UGs" @@ -257,79 +259,115 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.jobName = queueLogDir + t.name + ".indelfilter" } - // 3.) Variant Quality Score Recalibration - Generate Recalibration table - class VQSR(t: Target, goldStandard: Boolean) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS { + class VQSRBase(t: Target) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS { this.nt = 2 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) - this.resource :+= new TaggedFile( t.hapmapFile, "training=true,truth=true,prior=15.0" ) - this.resource :+= new TaggedFile( omni_b37, "training=true,truth=true,prior=12.0" ) - this.resource :+= new TaggedFile( training_1000G, "training=true,prior=10.0" ) + this.allPoly = true + this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") + } + + class snpRecal(t: Target, goldStandard: Boolean) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS { + this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } ) + this.resource :+= new TaggedFile( t.hapmapFile, "known=false,training=true,truth=true,prior=15.0" ) + this.resource :+= new TaggedFile( omni_b37, "known=false,training=true,truth=true,prior=12.0" ) + this.resource :+= new TaggedFile( training_1000G, "known=false,training=true,prior=10.0" ) this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" ) this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" ) this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS") - if(t.nSamples >= 10) { // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate - this.use_annotation ++= List("InbreedingCoeff") - } - if(!t.isExome) { + if(t.nSamples >= 10) + this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate + if(!t.isExome) this.use_annotation ++= List("DP") - } else { // exome specific parameters + else { // exome specific parameters this.resource :+= new TaggedFile( badSites_1000G, "bad=true,prior=2.0" ) this.mG = 6 - if(t.nSamples <= 3) { // very few exome samples means very few variants + if(t.nSamples <= 3) { // very few exome samples means very few variants this.mG = 4 this.percentBad = 0.04 } } - this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile } - this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } - this.allPoly = true - this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") - this.rscript_file = t.vqsrRscript - this.analysisName = t.name + "_VQSR" - this.jobName = queueLogDir + t.name + ".VQSR" + this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile } + this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile } + this.rscript_file = t.vqsrSnpRscript + this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP + this.analysisName = t.name + "_VQSRs" + this.jobName = queueLogDir + t.name + ".snprecal" } + class indelRecal(t: Target) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS { + this.input :+= t.rawIndelVCF + this.resource :+= new TaggedFile(indelGoldStandardCallset, "known=false,training=true,truth=true,prior=12.0" ) + this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" ) + this.use_annotation ++= List("QD", "HaplotypeScore", "ReadPosRankSum", "FS") + if(t.nSamples >= 10) + this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate + this.tranches_file = t.tranchesIndelFile + this.recal_file = t.recalIndelFile + this.rscript_file = t.vqsrIndelRscript + this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL + this.analysisName = t.name + "_VQSRi" + this.jobName = queueLogDir + t.name + ".indelrecal" + } + + // 4.) Apply the recalibration table to the appropriate tranches - class applyVQSR (t: Target, goldStandard: Boolean) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS { + class applyVQSRBase (t: Target) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS { this.memoryLimit = 6 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) - this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile} - this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } - this.ts_filter_level = t.trancheTarget - this.out = t.recalibratedVCF - this.analysisName = t.name + "_AVQSR" - this.jobName = queueLogDir + t.name + ".applyVQSR" } + class snpCut (t: Target, goldStandard: Boolean) extends applyVQSRBase(t) { + this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } ) + this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile} + this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile } + this.ts_filter_level = t.snpTrancheTarget + this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP + this.out = t.recalibratedSnpVCF + this.analysisName = t.name + "_AVQSRs" + this.jobName = queueLogDir + t.name + ".snpcut" + } + + class indelCut (t: Target) extends applyVQSRBase(t) { + this.input :+= t.rawIndelVCF + this.tranches_file = t.tranchesIndelFile + this.recal_file = t.recalIndelFile + this.ts_filter_level = t.indelTranchTarget + this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL + this.out = t.recalibratedIndelVCF + this.analysisName = t.name + "_AVQSRi" + this.jobName = queueLogDir + t.name + ".indelcut" + } + + // 5.) Variant Evaluation Base(OPTIONAL) class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS { this.memoryLimit = 3 - this.reference_sequence = t.reference this.comp :+= new TaggedFile(t.hapmapFile, "hapmap" ) - this.intervalsString ++= List(t.intervals) this.D = new File(t.dbsnpFile) + this.reference_sequence = t.reference + this.intervalsString ++= List(t.intervals) this.sample = samples } // 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf class snpEvaluation(t: Target) extends EvalBase(t) { if (t.reference == b37 || t.reference == hg19) this.comp :+= new TaggedFile( omni_b37, "omni" ) - this.eval :+= t.recalibratedVCF + this.eval :+= t.recalibratedSnpVCF this.out = t.evalFile this.analysisName = t.name + "_VEs" - this.jobName = queueLogDir + t.name + ".snp.eval" + this.jobName = queueLogDir + t.name + ".snpeval" } // 5b.) Indel Evaluation (OPTIONAL) class indelEvaluation(t: Target) extends EvalBase(t) { - this.eval :+= t.filteredIndelVCF - this.evalModule :+= "IndelStatistics" + this.eval :+= t.recalibratedIndelVCF + this.comp :+= new TaggedFile(indelGoldStandardCallset, "indelGS" ) + this.noEV = true + this.evalModule = List("CompOverlap", "CountVariants", "TiTvVariantEvaluator", "ValidationReport", "IndelStatistics") this.out = t.evalIndelFile this.analysisName = t.name + "_VEi" - this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval" + this.jobName = queueLogDir + queueLogDir + t.name + ".indeleval" } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index 4896eaed3c..c64eef7f76 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -1,12 +1,12 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.util.QScriptUtils import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.utils.exceptions.UserException import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups} +import org.broadinstitute.sting.queue.extensions.gatk._ /** * Created by IntelliJ IDEA. @@ -53,19 +53,22 @@ class PacbioProcessingPipeline extends QScript { val queueLogDir: String = ".qlog/" - def script = { + def script() { - val fileList: List[File] = QScriptUtils.createListFromFile(input) + val fileList: Seq[File] = QScriptUtils.createSeqFromFile(input) for (file: File <- fileList) { var USE_BWA: Boolean = false + var resetQuals: Boolean = true - if (file.endsWith(".fasta") || file.endsWith(".fq")) { + if (file.endsWith(".fasta") || file.endsWith(".fq") || file.endsWith(".fastq")) { if (bwaPath == null) { throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); } USE_BWA = true + if (file.endsWith(".fq") || file.endsWith(".fastq")) + resetQuals = false } // FASTA -> BAM steps @@ -97,9 +100,9 @@ class PacbioProcessingPipeline extends QScript { val bam = if (BLASR_BAM) {mqBAM} else {bamBase} - add(cov(bam, recalFile1), + add(cov(bam, recalFile1, resetQuals), recal(bam, recalFile1, recalBam), - cov(recalBam, recalFile2), + cov(recalBam, recalFile2, false), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) } @@ -158,8 +161,9 @@ class PacbioProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".rg" } - case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.DBQ = dbq + case class cov (inBam: File, outRecalFile: File, resetQuals: Boolean) extends CountCovariates with CommandLineGATKArgs { + if (resetQuals) + this.DBQ = dbq this.knownSites :+= dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala new file mode 100644 index 0000000000..d891ebaafd --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.qscripts.examples + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +/** + * Script used for testing output to /dev/null + */ +class DevNullOutput extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + def script() { + val genotyper = new UnifiedGenotyper + genotyper.reference_sequence = referenceFile + genotyper.memoryLimit = 2 + genotyper.scatterCount = 3 + genotyper.input_file :+= bamFile + genotyper.out = "/dev/null" + add(genotyper) + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala new file mode 100644 index 0000000000..0184b5d2c0 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.queue.qscripts.lib + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals +import scala.collection.JavaConversions._ +import org.broadinstitute.sting.utils.text.XReadLines +import java.io.PrintStream +import org.broadinstitute.sting.queue.extensions.gatk.SelectVariants + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 2/2/12 + * Time: 12:13 PM + * To change this template use File | Settings | File Templates. + */ + +class ChunkVCF extends QScript { + + @Input(shortName="V",fullName="VCF",doc="The VCF you want to chunk",required=true) + var inVCF : File = _ + + @Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true) + var numEntries : Int = _ + + @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.",required=false) + var intervals : File = _ + + @Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false) + var preserve : Boolean = false + + @Input(fullName="reference",doc="The reference file",required=false) + var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Input(fullName="samples",doc="A file of sample IDs to condense VCF file to",required=false) + var extractSamples : File = _ + + val tmpdir : File = System.getProperty("java.io.tmpdir") + + def script = { + if ( intervals == null ) { + // create an interval list from the VCF + val ivals : File = swapExt(inVCF,".vcf",".intervals.list") + val extract : VCFExtractIntervals = new VCFExtractIntervals(inVCF,ivals,false) + add(extract) + } else { + var chunkNum = 1 + var numLinesInChunk = 0 + var chromosome : String = asScalaIterator(new XReadLines(intervals)).next().split(":")(0) + var chunkFile : File = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum)) + var chunkWriter = new PrintStream(chunkFile) + asScalaIterator(new XReadLines(intervals)).foreach( int => { + // check new chromosome or full chunk + if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) { + chunkWriter.close() + val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.variant = inVCF + chunkSelect.reference_sequence = ref + chunkSelect.memoryLimit = 2 + chunkSelect.intervals :+= chunkFile + if ( extractSamples != null ) + chunkSelect.sample_file :+= extractSamples + chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) + add(chunkSelect) + chunkNum += 1 + numLinesInChunk = 0 + chromosome = int.split(":")(0) + chunkFile = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum)) + chunkWriter = new PrintStream(chunkFile) + } + chunkWriter.printf("%s%n",int) + numLinesInChunk += 1 + }) + // last chunk + if ( numLinesInChunk > 0 ) { + // some work to do + val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.variant = inVCF + chunkSelect.reference_sequence = ref + chunkSelect.memoryLimit = 2 + chunkSelect.intervals :+= chunkFile + chunkWriter.close() + if ( extractSamples != null ) + chunkSelect.sample_file :+= extractSamples + chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) + add(chunkSelect) + } + } + } +} \ No newline at end of file diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala new file mode 100644 index 0000000000..cad8af51d3 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -0,0 +1,200 @@ +package org.broadinstitute.sting.queue.qscripts.lib + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.commandline.Input +import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions._ +import java.io._ +import org.broadinstitute.sting.queue.extensions.gatk.{SelectVariants, VariantsToPed} + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 1/31/12 + * Time: 10:46 PM + * To change this template use File | Settings | File Templates. + */ + +class VcfToPed extends QScript { + + @Input(shortName = "V", fullName="Variants", required=true,doc="VCF to convert to ped") + var variants : File = _ + + @Output(shortName = "B", fullName="Bed",required=true,doc="Name of the ped output file (fam and bim will use the root of this file)") + var bed : File = _ + + @Input(shortName = "M", fullName="Meta",required=true,doc="The sample metadata file, can be a .fam or [NAME]\\tkey1=val1;key2=val2") + var meta : File = _ + + @Input(shortName = "Int", fullName="Intervals",required=false,doc="Intervals. If not specified script will produce them and exit.") + var intervals : File = _ + + @Argument(shortName="R",fullName="Ref",required=false,doc="Reference file") + var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file") + var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") + + @Argument(shortName="sf",fullName="sampleFile",required=false,doc="sample file") + var samFile : File = _ + + val tmpdir : File = System.getProperty("java.io.tmpdir") + + def script = { + if ( intervals == null ) { + val ivals : File = swapExt(variants,".vcf",".intervals.list") + val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) + add(extract) + } else { + val IS_GZ : Boolean = variants.getName.endsWith(".vcf.gz") + var iXRL = new XReadLines(intervals) + var chunk = 1; + var subListFile : File = null + if ( IS_GZ ) + subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk)) + else + subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + var subList = new PrintStream(subListFile) + var nL = 0; + var bedOuts : List[File] = Nil; + var bimOuts : List[File] = Nil + var lastFam : File = null; + while ( iXRL.hasNext ) { + subList.printf("%s%n",iXRL.next()) + nL = nL + 1 + if ( nL > 10000 ) { + val toPed : VariantsToPed = new VariantsToPed + toPed.memoryLimit = 2 + toPed.reference_sequence = ref + toPed.intervals :+= subListFile + toPed.dbsnp = dbsnp + if ( samFile != null ) { + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val extract : SelectVariants = new SelectVariants + extract.reference_sequence = ref + extract.memoryLimit = 2 + extract.intervals :+= subListFile + extract.variant = variants + extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk)) + extract.sample_file :+= samFile + add(extract) + toPed.variant = extract.out + } else { + toPed.variant = variants + } + toPed.metaData = meta + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val tBed = new File(tmpdir,base+".bed") + val bim = new File(tmpdir,base+".bim") + val fam = new File(tmpdir,base+".fam") + toPed.bed = tBed + toPed.bim = bim + toPed.fam = fam + add(toPed) + subList.close() + chunk = chunk + 1 + if ( IS_GZ ) + subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk)) + else + subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + subList = new PrintStream(subListFile) + bedOuts :+= tBed + bimOuts :+= bim + lastFam = fam + nL = 0; + } + } + + if ( nL > 0 ) { + val toPed : VariantsToPed = new VariantsToPed + toPed.reference_sequence = ref + toPed.intervals :+= new File(subListFile) + toPed.dbsnp = dbsnp + if ( samFile != null ) { + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val extract : SelectVariants = new SelectVariants + extract.reference_sequence = ref + extract.memoryLimit = 2 + extract.intervals :+= subListFile + extract.variant = variants + extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk)) + extract.sample_file :+= samFile + add(extract) + toPed.variant = extract.out + } else { + toPed.variant = variants + } + toPed.metaData = meta + toPed.memoryLimit = 2 + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val tBed = new File(tmpdir,base+".bed") + val bim = new File(tmpdir,base+".bim") + val fam = new File(tmpdir,base+".fam") + toPed.bed = tBed + toPed.bim = bim + toPed.fam = fam + lastFam = fam + add(toPed) + subList.close() + bedOuts :+= tBed + bimOuts :+= bim + } + + var gatherUP = new MyPedGather + gatherUP.binPed = bedOuts + gatherUP.bim = bimOuts + gatherUP.outPed = bed + gatherUP.outBim = swapExt(bed,".bed",".bim") + + add(gatherUP) + + class copyFam extends InProcessFunction { + @Input(doc="fam") var inFam = lastFam + @Output(doc="fam") var outFam = swapExt(bed,".bed",".fam") + + def run = { + var stream = new PrintStream(outFam) + asScalaIterator(new XReadLines(inFam)).foreach( u => { + stream.printf("%s%n",u) + }) + stream.close() + } + } + + add(new copyFam) + } + + } + + class MyPedGather extends InProcessFunction { + @Input(doc="Peds to be merged") var binPed: List[File] = Nil + @Input(doc="Bims to be merged") var bim : List[File] = Nil + @Output(doc="The final Ped to write to") var outPed : File = _ + @Output(doc="The final bim to write to") var outBim : File = _ + + def run : Unit = { + var stream : PrintStream = new PrintStream(outPed) + stream.write((List[Byte](0x6c.toByte,0x1b.toByte,0x1.toByte)).toArray) + binPed.map(u => new FileInputStream(u) ).foreach( u => { + u.skip(3) + var b = -1 + do { + b = u.read() + stream.write(b.toByte) + } while ( b != -1 ) + }) + stream.close() + + stream = new PrintStream(outBim) + bim.map(u => new XReadLines(u)).foreach( u => { + asScalaIterator(u).foreach( x => { + stream.printf("%s%n",x) + }) + }) + + stream.close() + } + } + +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 32913deb47..7a22e700b8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.io.IOUtils import org.broadinstitute.sting.utils.help.ApplicationDetails import java.util.{ResourceBundle, Arrays} import org.broadinstitute.sting.utils.text.TextFormattingUtils +import org.apache.commons.io.FilenameUtils /** * Entry point of Queue. Compiles and runs QScripts passed in to the command line. @@ -61,6 +62,7 @@ object QCommandLine extends Logging { CommandLineProgram.start(qCommandLine, argv) try { Runtime.getRuntime.removeShutdownHook(shutdownHook) + qCommandLine.shutdown() } catch { case _ => /* ignore, example 'java.lang.IllegalStateException: Shutdown in progress' */ } @@ -78,10 +80,10 @@ object QCommandLine extends Logging { class QCommandLine extends CommandLineProgram with Logging { @Input(fullName="script", shortName="S", doc="QScript scala file", required=true) @ClassType(classOf[File]) - private var scripts = List.empty[File] + var scripts = Seq.empty[File] @ArgumentCollection - private val settings = new QGraphSettings + val settings = new QGraphSettings private val qScriptManager = new QScriptManager private val qGraph = new QGraph @@ -91,7 +93,7 @@ class QCommandLine extends CommandLineProgram with Logging { private lazy val pluginManager = { qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory) qScriptManager.loadScripts(scripts, qScriptClasses) - new PluginManager[QScript](classOf[QScript], List(qScriptClasses.toURI.toURL)) + new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) } QFunction.parsingEngine = new ParsingEngine(this) @@ -101,12 +103,16 @@ class QCommandLine extends CommandLineProgram with Logging { * functions, and then builds and runs a QGraph based on the dependencies. */ def execute = { + if (settings.qSettings.runName == null) + settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) + qGraph.settings = settings val allQScripts = pluginManager.createAllTypes(); for (script <- allQScripts) { logger.info("Scripting " + pluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) loadArgumentsIntoObject(script) + script.qSettings = settings.qSettings try { script.script() } catch { @@ -120,22 +126,34 @@ class QCommandLine extends CommandLineProgram with Logging { // Execute the job graph qGraph.run() + val functionsAndStatus = qGraph.getFunctionsAndStatus + val success = qGraph.success + // walk over each script, calling onExecutionDone for (script <- allQScripts) { - script.onExecutionDone(qGraph.getFunctionsAndStatus(script.functions), qGraph.success) - if ( ! settings.disableJobReport ) { - val jobStringName = (QScriptUtils.?(settings.jobReportFile)).getOrElse(settings.qSettings.jobNamePrefix + ".jobreport.txt") - - if (!shuttingDown) { - val reportFile = new File(jobStringName) - logger.info("Writing JobLogging GATKReport to file " + reportFile) - QJobReport.printReport(qGraph.getFunctionsAndStatus(script.functions), reportFile) - - if ( settings.run ) { - val pdfFile = new File(jobStringName + ".pdf") - logger.info("Plotting JobLogging GATKReport to file " + pdfFile) - QJobReport.plotReport(reportFile, pdfFile) - } + val scriptFunctions = functionsAndStatus.filterKeys(f => script.functions.contains(f)) + script.onExecutionDone(scriptFunctions, success) + } + + logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) + + if (!settings.disableJobReport) { + val jobStringName = { + if (settings.jobReportFile != null) + settings.jobReportFile + else + settings.qSettings.runName + ".jobreport.txt" + } + + if (!shuttingDown) { + val reportFile = IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) + logger.info("Writing JobLogging GATKReport to file " + reportFile) + QJobReport.printReport(functionsAndStatus, reportFile) + + if (settings.run) { + val pdfFile = IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf") + logger.info("Plotting JobLogging GATKReport to file " + pdfFile) + QJobReport.plotReport(reportFile, pdfFile) } } } @@ -179,20 +197,20 @@ class QCommandLine extends CommandLineProgram with Logging { override def getApplicationDetails : ApplicationDetails = { new ApplicationDetails(createQueueHeader(), - List.empty[String], + Seq.empty[String], ApplicationDetails.createDefaultRunningInstructions(getClass.asInstanceOf[Class[CommandLineProgram]]), "") } - private def createQueueHeader() : List[String] = { - List(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), - "Copyright (c) 2011 The Broad Institute", + private def createQueueHeader() : Seq[String] = { + Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), + "Copyright (c) 2012 The Broad Institute", "Please view our documentation at http://www.broadinstitute.org/gsa/wiki", "For support, please view our support site at http://getsatisfaction.com/gsa") } private def getQueueVersion : String = { - var stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") + val stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") if ( stingResources.containsKey("org.broadinstitute.sting.queue.QueueVersion.version") ) { stingResources.getString("org.broadinstitute.sting.queue.QueueVersion.version") @@ -203,7 +221,7 @@ class QCommandLine extends CommandLineProgram with Logging { } private def getBuildTimestamp : String = { - var stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") + val stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") if ( stingResources.containsKey("build.timestamp") ) { stingResources.getString("build.timestamp") diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index fce65c9970..6f887ea002 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -27,7 +27,6 @@ package org.broadinstitute.sting.queue import engine.JobRunInfo import org.broadinstitute.sting.queue.function.QFunction import annotation.target.field -import io.Source import util.{StringFileConversions, PrimitiveOptionConversions, Logging} /** @@ -53,6 +52,11 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon type ArgumentCollection = org.broadinstitute.sting.commandline.ArgumentCollection @field type Gather = org.broadinstitute.sting.commandline.Gather @field + /** + * Default settings for QFunctions + */ + var qSettings: QSettings = _ + /** * Builds the CommandLineFunctions that will be used to run this script and adds them to this.functions directly or using the add() utility method. */ @@ -60,18 +64,14 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon /** * A default handler for the onExecutionDone() function. By default this doesn't do anything - * except print out a fine status message. */ def onExecutionDone(jobs: Map[QFunction, JobRunInfo], success: Boolean) { - logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", jobs.size)) - // this is too much output - // for ( (f, info) <- jobs ) logger.info(" %s %s".format(f.jobName, info)) } /** * The command line functions that will be executed for this QScript. */ - var functions = List.empty[QFunction] + var functions = Seq.empty[QFunction] /** * Exchanges the extension on a file. @@ -98,22 +98,20 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon * Adds one or more command line functions to be run. * @param functions Functions to add. */ - def add(functions: QFunction*) = { + def add(functions: QFunction*) { functions.foreach(function => function.addOrder = QScript.nextAddOrder) this.functions ++= functions } - def addAll(functions: List[QFunction]) { + def addAll(functions: Seq[QFunction]) { functions.foreach( f => add(f) ) } - - def extractFileEntries(in: File): List[File] = Source.fromFile(in).getLines().toList } object QScript { private var addOrder = 0 private def nextAddOrder = { addOrder += 1 - List(addOrder) + Seq(addOrder) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala index 512a9f8dd1..74487917fc 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala @@ -20,7 +20,7 @@ class QScriptManager() extends Logging { * Compiles and loads the scripts in the files into the current classloader. * Heavily based on scala/src/compiler/scala/tools/ant/Scalac.scala */ - def loadScripts(scripts: List[File], tempDir: File) { + def loadScripts(scripts: Seq[File], tempDir: File) { if (scripts.size > 0) { val settings = new Settings((error: String) => logger.error(error)) settings.deprecation.value = true @@ -36,7 +36,7 @@ class QScriptManager() extends Logging { logger.info("Compiling %s QScript%s".format(scripts.size, plural(scripts.size))) logger.debug("Compilation directory: " + settings.outdir.value) - run.compileFiles(scripts.map(new PlainFile(_))) + run.compileFiles(scripts.toList.map(new PlainFile(_))) reporter.printSummary() if (reporter.hasErrors) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index e8ac26a574..d9fed4ce8b 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -25,15 +25,14 @@ package org.broadinstitute.sting.queue import java.io.File -import org.broadinstitute.sting.commandline.{ArgumentCollection, Argument} -import org.broadinstitute.sting.queue.util.{SystemUtils, EmailSettings} +import org.broadinstitute.sting.commandline.Argument /** * Default settings settable on the command line and passed to CommandLineFunctions. */ class QSettings { - @Argument(fullName="job_name_prefix", shortName="jobPrefix", doc="Default name prefix for compute farm jobs.", required=false) - var jobNamePrefix: String = QSettings.processNamePrefix + @Argument(fullName="run_name", shortName="runName", doc="A name for this run used for various status messages.", required=false) + var runName: String = _ @Argument(fullName="job_project", shortName="jobProject", doc="Default project for compute farm jobs.", required=false) var jobProject: String = _ @@ -45,13 +44,13 @@ class QSettings { var jobPriority: Option[Int] = None @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) - var jobNativeArgs: List[String] = Nil + var jobNativeArgs: Seq[String] = Nil @Argument(fullName="job_resource_request", shortName="jobResReq", doc="Resource requests to pass to the job runner.", required=false) - var jobResourceRequests: List[String] = Nil + var jobResourceRequests: Seq[String] = Nil @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) - var jobEnvironmentNames: List[String] = Nil + var jobEnvironmentNames: Seq[String] = Nil @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) var memoryLimit: Option[Double] = None @@ -77,15 +76,4 @@ class QSettings { @Argument(fullName="job_scatter_gather_directory", shortName="jobSGDir", doc="Default directory to place scatter gather output for compute farm jobs.", required=false) var jobScatterGatherDirectory: File = _ - - @ArgumentCollection - val emailSettings = new EmailSettings -} - -/** - * Default settings settable on the command line and passed to CommandLineFunctions. - */ -object QSettings { - /** A semi-unique job prefix using the host name and the process id. */ - private val processNamePrefix = "Q-" + SystemUtils.pidAtHost } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala index 55ed942672..8225d28ab3 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.engine import org.broadinstitute.sting.queue.function.QFunction @@ -28,15 +52,18 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod val myRunInfo: JobRunInfo = JobRunInfo.default // purely for dryRun testing + /** + * When using reset status this variable tracks the old status + */ + var resetFromStatus: RunnerStatus.Value = null + /** * Initializes with the current status of the function. */ private var currentStatus = { - val isDone = function.isDone - val isFail = function.isFail - if (isFail.isDefined && isFail.get) + if (function.isFail) RunnerStatus.FAILED - else if (isDone.isDefined && isDone.get) + else if (function.isDone) RunnerStatus.DONE else RunnerStatus.PENDING @@ -136,13 +163,15 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod * Resets the edge to pending status. */ def resetToPending(cleanOutputs: Boolean) { + if (resetFromStatus == null) + resetFromStatus = currentStatus currentStatus = RunnerStatus.PENDING if (cleanOutputs) function.deleteOutputs() runner = null } - override def dotString = function.dotString + override def shortDescription = function.shortDescription /** * Returns the path to the file to use for logging errors. diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala index d006cde4b4..be5622360b 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala @@ -3,7 +3,8 @@ package org.broadinstitute.sting.queue.engine import org.broadinstitute.sting.queue.function.InProcessFunction import java.util.Date import org.broadinstitute.sting.utils.Utils -import org.apache.commons.io.FileUtils +import org.apache.commons.io.{IOUtils, FileUtils} +import java.io.PrintStream /** * Runs a function that executes in process and does not fork out an external process. @@ -16,12 +17,24 @@ class InProcessRunner(val function: InProcessFunction) extends JobRunner[InProce getRunInfo.exechosts = Utils.resolveHostname() runStatus = RunnerStatus.RUNNING - function.run() + function.jobOutputStream = new PrintStream(FileUtils.openOutputStream(function.jobOutputFile)) + function.jobErrorStream = { + if (function.jobErrorFile != null) + new PrintStream(FileUtils.openOutputStream(function.jobErrorFile)) + else + function.jobOutputStream + } + try { + function.run() + function.jobOutputStream.println("%s%nDone.".format(function.description)) + } finally { + IOUtils.closeQuietly(function.jobOutputStream) + if (function.jobErrorFile != null) + IOUtils.closeQuietly(function.jobErrorStream) + } - getRunInfo.doneTime = new Date() - val content = "%s%nDone.".format(function.description) - FileUtils.writeStringToFile(function.jobOutputFile, content) runStatus = RunnerStatus.DONE + getRunInfo.doneTime = new Date() } def status = runStatus diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala index 1d56009f33..17f0561faa 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.engine /** @@ -10,5 +34,5 @@ class MappingEdge(val inputs: QNode, val outputs: QNode) extends QEdge { * @return */ override def toString = "" - override def dotString = "" + override def shortDescription = "" } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala index 1608e3c088..e40a868675 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.engine /** @@ -15,9 +39,9 @@ trait QEdge { def outputs: QNode /** - * The function description in .dot files + * The short description */ - def dotString = "" + def shortDescription = "" override def hashCode = inputs.hashCode + outputs.hashCode diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index 42ddf91040..cee2c6e56a 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -30,7 +30,6 @@ import scala.collection.JavaConversions._ import org.jgrapht.alg.CycleDetector import org.jgrapht.EdgeFactory import org.jgrapht.ext.DOTExporter -import java.io.File import org.jgrapht.event.{TraversalListenerAdapter, EdgeTraversalEvent} import org.broadinstitute.sting.queue.QException import org.broadinstitute.sting.queue.function.{InProcessFunction, CommandLineFunction, QFunction} @@ -40,7 +39,8 @@ import collection.immutable.{TreeSet, TreeMap} import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction} import java.util.Date import org.broadinstitute.sting.utils.Utils -import org.broadinstitute.sting.utils.io.IOUtils +import org.apache.commons.io.{FileUtils, IOUtils} +import java.io.{OutputStreamWriter, File} /** * The internal dependency tracker between sets of function input and output files. @@ -69,7 +69,7 @@ class QGraph extends Logging { private val commandLinePluginManager = new CommandLinePluginManager private var commandLineManager: CommandLineJobManager[CommandLineJobRunner] = _ private val inProcessManager = new InProcessJobManager - private def managers = List[Any](inProcessManager, commandLineManager) + private def managers = Seq[Any](inProcessManager, commandLineManager) private class StatusCounts { var pending = 0 @@ -88,9 +88,14 @@ class QGraph extends Logging { runningLock.synchronized { if (running) { command.qSettings = settings.qSettings - command.freeze - val inputs = getQNode(command.inputs.toList.sorted(fileOrdering)) - val outputs = getQNode(command.outputs.toList.sorted(fileOrdering)) + command.freeze() + val inputFiles = command.inputs + var outputFiles = command.outputs + outputFiles :+= command.jobOutputFile + if (command.jobErrorFile != null) + outputFiles :+= command.jobErrorFile + val inputs = getQNode(inputFiles.sorted(fileOrdering)) + val outputs = getQNode(outputFiles.sorted(fileOrdering)) addEdge(new FunctionEdge(command, inputs, outputs)) } } @@ -106,8 +111,8 @@ class QGraph extends Logging { def run() { runningLock.synchronized { if (running) { - IOUtils.checkTempDir(settings.qSettings.tempDirectory) - fillGraph + org.broadinstitute.sting.utils.io.IOUtils.checkTempDir(settings.qSettings.tempDirectory) + fillGraph() val isReady = numMissingValues == 0 if (this.jobGraph.edgeSet.isEmpty) { @@ -133,11 +138,11 @@ class QGraph extends Logging { } } - private def fillGraph { + private def fillGraph() { logger.info("Generating graph.") - fill - if (settings.dotFile != null) - renderToDot(settings.dotFile) + fill() + if (settings.graphvizFile != null) + renderGraph(settings.graphvizFile) validate() if (running && numMissingValues == 0) { @@ -145,7 +150,7 @@ class QGraph extends Logging { if (!scatterGathers.isEmpty) { logger.info("Generating scatter gather jobs.") - var addedFunctions = List.empty[QFunction] + var addedFunctions = Seq.empty[QFunction] for (scatterGather <- scatterGathers) { val functions = scatterGather.asInstanceOf[FunctionEdge] .function.asInstanceOf[ScatterGatherableFunction] @@ -161,10 +166,10 @@ class QGraph extends Logging { addedFunctions.foreach(function => if (running) this.add(function)) logger.info("Regenerating graph.") - fill - val scatterGatherDotFile = if (settings.expandedDotFile != null) settings.expandedDotFile else settings.dotFile + fill() + val scatterGatherDotFile = if (settings.graphvizScatterGatherFile != null) settings.graphvizScatterGatherFile else settings.graphvizFile if (scatterGatherDotFile != null) - renderToDot(scatterGatherDotFile) + renderGraph(scatterGatherDotFile) validate() } } @@ -187,8 +192,8 @@ class QGraph extends Logging { * @param edge Graph edge to examine for the previous functions. * @return A list of prior function edges. */ - private def previousFunctions(edge: QEdge): List[FunctionEdge] = { - var previous = List.empty[FunctionEdge] + private def previousFunctions(edge: QEdge): Seq[FunctionEdge] = { + var previous = Seq.empty[FunctionEdge] val source = this.jobGraph.getEdgeSource(edge) for (incomingEdge <- this.jobGraph.incomingEdgesOf(source)) { incomingEdge match { @@ -208,8 +213,8 @@ class QGraph extends Logging { * @param edge Graph edge to examine for the next functions. * @return A list of prior function edges. */ - private def nextFunctions(edge: QEdge): List[FunctionEdge] = { - var next = List.empty[FunctionEdge] + private def nextFunctions(edge: QEdge): Seq[FunctionEdge] = { + var next = Seq.empty[FunctionEdge] val target = this.jobGraph.getEdgeTarget(edge) for (outgoingEdge <- this.jobGraph.outgoingEdgesOf(target)) { outgoingEdge match { @@ -238,7 +243,7 @@ class QGraph extends Logging { */ private def fillIn() { // clone since edgeSet is backed by the graph - asScalaSet(jobGraph.edgeSet).clone.foreach(edge => { + asScalaSet(jobGraph.edgeSet).clone().foreach(edge => { if (running) edge match { case cmd: FunctionEdge => { addCollectionOutputs(cmd.outputs) @@ -249,7 +254,7 @@ class QGraph extends Logging { }) } - private def getReadyJobs(): Set[FunctionEdge] = { + private def getReadyJobs: Set[FunctionEdge] = { jobGraph.edgeSet.filter{ case f: FunctionEdge => this.previousFunctions(f).forall(_.status == RunnerStatus.DONE) && f.status == RunnerStatus.PENDING @@ -317,33 +322,39 @@ class QGraph extends Logging { updateGraphStatus(false) - var readyJobs = getReadyJobs() + var readyJobs = getReadyJobs while (running && readyJobs.size > 0) { logger.debug("+++++++") - foreachFunction(readyJobs.toList, edge => { + foreachFunction(readyJobs.toSeq, edge => { if (running) { edge.myRunInfo.startTime = new Date() edge.getRunInfo.exechosts = Utils.resolveHostname() logEdge(edge) edge.myRunInfo.doneTime = new Date() - edge.markAsDone + edge.markAsDone() } }) - readyJobs = getReadyJobs() + readyJobs = getReadyJobs } } private def logEdge(edge: FunctionEdge) { logger.info("-------") + logger.info("%-8s %s".format(StringUtils.capitalize(edge.status.toString) + ":", edge.function.description)) if (logger.isDebugEnabled) { - logger.debug("Inputs: " + edge.inputs) + logger.debug("Inputs: " + edge.inputs) + logger.debug("Outputs: " + edge.outputs) + logger.debug("Done+: " + edge.function.doneOutputs.filter(_.exists())) + logger.debug("Done-: " + edge.function.doneOutputs.filterNot(_.exists())) + logger.debug("CmdDir: " + edge.function.commandDirectory) + logger.debug("Temp?: " + edge.function.isIntermediate) + logger.debug("Prev: " + + (if (edge.resetFromStatus == null) "none" else StringUtils.capitalize(edge.resetFromStatus.toString)) + + " (reset = " + (edge.resetFromStatus != null && edge.resetFromStatus != edge.status) + ")" ) } - logger.info(StringUtils.capitalize(edge.status.toString) + ": " + edge.function.description) - if (logger.isDebugEnabled) - logger.debug(edge.function.commandDirectory + " > " + edge.function.description) - logger.info("Log: " + edge.function.jobOutputFile.getAbsolutePath) + logger.info("Log: " + edge.function.jobOutputFile.getAbsolutePath) if (edge.function.jobErrorFile != null) - logger.info("Error: " + edge.function.jobErrorFile.getAbsolutePath) + logger.info("Error: " + edge.function.jobErrorFile.getAbsolutePath) } /** @@ -380,7 +391,7 @@ class QGraph extends Logging { updateGraphStatus(true) var readyJobs = TreeSet.empty[FunctionEdge](functionOrdering) - readyJobs ++= getReadyJobs() + readyJobs ++= getReadyJobs runningJobs = Set.empty[FunctionEdge] var lastRunningCheck = System.currentTimeMillis var logNextStatusCounts = true @@ -407,7 +418,7 @@ class QGraph extends Logging { statusCounts.running += startedJobs.size if (logNextStatusCounts) - logStatusCounts + logStatusCounts() logNextStatusCounts = false deleteCleanup(lastRunningCheck) @@ -456,10 +467,10 @@ class QGraph extends Logging { checkRetryJobs(failedJobs) } - readyJobs ++= getReadyJobs() + readyJobs ++= getReadyJobs } - logStatusCounts + logStatusCounts() deleteCleanup(-1) } catch { case e => @@ -476,7 +487,7 @@ class QGraph extends Logging { private def nextRunningCheck(lastRunningCheck: Long) = ((30 * 1000L) - (System.currentTimeMillis - lastRunningCheck)) - private def logStatusCounts { + private def logStatusCounts() { logger.info("%d Pend, %d Run, %d Fail, %d Done".format( statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done)) } @@ -532,7 +543,8 @@ class QGraph extends Logging { } if (edge.status == RunnerStatus.DONE || edge.status == RunnerStatus.SKIPPED) { - logger.debug("Already done: " + edge.function.description) + if (logger.isDebugEnabled) + logEdge(edge) addCleanup(edge) } } @@ -546,12 +558,12 @@ class QGraph extends Logging { } /** - * Checks if the function should have their outptus removed after they finish running - * @param edges Function to check + * Checks if the function should have their outputs removed after they finish running + * @param edge Function to check */ private def addCleanup(edge: FunctionEdge) { if (!settings.keepIntermediates) - if (edge.function.isIntermediate && edge.function.deleteIntermediateOutputs) + if (edge.function.isIntermediate) cleanupJobs += edge } @@ -601,14 +613,16 @@ class QGraph extends Logging { * From the previous edges, resets any that are marked as skipped to pending. * If those that are reset have skipped edges, those skipped edges are recursively also set * to pending. + * Any edges after this edge are also reset to pending. * @param edge Dependent edge. * @param previous Previous edges that provide inputs to edge. - * @param cleanOutputs If true will clean up the output files when resetting skipped jobs to pending. + * @param cleanOutputs If true will clean up the output files when resetting jobs to pending. */ - private def resetPreviousSkipped(edge: FunctionEdge, previous: List[FunctionEdge], cleanOutputs: Boolean) { - for (previousEdge <- previous.filter(_.status == RunnerStatus.SKIPPED)) { - previousEdge.resetToPending(cleanOutputs) - resetPreviousSkipped(previousEdge, this.previousFunctions(previousEdge), cleanOutputs) + private def resetPreviousSkipped(edge: FunctionEdge, previous: Seq[FunctionEdge], cleanOutputs: Boolean) { + val edges = previous.filter(_.status == RunnerStatus.SKIPPED) ++ this.nextFunctions(edge).filter(_.status != RunnerStatus.PENDING) + for (resetEdge <- edges) { + resetEdge.resetToPending(cleanOutputs) + resetPreviousSkipped(resetEdge, this.previousFunctions(resetEdge), cleanOutputs) } } @@ -628,9 +642,9 @@ class QGraph extends Logging { val emailMessage = new EmailMessage emailMessage.from = settings.statusEmailFrom emailMessage.to = settings.statusEmailTo - emailMessage.subject = "Queue function: Started: " + settings.qSettings.jobNamePrefix - addStartedFunctions(emailMessage, started.toList) - emailMessage.trySend(settings.qSettings.emailSettings) + emailMessage.subject = "Queue function: Started: " + settings.qSettings.runName + addStartedFunctions(emailMessage, started.toSeq) + emailMessage.trySend(settings.emailSettings) } } @@ -639,9 +653,9 @@ class QGraph extends Logging { val emailMessage = new EmailMessage emailMessage.from = settings.statusEmailFrom emailMessage.to = settings.statusEmailTo - emailMessage.subject = "Queue function: Failure: " + settings.qSettings.jobNamePrefix - addFailedFunctions(emailMessage, failed.toList) - emailMessage.trySend(settings.qSettings.emailSettings) + emailMessage.subject = "Queue function: Failure: " + settings.qSettings.runName + addFailedFunctions(emailMessage, failed.toSeq) + emailMessage.trySend(settings.emailSettings) } } @@ -665,7 +679,7 @@ class QGraph extends Logging { private def emailStatus() { if (running && settings.statusEmailTo.size > 0) { - var failed = List.empty[FunctionEdge] + var failed = Seq.empty[FunctionEdge] foreachFunction(edge => { if (edge.status == RunnerStatus.FAILED) { failed :+= edge @@ -677,16 +691,16 @@ class QGraph extends Logging { emailMessage.to = settings.statusEmailTo emailMessage.body = getStatus + nl if (failed.size == 0) { - emailMessage.subject = "Queue run: Success: " + settings.qSettings.jobNamePrefix + emailMessage.subject = "Queue run: Success: " + settings.qSettings.runName } else { - emailMessage.subject = "Queue run: Failure: " + settings.qSettings.jobNamePrefix + emailMessage.subject = "Queue run: Failure: " + settings.qSettings.runName addFailedFunctions(emailMessage, failed) } - emailMessage.trySend(settings.qSettings.emailSettings) + emailMessage.trySend(settings.emailSettings) } } - private def addStartedFunctions(emailMessage: EmailMessage, started: List[FunctionEdge]) { + private def addStartedFunctions(emailMessage: EmailMessage, started: Seq[FunctionEdge]) { if (emailMessage.body == null) emailMessage.body = "" emailMessage.body += """ @@ -697,7 +711,7 @@ class QGraph extends Logging { started.map(edge => emailDescription(edge)).mkString(nl+nl)) } - private def addFailedFunctions(emailMessage: EmailMessage, failed: List[FunctionEdge]) { + private def addFailedFunctions(emailMessage: EmailMessage, failed: Seq[FunctionEdge]) { val logs = failed.flatMap(edge => logFiles(edge)) if (emailMessage.body == null) @@ -725,7 +739,7 @@ class QGraph extends Logging { } private def logFiles(edge: FunctionEdge) = { - var failedOutputs = List.empty[File] + var failedOutputs = Seq.empty[File] failedOutputs :+= edge.function.jobOutputFile if (edge.function.jobErrorFile != null) failedOutputs :+= edge.function.jobErrorFile @@ -762,14 +776,14 @@ class QGraph extends Logging { private def getStatus = { val buffer = new StringBuilder doStatus(status => buffer.append(status).append(nl)) - buffer.toString + buffer.toString() } /** * Gets job statuses by traversing the graph and looking for status-related files */ - private def doStatus(statusFunc: String => Unit) = { - var statuses = List.empty[AnalysisStatus] + private def doStatus(statusFunc: String => Unit) { + var statuses = Seq.empty[AnalysisStatus] var maxWidth = 0 foreachFunction(edge => { val name = edge.function.analysisName @@ -860,7 +874,7 @@ class QGraph extends Logging { private def newGraph = new SimpleDirectedGraph[QNode, QEdge](new EdgeFactory[QNode, QEdge] { def createEdge(input: QNode, output: QNode) = new MappingEdge(input, output)}) - private def getQNode(files: List[File]) = { + private def getQNode(files: Seq[File]) = { nodeMap.get(files) match { case Some(node) => node @@ -888,7 +902,7 @@ class QGraph extends Logging { if (inputs.files.size > 1) for (file <- inputs.files) { if (running) { - val input = getQNode(List(file)) + val input = getQNode(Seq(file)) if (!jobGraph.containsEdge(input, inputs)) addEdge(new MappingEdge(input, inputs)) } @@ -903,7 +917,7 @@ class QGraph extends Logging { if (outputs.files.size > 1) for (file <- outputs.files) { if (running) { - val output = getQNode(List(file)) + val output = getQNode(Seq(file)) if (!jobGraph.containsEdge(outputs, output)) addEdge(new MappingEdge(outputs, output)) } @@ -937,37 +951,36 @@ class QGraph extends Logging { /** * Utility function for running a method over all function edges. - * @param edgeFunction Function to run for each FunctionEdge. + * @param f Function to run for each FunctionEdge. */ private def foreachFunction(f: (FunctionEdge) => Unit) { - foreachFunction(jobGraph.edgeSet.toList.filter(_.isInstanceOf[FunctionEdge]).asInstanceOf[List[FunctionEdge]], f) + foreachFunction(jobGraph.edgeSet.toSeq.filter(_.isInstanceOf[FunctionEdge]).asInstanceOf[Seq[FunctionEdge]], f) } /** * Utility function for running a method over a list of function edges. - * @param edegs Edges to traverse. - * @param edgeFunction Function to run for each FunctionEdge. + * @param edges Edges to traverse. + * @param f Function to run for each FunctionEdge. */ - private def foreachFunction(edges: List[FunctionEdge], f: (FunctionEdge) => Unit) { + private def foreachFunction(edges: Seq[FunctionEdge], f: (FunctionEdge) => Unit) { edges.sorted(functionOrdering).foreach(edge => if (running) f(edge)) } /** - * Utility function for running a method over all function edges. - * @param edgeFunction Function to run for each FunctionEdge. + * Utility function returning all function edges. */ - private def getFunctionEdges: List[FunctionEdge] = { - jobGraph.edgeSet.toList.filter(_.isInstanceOf[FunctionEdge]).asInstanceOf[List[FunctionEdge]] + private def getFunctionEdges: Seq[FunctionEdge] = { + jobGraph.edgeSet.toSeq.filter(_.isInstanceOf[FunctionEdge]).asInstanceOf[Seq[FunctionEdge]] } /** * Utility function for running a method over all functions, but traversing the nodes in order of dependency. - * @param edgeFunction Function to run for each FunctionEdge. + * @param f Function to run for each FunctionEdge. */ private def traverseFunctions(f: (FunctionEdge) => Unit) { val iterator = new TopologicalOrderIterator(this.jobGraph) iterator.addTraversalListener(new TraversalListenerAdapter[QNode, QEdge] { - override def edgeTraversed(event: EdgeTraversalEvent[QNode, QEdge]) = { + override def edgeTraversed(event: EdgeTraversalEvent[QNode, QEdge]) { if (running) { event.getEdge match { case functionEdge: FunctionEdge => f(functionEdge) @@ -980,23 +993,44 @@ class QGraph extends Logging { } /** - * Outputs the graph to a .dot file. + * Outputs the graph to a .gv DOT file. + * http://www.graphviz.org/Documentation.php * http://en.wikipedia.org/wiki/DOT_language - * @param file Path to output the .dot file. + * @param file Path to output the .gv file. */ - private def renderToDot(file: java.io.File) { - val out = new java.io.FileWriter(file) + private def renderGraph(file: java.io.File) { + val vertexIDProvider = new org.jgrapht.ext.VertexNameProvider[QNode] { + def getVertexName(node: QNode) = node.id.toString + } + + val vertexLabelProvider = new org.jgrapht.ext.VertexNameProvider[QNode] { + // The QGraph fills in with single file nodes between nodes that contain more than one file. + // We only need to display the single element nodes. + def getVertexName(node: QNode) = { + if (!node.files.isEmpty && node.files.tail.isEmpty) + node.files.head.getName + else + "" + } + } - // todo -- we need a nice way to visualize the key pieces of information about commands. Perhaps a - // todo -- visualizeString() command, or something that shows inputs / outputs - val ve = new org.jgrapht.ext.EdgeNameProvider[QEdge] { - def getEdgeName(function: QEdge) = if (function.dotString == null) "" else function.dotString.replace("\"", "\\\"") + val edgeNameProvider = new org.jgrapht.ext.EdgeNameProvider[QEdge] { + def getEdgeName(edge: QEdge) = { + if (edge.shortDescription != null) + edge.shortDescription.replace("\"", "\\\"") + else + "" + } } - //val iterator = new TopologicalOrderIterator(qGraph.jobGraph) - (new DOTExporter(new org.jgrapht.ext.IntegerNameProvider[QNode](), null, ve)).export(out, jobGraph) + val exporter = new DOTExporter(vertexIDProvider, vertexLabelProvider, edgeNameProvider) - out.close + val out = new OutputStreamWriter(FileUtils.openOutputStream(file)) + try { + exporter.export(out, jobGraph) + } finally { + IOUtils.closeQuietly(out) + } } /** @@ -1054,7 +1088,7 @@ class QGraph extends Logging { */ def isShutdown = !running - def getFunctionsAndStatus(functions: List[QFunction]): Map[QFunction, JobRunInfo] = { + def getFunctionsAndStatus: Map[QFunction, JobRunInfo] = { getFunctionEdges.map(edge => (edge.function, edge.getRunInfo)).toMap } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala index 56d6975a51..6d81d4bd71 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.engine import java.io.File import org.broadinstitute.sting.queue.QSettings -import org.broadinstitute.sting.queue.util.SystemUtils +import org.broadinstitute.sting.queue.util.{EmailSettings, SystemUtils} import org.broadinstitute.sting.commandline.{Advanced, ArgumentCollection, Argument} /** @@ -58,16 +58,16 @@ class QGraphSettings { var keepIntermediates = false @Argument(fullName="status_email_to", shortName="statusTo", doc="Email address to send emails to upon completion or on error.", required=false) - var statusEmailTo: List[String] = Nil + var statusEmailTo: Seq[String] = Nil @Argument(fullName="status_email_from", shortName="statusFrom", doc="Email address to send emails from upon completion or on error.", required=false) var statusEmailFrom: String = System.getProperty("user.name") + "@" + SystemUtils.mailName - @Argument(fullName="dot_graph", shortName="dot", doc="Outputs the queue graph to a .dot file. See: http://en.wikipedia.org/wiki/DOT_language", required=false) - var dotFile: File = _ + @Argument(fullName="graphviz", shortName="gv", doc="Outputs the queue graph to a Graphviz .gv file. See: http://www.graphviz.org/Documentation.php", required=false) + var graphvizFile: File = _ - @Argument(fullName="expanded_dot_graph", shortName="expandedDot", doc="Outputs the queue graph of scatter gather to a .dot file. Otherwise overwrites the dot_graph", required=false) - var expandedDotFile: File = _ + @Argument(fullName="graphviz_scatter_gather", shortName="gvsg", doc="Outputs the scatter/gather queue graph to a Graphviz .gv file. Otherwise overwrites the --graphviz file.", required=false) + var graphvizScatterGatherFile: File = _ @Argument(fullName="jobReport", shortName="jobReport", doc="File where we will write the Queue job report", required=false) var jobReportFile: String = _ @@ -76,6 +76,9 @@ class QGraphSettings { @Argument(fullName="disableJobReport", shortName="disabpleJobReport", doc="If provided, we will not create a job report", required=false) var disableJobReport: Boolean = false + @ArgumentCollection + val emailSettings = new EmailSettings + @ArgumentCollection val qSettings = new QSettings } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala index a86c08aae5..a5c039a530 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.engine import java.io.File @@ -6,7 +30,7 @@ import java.io.File * Represents a state between QFunctions the directed acyclic QGraph * @param files The list of files that represent this node state ordered by file name. */ -class QNode (val id: Int, val files: List[File]) { +class QNode (val id: Int, val files: Seq[File]) { override def equals(obj: Any) = { obj match { case other: QNode => this.id == other.id @@ -16,5 +40,5 @@ class QNode (val id: Int, val files: List[File]) { override def hashCode = id - override def toString = files.toString + override def toString = files.toString() } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index fca92a7a17..239f834820 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -69,7 +69,7 @@ class GridEngineJobRunner(session: Session, function: CommandLineFunction) exten if ( function.nCoresRequest.getOrElse(1) > 1 ) { if ( function.qSettings.dontRequestMultipleCores ) logger.warn("Sending multicore job %s to farm without requesting appropriate number of cores (%d)".format( - function.jobName, function.nCoresRequest.get)) + function.shortDescription, function.nCoresRequest.get)) else nativeSpec += " -pe %s %d".format(function.qSettings.parallelEnvironmentName, function.nCoresRequest.get) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 5ef78500c8..de996d1870 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -35,8 +35,8 @@ import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner import java.util.regex.Pattern import java.lang.StringBuffer import java.util.Date -import com.sun.jna.{Pointer, Structure, StringArray, NativeLong} -import com.sun.jna.ptr.{PointerByReference, IntByReference} +import com.sun.jna.{Structure, StringArray, NativeLong} +import com.sun.jna.ptr.IntByReference /** * Runs jobs on an LSF compute cluster. @@ -60,7 +60,6 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR /** * Dispatches the function on the LSF cluster. - * @param function Command to run. */ def start() { Lsf706JobRunner.lsfLibLock.synchronized { @@ -110,7 +109,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR if ( function.nCoresRequest.getOrElse(1) > 1 ) { if ( function.qSettings.dontRequestMultipleCores ) logger.warn("Sending multicore job %s to farm without requesting appropriate number of cores (%d)".format( - function.jobName, function.nCoresRequest.get)) + function.shortDescription, function.nCoresRequest.get)) else { request.numProcessors = function.nCoresRequest.get request.maxNumProcessors = request.numProcessors @@ -298,7 +297,7 @@ object Lsf706JobRunner extends Logging { runner.getRunInfo.doneTime = new Date(jobInfo.endTime.longValue * 1000) val exHostsRaw = jobInfo.exHosts.getStringArray(0) //logger.warn("exHostsRaw = " + exHostsRaw) - val exHostsList = exHostsRaw.toList + val exHostsList = exHostsRaw.toSeq //logger.warn("exHostsList = " + exHostsList) val exHosts = exHostsList.reduceLeft(_ + "," + _) //logger.warn("exHosts = " + exHosts) @@ -363,7 +362,7 @@ object Lsf706JobRunner extends Logging { /** * Returns the run limit in seconds for the queue. * If the queue name is null returns the length of the default queue. - * @param queue Name of the queue or null for the default queue. + * @param queueName Name of the queue or null for the default queue. * @return the run limit in seconds for the queue. */ private def getRlimitRun(queueName: String) = { diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala index 9751012a46..6cd4b06bc5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala @@ -53,6 +53,9 @@ class BamGatherFunction extends GatherFunction with PicardBamFunction { val disableIndex = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME) this.createIndex = Some(!originalGATK.getFieldValue(disableIndex).asInstanceOf[Boolean]) - super.freezeFieldValues + val enableMD5 = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME) + this.createMD5 = Some(originalGATK.getFieldValue(enableMD5).asInstanceOf[Boolean]) + + super.freezeFieldValues() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index b0483f0bb5..085e0b0085 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -32,9 +32,8 @@ import net.sf.samtools.SAMFileHeader import java.util.Collections import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} -case class GATKIntervals(reference: File, intervals: List[String]) { +case class GATKIntervals(reference: File, intervals: Seq[String]) { private lazy val referenceDataSource = new ReferenceDataSource(reference) -// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] lazy val samFileHeader = { val header = new SAMFileHeader @@ -55,13 +54,5 @@ case class GATKIntervals(reference: File, intervals: List[String]) { Collections.unmodifiableList(mergedLocs) } - lazy val contigs = locs.map(_.getContig).distinct.toList - -// def getSplits(size: Int) = { -// splitsBySize.getOrElse(size, { -// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) -// splitsBySize += size -> splits -// splits -// }) -// } + lazy val contigs = locs.map(_.getContig).distinct.toSeq } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala index c9adff0264..28c3f41e98 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -26,7 +26,6 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.utils.interval.IntervalUtils import java.io.File -import collection.JavaConversions._ import org.broadinstitute.sting.utils.io.IOUtils import org.broadinstitute.sting.queue.function.scattergather.{CloneFunction, ScatterFunction} import org.broadinstitute.sting.commandline.Output @@ -39,7 +38,7 @@ trait GATKScatterFunction extends ScatterFunction { private final val intervalsStringField = "intervalsString" @Output(doc="Scatter function outputs") - var scatterOutputFiles: List[File] = Nil + var scatterOutputFiles: Seq[File] = Nil /** The original GATK function. */ protected var originalGATK: CommandLineGATK = _ @@ -48,7 +47,7 @@ trait GATKScatterFunction extends ScatterFunction { protected var referenceSequence: File = _ /** The list of interval files ("/path/to/interval.list") or interval strings ("chr1", "chr2") to parse into smaller parts. */ - protected var intervals: List[String] = Nil + protected var intervals: Seq[String] = Nil /** Whether the last scatter job should also include any unmapped reads. */ protected var includeUnmapped: Boolean = _ @@ -57,7 +56,7 @@ trait GATKScatterFunction extends ScatterFunction { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] this.referenceSequence = this.originalGATK.reference_sequence if (this.originalGATK.intervals.isEmpty && (this.originalGATK.intervalsString == null || this.originalGATK.intervalsString.isEmpty)) { - this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, List.empty[String]).contigs + this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, Seq.empty[String]).contigs } else { this.intervals ++= this.originalGATK.intervals.map(_.toString) this.intervals ++= this.originalGATK.intervalsString.filterNot(interval => IntervalUtils.isUnmapped(interval)) @@ -70,16 +69,16 @@ trait GATKScatterFunction extends ScatterFunction { } override def initCloneInputs(cloneFunction: CloneFunction, index: Int) { - cloneFunction.setFieldValue(this.intervalsField, List(new File("scatter.intervals"))) + cloneFunction.setFieldValue(this.intervalsField, Seq(new File("scatter.intervals"))) if (index == this.scatterCount && this.includeUnmapped) - cloneFunction.setFieldValue(this.intervalsStringField, List("unmapped")) + cloneFunction.setFieldValue(this.intervalsStringField, Seq("unmapped")) else - cloneFunction.setFieldValue(this.intervalsStringField, List.empty[String]) + cloneFunction.setFieldValue(this.intervalsStringField, Seq.empty[String]) } override def bindCloneInputs(cloneFunction: CloneFunction, index: Int) { val scatterPart = cloneFunction.getFieldValue(this.intervalsField) - .asInstanceOf[List[File]] + .asInstanceOf[Seq[File]] .map(file => IOUtils.absolute(cloneFunction.commandDirectory, file)) cloneFunction.setFieldValue(this.intervalsField, scatterPart) this.scatterOutputFiles ++= scatterPart @@ -100,9 +99,9 @@ trait GATKScatterFunction extends ScatterFunction { } object GATKScatterFunction { - var gatkIntervals = List.empty[GATKIntervals] + var gatkIntervals = Seq.empty[GATKIntervals] - def getGATKIntervals(reference: File, intervals: List[String]) = { + def getGATKIntervals(reference: File, intervals: Seq[String]) = { gatkIntervals.find(gi => gi.reference == reference && gi.intervals == intervals) match { case Some(gi) => gi case None => diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala deleted file mode 100644 index deb83bf5a2..0000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala +++ /dev/null @@ -1,41 +0,0 @@ -package org.broadinstitute.sting.queue.extensions.gatk - -import java.io.File -import org.broadinstitute.sting.utils.io.FileExtension -import java.lang.String - -/** - * Used to provide -B rodBind arguments to the GATK. - */ -class RodBind(var trackName: String, var trackType: String, path: String, val tag: String) extends File(path) with FileExtension { - def this(trackName: String, trackType: String, path: String) = - this(trackName, trackType, path, null) - def this(trackName: String, trackType: String, file: File, tag: String) = - this(trackName, trackType, file.getPath, tag) - def this(trackName: String, trackType: String, file: File) = - this(trackName, trackType, file.getPath, null) - require(trackName != null, "RodBind trackName cannot be null") - require(trackType != null, "RodBind trackType cannot be null") - def withPath(newPath: String) = new RodBind(trackName, trackType, newPath, tag) -} - -/** - * Used to provide -B rodBind arguments to the GATK. - */ -object RodBind { - def apply(trackName: String, trackType: String, path: String, tag: String) = new RodBind(trackName, trackType, path, tag) - def apply(trackName: String, trackType: String, path: String) = new RodBind(trackName, trackType, path, null) - def apply(trackName: String, trackType: String, file: File, tag: String) = new RodBind(trackName, trackType, file, tag) - def apply(trackName: String, trackType: String, file: File) = new RodBind(trackName, trackType, file, null) - - def formatCommandLineParameter( cmdLineParam: String, value: Any ) = { - value match { - case rodBind: RodBind if (rodBind.tag != null) => - "%s:%s,%s,%s".format(cmdLineParam, rodBind.trackName, rodBind.trackType, rodBind.tag) - case rodBind: RodBind => - "%s:%s,%s".format(cmdLineParam, rodBind.trackName, rodBind.trackType) - case x => - "" - } - } -} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala index 93735e4ac2..2faa659084 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ @@ -15,7 +39,7 @@ class AddOrReplaceReadGroups extends org.broadinstitute.sting.queue.function.Jav javaMainClass = "net.sf.picard.sam.AddOrReplaceReadGroups" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(doc="The output BAM file with the modified/added read groups", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala index d73c556af7..06c6e3fdc1 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ @@ -15,7 +39,7 @@ class MarkDuplicates extends org.broadinstitute.sting.queue.function.JavaCommand javaMainClass = "net.sf.picard.sam.MarkDuplicates" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(doc="The output file to write marked records to", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala index 036932cc68..8c23775775 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala @@ -1,9 +1,32 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ import java.io.File -import org.broadinstitute.sting.queue.QScript._ /* * Created by IntelliJ IDEA. @@ -16,7 +39,7 @@ class MergeSamFiles extends org.broadinstitute.sting.queue.function.JavaCommandL javaMainClass = "net.sf.picard.sam.MergeSamFiles" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(doc="The output merged BAM file", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala index 76856dc366..defb43e4e8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -41,20 +41,22 @@ trait PicardBamFunction extends JavaCommandLineFunction { var sortOrder = SortOrder.coordinate var compressionLevel: Option[Int] = None var createIndex: Option[Boolean] = None + var createMD5: Option[Boolean] = None var maxRecordsInRam: Option[Int] = None var assumeSorted: Option[Boolean] = None - protected def inputBams: List[File] + protected def inputBams: Seq[File] protected def outputBam: File abstract override def commandLine = super.commandLine + - repeat("INPUT=", inputBams, spaceSeparated=false) + - required("TMP_DIR=" + jobTempDir) + - optional("OUTPUT=", outputBam, spaceSeparated=false) + - optional("COMPRESSION_LEVEL=", compressionLevel, spaceSeparated=false) + - optional("VALIDATION_STRINGENCY=", validationStringency, spaceSeparated=false) + - optional("SO=", sortOrder, spaceSeparated=false) + - optional("MAX_RECORDS_IN_RAM=", maxRecordsInRam, spaceSeparated=false) + - optional("ASSUME_SORTED=", assumeSorted, spaceSeparated=false) + - optional("CREATE_INDEX=", createIndex, spaceSeparated=false) + repeat("INPUT=", inputBams, spaceSeparated=false) + + required("TMP_DIR=" + jobTempDir) + + optional("OUTPUT=", outputBam, spaceSeparated=false) + + optional("COMPRESSION_LEVEL=", compressionLevel, spaceSeparated=false) + + optional("VALIDATION_STRINGENCY=", validationStringency, spaceSeparated=false) + + optional("SO=", sortOrder, spaceSeparated=false) + + optional("MAX_RECORDS_IN_RAM=", maxRecordsInRam, spaceSeparated=false) + + optional("ASSUME_SORTED=", assumeSorted, spaceSeparated=false) + + optional("CREATE_INDEX=", createIndex, spaceSeparated=false) + + optional("CREATE_MD5_FILE=", createMD5, spaceSeparated=false) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala index b1968bee5a..46188586e7 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ @@ -14,7 +38,7 @@ class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLine javaMainClass = "net.sf.picard.sam.ReorderSam" @Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(doc="Output file (bam or sam) to write extracted reads to.", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala index 60d8bfaf81..c2161b5518 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ @@ -15,7 +39,7 @@ class RevertSam extends org.broadinstitute.sting.queue.function.JavaCommandLineF javaMainClass = "net.sf.picard.sam.RevertSam" @Input(shortName = "input", fullName = "input_bam_files", required = true, doc = "The input SAM or BAM files to revert.") - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(shortName = "output", fullName = "output_bam_file", required = true, doc = "The reverted BAM or SAM output file.") var output: File = _ @@ -33,7 +57,7 @@ class RevertSam extends org.broadinstitute.sting.queue.function.JavaCommandLineF var removeAlignmentInformation: Boolean = true @Argument(shortName = "atc", fullName = "attributes_to_clear", required = false, doc = "When removing alignment information, the set of optional tags to remove.") - var attributesToClear: List[String] = Nil + var attributesToClear: Seq[String] = Nil @Argument(shortName = "sa", fullName = "sample_alias", required = false, doc = "The sample alias to use in the reverted output file. This will override the existing sample alias in the file and is used only if all the read groups in the input file have the same sample alias.") var sampleAlias: String = null diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala index 3eb4e8e064..6c658b1055 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ @@ -15,7 +39,7 @@ class SamToFastq extends org.broadinstitute.sting.queue.function.JavaCommandLine javaMainClass = "net.sf.picard.sam.SamToFastq" @Input(shortName = "input", fullName = "input_bam_files", required = true, doc = "Input SAM/BAM file to extract reads from.") - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(shortName = "fastq", fullName = "output_fastq_file", required = true, doc = "Output fastq file (single-end fastq or, if paired, first end of the pair fastq).") var fastq: File = _ diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala index a56093be8d..9257cc7c28 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala @@ -1,9 +1,32 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ import java.io.File -import org.broadinstitute.sting.queue.QScript._ /* * Created by IntelliJ IDEA. @@ -16,7 +39,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun javaMainClass = "net.sf.picard.sam.SortSam" @Input(doc="The input SAM or BAM files to sort.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(doc="The sorted BAM or SAM output file.", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala index 030e4b07d3..43d4ab442a 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ @@ -17,7 +41,7 @@ class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaComman javaMainClass = "net.sf.picard.sam.ValidateSamFile" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = Nil + var input: Seq[File] = Nil @Output(doc="Send output to a file instead of stdout", shortName = "output", fullName = "output_file", required = false) var output: File = _ @@ -26,7 +50,7 @@ class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaComman var MODE: Mode = Mode.VERBOSE @Argument(doc="List of validation error types to ignore.", shortName = "ignore", fullName = "ignore_error_types", required = false) - var IGNORE: List[String] = Nil + var IGNORE: Seq[String] = Nil @Argument(doc = "The maximum number of lines output in verbose mode.", shortName = "max", fullName = "max_output", required = false) var MAX_OUTPUT: Int = 100 diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala index 83a03b904e..1ad758b585 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala @@ -52,6 +52,4 @@ class SamtoolsIndexFunction extends SamtoolsCommandLineFunction { required("index") + required(bamFile) + required(bamFileIndex) - - override def dotString = "Index: %s".format(bamFile.getName) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala index aff9a25c0d..1949d9add8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -34,7 +34,7 @@ class SamtoolsMergeFunction extends SamtoolsCommandLineFunction { analysisName = "samtools merge" @Input(doc="BAM file input") - var inputBams: List[File] = Nil + var inputBams: Seq[File] = Nil @Output(doc="BAM file output") var outputBam: File = _ @@ -43,10 +43,10 @@ class SamtoolsMergeFunction extends SamtoolsCommandLineFunction { var region: String = _ @Input(doc="BAM file input indexes") - var inputBamIndexes: List[File] = Nil + var inputBamIndexes: Seq[File] = Nil - override def freezeFieldValues = { - super.freezeFieldValues + override def freezeFieldValues() { + super.freezeFieldValues() inputBamIndexes ++= inputBams .filter(orig => orig != null && orig.getName.endsWith(".bam")) .flatMap(orig => Array( diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index 167dcb593f..eff4a2ba91 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.function import org.broadinstitute.sting.queue.util._ @@ -27,13 +51,13 @@ trait CommandLineFunction extends QFunction with Logging { var jobQueue: String = _ /** Native arguments to pass to the job runner */ - var jobNativeArgs: List[String] = Nil + var jobNativeArgs: Seq[String] = Nil /** Native arguments to pass to the job runner */ - var jobResourceRequests: List[String] = Nil + var jobResourceRequests: Seq[String] = Nil /** Environment names to pass to the job runner */ - var jobEnvironmentNames: List[String] = Nil + var jobEnvironmentNames: Seq[String] = Nil override def copySettingsTo(function: QFunction) { super.copySettingsTo(function) @@ -270,7 +294,7 @@ trait CommandLineFunction extends QFunction with Logging { } // Trim leading and trailing whitespace off our three tokens, and unwrap Some(x) to x for the param - val trimmedValues : List[String] = List((if ( prefix != null ) prefix.trim else ""), + val trimmedValues : Seq[String] = Seq((if ( prefix != null ) prefix.trim else ""), (param match { case Some(x) => paramFormat.format(x).trim case x => paramFormat.format(x).trim diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala index 783eef1bfb..653b87b2f9 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -24,10 +24,24 @@ package org.broadinstitute.sting.queue.function +import java.io.PrintStream + + /** * Runs a function in process. */ trait InProcessFunction extends QFunction { + analysisName = this.getClass.getSimpleName + def run() - def description = this.getClass.getSimpleName + " " + this.commandOutputs.mkString(" ") + + /** + * During run() this stream will write to the stdout. + */ + var jobOutputStream: PrintStream = null + + /** + * Write errors to this stream run(). + */ + var jobErrorStream: PrintStream = null } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index 5b19cf9b66..534d68069c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -42,7 +42,7 @@ trait JavaCommandLineFunction extends CommandLineFunction { * Class path for the main class. * Defaults to the current classpath. */ - var javaClasspath: List[String] = Nil + var javaClasspath: Seq[String] = Nil /** * Memory limit for the java executable, or if None will use the default memoryLimit. @@ -82,5 +82,5 @@ trait JavaCommandLineFunction extends CommandLineFunction { object JavaCommandLineFunction { val currentClasspath = System.getProperty("java.class.path") - .split(File.pathSeparatorChar).map(path => IOUtils.absolute(new File(path)).getPath).toList + .split(File.pathSeparatorChar).map(path => IOUtils.absolute(new File(path)).getPath).toSeq } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala index f60302ef48..becc64f04c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.function import org.broadinstitute.sting.commandline.{Input, Output} @@ -9,10 +33,12 @@ import org.apache.commons.io.IOUtils * Custom formats can override addFile. */ class ListWriterFunction extends InProcessFunction { - @Input(doc="input files") var inputFiles: List[File] = Nil + analysisName = "WriteList" + + @Input(doc="input files") var inputFiles: Seq[File] = Nil @Output(doc="output file") var listFile: File = _ - def run { + def run() { val writer = new PrintWriter(listFile) try { for (inputFile <- inputFiles) diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index 59f2ada446..7d9debbdc6 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -29,7 +29,7 @@ import java.lang.annotation.Annotation import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.{QException, QSettings} import collection.JavaConversions._ -import org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction +import java.lang.IllegalStateException import org.broadinstitute.sting.queue.util._ import org.broadinstitute.sting.utils.io.IOUtils @@ -39,13 +39,18 @@ import org.broadinstitute.sting.utils.io.IOUtils * Inputs are matched to other outputs by using .equals() */ trait QFunction extends Logging with QJobReport { - /** A short description of this step in the graph */ + /** + * A short description of what this class of function does. + * By default does not include the output specific to this function. + * See shortDescription for a description of what this instance of the function outputs. + */ var analysisName: String = "" - /** Prefix for automatic job name creation */ - var jobNamePrefix: String = _ - - /** The name name of the job */ + /** + * The name name of the job, must be file system safe and unique to the graph. + * Defaults to "runName-". + * Use shortDescription for an alternative that is display friendly. + */ var jobName: String = _ /** Default settings */ @@ -58,7 +63,7 @@ trait QFunction extends Logging with QJobReport { var jobTempDir: File = null /** Order the function was added to the graph. */ - var addOrder: List[Int] = Nil + var addOrder: Seq[Int] = Nil /** Job priority */ var jobPriority: Option[Int] = None @@ -78,12 +83,6 @@ trait QFunction extends Logging with QJobReport { */ var isIntermediate = false - /** - * If true and isIntermediate is true, the files listed - * via outputs will deleted after the command completes. - */ - var deleteIntermediateOutputs = true - // ------------------------------------------------------- // // job run information @@ -95,8 +94,6 @@ trait QFunction extends Logging with QJobReport { * @param function QFunction to copy values to. */ override def copySettingsTo(function: QFunction) { - function.analysisName = this.analysisName - function.jobName = this.jobName function.qSettings = this.qSettings function.commandDirectory = this.commandDirectory function.jobTempDir = this.jobTempDir @@ -105,79 +102,82 @@ trait QFunction extends Logging with QJobReport { function.jobRestartable = this.jobRestartable function.updateJobRun = this.updateJobRun function.isIntermediate = this.isIntermediate - function.deleteIntermediateOutputs = this.deleteIntermediateOutputs function.reportGroup = this.reportGroup function.reportFeatures = this.reportFeatures } /** File to redirect any output. Defaults to .out */ - @Output(doc="File to redirect any output", required=false) - @Gather(classOf[SimpleTextGatherFunction]) var jobOutputFile: File = _ /** File to redirect any errors. Defaults to .out */ - @Output(doc="File to redirect any errors", required=false) - @Gather(classOf[SimpleTextGatherFunction]) var jobErrorFile: File = _ /** * Description of this command line function. */ - def description: String + def description: String = "%s: %s > %s".format(analysisName, inputs, outputs) /** - * The function description in .dot files + * A short description of the function. */ - def dotString = jobName + " => " + description + def shortDescription = { + firstOutput match { + case file: File => analysisName + ": " + file.getName + case _ => analysisName + } + } /** - * Returns true if the function is done, false if it's - * not done and None if the done status is unknown. + * Returns true if the function is done. */ - def isDone = { + def isDone: Boolean = { val files = doneOutputs if (files.size == 0) - None - else - Some(files.forall(_.exists)) + throw new IllegalStateException("Function should have at least one output: " + analysisName) + files.forall(_.exists) } /** - * Returns true if the function has failed, false if it - * has not failed and None if the fail status is unknown. + * Returns true if the function has failed. */ - def isFail = { + def isFail: Boolean = { val files = failOutputs if (files.size == 0) - None - else - Some(files.exists(_.exists)) + throw new IllegalStateException("Function should have at least one output: " + analysisName) + files.exists(_.exists) } /** - * Returns true if the file is a log file for this function. + * Returns files to track for hidden done/fail files. + * @return Seq[String] files. */ - protected def isLogFile(file: File) = - file == jobOutputFile || file == jobErrorFile + protected def statusPaths = { + var paths = outputs + paths :+= jobOutputFile + if (jobErrorFile != null) + paths :+= jobErrorFile + paths + } /** - * Returns the output files for this function. - * @return Set[File] outputs for this function. + * Returns prefixes for hidden done/fail files. + * @return prefixes. */ - private def statusPaths = - commandOutputs.map(file => file.getParentFile + "/." + file.getName) - + private def statusPrefixes = statusPaths. + filter(file => !IOUtils.isSpecialFile(file)). + map(file => file.getParentFile + "/." + file.getName) + /** * Returns the output files for this function. - * @return Set[File] outputs for this function. + * @return outputs for this function. */ - def doneOutputs = statusPaths.map(path => new File(path + ".done")) + def doneOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".done")) /** * Returns the output files for this function. - * @return Set[File] outputs for this function. + * @return outputs for this function. */ - def failOutputs = statusPaths.map(path => new File(path + ".fail")) + def failOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".fail")) /** The complete list of fields on this CommandLineFunction. */ def functionFields = QFunction.classFields(this.functionFieldClass).functionFields @@ -195,21 +195,21 @@ trait QFunction extends Logging with QJobReport { /** * Returns the input files for this function. - * @return Set[File] inputs for this function. + * @return inputs for this function. */ - def inputs = getFieldFiles(inputFields) + def inputs: Seq[File] = getFieldFiles(inputFields) /** * Returns the output files for this function. - * @return Set[File] outputs for this function. + * @return outputs for this function. */ - def outputs = getFieldFiles(outputFields) + def outputs: Seq[File] = getFieldFiles(outputFields) /** - * Returns the non-log outputs for this function. - * @return the non-log outputs for this function. + * Returns the first output file. + * @return first output for this function. */ - def commandOutputs = outputs.filterNot(file => isLogFile(file)) + def firstOutput: File = outputs.headOption.getOrElse(null) /** * Returns the set of directories where files may be written. @@ -218,6 +218,9 @@ trait QFunction extends Logging with QJobReport { var dirs = Set.empty[File] dirs += commandDirectory dirs += jobTempDir + dirs += jobOutputFile.getParentFile + if (jobErrorFile != null) + dirs += jobErrorFile.getParentFile dirs ++= outputs.map(_.getParentFile) dirs } @@ -235,7 +238,7 @@ trait QFunction extends Logging with QJobReport { * Deletes the output files and all the status files for this function. */ def deleteOutputs() { - commandOutputs.foreach(file => IOUtils.tryDelete(file)) + outputs.filter(file => !IOUtils.isSpecialFile(file)).foreach(file => IOUtils.tryDelete(file)) doneOutputs.foreach(file => IOUtils.tryDelete(file)) failOutputs.foreach(file => IOUtils.tryDelete(file)) } @@ -252,63 +255,63 @@ trait QFunction extends Logging with QJobReport { /** * Returns fields that do not have values which are required. - * @return List[String] names of fields missing values. + * @return Seq[String] names of fields missing values. */ - def missingFields: List[String] = { + def missingFields: Seq[String] = { val missingInputs = missingFields(inputFields, classOf[Input]) val missingOutputs = missingFields(outputFields, classOf[Output]) val missingArguments = missingFields(argumentFields, classOf[Argument]) - (missingInputs | missingOutputs | missingArguments).toList.sorted + (missingInputs ++ missingOutputs ++ missingArguments).distinct.sorted } /** * Returns fields that do not have values which are required. * @param sources Fields to check. * @param annotation Annotation. - * @return Set[String] names of fields missing values. + * @return names of fields missing values. */ - private def missingFields(sources: List[ArgumentSource], annotation: Class[_ <: Annotation]): Set[String] = { - var missing = Set.empty[String] + private def missingFields(sources: Seq[ArgumentSource], annotation: Class[_ <: Annotation]): Seq[String] = { + var missing: Seq[String] = Nil for (source <- sources) { if (isRequired(source, annotation)) if (!hasFieldValue(source)) if (!exclusiveOf(source, annotation).exists(otherSource => hasFieldValue(otherSource))) - missing += "@%s: %s - %s".format(annotation.getSimpleName, source.field.getName, doc(source, annotation)) + missing :+= "@%s: %s - %s".format(annotation.getSimpleName, source.field.getName, doc(source, annotation)) } missing } /** - * Gets the files from the fields. The fields must be a File, a FileExtension, or a List or Set of either. + * Gets the files from the fields. The fields must be a File, a FileExtension, or a Seq or Set of either. * @param fields Fields to get files. - * @return Set[File] for the fields. + * @return for the fields. */ - private def getFieldFiles(fields: List[ArgumentSource]): Set[File] = { - var files = Set.empty[File] + private def getFieldFiles(fields: Seq[ArgumentSource]): Seq[File] = { + var files: Seq[File] = Nil for (field <- fields) files ++= getFieldFiles(field) - files + files.distinct } /** - * Gets the files from the field. The field must be a File, a FileExtension, or a List or Set of either. - * @param fields Field to get files. - * @return Set[File] for the field. + * Gets the files from the field. The field must be a File, a FileExtension, or a Seq or Set of either. + * @param field Field to get files. + * @return for the field. */ - def getFieldFiles(field: ArgumentSource): Set[File] = { - var files = Set.empty[File] + def getFieldFiles(field: ArgumentSource): Seq[File] = { + var files: Seq[File] = Nil CollectionUtils.foreach(getFieldValue(field), (fieldValue) => { val file = fieldValueToFile(field, fieldValue) if (file != null) - files += file + files :+= file }) - files + files.distinct } /** - * Gets the file from the field. The field must be a File or a FileExtension and not a List or Set. + * Gets the file from the field. The field must be a File or a FileExtension and not a Seq or Set. * @param field Field to get the file. - * @return File for the field. + * @return for the field. */ def getFieldFile(field: ArgumentSource): File = fieldValueToFile(field, getFieldValue(field)) @@ -340,14 +343,15 @@ trait QFunction extends Logging with QJobReport { * Sets all field values. */ def freezeFieldValues() { - if (jobNamePrefix == null) - jobNamePrefix = qSettings.jobNamePrefix - if (jobName == null) - jobName = QFunction.nextJobName(jobNamePrefix) + jobName = qSettings.runName + "-" + this.addOrder.mkString("-") - if (jobOutputFile == null) - jobOutputFile = new File(jobName + ".out") + if (jobOutputFile == null) { + jobOutputFile = firstOutput match { + case file: File if (!IOUtils.isSpecialFile(file)) => new File(file.getParentFile, file.getName + ".out") + case _ => new File(jobName + ".out") + } + } if (jobTempDir == null) jobTempDir = qSettings.tempDirectory @@ -378,6 +382,10 @@ trait QFunction extends Logging with QJobReport { fieldValue = CollectionUtils.updated(fieldValue, canon).asInstanceOf[AnyRef] this.setFieldValue(field, fieldValue) } + + this.jobOutputFile = canon(this.jobOutputFile).asInstanceOf[File] + if (this.jobErrorFile != null) + this.jobErrorFile = canon(this.jobErrorFile).asInstanceOf[File] } /** @@ -443,7 +451,7 @@ trait QFunction extends Logging with QJobReport { /** * Returns false if the value is null or an empty collection. - * @param value Value to test for null, or a collection to test if it is empty. + * @param param Value to test for null, or a collection to test if it is empty. * @return false if the value is null, or false if the collection is empty, otherwise true. */ protected def hasValue(param: Any) = CollectionUtils.isNotNullOrNotEmpty(param) @@ -472,28 +480,15 @@ trait QFunction extends Logging with QJobReport { } object QFunction { - /** Job index counter for this run of Queue. */ - private var jobIndex = 0 - var parsingEngine: ParsingEngine = _ - /** - * Returns the next job name using the prefix. - * @param prefix Prefix of the job name. - * @return the next job name. - */ - private def nextJobName(prefix: String) = { - jobIndex += 1 - prefix + "-" + jobIndex - } - /** * The list of fields defined on a class * @param clazz The class to lookup fields. */ private class ClassFields(clazz: Class[_]) { /** The complete list of fields on this CommandLineFunction. */ - val functionFields: List[ArgumentSource] = parsingEngine.extractArgumentSources(clazz).toList + val functionFields: Seq[ArgumentSource] = parsingEngine.extractArgumentSources(clazz).toSeq /** The @Input fields on this CommandLineFunction. */ val inputFields = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Input])) /** The @Output fields on this CommandLineFunction. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala index b5cef3d5c2..5b4f2b7e6d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -25,7 +25,6 @@ package org.broadinstitute.sting.queue.function.scattergather import org.broadinstitute.sting.commandline.ArgumentSource -import java.io.File import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction} /** @@ -62,9 +61,8 @@ class CloneFunction extends CommandLineFunction { } } - override def commandOutputs = withScatterPart(() => originalFunction.commandOutputs) - override def dotString = withScatterPart(() => originalFunction.dotString) override def description = withScatterPart(() => originalFunction.description) + override def shortDescription = withScatterPart(() => originalFunction.shortDescription) override protected def functionFieldClass = originalFunction.getClass def commandLine = withScatterPart(() => originalFunction.commandLine) @@ -75,30 +73,22 @@ class CloneFunction extends CommandLineFunction { } override def getFieldValue(source: ArgumentSource): AnyRef = { - source.field.getName match { - case "jobOutputFile" => jobOutputFile - case "jobErrorFile" => jobErrorFile - case _ => overriddenFields.get(source) match { - case Some(value) => value.asInstanceOf[AnyRef] - case None => { - val value = originalFunction.getFieldValue(source) - overriddenFields += source -> value - value - } + overriddenFields.get(source) match { + case Some(value) => value.asInstanceOf[AnyRef] + case None => { + val value = originalFunction.getFieldValue(source) + overriddenFields += source -> value + value } } } - def setFieldValue(field: String, value: Any): Unit = { + def setFieldValue(field: String, value: Any) { val source = QFunction.findField(originalFunction.getClass, field) setFieldValue(source, value) } - override def setFieldValue(source: ArgumentSource, value: Any): Unit = { - source.field.getName match { - case "jobOutputFile" => jobOutputFile = value.asInstanceOf[File] - case "jobErrorFile" => jobErrorFile = value.asInstanceOf[File] - case _ => overriddenFields += source -> value - } + override def setFieldValue(source: ArgumentSource, value: Any) { + overriddenFields += source -> value } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala new file mode 100644 index 0000000000..9261dd7674 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.function.scattergather + +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.queue.QException +import org.broadinstitute.sting.commandline.Input +import org.apache.commons.io.FileUtils +import java.io.File +import collection.JavaConversions._ + +/** + * Concatenate log files to the jobOutputFile. + */ +class ConcatenateLogsFunction extends InProcessFunction { + analysisName = "Concat" + + @Input(doc="Parts to gather back into the original output") + var logs: Seq[File] = Nil + + override def description = "%s: %s > %s".format(analysisName, logs, jobOutputFile) + override def shortDescription = analysisName + ": " + jobOutputFile.getName + + def run() { + val missing = org.broadinstitute.sting.utils.io.IOUtils.waitFor(logs, 120) + if (!missing.isEmpty) + throw new QException("Unable to find log: " + missing.mkString(", ")) + logs.foreach(log => { + FileUtils.copyFile(log, this.jobOutputStream) + this.jobOutputStream.println() + }) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala index 6b8b5d143f..c8b9d52fbe 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.function.scattergather import java.io.File @@ -11,22 +35,31 @@ import collection.JavaConversions._ * Base class for Gather command line functions. */ trait GatherFunction extends QFunction { + analysisName = "Gather" + var originalFunction: ScatterGatherableFunction = _ @Output(doc="The original output of the scattered function") var originalOutput: File = _ @Input(doc="Parts to gather back into the original output") - var gatherParts: List[File] = Nil - - @Input(doc="Other log files that will be gathered before this output", required=false) - var originalLogFiles: List[File] = Nil + var gatherParts: Seq[File] = Nil /** * Called to initialize the gather function values after all other values have been setup for this function. */ def init() {} + /** + * Don't include this @Gather's log file when tracking .done. + * The done files for original log file being produced will do. + * + * The logs from the scatter/gather jobs are concatenated together into the original log. + * Without removing the logs a .done file would be created for the logs. If a SGF is switched + * from scatterCount=1 to >1 then this Gather would be "missing" its logs and re-run. + */ + override protected def statusPaths = outputs + /** * Waits for gather parts to propagate over NFS or throws an exception. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala index c1204fd1d3..536bbf5fc6 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -32,11 +32,12 @@ import collection.JavaConversions._ * Runs a Gatherer in process. */ class GathererFunction(gathererClass: Class[_ <: Gatherer]) extends InProcessFunction with GatherFunction { + analysisName = this.gathererClass.getSimpleName + def run() { val gatherer = gathererClass.newInstance if (gatherer.waitForInputs) - waitForGatherParts + waitForGatherParts() gatherer.gather(this.gatherParts, this.originalOutput) } - override def description = this.gathererClass.getSimpleName + " " + this.commandOutputs.mkString(" ") } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala index 632e2d39fe..a407476719 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala @@ -32,14 +32,17 @@ import org.broadinstitute.sting.queue.function.QFunction * Base class for Scatter functions. */ trait ScatterFunction extends QFunction { + analysisName = "Scatter" + var originalFunction: ScatterGatherableFunction = _ @Input(doc="Original inputs to scatter") var originalInputs: Set[File] = _ + override def shortDescription = analysisName + ": %s ...".format(firstOutput.getName) + /** * Called to initialize scatter function values after all other values have been setup for this function. - * @param originalFunction The original function to with inputs bind to this scatter function. */ def init() {} diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala index 402da4a7a2..4578f0e826 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -30,6 +30,7 @@ import org.broadinstitute.sting.commandline.{Gatherer, Gather, ArgumentSource} import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction} import org.broadinstitute.sting.queue.QException import org.broadinstitute.sting.utils.io.IOUtils +import collection.immutable.ListMap /** * A function that can be run faster by splitting it up into pieces and then joining together the results. @@ -47,28 +48,28 @@ trait ScatterGatherableFunction extends CommandLineFunction { /** * Function that returns the class to use for gathering a directory. If it returns null then @Gather annotation will be used. - * @param gatherField Field that is to be gathered. + * PartialFunction param gatherField Field that is to be gathered. * @return The class of the GatherFunction to be used or null. */ var gatherClass: PartialFunction[ArgumentSource, Class[_ <: GatherFunction]] = _ /** * Allows external modification of the ScatterFunction that will create the scatter pieces in the temporary directories. - * @param scatterFunction The function that will create the scatter pieces in the temporary directories. + * PartialFunction param scatterFunction The function that will create the scatter pieces in the temporary directories. */ var setupScatterFunction: PartialFunction[ScatterFunction, Unit] = _ /** * Allows external modification of the GatherFunction that will collect the gather pieces in the temporary directories. - * @param gatherFunction The function that will merge the gather pieces from the temporary directories. - * @param gatherField The output field being gathered. + * PartialFunction param gatherFunction The function that will merge the gather pieces from the temporary directories. + * PartialFunction param gatherField The output field being gathered. */ var setupGatherFunction: PartialFunction[(GatherFunction, ArgumentSource), Unit] = _ /** * Allows external modification of the cloned function. - * @param cloneFunction A clone wrapper of this ScatterGatherableFunction - * @param index The one based index (from 1..scatterCount inclusive) of the scatter piece. + * PartialFunction param cloneFunction A clone wrapper of this ScatterGatherableFunction + * PartialFunction param index The one based index (from 1..scatterCount inclusive) of the scatter piece. */ var setupCloneFunction: PartialFunction[(CloneFunction, Int), Unit] = _ @@ -108,8 +109,9 @@ trait ScatterGatherableFunction extends CommandLineFunction { scatterFunction.originalFunction = this scatterFunction.originalInputs = inputFiles scatterFunction.commandDirectory = this.scatterGatherTempDir("scatter") - scatterFunction.isIntermediate = true + scatterFunction.jobOutputFile = new File("scatter.out") scatterFunction.addOrder = this.addOrder :+ 1 + scatterFunction.isIntermediate = true initScatterFunction(scatterFunction) scatterFunction.absoluteCommandDirectory() @@ -121,69 +123,63 @@ trait ScatterGatherableFunction extends CommandLineFunction { * Returns a list of scatter / gather and clones of this function * that can be run in parallel to produce the same output as this * command line function. - * @return List[QFunction] to run instead of this function. + * @return Seq[QFunction] to run instead of this function. */ def generateFunctions() = { - var functions = List.empty[QFunction] - - // Only gather up fields that will have a value - val outputFieldsWithValues = this.outputFields.filter(hasFieldValue(_)) - - // Create the scatter function based on @Scatter - functions :+= scatterFunction - // Ask the scatter function how many clones to create. val numClones = scatterFunction.scatterCount - // List of the log files that are output by this function. - var logFiles = List(this.jobOutputFile) - if (this.jobErrorFile != null) - logFiles :+= this.jobErrorFile - // Create the gather functions for each output field - var gatherFunctions = Map.empty[ArgumentSource, GatherFunction] - var gatherOutputs = Map.empty[ArgumentSource, File] + var gatherFunctions = ListMap.empty[ArgumentSource, GatherFunction] + var gatherOutputs = ListMap.empty[ArgumentSource, File] var gatherAddOrder = numClones + 2 + + // Only track fields that will have an output file + val outputFieldsWithValues = this.outputFields. + filter(hasFieldValue(_)). + filter(gatherField => !IOUtils.isSpecialFile(getFieldFile(gatherField))) + for (gatherField <- outputFieldsWithValues) { - val gatherOutput = getFieldFile(gatherField) + gatherOutputs += gatherField -> getFieldFile(gatherField) + } + + // Only gather fields that are @Gather(enabled=true) + val outputFieldsWithGathers = outputFieldsWithValues.filter(hasGatherFunction(_)) + + for (gatherField <- outputFieldsWithGathers) { + val gatherOutput = gatherOutputs(gatherField) val gatherFunction = this.newGatherFunction(gatherField) this.copySettingsTo(gatherFunction) gatherFunction.originalFunction = this gatherFunction.originalOutput = gatherOutput gatherFunction.commandDirectory = this.scatterGatherTempDir("gather-" + gatherField.field.getName) - // If this is a gather for a log file, make the gather intermediate just in case the log file name changes - // Otherwise have the regular output function wait on the log files to gather - if (isLogFile(gatherOutput)) { - gatherFunction.isIntermediate = true - // Only delete the log files if the original function is an intermediate - // and the intermediate files are supposed to be deleted - gatherFunction.deleteIntermediateOutputs = this.isIntermediate && this.deleteIntermediateOutputs - } else { - gatherFunction.originalLogFiles = logFiles - } + gatherFunction.jobOutputFile = new File("gather-" + gatherOutput.getName + ".out") gatherFunction.addOrder = this.addOrder :+ gatherAddOrder initGatherFunction(gatherFunction, gatherField) gatherFunction.absoluteCommandDirectory() gatherFunction.init() - functions :+= gatherFunction gatherFunctions += gatherField -> gatherFunction - gatherOutputs += gatherField -> gatherOutput gatherAddOrder += 1 } // Create the clone functions for running the parallel jobs - var cloneFunctions = List.empty[CloneFunction] + var cloneFunctions = Seq.empty[CloneFunction] + val dirFormat = "temp_%%0%dd_of_%d".format(numClones.toString.length(), numClones) for (i <- 1 to numClones) { val cloneFunction = this.newCloneFunction() this.copySettingsTo(cloneFunction) cloneFunction.originalFunction = this + cloneFunction.analysisName = this.analysisName cloneFunction.cloneIndex = i - cloneFunction.commandDirectory = this.scatterGatherTempDir("temp-"+i) + cloneFunction.commandDirectory = this.scatterGatherTempDir(dirFormat.format(i)) + cloneFunction.jobOutputFile = if (IOUtils.isSpecialFile(this.jobOutputFile)) this.jobOutputFile else new File(this.jobOutputFile.getName) + if (this.jobErrorFile != null) + cloneFunction.jobErrorFile = if (IOUtils.isSpecialFile(this.jobErrorFile)) this.jobErrorFile else new File(this.jobErrorFile.getName) cloneFunction.addOrder = this.addOrder :+ (i+1) cloneFunction.isIntermediate = true @@ -200,17 +196,39 @@ trait ScatterGatherableFunction extends CommandLineFunction { // If the command directory is relative, insert the run directory ahead of it. cloneFunction.absoluteCommandDirectory() - // Get absolute paths to the files and bind the sg functions to the clone function via the absolute paths. + // Allow the scatter function to set the specific input for this clone scatterFunction.bindCloneInputs(cloneFunction, i) + + // Set each of the clone outputs to be absolute paths. for (gatherField <- outputFieldsWithValues) { val gatherPart = IOUtils.absolute(cloneFunction.commandDirectory, cloneFunction.getFieldFile(gatherField)) cloneFunction.setFieldValue(gatherField, gatherPart) - gatherFunctions(gatherField).gatherParts :+= gatherPart + } + + // For the outputs that are being gathered add this clone's output to be gathered. + for (gatherField <- outputFieldsWithGathers) { + gatherFunctions(gatherField).gatherParts :+= cloneFunction.getFieldFile(gatherField) } cloneFunctions :+= cloneFunction } - functions ++= cloneFunctions + + // Track the functions starting with the scatter function. + var functions: Seq[QFunction] = Seq(scatterFunction) ++ cloneFunctions ++ gatherFunctions.values + + // Make all log file paths absolute. + for (function <- functions) { + function.jobOutputFile = IOUtils.absolute(function.commandDirectory, function.jobOutputFile) + if (function.jobErrorFile != null) + function.jobErrorFile = IOUtils.absolute(function.commandDirectory, function.jobErrorFile) + } + + val jobOutputGather = gatherLogFile(_.jobOutputFile, functions, gatherAddOrder) + if (this.jobErrorFile != null) { + val jobErrorGather = gatherLogFile(_.jobErrorFile, functions, gatherAddOrder + 1) + functions :+= jobErrorGather + } + functions :+= jobOutputGather // Return all the various created functions. functions @@ -237,6 +255,25 @@ trait ScatterGatherableFunction extends CommandLineFunction { this.setupScatterFunction(scatterFunction) } + /** + * Returns true if the field should be gathered. + * @param gatherField Field that defined @Gather. + * @return true if the field should be gathered. + */ + protected def hasGatherFunction(gatherField: ArgumentSource) : Boolean = { + // Check if there is a function that will return the gather class for this field. + if (this.gatherClass != null && this.gatherClass.isDefinedAt(gatherField)) + true + + // Check for an annotation defining the gather class. + else if (ReflectionUtils.hasAnnotation(gatherField.field, classOf[Gather])) + ReflectionUtils.getAnnotation(gatherField.field, classOf[Gather]).enabled + + // Nothing else to disable this field. + else + true + } + /** * Creates a new GatherFunction for the gatherField. * @param gatherField Field that defined @Gather. @@ -255,16 +292,18 @@ trait ScatterGatherableFunction extends CommandLineFunction { if (ReflectionUtils.hasAnnotation(gatherField.field, classOf[Gather])) { gatherClass = ReflectionUtils.getAnnotation(gatherField.field, classOf[Gather]).value } else { - throw new QException("Missing @Gather annotation: " + gatherField.field) + throw new QException("Missing @Gather annotation on %s".format(gatherField.field)) } } - if (classOf[GatherFunction].isAssignableFrom(gatherClass)) { + if (gatherClass == classOf[GatherFunction]) { + throw new QException("@Gather did not specify class type on %s".format(gatherField.field)) + } else if (classOf[GatherFunction].isAssignableFrom(gatherClass)) { gatherClass.newInstance.asInstanceOf[GatherFunction] } else if (classOf[Gatherer].isAssignableFrom(gatherClass)) { new GathererFunction(gatherClass.asSubclass(classOf[Gatherer])) } else { - throw new QException("Unsupported @Gather class type: " + gatherClass) + throw new QException("Unsupported @Gather class type on %s: %s".format(gatherField.field, gatherClass)) } } @@ -298,10 +337,27 @@ trait ScatterGatherableFunction extends CommandLineFunction { this.setupCloneFunction(cloneFunction, index) } + /** + * Gathers up the logs files from other functions. + * @param logFile Takes the QFunction and return the log file. + * @param functions The functions for which the logs will be concatenated. + * @param addOrder The order this function should be added in the graph. + */ + private def gatherLogFile(logFile: (QFunction) => File, functions: Seq[QFunction], addOrder: Int) = { + val gatherLogFunction = new ConcatenateLogsFunction + this.copySettingsTo(gatherLogFunction) + gatherLogFunction.logs = functions.map(logFile).filter(_ != null) + gatherLogFunction.jobOutputFile = logFile(this) + gatherLogFunction.commandDirectory = this.scatterGatherTempDir() + gatherLogFunction.addOrder = this.addOrder :+ addOrder + gatherLogFunction.isIntermediate = false + gatherLogFunction + } + /** * Returns a temporary directory under this scatter gather directory. - * @param Sub directory under the scatter gather directory. + * @param subDir directory under the scatter gather directory. * @return temporary directory under this scatter gather directory. */ - private def scatterGatherTempDir(subDir: String) = IOUtils.absolute(this.scatterGatherDirectory, this.jobName + "-sg/" + subDir) + private def scatterGatherTempDir(subDir: String = "") = IOUtils.absolute(this.scatterGatherDirectory, this.jobName + "-sg/" + subDir) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index 3935c2138d..3c7cd0a2d5 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -26,23 +26,23 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e var cur : String = null if ( elems.hasNext ) { cur = elems.next - } else { - out.printf("%s%n",prev) - } - while ( elems.hasNext ) { - out.printf("%s%n",prev) - while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) { - cur = elems.next - } - - if ( ! cur.equals(prev) ) { - if ( elems.hasNext ) { - prev = cur + while ( elems.hasNext ) { + out.printf("%s%n",prev) + while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) { cur = elems.next - } else { - out.printf("%s%n",cur) + } + + if ( ! cur.equals(prev) ) { + if ( elems.hasNext ) { + prev = cur + cur = elems.next + } } } + out.printf("%s%n",prev) + out.printf("%s%n",cur) + } else { + out.printf("%s%n",prev) } out.close diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala index 54e5411429..3179c327f9 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala @@ -6,7 +6,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.commandline._ import java.io.{PrintWriter, PrintStream, File} -class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { +class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList) @Input(doc="VCF from which to extract samples") var inputVCF : File = inVCF diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala b/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala index cda981d29c..2ef7aa06f0 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -34,12 +34,12 @@ import scala.collection.JavaConversions._ */ class EmailMessage extends Logging { var from: String = _ - var to: List[String] = Nil - var cc: List[String] = Nil - var bcc: List[String] = Nil + var to: Seq[String] = Nil + var cc: Seq[String] = Nil + var bcc: Seq[String] = Nil var subject: String = _ var body: String = _ - var attachments: List[File] = Nil + var attachments: Seq[File] = Nil /** * Sends the email and throws an exception if the email can't be sent. @@ -111,10 +111,10 @@ class EmailMessage extends Logging { /** * Converts the email addresses to a collection of InternetAddress which can bypass client side validation, * specifically that the domain is specified. - * @param addresses List of email addresses. + * @param addresses Seq of email addresses. * @return java.util.List of InternetAddress'es */ - private def convert(addresses: List[String]): java.util.List[InternetAddress] = { + private def convert(addresses: Seq[String]): java.util.List[InternetAddress] = { addresses.map(address => new InternetAddress(address, false)) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index 73d1c028a2..e548e5c5e1 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -28,9 +28,10 @@ import org.broadinstitute.sting.queue.function.QFunction import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} import org.broadinstitute.sting.utils.exceptions.UserException import org.broadinstitute.sting.queue.engine.JobRunInfo -import java.io.{FileOutputStream, PrintStream, File} +import java.io.{PrintStream, File} import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} import org.broadinstitute.sting.utils.io.Resource +import org.apache.commons.io.{IOUtils, FileUtils} /** * A mixin to add Job info to the class @@ -67,7 +68,7 @@ trait QJobReport extends Logging { def getReportGroup = self.analysisName.replaceAll(GATKReportTable.INVALID_TABLE_NAME_REGEX, "_") def getReportFeatures = reportFeatures - def getReportFeatureNames: List[String] = getReportFeatures.keys.toList + def getReportFeatureNames: Seq[String] = getReportFeatures.keys.toSeq def getReportFeature(key: String): String = { getReportFeatures.get(key) match { case Some(x) => x @@ -102,9 +103,12 @@ object QJobReport { def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) jobs foreach {case (qf, info) => qf.setRunInfo(info)} - val stream = new PrintStream(new FileOutputStream(dest)) - printJobLogging(jobs.keys.toList, stream) - stream.close() + val stream = new PrintStream(FileUtils.openOutputStream(dest)) + try { + printJobLogging(jobs.keys.toSeq, stream) + } finally { + IOUtils.closeQuietly(stream) + } } def plotReport(reportFile: File, pdfFile: File) { @@ -129,7 +133,7 @@ object QJobReport { * Prints the JobLogging logs to a GATKReport. First splits up the * logs by group, and for each group generates a GATKReportTable */ - private def printJobLogging(logs: List[QFunction], stream: PrintStream) { + private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { // create the report val report: GATKReport = new GATKReport @@ -151,11 +155,11 @@ object QJobReport { report.print(stream) } - private def groupLogs(logs: List[QFunction]): Map[String, List[QFunction]] = { + private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { logs.groupBy(_.getReportGroup) } - private def logKeys(logs: List[QFunction]): Set[String] = { + private def logKeys(logs: Seq[QFunction]): Set[String] = { // the keys should be the same for each log, but we will check that val keys = Set[String](logs(0).getReportFeatureNames : _*) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 3b1b2ece15..1529d99518 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.util import java.io.File @@ -12,23 +36,25 @@ import collection.JavaConversions._ * User: carneiro * Date: 7/14/11 * Time: 4:57 PM - * To change this template use File | Settings | File Templates. */ object QScriptUtils { /** - * Takes a bam list file and produces a scala list with each file allowing the bam list + * Takes a bam list file and produces a scala sequence with each file allowing the bam list * to have empty lines and comment lines (lines starting with #). */ - def createListFromFile(in: File):List[File] = { - // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file. + def createSeqFromFile(in: File):Seq[File] = { + // If the file provided ends with .bam, .fasta, fastq or .fq, it is not a bam list, we treat it as a single file. // and return a list with only this file. - if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq")) - return List(in) + if (in.toString.toUpperCase.endsWith(".BAM") || + in.toString.toUpperCase.endsWith(".FASTA") || + in.toString.toUpperCase.endsWith(".FQ") || + in.toString.toUpperCase.endsWith("FASTQ") ) + return Seq(in) - var list: List[File] = List() - for (file <- fromFile(in).getLines) + var list: Seq[File] = Seq() + for (file <- fromFile(in).getLines()) if (!file.startsWith("#") && !file.isEmpty ) list :+= new File(file.trim()) list.sortWith(_.compareTo(_) < 0) @@ -55,8 +81,4 @@ object QScriptUtils { } false } - - - def ?[A <: AnyRef](ref: A): Option[A] = - if (ref eq null) None else Some(ref) -} \ No newline at end of file +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala index f6a174dd66..980a22e8e4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.util import org.broadinstitute.sting.queue.QException @@ -64,17 +88,17 @@ object ReflectionUtils { /** * Returns all the declared fields on a class in order of sub type to super type. * @param clazz Base class to start looking for fields. - * @return List[Field] found on the class and all super classes. + * @return Seq[Field] found on the class and all super classes. */ - def getAllFields(clazz: Class[_]) = getAllTypes(clazz).map(_.getDeclaredFields).flatMap(_.toList) + def getAllFields(clazz: Class[_]) = getAllTypes(clazz).map(_.getDeclaredFields).flatMap(_.toSeq) /** * Gets all the types on a class in order of sub type to super type. * @param clazz Base class. - * @return List[Class] including the class and all super classes. + * @return Seq[Class] including the class and all super classes. */ def getAllTypes(clazz: Class[_]) = { - var types = List.empty[Class[_]] + var types = Seq.empty[Class[_]] var c = clazz while (c != null) { types :+= c diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 58341a0a5e..6b615e6d9d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.util import collection.JavaConversions._ @@ -14,32 +38,34 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { /** * Checks if the class type is a scala collection. * @param classType Class type to check. - * @return true if the class is a List, Set, or an Option. + * @return true if the class is a Seq, Set, or an Option. */ def supports(classType: Class[_]) = isCompound(classType) /** * Checks if the class type is a scala collection. * @param source Argument source to check. - * @return true if the source is a List, Set, or an Option. + * @return true if the source is a Seq, Set, or an Option. */ override def isMultiValued(source: ArgumentSource) = isCompound(source.field.getType) /** * Checks if the class type is a scala collection. * @param classType Class type to check. - * @return true if the class is a List, Set, or an Option. + * @return true if the class is a Seq, Set, or an Option. */ private def isCompound(classType: Class[_]) = { - classOf[List[_]].isAssignableFrom(classType) || + classOf[Seq[_]].isAssignableFrom(classType) || + classOf[List[_]].isAssignableFrom(classType) || // see comment below re: List vs. Seq classOf[Set[_]].isAssignableFrom(classType) || classOf[Option[_]].isAssignableFrom(classType) } /** * Parses the argument matches based on the class type of the argument source's field. + * @param parsingEngine Parsing engine. * @param source Argument source that contains the field being populated. - * @param classType Class type being parsed. + * @param typeType Type of the argument source's field. * @param argumentMatches The argument match strings that were found for this argument source. * @return The parsed object. */ @@ -51,7 +77,15 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { val componentType = ReflectionUtils.getCollectionType(source.field) val componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType) - if (classOf[List[_]].isAssignableFrom(classType)) { + if (classOf[Seq[_]].isAssignableFrom(classType)) { + var seq = Seq.empty[Any] + for (argumentMatch <- argumentMatches) + for (value <- argumentMatch) + seq :+= componentArgumentParser.parse(parsingEngine, source, componentType, new ArgumentMatches(value)) + seq + } else if (classOf[List[_]].isAssignableFrom(classType)) { + // QScripts should be using the interface Seq instead of the class List. + // Leaving this here for now for legacy support until the effects of switching have been tested for a while. -ks var list = List.empty[Any] for (argumentMatch <- argumentMatches) for (value <- argumentMatch) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala index 62240b6045..2c6d62ae91 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -44,7 +44,7 @@ object StringFileConversions { // and mixins all correct so this doesn't have to be duplicated with concrete implementations? // http://programming-scala.labs.oreilly.com/ch12.html is your friend. - implicit def stringsAsFiles(x: List[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable]): List[File] = { + implicit def stringsAsFiles(x: Seq[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable]): Seq[File] = { x.map(_ match { case string: String => stringAsFile(string) case file: File => file @@ -52,7 +52,23 @@ object StringFileConversions { }) } - implicit def filesAsStrings(x: List[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable]): List[String] = { + implicit def filesAsStrings(x: Seq[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable]): Seq[String] = { + x.map(_ match { + case file: File => fileAsString(file) + case string: String => string + case null => null + }) + } + + implicit def stringsAsFilesList(x: List[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable]): List[File] = { + x.map(_ match { + case string: String => stringAsFile(string) + case file: File => file + case null => null + }) + } + + implicit def filesAsStringsList(x: List[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable]): List[String] = { x.map(_ match { case file: File => fileAsString(file) case string: String => string @@ -91,14 +107,22 @@ trait StringFileConversions { StringFileConversions.fileAsString(x) } - implicit def stringsAsFiles(x: List[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable]): List[File] = { + implicit def stringsAsFiles(x: Seq[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable]): Seq[File] = { StringFileConversions.stringsAsFiles(x) } - implicit def filesAsStrings(x: List[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable]): List[String] = { + implicit def filesAsStrings(x: Seq[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable]): Seq[String] = { StringFileConversions.filesAsStrings(x) } + implicit def stringsAsFilesList(x: List[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable]): List[File] = { + StringFileConversions.stringsAsFilesList(x) + } + + implicit def filesAsStringsList(x: List[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable]): List[String] = { + StringFileConversions.filesAsStringsList(x) + } + implicit def stringsAsFiles(x: Set[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable] with Serializable]): Set[File] = { StringFileConversions.stringsAsFiles(x) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala index 9002def78f..ed149f8a43 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -24,7 +24,6 @@ package org.broadinstitute.sting.queue.util -import java.lang.management.ManagementFactory import java.net.InetAddress import java.io.File import io.Source @@ -56,6 +55,4 @@ object SystemUtils extends Logging { else hostName.split('.').takeRight(2).mkString(".") } - - val pidAtHost = ManagementFactory.getRuntimeMXBean.getName.split('.').head } diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index 5383b3716b..b233505575 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -41,9 +41,6 @@ class GATKIntervalsUnitTest { createSetFromSequenceDictionary(new ReferenceDataSource(hg18Reference).getReference.getSequenceDictionary).toList private final lazy val hg19Reference = new File(BaseTest.hg19Reference) - private final lazy val hg19GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg19Reference)) - private final lazy val hg19ReferenceLocs = GenomeLocSortedSet. - createSetFromSequenceDictionary(new ReferenceDataSource(hg19Reference).getReference.getSequenceDictionary).toList @Test def testWithIntervals() { @@ -51,16 +48,14 @@ class GATKIntervalsUnitTest { val chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-3") val chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:3-5") - val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) - Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) - Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) -// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) -// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) + val gi = new GATKIntervals(hg18Reference, Seq("chr1:1-1", "chr2:2-3", "chr3:3-5")) + Assert.assertEquals(gi.locs.toSeq, Seq(chr1, chr2, chr3)) + Assert.assertEquals(gi.contigs, Seq("chr1", "chr2", "chr3")) } @Test(timeOut = 30000L) def testIntervalFile() { - var gi = new GATKIntervals(hg19Reference, List(BaseTest.hg19Intervals)) + var gi = new GATKIntervals(hg19Reference, Seq(BaseTest.hg19Intervals)) Assert.assertEquals(gi.locs.size, 189894) // Timeout check is because of bad: // for(Item item: javaConvertedScalaList) @@ -75,15 +70,13 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) -// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) -// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) } @Test def testContigCounts() { Assert.assertEquals(new GATKIntervals(hg18Reference, Nil).contigs, hg18ReferenceLocs.map(_.getContig)) - Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1", "chr2", "chr3")).contigs, List("chr1", "chr2", "chr3")) - Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, List("chr1", "chr2", "chr3")) + Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3")) + Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3")) } @Test @@ -96,6 +89,6 @@ class GATKIntervalsUnitTest { } private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) { - Assert.assertEquals(new GATKIntervals(hg18Reference, actual.toList).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) + Assert.assertEquals(new GATKIntervals(hg18Reference, actual).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala index eb50c3a2ef..9c5b648d28 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.function import org.testng.Assert @@ -114,20 +138,20 @@ class CommandLineFunctionUnitTest extends CommandLineFunction { @DataProvider( name = "repeatTestData" ) def repeatDataProvider = { - Array(Array("", List("a", "bc", "d"), "", " ", true, true, " 'a' 'bc' 'd' "), - Array("", List("a", "bc", "d"), "", " ", true, false, " a bc d "), - Array("", List("a", "bc", "d"), "", "", true, true, " 'a''bc''d' "), - Array("", List("a", "bc", "d"), "", "", true, false, " abcd "), - Array("-f", List("file1", "file2", "file3"), "", " ", true, true, " '-f' 'file1' '-f' 'file2' '-f' 'file3' "), - Array("-f", List("file1", "file2", "file3"), "", " ", true, false, " -f file1 -f file2 -f file3 "), - Array("-f", List("file1", "file2", "file3"), "", " ", false, true, " '-ffile1' '-ffile2' '-ffile3' "), - Array("-f", List("file1", "file2", "file3"), "", " ", false, false, " -ffile1 -ffile2 -ffile3 "), - Array("-f", List("file1", "file2", "file3"), "", "", false, true, " '-ffile1''-ffile2''-ffile3' "), - Array("-f", List("file1", "file2", "file3"), "", "", false, false, " -ffile1-ffile2-ffile3 "), - Array("-f", List("file1", "file2", "file3"), "suffix", " ", true, true, " '-f' 'file1' 'suffix' '-f' 'file2' 'suffix' '-f' 'file3' 'suffix' "), - Array("-f", List("file1", "file2", "file3"), "suffix", " ", true, false, " -f file1 suffix -f file2 suffix -f file3 suffix "), - Array("-f", List("file1", "file2", "file3"), "suffix", " ", false, true, " '-ffile1suffix' '-ffile2suffix' '-ffile3suffix' "), - Array("-f", List("file1", "file2", "file3"), "suffix", " ", false, false, " -ffile1suffix -ffile2suffix -ffile3suffix "), + Array(Array("", Seq("a", "bc", "d"), "", " ", true, true, " 'a' 'bc' 'd' "), + Array("", Seq("a", "bc", "d"), "", " ", true, false, " a bc d "), + Array("", Seq("a", "bc", "d"), "", "", true, true, " 'a''bc''d' "), + Array("", Seq("a", "bc", "d"), "", "", true, false, " abcd "), + Array("-f", Seq("file1", "file2", "file3"), "", " ", true, true, " '-f' 'file1' '-f' 'file2' '-f' 'file3' "), + Array("-f", Seq("file1", "file2", "file3"), "", " ", true, false, " -f file1 -f file2 -f file3 "), + Array("-f", Seq("file1", "file2", "file3"), "", " ", false, true, " '-ffile1' '-ffile2' '-ffile3' "), + Array("-f", Seq("file1", "file2", "file3"), "", " ", false, false, " -ffile1 -ffile2 -ffile3 "), + Array("-f", Seq("file1", "file2", "file3"), "", "", false, true, " '-ffile1''-ffile2''-ffile3' "), + Array("-f", Seq("file1", "file2", "file3"), "", "", false, false, " -ffile1-ffile2-ffile3 "), + Array("-f", Seq("file1", "file2", "file3"), "suffix", " ", true, true, " '-f' 'file1' 'suffix' '-f' 'file2' 'suffix' '-f' 'file3' 'suffix' "), + Array("-f", Seq("file1", "file2", "file3"), "suffix", " ", true, false, " -f file1 suffix -f file2 suffix -f file3 suffix "), + Array("-f", Seq("file1", "file2", "file3"), "suffix", " ", false, true, " '-ffile1suffix' '-ffile2suffix' '-ffile3suffix' "), + Array("-f", Seq("file1", "file2", "file3"), "suffix", " ", false, false, " -ffile1suffix -ffile2suffix -ffile3suffix "), Array("-f", null, "", " ", true, true, ""), Array("-f", Nil, "", " ", true, true, "") ) @@ -148,11 +172,11 @@ class CommandLineFunctionUnitTest extends CommandLineFunction { @DataProvider( name = "repeatWithPrefixFormattingTestData" ) def repeatWithPrefixFormattingDataProvider = { - Array(Array("-f", List("file1", "file2", "file3"), "", " ", true, true, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), + Array(Array("-f", Seq("file1", "file2", "file3"), "", " ", true, true, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), " '-f:tagfile1' 'file1' '-f:tagfile2' 'file2' '-f:tagfile3' 'file3' "), - Array("-f", List("file1", "file2", "file3"), "", " ", true, false, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), + Array("-f", Seq("file1", "file2", "file3"), "", " ", true, false, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), " -f:tagfile1 file1 -f:tagfile2 file2 -f:tagfile3 file3 "), - Array("", List("file1", "file2", "file3"), "", " ", true, true, (prefix: String, value: Any) => "-%s".format(value), + Array("", Seq("file1", "file2", "file3"), "", " ", true, true, (prefix: String, value: Any) => "-%s".format(value), " '-file1' 'file1' '-file2' 'file2' '-file3' 'file3' "), Array("-f", null, "", " ", true, true, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), ""), diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index aedbc1cd39..f0feb207b8 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -47,10 +47,10 @@ object PipelineTest extends BaseTest with Logging { final val allJobRunners = { val commandLinePluginManager = new CommandLinePluginManager - commandLinePluginManager.getPlugins.map(commandLinePluginManager.getName(_)).toList + commandLinePluginManager.getPlugins.map(commandLinePluginManager.getName(_)).toSeq } - final val defaultJobRunners = List("Lsf706", "GridEngine") + final val defaultJobRunners = Seq("Lsf706", "GridEngine") /** * Returns the top level output path to this test. diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala index 33b8c1c392..3996f2ca3a 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala @@ -1,14 +1,38 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.pipeline /** * Data validations to evaluate on a GATKReport. */ class PipelineTestEvalSpec { - /** List of eval modules to output. */ + /** Eval modules to output. */ var evalReport: String = _ /** Validations to assert. */ - var validations: List[PipelineValidation[_]] = Nil + var validations: Seq[PipelineValidation[_]] = Nil } /** A VariantEval JEXL and range of values to validate. */ diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala index a7b3f3a47f..0900246982 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.queue.pipeline class PipelineTestSpec(var name: String = null) { @@ -9,7 +33,7 @@ class PipelineTestSpec(var name: String = null) { var jobQueue: String = _ /** Job runners to run the test. Default is null which means use the default. */ - var jobRunners: List[String] = _ + var jobRunners: Seq[String] = _ /** Expected MD5 results for each file path. */ var fileMD5s = Map.empty[String, String] diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala new file mode 100644 index 0000000000..9bb287ac44 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class DevNullOutputPipelineTest { + @Test + def testDevNullOutput() { + val spec = new PipelineTestSpec + spec.name = "devnulloutput" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam").mkString + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala index d50673a1a9..f598402af5 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -39,6 +39,7 @@ class ExampleUnifiedGenotyperPipelineTest { " -I " + BaseTest.testDir + "exampleBAM.bam", " -filter QD", " -filterExpression 'QD < 2.0'").mkString + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index f320cb3a6a..a43727ba60 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -38,11 +38,11 @@ class HelloWorldPipelineTest { } @Test - def testHelloWorldWithPrefix() { + def testHelloWorldWithRunName() { val spec = new PipelineTestSpec - spec.name = "HelloWorldWithPrefix" + spec.name = "HelloWorldWithRunName" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + - " -jobPrefix HelloWorld" + " -runName HelloWorld" spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } @@ -73,7 +73,7 @@ class HelloWorldPipelineTest { spec.name = "HelloWorldWithLsfResource" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + " -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" - spec.jobRunners = List("Lsf706") + spec.jobRunners = Seq("Lsf706") PipelineTest.executeTest(spec) } @@ -83,7 +83,7 @@ class HelloWorldPipelineTest { spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + " -memLimit 1.25 -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" - spec.jobRunners = List("Lsf706") + spec.jobRunners = Seq("Lsf706") PipelineTest.executeTest(spec) } @@ -93,7 +93,7 @@ class HelloWorldPipelineTest { spec.name = "HelloWorldWithLsfEnvironment" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + " -jobEnv tv" - spec.jobRunners = List("Lsf706") + spec.jobRunners = Seq("Lsf706") PipelineTest.executeTest(spec) } @@ -103,7 +103,7 @@ class HelloWorldPipelineTest { spec.name = "HelloWorldWithGridEngineResource" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + " -jobResReq s_core=1000M" - spec.jobRunners = List("GridEngine") + spec.jobRunners = Seq("GridEngine") PipelineTest.executeTest(spec) } @@ -113,7 +113,7 @@ class HelloWorldPipelineTest { spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + " -memLimit 1.25 -jobResReq s_core=1000M" - spec.jobRunners = List("GridEngine") + spec.jobRunners = Seq("GridEngine") PipelineTest.executeTest(spec) } @@ -123,7 +123,7 @@ class HelloWorldPipelineTest { spec.name = "HelloWorldWithGridEngineEnvironment" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + " -jobEnv \"make 1\"" - spec.jobRunners = List("GridEngine") + spec.jobRunners = Seq("GridEngine") PipelineTest.executeTest(spec) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala index a735edebee..4d364040a8 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -50,40 +50,40 @@ class StringFileConversionsUnitTest { @Test def testStringToFileList() { - var files = List(new File("foo")) + var files = Seq(new File("foo")) files :+= "bar" - Assert.assertEquals(files, List(new File("foo"), new File("bar"))) + Assert.assertEquals(files, Seq(new File("foo"), new File("bar"))) - files = List(new File("foo")) + files = Seq(new File("foo")) files :+= null.asInstanceOf[String] - Assert.assertEquals(files, List(new File("foo"), null)) + Assert.assertEquals(files, Seq(new File("foo"), null)) - files = List[File](null) + files = Seq[File](null) files :+= "foo" - Assert.assertEquals(files, List(null, new File("foo"))) + Assert.assertEquals(files, Seq(null, new File("foo"))) - files = List[File](null) + files = Seq[File](null) files :+= null.asInstanceOf[String] - Assert.assertEquals(files, List(null, null)) + Assert.assertEquals(files, Seq(null, null)) } @Test def testFileToStringList() { - var strings = List("foo") + var strings = Seq("foo") strings :+= new File("bar") - Assert.assertEquals(strings, List("foo", "bar")) + Assert.assertEquals(strings, Seq("foo", "bar")) - strings = List("foo") + strings = Seq("foo") strings :+= null.asInstanceOf[File] - Assert.assertEquals(strings, List("foo", null)) + Assert.assertEquals(strings, Seq("foo", null)) - strings = List[String](null) + strings = Seq[String](null) strings :+= new File("foo") - Assert.assertEquals(strings, List(null, "foo")) + Assert.assertEquals(strings, Seq(null, "foo")) - strings = List[String](null) + strings = Seq[String](null) strings :+= null.asInstanceOf[File] - Assert.assertEquals(strings, List(null, null)) + Assert.assertEquals(strings, Seq(null, null)) } @Test @@ -126,40 +126,40 @@ class StringFileConversionsUnitTest { @Test def testStringListToFileList() { - var files = List(new File("foo")) - files ++= List("bar") - Assert.assertEquals(files, List(new File("foo"), new File("bar"))) + var files = Seq(new File("foo")) + files ++= Seq("bar") + Assert.assertEquals(files, Seq(new File("foo"), new File("bar"))) - files = List(new File("foo")) - files ++= List[String](null) - Assert.assertEquals(files, List(new File("foo"), null)) + files = Seq(new File("foo")) + files ++= Seq[String](null) + Assert.assertEquals(files, Seq(new File("foo"), null)) - files = List[File](null) - files ++= List("foo") - Assert.assertEquals(files, List(null, new File("foo"))) + files = Seq[File](null) + files ++= Seq("foo") + Assert.assertEquals(files, Seq(null, new File("foo"))) - files = List[File](null) - files ++= List[String](null) - Assert.assertEquals(files, List(null, null)) + files = Seq[File](null) + files ++= Seq[String](null) + Assert.assertEquals(files, Seq(null, null)) } @Test def testFileListToStringList() { - var strings = List("foo") - strings ++= List(new File("bar")) - Assert.assertEquals(strings, List("foo", "bar")) + var strings = Seq("foo") + strings ++= Seq(new File("bar")) + Assert.assertEquals(strings, Seq("foo", "bar")) - strings = List("foo") - strings ++= List[File](null) - Assert.assertEquals(strings, List("foo", null)) + strings = Seq("foo") + strings ++= Seq[File](null) + Assert.assertEquals(strings, Seq("foo", null)) - strings = List[String](null) - strings ++= List(new File("foo")) - Assert.assertEquals(strings, List(null, "foo")) + strings = Seq[String](null) + strings ++= Seq(new File("foo")) + Assert.assertEquals(strings, Seq(null, "foo")) - strings = List[String](null) - strings ++= List[File](null) - Assert.assertEquals(strings, List(null, null)) + strings = Seq[String](null) + strings ++= Seq[File](null) + Assert.assertEquals(strings, Seq(null, null)) } @Test diff --git a/public/testdata/breakpoint-example.vcf b/public/testdata/breakpoint-example.vcf new file mode 100644 index 0000000000..f015e1721b --- /dev/null +++ b/public/testdata/breakpoint-example.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.1 +#CHROM POS ID REF ALT QUAL FILTER INFO +22 50 bnd_W G G]22:6000] 6 PASS SVTYPE=BND;MATEID=bnd_Y +22 51 bnd_V T ]22:55]T 6 PASS SVTYPE=BND;MATEID=bnd_U +22 55 bnd_U C C[22:51[ 6 PASS SVTYPE=BND;MATEID=bnd_V +22 6000 bnd_Y A A]22:50] 6 PASS SVTYPE=BND;MATEID=bnd_W \ No newline at end of file diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2164.jar b/settings/repository/edu.mit.broad/picard-private-parts-2181.jar similarity index 88% rename from settings/repository/edu.mit.broad/picard-private-parts-2164.jar rename to settings/repository/edu.mit.broad/picard-private-parts-2181.jar index 4465f91f57..ef33718276 100644 Binary files a/settings/repository/edu.mit.broad/picard-private-parts-2164.jar and b/settings/repository/edu.mit.broad/picard-private-parts-2181.jar differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2164.xml b/settings/repository/edu.mit.broad/picard-private-parts-2181.xml similarity index 58% rename from settings/repository/edu.mit.broad/picard-private-parts-2164.xml rename to settings/repository/edu.mit.broad/picard-private-parts-2181.xml index 6a22ea2c36..d11423b598 100644 --- a/settings/repository/edu.mit.broad/picard-private-parts-2164.xml +++ b/settings/repository/edu.mit.broad/picard-private-parts-2181.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf/picard-1.58.1057.xml b/settings/repository/net.sf/picard-1.58.1057.xml deleted file mode 100644 index 15c5b5620c..0000000000 --- a/settings/repository/net.sf/picard-1.58.1057.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/picard-1.58.1057.jar b/settings/repository/net.sf/picard-1.59.1066.jar similarity index 91% rename from settings/repository/net.sf/picard-1.58.1057.jar rename to settings/repository/net.sf/picard-1.59.1066.jar index 4a82a3058e..1bbfd5a19e 100644 Binary files a/settings/repository/net.sf/picard-1.58.1057.jar and b/settings/repository/net.sf/picard-1.59.1066.jar differ diff --git a/settings/repository/net.sf/picard-1.59.1066.xml b/settings/repository/net.sf/picard-1.59.1066.xml new file mode 100644 index 0000000000..73bc3ffee3 --- /dev/null +++ b/settings/repository/net.sf/picard-1.59.1066.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/net.sf/sam-1.58.1057.xml b/settings/repository/net.sf/sam-1.58.1057.xml deleted file mode 100644 index 4f0dfe44e3..0000000000 --- a/settings/repository/net.sf/sam-1.58.1057.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/sam-1.58.1057.jar b/settings/repository/net.sf/sam-1.59.1066.jar similarity index 96% rename from settings/repository/net.sf/sam-1.58.1057.jar rename to settings/repository/net.sf/sam-1.59.1066.jar index 804e21b616..8380da8644 100644 Binary files a/settings/repository/net.sf/sam-1.58.1057.jar and b/settings/repository/net.sf/sam-1.59.1066.jar differ diff --git a/settings/repository/net.sf/sam-1.59.1066.xml b/settings/repository/net.sf/sam-1.59.1066.xml new file mode 100644 index 0000000000..75a327daa3 --- /dev/null +++ b/settings/repository/net.sf/sam-1.59.1066.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/org.broad/tribble-46.jar b/settings/repository/org.broad/tribble-53.jar similarity index 87% rename from settings/repository/org.broad/tribble-46.jar rename to settings/repository/org.broad/tribble-53.jar index 401fcfc3a9..02865df435 100644 Binary files a/settings/repository/org.broad/tribble-46.jar and b/settings/repository/org.broad/tribble-53.jar differ diff --git a/settings/repository/org.broad/tribble-46.xml b/settings/repository/org.broad/tribble-53.xml similarity index 51% rename from settings/repository/org.broad/tribble-46.xml rename to settings/repository/org.broad/tribble-53.xml index bb8df5c876..cae6cf15ac 100644 --- a/settings/repository/org.broad/tribble-46.xml +++ b/settings/repository/org.broad/tribble-53.xml @@ -1,3 +1,3 @@ - +